Merge remote-tracking branch 'origin/main' into GraniteFour

* origin/main: readme: add Mayan EDMS to community integrations (#11543) kvcache: Group shift operations into batches CONTRIBUTING: fix typo in commit message example (#11528)
2025-07-28 10:33:49 -04:00 · 2025-07-28 10:33:49 -04:00 · 444c2bf248
parent 11a0d7376c bbf66c0b96
commit 444c2bf248
3 changed files with 45 additions and 37 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -65,7 +65,7 @@ continuation of the sentence:
 Examples:

      llm/backend/mlx: support the llama architecture
-      CONTRIBUTING: provide clairity on good commit messages, and bad
+      CONTRIBUTING: provide clarity on good commit messages, and bad

 Bad Examples:

--- a/README.md
+++ b/README.md
@ -410,6 +410,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
+- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)

 ### Cloud

--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@ -25,6 +25,9 @@ type Causal struct {

 	opts CausalOptions

+	// maxBatch is the largest batch that we might receive
+	maxBatch int
+
 	// config controls mostly backend-specific optimizations
 	config *ml.CacheConfig

@ -147,6 +150,7 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 	c.DType = dtype
 	c.cellRanges = make(map[int]cellRange)
 	c.backend = backend
+	c.maxBatch = maxBatch
 }

 func (c *Causal) SetConfig(config ml.CacheConfig) {
@ -639,48 +643,51 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		return ErrNotSupported
 	}

-	ctx := c.backend.NewContext()
-	defer ctx.Close()
-
 	seqRange := c.cellRanges[seq]
-	size := seqRange.max - seqRange.min + 1

-	offsets := make([]int32, size)
-	for i := range offsets {
-		cell := c.cells[seqRange.min+i]
+	for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
+		ctx := c.backend.NewContext()

-		if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
-			offsets[i] = offset
+		size := min(seqRange.max-start+1, c.maxBatch)
+		offsets := make([]int32, size)
+		for i := range offsets {
+			cell := c.cells[start+i]
+
+			if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
+				offsets[i] = offset
+			}
 		}
+
+		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+
+		for i, key := range c.keys {
+			if key == nil {
+				continue
+			}
+
+			kHeadDim := key.Dim(0)
+			numKVHeads := key.Dim(1)
+			rowSize := key.Stride(2)
+
+			key = key.View(ctx, rowSize*start,
+				kHeadDim, key.Stride(1),
+				numKVHeads, key.Stride(2),
+				size,
+			)
+
+			roped, err := c.shiftFn(ctx, i, key, kShift)
+			if err != nil {
+				ctx.Close()
+				return err
+			}
+
+			ctx.Forward(roped.Copy(ctx, key))
+		}
+
+		ctx.Compute()
+		ctx.Close()
 	}

-	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
-
-	for i, key := range c.keys {
-		if key == nil {
-			continue
-		}
-
-		kHeadDim := key.Dim(0)
-		numKVHeads := key.Dim(1)
-		rowSize := key.Stride(2)
-
-		key = key.View(ctx, rowSize*seqRange.min,
-			kHeadDim, key.Stride(1),
-			numKVHeads, key.Stride(2),
-			size,
-		)
-
-		roped, err := c.shiftFn(ctx, i, key, kShift)
-		if err != nil {
-			return err
-		}
-
-		ctx.Forward(roped.Copy(ctx, key))
-	}
-
-	ctx.Compute()
-
 	return nil
 }