diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c7028e004..455e7c698 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,7 +65,7 @@ continuation of the sentence: Examples: llm/backend/mlx: support the llama architecture - CONTRIBUTING: provide clairity on good commit messages, and bad + CONTRIBUTING: provide clarity on good commit messages, and bad Bad Examples: diff --git a/README.md b/README.md index 1ea24e75f..d5049d3eb 100644 --- a/README.md +++ b/README.md @@ -410,6 +410,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.) - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.) - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.) +- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.) ### Cloud diff --git a/kvcache/causal.go b/kvcache/causal.go index b594d0b41..8b101a817 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -25,6 +25,9 @@ type Causal struct { opts CausalOptions + // maxBatch is the largest batch that we might receive + maxBatch int + // config controls mostly backend-specific optimizations config *ml.CacheConfig @@ -147,6 +150,7 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity c.DType = dtype c.cellRanges = make(map[int]cellRange) c.backend = backend + c.maxBatch = maxBatch } func (c *Causal) SetConfig(config ml.CacheConfig) { @@ -639,48 +643,51 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error { return ErrNotSupported } - ctx := c.backend.NewContext() - defer ctx.Close() - seqRange := c.cellRanges[seq] - size := seqRange.max - seqRange.min + 1 - offsets := make([]int32, size) - for i := range offsets { - cell := c.cells[seqRange.min+i] + for start := seqRange.min; start <= seqRange.max; start += c.maxBatch { + ctx := c.backend.NewContext() - if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex { - offsets[i] = offset + size := min(seqRange.max-start+1, c.maxBatch) + offsets := make([]int32, size) + for i := range offsets { + cell := c.cells[start+i] + + if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex { + offsets[i] = offset + } } + + kShift := ctx.Input().FromIntSlice(offsets, len(offsets)) + + for i, key := range c.keys { + if key == nil { + continue + } + + kHeadDim := key.Dim(0) + numKVHeads := key.Dim(1) + rowSize := key.Stride(2) + + key = key.View(ctx, rowSize*start, + kHeadDim, key.Stride(1), + numKVHeads, key.Stride(2), + size, + ) + + roped, err := c.shiftFn(ctx, i, key, kShift) + if err != nil { + ctx.Close() + return err + } + + ctx.Forward(roped.Copy(ctx, key)) + } + + ctx.Compute() + ctx.Close() } - kShift := ctx.Input().FromIntSlice(offsets, len(offsets)) - - for i, key := range c.keys { - if key == nil { - continue - } - - kHeadDim := key.Dim(0) - numKVHeads := key.Dim(1) - rowSize := key.Stride(2) - - key = key.View(ctx, rowSize*seqRange.min, - kHeadDim, key.Stride(1), - numKVHeads, key.Stride(2), - size, - ) - - roped, err := c.shiftFn(ctx, i, key, kShift) - if err != nil { - return err - } - - ctx.Forward(roped.Copy(ctx, key)) - } - - ctx.Compute() - return nil }