more linter feeding

feed the linter
fix causal test
2025-02-18 13:32:58 -08:00 · 2025-02-18 13:16:43 -08:00 · 2025-02-18 13:02:44 -08:00 · 2025-02-18 12:47:34 -08:00 · 2025-02-18 12:40:12 -08:00 · 2025-02-18 12:40:02 -08:00
23 changed files with 548 additions and 498 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -329,7 +329,9 @@ jobs:
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
+            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
+          done
      - uses: actions/upload-artifact@v4
        with:
          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,3 @@ test_data
 __debug_bin*
 llama/build
 llama/vendor
-model/testdata/models/*
-!model/testdata/models/*.md
-!model/testdata/models/*.json
--- a/README.md
+++ b/README.md
@@ -381,6 +381,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
+- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)

 ### Cloud

@@ -548,6 +549,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
+- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)

 ### Supported backends

--- a/docs/windows.md
+++ b/docs/windows.md
@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
 ## Troubleshooting

 Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<cmd>+R` and type in:
+the explorer window by hitting `<Ctrl>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
    - *app.log* contains most resent logs from the GUI application
    - *server.log* contains the most recent server logs
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -12,6 +12,9 @@ func TestHumanNumber(t *testing.T) {

 	testCases := []testCase{
 		{0, "0"},
+		{999, "999"},
+		{1000, "1K"},
+		{1001, "1K"},
 		{1000000, "1M"},
 		{125000000, "125M"},
 		{500500000, "500.50M"},
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -120,6 +120,15 @@ func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
 	return s
 }

+func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
+	r := keyValue(kv, key, &array{})
+	s := make([]float32, r.size)
+	for i := range r.size {
+		s[i] = float32(r.values[i].(float32))
+	}
+	return s
+}
+
 func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
--- a/go.mod
+++ b/go.mod
@@ -18,6 +18,7 @@ require (
 	github.com/agnivade/levenshtein v1.1.1
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
+	github.com/emirpasic/gods v1.18.1
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
--- a/go.sum
+++ b/go.sum
@@ -44,6 +44,8 @@ github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
 github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
 github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
+github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
+github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
 github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -305,6 +305,10 @@ func (b *testBackend) NewContext() ml.Context {
 	return &testContext{}
 }

+func (b *testBackend) SystemInfo() string {
+	return "not implemented"
+}
+
 type testContext struct{}

 func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
@@ -430,7 +434,7 @@ func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0
 	panic("not implemented")
 }

-func (t *testTensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
+func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim, ropeType uint32, base, scale float32) ml.Tensor {
 	panic("not implemented")
 }

--- a/ml/backend.go
+++ b/ml/backend.go
@@ -17,12 +17,14 @@ type Config interface {

 	Strings(string, ...[]string) []string
 	Uints(string, ...[]uint32) []uint32
+	Floats(string, ...[]float32) []float32
 }

 type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
+	SystemInfo() string
 }

 var backends = make(map[string]func(*os.File) (Backend, error))
@@ -43,42 +45,6 @@ func NewBackend(f *os.File) (Backend, error) {
 	return nil, fmt.Errorf("unsupported backend")
 }

-// RopeType specifies the type of RoPE (Rotary Position Embedding) to use, these types are implemented in the backend
-type RopeType int
-
-const (
-	RopeTypeStandard RopeType = iota
-	_                         // not yet used
-	RopeTypeNeoX
-)
-
-// RopeConfig contains all configuration for the RoPE (Rotary Position Embedding) operation
-type RopeConfig struct {
-	// PositionIDs contains the position indices for each token in the sequence
-	// These indices are used to calculate the rotary embeddings
-	PositionIDs Tensor
-
-	// RopeFactors is an optional tensor containing pre-computed rotation factors
-	RopeFactors Tensor
-
-	// RopeDim specifies the dimension size for the rotary embeddings
-	RopeDim uint32
-
-	// RopeType indicates which RoPE variant to use (e.g. normal or neox)
-	RopeType RopeType
-
-	// OrigCtxLen stores the original context length the model was trained with
-	OrigCtxLen int
-
-	// RopeBase is the base value used in the frequency calculation
-	RopeBase float32
-
-	// RopeScale is a scaling factor applied to position indices
-	RopeScale float32
-
-	// YaRN parameters can be added here if they need to be configurable
-}
-
 type Context interface {
 	Zeros(dtype DType, shape ...int) Tensor
 	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
@@ -111,7 +77,7 @@ type Tensor interface {
 	Scale(ctx Context, s float64) Tensor

 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, rc RopeConfig) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor

 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,11 +1,27 @@
 package ggml

-// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
-// #include <stdlib.h>
-// #include <stdint.h>
-// #include "ggml.h"
-// #include "ggml-cpu.h"
-// #include "ggml-backend.h"
+/*
+#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
+#include <stdlib.h>
+#include <stdint.h>
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-backend.h"
+static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
+static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
+
+typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
+COMPILER inline get_compiler() {
+#if defined(__clang__)
+	return COMP_CLANG;
+#elif defined(__GNUC__)
+	return COMP_GCC;
+#else
+	return UNKNOWN_COMPILER;
+#endif
+}
+
+*/
 import "C"

 import (
@@ -579,9 +595,16 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-func (t *Tensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
-	if rc.RopeFactors == nil {
-		rc.RopeFactors = &Tensor{}
+const (
+	ropeTypeNorm   C.int = 0
+	ropeTypeNeox   C.int = 2
+	ropeTypeMrope  C.int = 8
+	ropeTypeVision C.int = 24
+)
+
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
+	if ropeFactors == nil {
+		ropeFactors = &Tensor{}
 	}

 	dequant := t.t
@@ -591,15 +614,12 @@ func (t *Tensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {

 	return &Tensor{
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx,
-			dequant,
-			rc.PositionIDs.(*Tensor).t,
-			rc.RopeFactors.(*Tensor).t,
-			C.int(rc.RopeDim),
-			C.int(rc.RopeType),
-			C.int(rc.OrigCtxLen),
-			C.float(rc.RopeBase),
-			C.float(rc.RopeScale),
+			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			C.int(ropeDim),
+			C.int(ropeType),
+			131072, // YaRN n_ctx_train
+			C.float(ropeBase),
+			C.float(ropeScale),
 			0.,  // YaRN ext_factor
 			1.,  // YaRN attn_factor
 			32., // YaRN beta_fast
@@ -625,3 +645,34 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
 	}
 }
+
+func (b *Backend) SystemInfo() string {
+	var compiler string
+	switch C.get_compiler() {
+	case C.COMP_UNKNOWN:
+		compiler = "cgo(unknown_compiler)"
+	case C.COMP_GCC:
+		compiler = "cgo(gcc)"
+	case C.COMP_CLANG:
+		compiler = "cgo(clang)"
+	}
+
+	var s string
+	for i := range C.ggml_backend_reg_count() {
+		reg := C.ggml_backend_reg_get(i)
+		fName := C.CString("ggml_backend_get_features")
+		defer C.free(unsafe.Pointer(fName))
+		get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
+		if get_features_fn != nil {
+			s += C.GoString(C.ggml_backend_reg_name(reg))
+			s += " : "
+			for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
+				s += C.GoString(features.name)
+				s += " = "
+				s += C.GoString(features.value)
+				s += " | "
+			}
+		}
+	}
+	return s + compiler
+}
--- a/model/model.go
+++ b/model/model.go
@@ -21,6 +21,7 @@ import (
 	_ "github.com/ollama/ollama/ml/backend"
 )

+// Options contains the inputs for a model forward pass
 type Options struct {
 	Inputs    []int32
 	Positions []int32
@@ -34,11 +35,13 @@ type config struct {
 	Cache kvcache.Cache
 }

+// Base implements the common fields and methods for all models
 type Base struct {
 	b ml.Backend
 	config
 }

+// Backend returns the underlying backend that will run the model
 func (m *Base) Backend() ml.Backend {
 	return m.b
 }
@@ -47,6 +50,7 @@ func (m *Base) Config() config {
 	return m.config
 }

+// Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
 	Forward(ml.Context, Options) (ml.Tensor, error)

@@ -56,6 +60,7 @@ type Model interface {

 var models = make(map[string]func(ml.Config) (Model, error))

+// Register registers a model constructor for the given architecture
 func Register(name string, f func(ml.Config) (Model, error)) {
 	if _, ok := models[name]; ok {
 		panic("model: model already registered")
@@ -64,8 +69,9 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 	models[name] = f
 }

-func New(s string) (Model, error) {
-	r, err := os.Open(s)
+// New initializes a new model instance with the provided configuration based on the metadata in the model file
+func New(modelPath string) (Model, error) {
+	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
--- a/model/model_external_test.go
+++ b/model/model_external_test.go
@@ -1,138 +0,0 @@
-// Package model_test provides external tests for the model package.
-// This test file specifically tests the forward pass functionality on models.
-// It is in a separate package (model_test) to avoid import cycles while still
-// being able to test the public API of the model package.
-package model_test
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-	"testing"
-
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/sample"
-
-	_ "github.com/ollama/ollama/model/models"
-)
-
-type modelTest struct {
-	Prompt            string   `json:"prompt"`
-	OutputContainsOne []string `json:"output_contains_one"`
-}
-
-func TestForwardSimple(t *testing.T) {
-	if testing.Short() {
-		t.Skip("skipping in short mode")
-	}
-
-	// Read all JSON files from testdata/models
-	files, err := os.ReadDir("testdata/models")
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	for _, file := range files {
-		if !strings.HasSuffix(file.Name(), ".json") {
-			continue
-		}
-
-		jsonPath := filepath.Join("testdata/models", file.Name())
-		ggufPath := filepath.Join("testdata/models", strings.TrimSuffix(file.Name(), ".json")+".gguf")
-
-		// Skip if no corresponding .gguf file exists
-		if _, err := os.Stat(ggufPath); err != nil {
-			t.Logf("skipping %s: no corresponding GGUF file found", file.Name())
-			continue
-		}
-
-		data, err := os.ReadFile(jsonPath)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		var test modelTest
-		if err := json.Unmarshal(data, &test); err != nil {
-			t.Fatal(err)
-		}
-
-		t.Run(strings.TrimSuffix(file.Name(), ".json"), func(t *testing.T) {
-			m, err := model.New(ggufPath)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
-
-			inputs, err := m.(model.TextProcessor).Encode(test.Prompt)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			var result []string
-			for len(result) < 100 { // Limit to 100 tokens max
-				options := model.Options{
-					Inputs:    inputs,
-					Positions: make([]int32, len(inputs)),
-					Sequences: make([]int, len(inputs)),
-					Outputs:   []int32{int32(len(inputs) - 1)},
-				}
-				for i := range options.Positions {
-					options.Positions[i] = int32(i)
-					options.Sequences[i] = 0
-				}
-
-				ctx := m.Backend().NewContext()
-
-				modelOutput, err := model.Forward(ctx, m, options)
-				if err != nil {
-					ctx.Close()
-					t.Fatal(fmt.Errorf("forward pass failed: %v", err))
-				}
-
-				f32s := modelOutput.Floats()
-				logits := make([]float64, len(f32s))
-				for i, f32 := range f32s {
-					logits[i] = float64(f32)
-				}
-
-				token, err := sample.Sample(logits, sample.Greedy())
-				if err != nil {
-					ctx.Close()
-					t.Fatal(fmt.Errorf("sampling failed: %v", err))
-				}
-
-				ctx.Close()
-
-				// Greedy sampling: take the token with the highest logit
-				nextToken := int32(token[0])
-				if m.(model.TextProcessor).Is(nextToken, model.SpecialEOS) {
-					break
-				}
-
-				piece, err := m.(model.TextProcessor).Decode([]int32{nextToken})
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				result = append(result, piece)
-				output := strings.Join(result, "")
-
-				for _, expectedOutput := range test.OutputContainsOne {
-					if strings.Contains(output, expectedOutput) {
-						t.Logf("Test passed with output: %q (matched expected: %q)", output, expectedOutput)
-						return
-					}
-				}
-
-				// Maintain full context by appending new token
-				inputs = append(inputs, nextToken)
-			}
-
-			t.Fatalf("Expected output containing one of %q but got: %q", test.OutputContainsOne, strings.Join(result, ""))
-		})
-	}
-}
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -0,0 +1,193 @@
+package gemma2
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	attnKeyLen, attnValLen           int
+	eps, ropeBase, ropeScale         float32
+	attnLogitSoftcap                 float32
+	finalLogitSoftcap                float32
+}
+
+type Model struct {
+	model.Base
+	model.SentencePieceModel
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`           // is this supposed to be root means square?
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"` // just set to token_embd?
+
+	*Options
+}
+
+func New(c ml.Config) (model.Model, error) {
+	m := Model{
+		SentencePieceModel: model.NewSentencePieceModel(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+			},
+		),
+		Layers: make([]Layer, c.Uint("block_count")),
+		Options: &Options{
+			hiddenSize:        int(c.Uint("embedding_length")),
+			numHeads:          int(c.Uint("attention.head_count")),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
+			attnKeyLen:        int(c.Uint("attention.key_length")),
+			attnValLen:        int(c.Uint("attention.value_length")),
+			eps:               c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base", 10000.0),
+			ropeScale:         c.Float("rope.freq_scale", 1.0),
+			attnLogitSoftcap:  c.Float("attn_logit_softcapping"),
+			finalLogitSoftcap: c.Float("final_logit_softcapping"),
+		},
+	}
+
+	slidingWindowLen := int32(c.Uint("attention.sliding_window"))
+	m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
+
+	return &m, nil
+}
+
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+
+	// todo: this should be 1.0/math.Sqrt(float64(headDim)) for 27B models
+	q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
+
+	cache.Put(ctx, k, v)
+	k, v, mask := cache.Get(ctx)
+
+	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	kq := k.Mulmat(ctx, q)
+
+	// logit softcap
+	kq = kq.Scale(ctx, 1.0/float64(opts.attnLogitSoftcap))
+	kq = kq.Tanh(ctx)
+	kq = kq.Scale(ctx, float64(opts.attnLogitSoftcap))
+
+	kq = kq.Add(ctx, mask)
+	kq = kq.Softmax(ctx)
+
+	kqv := v.Mulmat(ctx, kq)
+	kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
+}
+
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type Layer struct {
+	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention     *SelfAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm"`
+	MLPNorm           *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP               *MLP
+	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm"`
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
+	return hiddenState.Add(ctx, residual)
+}
+
+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
+	inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
+
+	for i, layer := range m.Layers {
+		cacheType := i % 2
+		m.Cache.SetLayer(i)
+		wc := m.Cache.(*kvcache.WrapperCache)
+		wc.SetLayerType(cacheType)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, m.Cache, m.Options)
+	}
+
+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+	hiddenState = m.Output.Forward(ctx, hiddenState)
+
+	// final logit softcap
+	hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.Options.finalLogitSoftcap))
+	hiddenState = hiddenState.Tanh(ctx)
+	hiddenState = hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap))
+
+	outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	return hiddenState.Rows(ctx, outputs), nil
+}
+
+func init() {
+	model.Register("gemma2", New)
+}
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -10,10 +10,10 @@ import (
 )

 type Options struct {
-	RopeFactors                              ml.Tensor `gguf:"rope_freqs.weight"`
-	ctxLen, hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale                 float32
-	ropeDim                                  uint32
+	RopeFactors                      ml.Tensor `gguf:"rope_freqs.weight"`
+	hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
 }

 type Model struct {
@@ -46,7 +46,6 @@ func New(c ml.Config) (model.Model, error) {
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-			ctxLen:     int(c.Uint("context_length")),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
 			ropeDim:    c.Uint("rope.dimension_count"),
@@ -68,23 +67,15 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	rc := ml.RopeConfig{
-		PositionIDs: positionIDs,
-		RopeFactors: opts.RopeFactors,
-		RopeDim:     opts.ropeDim,
-		RopeType:    ml.RopeTypeStandard,
-		OrigCtxLen:  opts.ctxLen,
-		RopeBase:    opts.ropeBase,
-		RopeScale:   opts.ropeScale,
-	}
+	ropeType := uint32(0)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, rc)
+	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, rc)
+	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -109,18 +100,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(
-		ctx,
-		ml.RopeConfig{
-			PositionIDs: shift,
-			RopeFactors: m.Options.RopeFactors,
-			RopeDim:     m.Options.ropeDim,
-			RopeType:    ml.RopeTypeStandard,
-			OrigCtxLen:  m.Options.ctxLen,
-			RopeBase:    m.Options.ropeBase,
-			RopeScale:   m.Options.ropeScale,
-		},
-	), nil
+	return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, uint32(0), m.Options.ropeBase, m.Options.ropeScale), nil
 }

 type MLP struct {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -19,23 +19,15 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	rc := ml.RopeConfig{
-		PositionIDs: positions,
-		RopeFactors: opts.RopeFactors,
-		RopeDim:     opts.ropeDim,
-		RopeType:    ml.RopeTypeStandard,
-		OrigCtxLen:  opts.ctxLen,
-		RopeBase:    opts.ropeBase,
-		RopeScale:   opts.ropeScale,
-	}
+	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, rc)
+	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, rc)
+	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -61,18 +53,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
-	return key.RoPE(
-		ctx,
-		ml.RopeConfig{
-			PositionIDs: shift,
-			RopeFactors: m.RopeFactors,
-			RopeDim:     m.ropeDim,
-			RopeType:    ml.RopeTypeStandard,
-			OrigCtxLen:  m.ctxLen,
-			RopeBase:    m.ropeBase,
-			RopeScale:   m.ropeScale,
-		},
-	), nil
+	return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
 }

 type TextMLP struct {
@@ -209,9 +190,9 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, cr
 type TextModelOptions struct {
 	RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`

-	ctxLen, hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale                 float32
-	ropeDim                                  uint32
+	hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32

 	crossAttentionLayers []uint32
 }
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -1,7 +1,7 @@
 package models

 import (
+	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/mllama"
-	_ "github.com/ollama/ollama/model/models/qwen2"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -1,222 +0,0 @@
-package qwen2
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
-)
-
-type Options struct {
-	RopeFactors    ml.Tensor `gguf:"rope_freqs.weight"`
-	contextLength  int
-	hiddenSize     int
-	numAttnHeads   int
-	numKVHeads     int
-	modelEpsilon   float32
-	ropeBaseFreq   float32
-	ropeFreqScale  float32
-	ropeDimensions uint32
-}
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	Layers         []Layer       `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
-
-	*Options
-}
-
-func New(c ml.Config) (model.Model, error) {
-	m := &Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Uints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-			},
-		),
-		Layers: make([]Layer, c.Uint("block_count")),
-		Options: &Options{
-			hiddenSize:     int(c.Uint("embedding_length")),
-			numAttnHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads:     int(c.Uint("attention.head_count_kv")),
-			modelEpsilon:   c.Float("attention.layer_norm_rms_epsilon"),
-			contextLength:  int(c.Uint("context_length")),
-			ropeBaseFreq:   c.Float("rope.freq_base"),
-			ropeFreqScale:  c.Float("rope.freq_scale", 1),
-			ropeDimensions: c.Uint("rope.dimension_count", 64),
-		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-
-	return m, nil
-}
-
-// Shift applies rotary position embeddings to the key tensor for causal attention caching
-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(
-		ctx,
-		ml.RopeConfig{
-			PositionIDs: shift,
-			RopeFactors: m.Options.RopeFactors,
-			RopeDim:     m.Options.ropeDimensions,
-			RopeType:    ml.RopeTypeNeoX,
-			OrigCtxLen:  m.Options.contextLength,
-			RopeBase:    m.Options.ropeBaseFreq,
-			RopeScale:   m.Options.ropeFreqScale,
-		},
-	), nil
-}
-
-// SelfAttention implements the multi-head self-attention mechanism
-// with separate projections for query, key, value and output transformations
-type SelfAttention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
-}
-
-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, inputPositions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	// Initialize dimensions and configuration
-	batchSize := hiddenState.Dim(1)
-	headDimension := opts.hiddenSize / opts.numAttnHeads
-	ropeConfig := ml.RopeConfig{
-		PositionIDs: inputPositions,
-		RopeFactors: nil,
-		RopeDim:     opts.ropeDimensions,
-		RopeType:    ml.RopeTypeNeoX,
-		OrigCtxLen:  opts.contextLength,
-		RopeBase:    opts.ropeBaseFreq,
-		RopeScale:   opts.ropeFreqScale,
-	}
-
-	// Project and reshape query states with rotary embeddings
-	queryStates := sa.Query.Forward(ctx, hiddenState)
-	queryStates = queryStates.Reshape(ctx, headDimension, opts.numAttnHeads, batchSize)
-	queryStates = queryStates.RoPE(ctx, ropeConfig)
-
-	// Project and reshape key states with rotary embeddings
-	keyStates := sa.Key.Forward(ctx, hiddenState)
-	keyStates = keyStates.Reshape(ctx, headDimension, opts.numKVHeads, batchSize)
-	keyStates = keyStates.RoPE(ctx, ropeConfig)
-
-	// Project and reshape value states
-	valueStates := sa.Value.Forward(ctx, hiddenState)
-	valueStates = valueStates.Reshape(ctx, headDimension, opts.numKVHeads, batchSize)
-
-	// Update and retrieve from KV cache
-	cache.Put(ctx, keyStates, valueStates)
-	keyStates, valueStates, attentionMask := cache.Get(ctx)
-
-	// Prepare tensors for attention computation
-	queryStates = queryStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	keyStates = keyStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	valueStates = valueStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-
-	// Apply scaling and attention mask to scores
-	attentionScores := keyStates.MulmatFullPrec(ctx, queryStates)
-	attentionScores = attentionScores.Scale(ctx, 1.0/math.Sqrt(float64(headDimension)))
-	attentionScores = attentionScores.Add(ctx, attentionMask)
-	// Compute scaled dot-product attention
-	attentionProbs := attentionScores.Softmax(ctx)
-
-	// Apply attention weights and reshape
-	weightedStates := valueStates.Mulmat(ctx, attentionProbs)
-	weightedStates = weightedStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	weightedStates = weightedStates.Reshape(ctx, opts.hiddenSize, batchSize)
-
-	// Project to output dimension
-	return sa.Output.Forward(ctx, weightedStates)
-}
-
-// MLP implements the feed-forward network component with SwiGLU activation
-type MLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-	Gate *nn.Linear `gguf:"ffn_gate"`
-}
-
-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
-	// Apply SwiGLU activation gating
-	gateActivation := mlp.Gate.Forward(ctx, hiddenState).SILU(ctx)
-	upProjection := mlp.Up.Forward(ctx, hiddenState)
-	intermediateStates := gateActivation.Mul(ctx, upProjection)
-
-	// Project back to hidden dimension
-	return mlp.Down.Forward(ctx, intermediateStates)
-}
-
-// Layer represents a single transformer layer combining self-attention and feed-forward components
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	SelfAttention *SelfAttention
-	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
-}
-
-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	// Self-attention branch with residual connection
-	residual := hiddenState
-
-	normalizedAttention := l.AttentionNorm.Forward(ctx, hiddenState, opts.modelEpsilon)
-	attentionOutput := l.SelfAttention.Forward(ctx, normalizedAttention, positionIDs, cache, opts)
-	hiddenState = attentionOutput.Add(ctx, residual)
-
-	// Feed-forward branch with residual connection
-	residual = hiddenState
-	normalizedMLP := l.MLPNorm.Forward(ctx, hiddenState, opts.modelEpsilon)
-	mlpOutput := l.MLP.Forward(ctx, normalizedMLP, opts)
-	output := mlpOutput.Add(ctx, residual)
-
-	return output
-}
-
-func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
-	// Convert input tokens and positions to tensors
-	inputTensor, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
-	if err != nil {
-		return nil, err
-	}
-
-	positionsTensor, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	// Initial token embedding
-	hiddenStates := m.TokenEmbedding.Forward(ctx, inputTensor)
-
-	// Process through transformer layers
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-		hiddenStates = layer.Forward(ctx, hiddenStates, positionsTensor, m.Cache, m.Options)
-	}
-
-	// Final layer normalization and output projection
-	normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)
-	logits := m.Output.Forward(ctx, normalizedOutput)
-
-	// Extract requested output token positions
-	outputsTensor, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
-	if err != nil {
-		return nil, err
-	}
-
-	return logits.Rows(ctx, outputsTensor), nil
-}
-
-func init() {
-	model.Register("qwen2", New)
-}
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -18,6 +18,15 @@ const (
 	SpecialEOS
 )

+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
 type TextProcessor interface {
 	Encode(string) ([]int32, error)
 	Decode([]int32) (string, error)
@@ -27,7 +36,7 @@ type TextProcessor interface {
 type Vocabulary struct {
 	Values []string
 	Types  []uint32
-	Scores []uint32
+	Scores []float32
 	Merges []string

 	BOS, EOS int32
@@ -75,7 +84,7 @@ func (v *Vocabulary) Decode(id int32) string {
 func (v *Vocabulary) SpecialVocabulary() []string {
 	v.specialOnce.Do(func() {
 		for i := range v.Values {
-			if v.Types[i] == 3 {
+			if v.Types[i] == TOKEN_TYPE_CONTROL {
 				v.special = append(v.special, v.Values[i])
 			}
 		}
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -0,0 +1,220 @@
+package model
+
+import (
+	"iter"
+	"log/slog"
+	"strings"
+
+	"github.com/dlclark/regexp2"
+	queue "github.com/emirpasic/gods/queues/priorityqueue"
+)
+
+const spmWhitespaceSep = "▁"
+
+func replaceWhitespaceBySeperator(s string) string {
+	return strings.ReplaceAll(s, " ", spmWhitespaceSep)
+}
+
+type SentencePieceModel struct {
+	maxTokenLen int
+	pre         *regexp2.Regexp
+	vocab       *Vocabulary
+}
+
+func NewSentencePieceModel(pre string, vocab *Vocabulary) SentencePieceModel {
+	slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:3], "scores", vocab.Scores[:3], "types", vocab.Types[:3])
+
+	counter := map[int]int{}
+	var maxTokenLen int
+	for cnt := range vocab.Types {
+		switch vocab.Types[cnt] {
+		case TOKEN_TYPE_NORMAL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_UNUSED:
+			maxTokenLen = max(maxTokenLen, len(vocab.Values[cnt]))
+			fallthrough
+		default:
+			counter[int(vocab.Types[cnt])] += 1
+		}
+	}
+
+	slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
+		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
+		"max token len", maxTokenLen)
+
+	return SentencePieceModel{
+		maxTokenLen: maxTokenLen,
+		pre:         regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
+		vocab:       vocab,
+	}
+}
+
+func (spm SentencePieceModel) Is(id int32, special Special) bool {
+	return spm.vocab.Is(id, special)
+}
+
+func (spm *SentencePieceModel) split(s string) iter.Seq[string] {
+	return func(yield func(string) bool) {
+		for m, _ := spm.pre.FindStringMatch(s); m != nil; m, _ = spm.pre.FindNextMatch(m) {
+			if !yield(m.String()) {
+				break
+			}
+		}
+	}
+}
+
+func (spm SentencePieceModel) Encode(s string) ([]int32, error) {
+	fragments := []fragment{{value: s}}
+	for _, special := range spm.vocab.SpecialVocabulary() {
+		// TODO: process special tokens concurrently
+		id := spm.vocab.Encode(special)
+		for i := 0; i < len(fragments); i++ {
+			frag := fragments[i]
+			if len(frag.ids) > 0 {
+				continue
+			}
+
+			var middle []fragment
+			switch i := strings.Index(frag.value, special); {
+			case i < 0:
+				middle = append(middle, frag)
+			case i > 0:
+				middle = append(middle, fragment{value: frag.value[:i]})
+				fallthrough
+			default:
+				middle = append(middle, fragment{value: special, ids: []int32{id}})
+				if rest := frag.value[i+len(special):]; rest != "" {
+					middle = append(middle, fragment{value: rest})
+				}
+			}
+
+			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
+		}
+	}
+	slog.Debug("fragments", "frags", fragments)
+
+	var ids []int32
+	for _, frag := range fragments {
+		if len(frag.ids) > 0 {
+			ids = append(ids, frag.ids...)
+			continue
+		}
+
+		for split := range spm.split(frag.value) {
+			split = replaceWhitespaceBySeperator(split)
+
+			var sb strings.Builder
+			sb.Write([]byte(split))
+			if id := spm.vocab.Encode(sb.String()); id >= 0 {
+				ids = append(ids, id)
+				continue
+			}
+
+			runes := []rune(sb.String())
+			pq := queue.NewWith(func(a, b any) int {
+				priA := a.(*candidate)
+				priB := b.(*candidate)
+				if priA.score > priB.score || (priA.score == priB.score && priA.a < priB.a) {
+					return 1
+				}
+				return -1
+			})
+
+			merges := make([]merge, len(runes))
+			for r := range runes {
+				merges[r] = merge{
+					p:     r - 1,
+					n:     r + 1,
+					runes: []rune{runes[r]},
+				}
+			}
+
+			pairwise := func(a, b int) *candidate {
+				if a < 0 || b >= len(runes) {
+					return nil
+				}
+
+				left, right := string(merges[a].runes), string(merges[b].runes)
+				if id := spm.vocab.Encode(left + right); id >= 0 {
+					return &candidate{
+						a:      a,
+						b:      b,
+						length: len(left + " " + right),
+						score:  spm.vocab.Scores[id],
+					}
+				}
+				return nil
+			}
+
+			for i := range len(runes) - 1 {
+				if pair := pairwise(i, i+1); pair != nil {
+					pq.Enqueue(pair)
+				}
+			}
+
+			pqv := pq.Values()
+			for _, v := range pqv {
+				e := v.(*candidate)
+				slog.Debug("candidate", "candidate", e)
+			}
+
+			for !pq.Empty() {
+				v, _ := pq.Dequeue()
+				pair := v.(*candidate)
+				left, right := merges[pair.a], merges[pair.b]
+
+				if len(left.runes) == 0 || len(right.runes) == 0 {
+					continue
+				}
+
+				merges[pair.a].runes = append(left.runes, right.runes...)
+				merges[pair.b].runes = nil
+				merges[pair.a].n = right.n
+				if right.n < len(merges) {
+					merges[right.n].p = pair.a
+				}
+
+				if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
+					pq.Enqueue(pair)
+				}
+
+				if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
+					pq.Enqueue(pair)
+				}
+			}
+
+			slog.Debug("merges", "merges", merges)
+
+			for _, merge := range merges {
+				if len(merge.runes) > 0 {
+					if id := spm.vocab.Encode(string(merge.runes)); id >= 0 {
+						ids = append(ids, id)
+					} else {
+						slog.Debug("missing token", "token", string(merge.runes))
+					}
+				}
+			}
+		}
+	}
+	slog.Debug("encoded", "ids", ids)
+
+	return ids, nil
+}
+
+type candidate struct {
+	a, b   int
+	score  float32
+	length int
+}
+
+func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
+	var sb strings.Builder
+	for _, id := range ids {
+		data := spm.vocab.Decode(id)
+		data = strings.ReplaceAll(data, spmWhitespaceSep, " ")
+		if _, err := sb.WriteString(data); err != nil {
+			return "", err
+		}
+	}
+
+	slog.Debug("decoded", "ids", ids, "text", sb.String())
+	return sb.String(), nil
+}
--- a/model/testdata/models/README.md
+++ b/model/testdata/models/README.md
@@ -1,10 +0,0 @@
-# Test Model Directory
-
-This directory is used for storing model files (like `.gguf` files) that are required to run the tests in `model_external_test.go`. 
-
-## Usage
-
- Place any model files you need for testing in this directory
- The test file will look for any model files here (e.g., `llama3.gguf`)
- All non-markdown files in this directory are git-ignored to prevent large model files from being committed to the repository
- Only `.md` files (like this README) will be tracked in git
--- a/model/testdata/models/qwen2_5.json
+++ b/model/testdata/models/qwen2_5.json
@@ -1,7 +0,0 @@
-{
-  "prompt": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nhi<|im_end|>\n<|im_start|>assistant\n",
-  "output_contains_one": [
-    "Hello",
-    "Hi"
-  ]
-}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -813,6 +813,8 @@ func (s *Server) loadModel(
 		panic(err)
 	}

+	slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
+
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")
@@ -881,7 +883,6 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting ollama engine")
-	// TODO(jessegross): Some system info would be useful

 	server := &Server{
 		batchSize: *batchSize,
Author	SHA1	Message	Date
Patrick Devine	b7349a4efd	more linter feeding	2025-02-18 13:32:58 -08:00
Patrick Devine	4cda3e3622	feed the linter	2025-02-18 13:16:43 -08:00
Patrick Devine	95fbf1da12	fix causal test	2025-02-18 13:02:44 -08:00
Patrick Devine	83d1a1ab55	cleanup	2025-02-18 12:47:34 -08:00
Patrick Devine	035e69799e	clean up	2025-02-18 12:40:12 -08:00
Patrick Devine	10e06d0a45	gemma2 ftw	2025-02-18 12:40:02 -08:00
Patrick Devine	8cf1ea4fd8	add sentence piece tokenizer	2025-02-18 12:39:45 -08:00
Patrick Devine	d231229122	cache is king	2025-02-18 12:39:27 -08:00
Patrick Devine	fad98fabab	gemma2 impl	2025-02-18 12:39:17 -08:00
Michael Yang	7b5d916a9a	ci: set owner/group in tarball set owner and group when building the linux tarball so extracted files are consistent. this is the behaviour of release tarballs in version 0.5.7 and lower	2025-02-18 20:11:09 +00:00
benhaotang	33ad61b112	Add OpenDeepResearcher-via-searxng to Community Integrations (#9138 )	2025-02-18 11:39:11 -08:00
L. Jiang	716e365615	test: add test cases for HumanNumber (#9108 )	2025-02-18 11:35:26 -08:00
innightwolfsleep	3b4424ff98	readme: add LLM Telegram Bot to community integrations (#9150 )	2025-02-18 10:04:30 -05:00
James-William-Kincaid-III	0667baddc6	docs: fix incorrect shortcut key in windows.md (#9098 )	2025-02-15 15:38:24 -05:00
Bruce MacDonald	d006e1e09b	model: document high-level model interface (#9122 )	2025-02-14 16:01:00 -08:00
Daniel Hiltgen	df2680b4b9	Wire up system info log for new engine (#9123 )	2025-02-14 15:55:33 -08:00