more linter feeding

feed the linter
fix causal test
2025-02-18 13:32:58 -08:00 · 2025-02-18 13:16:43 -08:00 · 2025-02-18 13:02:44 -08:00 · 2025-02-18 12:47:34 -08:00 · 2025-02-18 12:40:12 -08:00 · 2025-02-18 12:40:02 -08:00
22 changed files with 542 additions and 31 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -329,7 +329,9 @@ jobs:
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
+            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
+          done
      - uses: actions/upload-artifact@v4
        with:
          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
--- a/README.md
+++ b/README.md
@@ -381,6 +381,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
+- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)

 ### Cloud

@@ -548,6 +549,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
+- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)

 ### Supported backends

--- a/discover/path.go
+++ b/discover/path.go
@@ -19,6 +19,10 @@ var LibOllamaPath string = func() string {
 		return ""
 	}

+	if eval, err := filepath.EvalSymlinks(exe); err == nil {
+		exe = eval
+	}
+
 	var libPath string
 	switch runtime.GOOS {
 	case "windows":
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
 ## Troubleshooting

 Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<cmd>+R` and type in:
+the explorer window by hitting `<Ctrl>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
    - *app.log* contains most resent logs from the GUI application
    - *server.log* contains the most recent server logs
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -12,6 +12,9 @@ func TestHumanNumber(t *testing.T) {

 	testCases := []testCase{
 		{0, "0"},
+		{999, "999"},
+		{1000, "1K"},
+		{1001, "1K"},
 		{1000000, "1M"},
 		{125000000, "125M"},
 		{500500000, "500.50M"},
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -120,6 +120,15 @@ func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
 	return s
 }

+func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
+	r := keyValue(kv, key, &array{})
+	s := make([]float32, r.size)
+	for i := range r.size {
+		s[i] = float32(r.values[i].(float32))
+	}
+	return s
+}
+
 func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
--- a/go.mod
+++ b/go.mod
@@ -18,6 +18,7 @@ require (
 	github.com/agnivade/levenshtein v1.1.1
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
+	github.com/emirpasic/gods v1.18.1
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
--- a/go.sum
+++ b/go.sum
@@ -44,6 +44,8 @@ github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
 github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
 github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
+github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
+github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
 github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -305,6 +305,10 @@ func (b *testBackend) NewContext() ml.Context {
 	return &testContext{}
 }

+func (b *testBackend) SystemInfo() string {
+	return "not implemented"
+}
+
 type testContext struct{}

 func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
@@ -430,7 +434,7 @@ func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0
 	panic("not implemented")
 }

-func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim uint32, base, scale float32) ml.Tensor {
+func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim, ropeType uint32, base, scale float32) ml.Tensor {
 	panic("not implemented")
 }

--- a/llm/server.go
+++ b/llm/server.go
@@ -320,6 +320,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
 		}

+		if eval, err := filepath.EvalSymlinks(exe); err == nil {
+			exe = eval
+		}
+
 		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 		s := &llmServer{
 			port:        port,
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -17,12 +17,14 @@ type Config interface {

 	Strings(string, ...[]string) []string
 	Uints(string, ...[]uint32) []uint32
+	Floats(string, ...[]float32) []float32
 }

 type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
+	SystemInfo() string
 }

 var backends = make(map[string]func(*os.File) (Backend, error))
@@ -75,7 +77,7 @@ type Tensor interface {
 	Scale(ctx Context, s float64) Tensor

 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor

 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,11 +1,27 @@
 package ggml

-// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
-// #include <stdlib.h>
-// #include <stdint.h>
-// #include "ggml.h"
-// #include "ggml-cpu.h"
-// #include "ggml-backend.h"
+/*
+#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
+#include <stdlib.h>
+#include <stdint.h>
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-backend.h"
+static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
+static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
+
+typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
+COMPILER inline get_compiler() {
+#if defined(__clang__)
+	return COMP_CLANG;
+#elif defined(__GNUC__)
+	return COMP_GCC;
+#else
+	return UNKNOWN_COMPILER;
+#endif
+}
+
+*/
 import "C"

 import (
@@ -580,10 +596,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 }

 const (
-	ropeTypeNorm C.int = iota
+	ropeTypeNorm   C.int = 0
+	ropeTypeNeox   C.int = 2
+	ropeTypeMrope  C.int = 8
+	ropeTypeVision C.int = 24
 )

-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{}
 	}
@@ -597,8 +616,8 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
-			131072,       // YaRN n_ctx_train
-			ropeTypeNorm, // ROPE_TYPE_NORM
+			C.int(ropeType),
+			131072, // YaRN n_ctx_train
 			C.float(ropeBase),
 			C.float(ropeScale),
 			0.,  // YaRN ext_factor
@@ -626,3 +645,34 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
 	}
 }
+
+func (b *Backend) SystemInfo() string {
+	var compiler string
+	switch C.get_compiler() {
+	case C.COMP_UNKNOWN:
+		compiler = "cgo(unknown_compiler)"
+	case C.COMP_GCC:
+		compiler = "cgo(gcc)"
+	case C.COMP_CLANG:
+		compiler = "cgo(clang)"
+	}
+
+	var s string
+	for i := range C.ggml_backend_reg_count() {
+		reg := C.ggml_backend_reg_get(i)
+		fName := C.CString("ggml_backend_get_features")
+		defer C.free(unsafe.Pointer(fName))
+		get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
+		if get_features_fn != nil {
+			s += C.GoString(C.ggml_backend_reg_name(reg))
+			s += " : "
+			for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
+				s += C.GoString(features.name)
+				s += " = "
+				s += C.GoString(features.value)
+				s += " | "
+			}
+		}
+	}
+	return s + compiler
+}
--- a/ml/backend/ggml/ggml/src/ggml.go
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -47,10 +47,6 @@ var OnceLoad = sync.OnceFunc(func() {
 		exe = "."
 	}

-	if eval, err := filepath.EvalSymlinks(exe); err == nil {
-		exe = eval
-	}
-
 	// PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often
 	// set by the parent process, however, use a default value
 	// if the environment variable is not set.
--- a/model/model.go
+++ b/model/model.go
@@ -21,6 +21,7 @@ import (
 	_ "github.com/ollama/ollama/ml/backend"
 )

+// Options contains the inputs for a model forward pass
 type Options struct {
 	Inputs    []int32
 	Positions []int32
@@ -34,11 +35,13 @@ type config struct {
 	Cache kvcache.Cache
 }

+// Base implements the common fields and methods for all models
 type Base struct {
 	b ml.Backend
 	config
 }

+// Backend returns the underlying backend that will run the model
 func (m *Base) Backend() ml.Backend {
 	return m.b
 }
@@ -47,6 +50,7 @@ func (m *Base) Config() config {
 	return m.config
 }

+// Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
 	Forward(ml.Context, Options) (ml.Tensor, error)

@@ -56,6 +60,7 @@ type Model interface {

 var models = make(map[string]func(ml.Config) (Model, error))

+// Register registers a model constructor for the given architecture
 func Register(name string, f func(ml.Config) (Model, error)) {
 	if _, ok := models[name]; ok {
 		panic("model: model already registered")
@@ -64,8 +69,9 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 	models[name] = f
 }

-func New(s string) (Model, error) {
-	r, err := os.Open(s)
+// New initializes a new model instance with the provided configuration based on the metadata in the model file
+func New(modelPath string) (Model, error) {
+	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -0,0 +1,193 @@
+package gemma2
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	attnKeyLen, attnValLen           int
+	eps, ropeBase, ropeScale         float32
+	attnLogitSoftcap                 float32
+	finalLogitSoftcap                float32
+}
+
+type Model struct {
+	model.Base
+	model.SentencePieceModel
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`           // is this supposed to be root means square?
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"` // just set to token_embd?
+
+	*Options
+}
+
+func New(c ml.Config) (model.Model, error) {
+	m := Model{
+		SentencePieceModel: model.NewSentencePieceModel(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+			},
+		),
+		Layers: make([]Layer, c.Uint("block_count")),
+		Options: &Options{
+			hiddenSize:        int(c.Uint("embedding_length")),
+			numHeads:          int(c.Uint("attention.head_count")),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
+			attnKeyLen:        int(c.Uint("attention.key_length")),
+			attnValLen:        int(c.Uint("attention.value_length")),
+			eps:               c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base", 10000.0),
+			ropeScale:         c.Float("rope.freq_scale", 1.0),
+			attnLogitSoftcap:  c.Float("attn_logit_softcapping"),
+			finalLogitSoftcap: c.Float("final_logit_softcapping"),
+		},
+	}
+
+	slidingWindowLen := int32(c.Uint("attention.sliding_window"))
+	m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
+
+	return &m, nil
+}
+
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+
+	// todo: this should be 1.0/math.Sqrt(float64(headDim)) for 27B models
+	q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
+
+	cache.Put(ctx, k, v)
+	k, v, mask := cache.Get(ctx)
+
+	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	kq := k.Mulmat(ctx, q)
+
+	// logit softcap
+	kq = kq.Scale(ctx, 1.0/float64(opts.attnLogitSoftcap))
+	kq = kq.Tanh(ctx)
+	kq = kq.Scale(ctx, float64(opts.attnLogitSoftcap))
+
+	kq = kq.Add(ctx, mask)
+	kq = kq.Softmax(ctx)
+
+	kqv := v.Mulmat(ctx, kq)
+	kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
+}
+
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type Layer struct {
+	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention     *SelfAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm"`
+	MLPNorm           *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP               *MLP
+	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm"`
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
+	return hiddenState.Add(ctx, residual)
+}
+
+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
+	inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
+
+	for i, layer := range m.Layers {
+		cacheType := i % 2
+		m.Cache.SetLayer(i)
+		wc := m.Cache.(*kvcache.WrapperCache)
+		wc.SetLayerType(cacheType)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, m.Cache, m.Options)
+	}
+
+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+	hiddenState = m.Output.Forward(ctx, hiddenState)
+
+	// final logit softcap
+	hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.Options.finalLogitSoftcap))
+	hiddenState = hiddenState.Tanh(ctx)
+	hiddenState = hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap))
+
+	outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	return hiddenState.Rows(ctx, outputs), nil
+}
+
+func init() {
+	model.Register("gemma2", New)
+}
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -67,14 +67,15 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -99,7 +100,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, m.Options.ropeBase, m.Options.ropeScale), nil
+	return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, uint32(0), m.Options.ropeBase, m.Options.ropeScale), nil
 }

 type MLP struct {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -19,14 +19,15 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -52,7 +53,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
-	return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
 }

 type TextMLP struct {
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -1,6 +1,7 @@
 package models

 import (
+	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/mllama"
 )
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -18,6 +18,15 @@ const (
 	SpecialEOS
 )

+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
 type TextProcessor interface {
 	Encode(string) ([]int32, error)
 	Decode([]int32) (string, error)
@@ -27,7 +36,7 @@ type TextProcessor interface {
 type Vocabulary struct {
 	Values []string
 	Types  []uint32
-	Scores []uint32
+	Scores []float32
 	Merges []string

 	BOS, EOS int32
@@ -75,7 +84,7 @@ func (v *Vocabulary) Decode(id int32) string {
 func (v *Vocabulary) SpecialVocabulary() []string {
 	v.specialOnce.Do(func() {
 		for i := range v.Values {
-			if v.Types[i] == 3 {
+			if v.Types[i] == TOKEN_TYPE_CONTROL {
 				v.special = append(v.special, v.Values[i])
 			}
 		}
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -0,0 +1,220 @@
+package model
+
+import (
+	"iter"
+	"log/slog"
+	"strings"
+
+	"github.com/dlclark/regexp2"
+	queue "github.com/emirpasic/gods/queues/priorityqueue"
+)
+
+const spmWhitespaceSep = "▁"
+
+func replaceWhitespaceBySeperator(s string) string {
+	return strings.ReplaceAll(s, " ", spmWhitespaceSep)
+}
+
+type SentencePieceModel struct {
+	maxTokenLen int
+	pre         *regexp2.Regexp
+	vocab       *Vocabulary
+}
+
+func NewSentencePieceModel(pre string, vocab *Vocabulary) SentencePieceModel {
+	slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:3], "scores", vocab.Scores[:3], "types", vocab.Types[:3])
+
+	counter := map[int]int{}
+	var maxTokenLen int
+	for cnt := range vocab.Types {
+		switch vocab.Types[cnt] {
+		case TOKEN_TYPE_NORMAL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_UNUSED:
+			maxTokenLen = max(maxTokenLen, len(vocab.Values[cnt]))
+			fallthrough
+		default:
+			counter[int(vocab.Types[cnt])] += 1
+		}
+	}
+
+	slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
+		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
+		"max token len", maxTokenLen)
+
+	return SentencePieceModel{
+		maxTokenLen: maxTokenLen,
+		pre:         regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
+		vocab:       vocab,
+	}
+}
+
+func (spm SentencePieceModel) Is(id int32, special Special) bool {
+	return spm.vocab.Is(id, special)
+}
+
+func (spm *SentencePieceModel) split(s string) iter.Seq[string] {
+	return func(yield func(string) bool) {
+		for m, _ := spm.pre.FindStringMatch(s); m != nil; m, _ = spm.pre.FindNextMatch(m) {
+			if !yield(m.String()) {
+				break
+			}
+		}
+	}
+}
+
+func (spm SentencePieceModel) Encode(s string) ([]int32, error) {
+	fragments := []fragment{{value: s}}
+	for _, special := range spm.vocab.SpecialVocabulary() {
+		// TODO: process special tokens concurrently
+		id := spm.vocab.Encode(special)
+		for i := 0; i < len(fragments); i++ {
+			frag := fragments[i]
+			if len(frag.ids) > 0 {
+				continue
+			}
+
+			var middle []fragment
+			switch i := strings.Index(frag.value, special); {
+			case i < 0:
+				middle = append(middle, frag)
+			case i > 0:
+				middle = append(middle, fragment{value: frag.value[:i]})
+				fallthrough
+			default:
+				middle = append(middle, fragment{value: special, ids: []int32{id}})
+				if rest := frag.value[i+len(special):]; rest != "" {
+					middle = append(middle, fragment{value: rest})
+				}
+			}
+
+			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
+		}
+	}
+	slog.Debug("fragments", "frags", fragments)
+
+	var ids []int32
+	for _, frag := range fragments {
+		if len(frag.ids) > 0 {
+			ids = append(ids, frag.ids...)
+			continue
+		}
+
+		for split := range spm.split(frag.value) {
+			split = replaceWhitespaceBySeperator(split)
+
+			var sb strings.Builder
+			sb.Write([]byte(split))
+			if id := spm.vocab.Encode(sb.String()); id >= 0 {
+				ids = append(ids, id)
+				continue
+			}
+
+			runes := []rune(sb.String())
+			pq := queue.NewWith(func(a, b any) int {
+				priA := a.(*candidate)
+				priB := b.(*candidate)
+				if priA.score > priB.score || (priA.score == priB.score && priA.a < priB.a) {
+					return 1
+				}
+				return -1
+			})
+
+			merges := make([]merge, len(runes))
+			for r := range runes {
+				merges[r] = merge{
+					p:     r - 1,
+					n:     r + 1,
+					runes: []rune{runes[r]},
+				}
+			}
+
+			pairwise := func(a, b int) *candidate {
+				if a < 0 || b >= len(runes) {
+					return nil
+				}
+
+				left, right := string(merges[a].runes), string(merges[b].runes)
+				if id := spm.vocab.Encode(left + right); id >= 0 {
+					return &candidate{
+						a:      a,
+						b:      b,
+						length: len(left + " " + right),
+						score:  spm.vocab.Scores[id],
+					}
+				}
+				return nil
+			}
+
+			for i := range len(runes) - 1 {
+				if pair := pairwise(i, i+1); pair != nil {
+					pq.Enqueue(pair)
+				}
+			}
+
+			pqv := pq.Values()
+			for _, v := range pqv {
+				e := v.(*candidate)
+				slog.Debug("candidate", "candidate", e)
+			}
+
+			for !pq.Empty() {
+				v, _ := pq.Dequeue()
+				pair := v.(*candidate)
+				left, right := merges[pair.a], merges[pair.b]
+
+				if len(left.runes) == 0 || len(right.runes) == 0 {
+					continue
+				}
+
+				merges[pair.a].runes = append(left.runes, right.runes...)
+				merges[pair.b].runes = nil
+				merges[pair.a].n = right.n
+				if right.n < len(merges) {
+					merges[right.n].p = pair.a
+				}
+
+				if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
+					pq.Enqueue(pair)
+				}
+
+				if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
+					pq.Enqueue(pair)
+				}
+			}
+
+			slog.Debug("merges", "merges", merges)
+
+			for _, merge := range merges {
+				if len(merge.runes) > 0 {
+					if id := spm.vocab.Encode(string(merge.runes)); id >= 0 {
+						ids = append(ids, id)
+					} else {
+						slog.Debug("missing token", "token", string(merge.runes))
+					}
+				}
+			}
+		}
+	}
+	slog.Debug("encoded", "ids", ids)
+
+	return ids, nil
+}
+
+type candidate struct {
+	a, b   int
+	score  float32
+	length int
+}
+
+func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
+	var sb strings.Builder
+	for _, id := range ids {
+		data := spm.vocab.Decode(id)
+		data = strings.ReplaceAll(data, spmWhitespaceSep, " ")
+		if _, err := sb.WriteString(data); err != nil {
+			return "", err
+		}
+	}
+
+	slog.Debug("decoded", "ids", ids, "text", sb.String())
+	return sb.String(), nil
+}
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -845,8 +845,6 @@ func (s *Server) loadModel(
 	threads int,
 	multiUserCache bool,
 ) {
-	llama.BackendInit()
-
 	var err error
 	s.model, err = llama.LoadModelFromFile(mpath, params)
 	if err != nil {
@@ -932,6 +930,8 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting go runner")
+
+	llama.BackendInit()
 	slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)

 	server := &Server{
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -813,6 +813,8 @@ func (s *Server) loadModel(
 		panic(err)
 	}

+	slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
+
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")
@@ -881,7 +883,6 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting ollama engine")
-	// TODO(jessegross): Some system info would be useful

 	server := &Server{
 		batchSize: *batchSize,
Author	SHA1	Message	Date
Patrick Devine	b7349a4efd	more linter feeding	2025-02-18 13:32:58 -08:00
Patrick Devine	4cda3e3622	feed the linter	2025-02-18 13:16:43 -08:00
Patrick Devine	95fbf1da12	fix causal test	2025-02-18 13:02:44 -08:00
Patrick Devine	83d1a1ab55	cleanup	2025-02-18 12:47:34 -08:00
Patrick Devine	035e69799e	clean up	2025-02-18 12:40:12 -08:00
Patrick Devine	10e06d0a45	gemma2 ftw	2025-02-18 12:40:02 -08:00
Patrick Devine	8cf1ea4fd8	add sentence piece tokenizer	2025-02-18 12:39:45 -08:00
Patrick Devine	d231229122	cache is king	2025-02-18 12:39:27 -08:00
Patrick Devine	fad98fabab	gemma2 impl	2025-02-18 12:39:17 -08:00
Michael Yang	7b5d916a9a	ci: set owner/group in tarball set owner and group when building the linux tarball so extracted files are consistent. this is the behaviour of release tarballs in version 0.5.7 and lower	2025-02-18 20:11:09 +00:00
benhaotang	33ad61b112	Add OpenDeepResearcher-via-searxng to Community Integrations (#9138 )	2025-02-18 11:39:11 -08:00
L. Jiang	716e365615	test: add test cases for HumanNumber (#9108 )	2025-02-18 11:35:26 -08:00
innightwolfsleep	3b4424ff98	readme: add LLM Telegram Bot to community integrations (#9150 )	2025-02-18 10:04:30 -05:00
James-William-Kincaid-III	0667baddc6	docs: fix incorrect shortcut key in windows.md (#9098 )	2025-02-15 15:38:24 -05:00
Bruce MacDonald	d006e1e09b	model: document high-level model interface (#9122 )	2025-02-14 16:01:00 -08:00
Daniel Hiltgen	df2680b4b9	Wire up system info log for new engine (#9123 )	2025-02-14 15:55:33 -08:00
Jesse Gross	010313bb63	llamarunner: Init GGML before printing system info We currently print system info before the GGML backends are loaded. This results in only getting information about the default lowest common denominator runner. If we move up the GGML init then we can see what we are actually running. Before: time=2025-02-14T11:15:07.606-08:00 level=INFO source=runner.go:935 msg=system info="CPU : LLAMAFILE = 1 \| CPU : LLAMAFILE = 1 \| cgo(gcc)" threads=24 After: time=2025-02-14T11:16:02.936-08:00 level=INFO source=runner.go:935 msg=system info="CPU : LLAMAFILE = 1 \| CPU : LLAMAFILE = 1 \| CUDA : ARCHS = 890 \| USE_GRAPHS = 1 \| PEER_MAX_BATCH_SIZE = 128 \| CPU : SSE3 = 1 \| SSSE3 = 1 \| AVX = 1 \| AVX2 = 1 \| F16C = 1 \| FMA = 1 \| AVX512 = 1 \| AVX512_VBMI = 1 \| AVX512_VNNI = 1 \| LLAMAFILE = 1 \| cgo(gcc)" threads=24	2025-02-14 11:41:53 -08:00
Jeffrey Morgan	5296f487a8	llm: attempt to evaluate symlinks, but do not fail (#9089 ) provides a better approach to #9088 that will attempt to evaluate symlinks (important for macOS where 'ollama' is often a symlink), but use the result of os.Executable() as a fallback in scenarios where filepath.EvalSymlinks fails due to permission erorrs or other issues	2025-02-13 22:37:59 -08:00