model: add a test for model forward pass during implementation

Adds a new test file to verify model forward pass behavior through JSON-specified test cases. The framework loads model files (.gguf) and their corresponding test specifications to validate expected outputs using greedy sampling.
model: document qwen2 forward pass
2025-02-18 14:11:31 -08:00 · 2025-02-14 14:55:30 -08:00 · 2025-02-14 14:30:06 -08:00 · 2025-02-14 14:21:00 -08:00
22 changed files with 554 additions and 152 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -329,9 +329,7 @@ jobs:
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
-            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
-          done
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
      - uses: actions/upload-artifact@v4
        with:
          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,6 @@ test_data
 __debug_bin*
 llama/build
 llama/vendor
+model/testdata/models/*
+!model/testdata/models/*.md
+!model/testdata/models/*.json
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ set(GGML_LLAMAFILE ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_CUDA_GRAPHS ON)

-if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()
--- a/README.md
+++ b/README.md
@@ -381,7 +381,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)

 ### Cloud

@@ -549,7 +548,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)

 ### Supported backends

--- a/api/client.go
+++ b/api/client.go
@@ -126,8 +126,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 			return err
 		}
 	}
-
-	return ctx.Err()
+	return nil
 }

 const maxBufferSize = 512 * format.KiloByte
@@ -190,7 +189,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 		}
 	}

-	return ctx.Err()
+	return nil
 }

 // GenerateResponseFunc is a function that [Client.Generate] invokes every time
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -15,11 +15,13 @@ import (
 	"net"
 	"net/http"
 	"os"
+	"os/signal"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync/atomic"
+	"syscall"
 	"time"

 	"github.com/containerd/console"
@@ -328,7 +330,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 			if err := PullHandler(cmd, []string{name}); err != nil {
 				return nil, err
 			}
-
 			return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 		}
 		return info, err
@@ -857,6 +858,17 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)

+	cancelCtx, cancel := context.WithCancel(cmd.Context())
+	defer cancel()
+
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT)
+
+	go func() {
+		<-sigChan
+		cancel()
+	}()
+
 	var state *displayResponseState = &displayResponseState{}
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
@@ -891,7 +903,10 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		req.KeepAlive = opts.KeepAlive
 	}

-	if err := client.Chat(cmd.Context(), req, fn); err != nil {
+	if err := client.Chat(cancelCtx, req, fn); err != nil {
+		if errors.Is(err, context.Canceled) {
+			return nil, nil
+		}
 		return nil, err
 	}

@@ -931,6 +946,17 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		generateContext = []int{}
 	}

+	ctx, cancel := context.WithCancel(cmd.Context())
+	defer cancel()
+
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT)
+
+	go func() {
+		<-sigChan
+		cancel()
+	}()
+
 	var state *displayResponseState = &displayResponseState{}

 	fn := func(response api.GenerateResponse) error {
@@ -966,7 +992,10 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		KeepAlive: opts.KeepAlive,
 	}

-	if err := client.Generate(cmd.Context(), &request, fn); err != nil {
+	if err := client.Generate(ctx, &request, fn); err != nil {
+		if errors.Is(err, context.Canceled) {
+			return nil
+		}
 		return err
 	}

@@ -988,7 +1017,8 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		latest.Summary()
 	}

-	cmd.SetContext(context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context))
+	ctx = context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context)
+	cmd.SetContext(ctx)

 	return nil
 }
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
 ## Troubleshooting

 Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<Ctrl>+R` and type in:
+the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
    - *app.log* contains most resent logs from the GUI application
    - *server.log* contains the most recent server logs
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -12,9 +12,6 @@ func TestHumanNumber(t *testing.T) {

 	testCases := []testCase{
 		{0, "0"},
-		{999, "999"},
-		{1000, "1K"},
-		{1001, "1K"},
 		{1000000, "1M"},
 		{125000000, "125M"},
 		{500500000, "500.50M"},
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -305,10 +305,6 @@ func (b *testBackend) NewContext() ml.Context {
 	return &testContext{}
 }

-func (b *testBackend) SystemInfo() string {
-	return "not implemented"
-}
-
 type testContext struct{}

 func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
@@ -434,7 +430,7 @@ func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0
 	panic("not implemented")
 }

-func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim uint32, base, scale float32) ml.Tensor {
+func (t *testTensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
 	panic("not implemented")
 }

--- a/main.go
+++ b/main.go
@@ -2,8 +2,6 @@ package main

 import (
 	"context"
-	"os"
-	"os/signal"

 	"github.com/spf13/cobra"

@@ -11,15 +9,5 @@ import (
 )

 func main() {
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	sigChan := make(chan os.Signal, 1)
-	signal.Notify(sigChan, os.Interrupt)
-	go func() {
-		<-sigChan
-		cancel()
-	}()
-
-	cobra.CheckErr(cmd.NewCLI().ExecuteContext(ctx))
+	cobra.CheckErr(cmd.NewCLI().ExecuteContext(context.Background()))
 }
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -23,7 +23,6 @@ type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
-	SystemInfo() string
 }

 var backends = make(map[string]func(*os.File) (Backend, error))
@@ -44,6 +43,42 @@ func NewBackend(f *os.File) (Backend, error) {
 	return nil, fmt.Errorf("unsupported backend")
 }

+// RopeType specifies the type of RoPE (Rotary Position Embedding) to use, these types are implemented in the backend
+type RopeType int
+
+const (
+	RopeTypeStandard RopeType = iota
+	_                         // not yet used
+	RopeTypeNeoX
+)
+
+// RopeConfig contains all configuration for the RoPE (Rotary Position Embedding) operation
+type RopeConfig struct {
+	// PositionIDs contains the position indices for each token in the sequence
+	// These indices are used to calculate the rotary embeddings
+	PositionIDs Tensor
+
+	// RopeFactors is an optional tensor containing pre-computed rotation factors
+	RopeFactors Tensor
+
+	// RopeDim specifies the dimension size for the rotary embeddings
+	RopeDim uint32
+
+	// RopeType indicates which RoPE variant to use (e.g. normal or neox)
+	RopeType RopeType
+
+	// OrigCtxLen stores the original context length the model was trained with
+	OrigCtxLen int
+
+	// RopeBase is the base value used in the frequency calculation
+	RopeBase float32
+
+	// RopeScale is a scaling factor applied to position indices
+	RopeScale float32
+
+	// YaRN parameters can be added here if they need to be configurable
+}
+
 type Context interface {
 	Zeros(dtype DType, shape ...int) Tensor
 	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
@@ -76,7 +111,7 @@ type Tensor interface {
 	Scale(ctx Context, s float64) Tensor

 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
+	RoPE(ctx Context, rc RopeConfig) Tensor

 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,27 +1,11 @@
 package ggml

-/*
-#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
-#include <stdlib.h>
-#include <stdint.h>
-#include "ggml.h"
-#include "ggml-cpu.h"
-#include "ggml-backend.h"
-static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
-static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
-
-typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
-COMPILER inline get_compiler() {
-#if defined(__clang__)
-	return COMP_CLANG;
-#elif defined(__GNUC__)
-	return COMP_GCC;
-#else
-	return UNKNOWN_COMPILER;
-#endif
-}
-
-*/
+// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
+// #include <stdlib.h>
+// #include <stdint.h>
+// #include "ggml.h"
+// #include "ggml-cpu.h"
+// #include "ggml-backend.h"
 import "C"

 import (
@@ -595,13 +579,9 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-const (
-	ropeTypeNorm C.int = iota
-)
-
-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
-	if ropeFactors == nil {
-		ropeFactors = &Tensor{}
+func (t *Tensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
+	if rc.RopeFactors == nil {
+		rc.RopeFactors = &Tensor{}
 	}

 	dequant := t.t
@@ -611,12 +591,15 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi

 	return &Tensor{
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
-			C.int(ropeDim),
-			131072,       // YaRN n_ctx_train
-			ropeTypeNorm, // ROPE_TYPE_NORM
-			C.float(ropeBase),
-			C.float(ropeScale),
+			ctx.(*Context).ctx,
+			dequant,
+			rc.PositionIDs.(*Tensor).t,
+			rc.RopeFactors.(*Tensor).t,
+			C.int(rc.RopeDim),
+			C.int(rc.RopeType),
+			C.int(rc.OrigCtxLen),
+			C.float(rc.RopeBase),
+			C.float(rc.RopeScale),
 			0.,  // YaRN ext_factor
 			1.,  // YaRN attn_factor
 			32., // YaRN beta_fast
@@ -642,34 +625,3 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
 	}
 }
-
-func (b *Backend) SystemInfo() string {
-	var compiler string
-	switch C.get_compiler() {
-	case C.COMP_UNKNOWN:
-		compiler = "cgo(unknown_compiler)"
-	case C.COMP_GCC:
-		compiler = "cgo(gcc)"
-	case C.COMP_CLANG:
-		compiler = "cgo(clang)"
-	}
-
-	var s string
-	for i := range C.ggml_backend_reg_count() {
-		reg := C.ggml_backend_reg_get(i)
-		fName := C.CString("ggml_backend_get_features")
-		defer C.free(unsafe.Pointer(fName))
-		get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
-		if get_features_fn != nil {
-			s += C.GoString(C.ggml_backend_reg_name(reg))
-			s += " : "
-			for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
-				s += C.GoString(features.name)
-				s += " = "
-				s += C.GoString(features.value)
-				s += " | "
-			}
-		}
-	}
-	return s + compiler
-}
--- a/model/model.go
+++ b/model/model.go
@@ -21,7 +21,6 @@ import (
 	_ "github.com/ollama/ollama/ml/backend"
 )

-// Options contains the inputs for a model forward pass
 type Options struct {
 	Inputs    []int32
 	Positions []int32
@@ -35,13 +34,11 @@ type config struct {
 	Cache kvcache.Cache
 }

-// Base implements the common fields and methods for all models
 type Base struct {
 	b ml.Backend
 	config
 }

-// Backend returns the underlying backend that will run the model
 func (m *Base) Backend() ml.Backend {
 	return m.b
 }
@@ -50,7 +47,6 @@ func (m *Base) Config() config {
 	return m.config
 }

-// Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
 	Forward(ml.Context, Options) (ml.Tensor, error)

@@ -60,7 +56,6 @@ type Model interface {

 var models = make(map[string]func(ml.Config) (Model, error))

-// Register registers a model constructor for the given architecture
 func Register(name string, f func(ml.Config) (Model, error)) {
 	if _, ok := models[name]; ok {
 		panic("model: model already registered")
@@ -69,9 +64,8 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 	models[name] = f
 }

-// New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(modelPath string) (Model, error) {
-	r, err := os.Open(modelPath)
+func New(s string) (Model, error) {
+	r, err := os.Open(s)
 	if err != nil {
 		return nil, err
 	}
--- a/model/model_external_test.go
+++ b/model/model_external_test.go
@@ -0,0 +1,138 @@
+// Package model_test provides external tests for the model package.
+// This test file specifically tests the forward pass functionality on models.
+// It is in a separate package (model_test) to avoid import cycles while still
+// being able to test the public API of the model package.
+package model_test
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/sample"
+
+	_ "github.com/ollama/ollama/model/models"
+)
+
+type modelTest struct {
+	Prompt            string   `json:"prompt"`
+	OutputContainsOne []string `json:"output_contains_one"`
+}
+
+func TestForwardSimple(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping in short mode")
+	}
+
+	// Read all JSON files from testdata/models
+	files, err := os.ReadDir("testdata/models")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, file := range files {
+		if !strings.HasSuffix(file.Name(), ".json") {
+			continue
+		}
+
+		jsonPath := filepath.Join("testdata/models", file.Name())
+		ggufPath := filepath.Join("testdata/models", strings.TrimSuffix(file.Name(), ".json")+".gguf")
+
+		// Skip if no corresponding .gguf file exists
+		if _, err := os.Stat(ggufPath); err != nil {
+			t.Logf("skipping %s: no corresponding GGUF file found", file.Name())
+			continue
+		}
+
+		data, err := os.ReadFile(jsonPath)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		var test modelTest
+		if err := json.Unmarshal(data, &test); err != nil {
+			t.Fatal(err)
+		}
+
+		t.Run(strings.TrimSuffix(file.Name(), ".json"), func(t *testing.T) {
+			m, err := model.New(ggufPath)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
+
+			inputs, err := m.(model.TextProcessor).Encode(test.Prompt)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			var result []string
+			for len(result) < 100 { // Limit to 100 tokens max
+				options := model.Options{
+					Inputs:    inputs,
+					Positions: make([]int32, len(inputs)),
+					Sequences: make([]int, len(inputs)),
+					Outputs:   []int32{int32(len(inputs) - 1)},
+				}
+				for i := range options.Positions {
+					options.Positions[i] = int32(i)
+					options.Sequences[i] = 0
+				}
+
+				ctx := m.Backend().NewContext()
+
+				modelOutput, err := model.Forward(ctx, m, options)
+				if err != nil {
+					ctx.Close()
+					t.Fatal(fmt.Errorf("forward pass failed: %v", err))
+				}
+
+				f32s := modelOutput.Floats()
+				logits := make([]float64, len(f32s))
+				for i, f32 := range f32s {
+					logits[i] = float64(f32)
+				}
+
+				token, err := sample.Sample(logits, sample.Greedy())
+				if err != nil {
+					ctx.Close()
+					t.Fatal(fmt.Errorf("sampling failed: %v", err))
+				}
+
+				ctx.Close()
+
+				// Greedy sampling: take the token with the highest logit
+				nextToken := int32(token[0])
+				if m.(model.TextProcessor).Is(nextToken, model.SpecialEOS) {
+					break
+				}
+
+				piece, err := m.(model.TextProcessor).Decode([]int32{nextToken})
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				result = append(result, piece)
+				output := strings.Join(result, "")
+
+				for _, expectedOutput := range test.OutputContainsOne {
+					if strings.Contains(output, expectedOutput) {
+						t.Logf("Test passed with output: %q (matched expected: %q)", output, expectedOutput)
+						return
+					}
+				}
+
+				// Maintain full context by appending new token
+				inputs = append(inputs, nextToken)
+			}
+
+			t.Fatalf("Expected output containing one of %q but got: %q", test.OutputContainsOne, strings.Join(result, ""))
+		})
+	}
+}
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -10,10 +10,10 @@ import (
 )

 type Options struct {
-	RopeFactors                      ml.Tensor `gguf:"rope_freqs.weight"`
-	hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
+	RopeFactors                              ml.Tensor `gguf:"rope_freqs.weight"`
+	ctxLen, hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale                 float32
+	ropeDim                                  uint32
 }

 type Model struct {
@@ -46,6 +46,7 @@ func New(c ml.Config) (model.Model, error) {
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+			ctxLen:     int(c.Uint("context_length")),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
 			ropeDim:    c.Uint("rope.dimension_count"),
@@ -67,14 +68,23 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	rc := ml.RopeConfig{
+		PositionIDs: positionIDs,
+		RopeFactors: opts.RopeFactors,
+		RopeDim:     opts.ropeDim,
+		RopeType:    ml.RopeTypeStandard,
+		OrigCtxLen:  opts.ctxLen,
+		RopeBase:    opts.ropeBase,
+		RopeScale:   opts.ropeScale,
+	}

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	q = q.RoPE(ctx, rc)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	k = k.RoPE(ctx, rc)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -99,7 +109,18 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, m.Options.ropeBase, m.Options.ropeScale), nil
+	return key.RoPE(
+		ctx,
+		ml.RopeConfig{
+			PositionIDs: shift,
+			RopeFactors: m.Options.RopeFactors,
+			RopeDim:     m.Options.ropeDim,
+			RopeType:    ml.RopeTypeStandard,
+			OrigCtxLen:  m.Options.ctxLen,
+			RopeBase:    m.Options.ropeBase,
+			RopeScale:   m.Options.ropeScale,
+		},
+	), nil
 }

 type MLP struct {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -19,14 +19,23 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	rc := ml.RopeConfig{
+		PositionIDs: positions,
+		RopeFactors: opts.RopeFactors,
+		RopeDim:     opts.ropeDim,
+		RopeType:    ml.RopeTypeStandard,
+		OrigCtxLen:  opts.ctxLen,
+		RopeBase:    opts.ropeBase,
+		RopeScale:   opts.ropeScale,
+	}

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	query = query.RoPE(ctx, rc)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	key = key.RoPE(ctx, rc)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -52,7 +61,18 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
-	return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return key.RoPE(
+		ctx,
+		ml.RopeConfig{
+			PositionIDs: shift,
+			RopeFactors: m.RopeFactors,
+			RopeDim:     m.ropeDim,
+			RopeType:    ml.RopeTypeStandard,
+			OrigCtxLen:  m.ctxLen,
+			RopeBase:    m.ropeBase,
+			RopeScale:   m.ropeScale,
+		},
+	), nil
 }

 type TextMLP struct {
@@ -189,9 +209,9 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, cr
 type TextModelOptions struct {
 	RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`

-	hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
+	ctxLen, hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale                 float32
+	ropeDim                                  uint32

 	crossAttentionLayers []uint32
 }
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -3,4 +3,5 @@ package models
 import (
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/qwen2"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -0,0 +1,222 @@
+package qwen2
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type Options struct {
+	RopeFactors    ml.Tensor `gguf:"rope_freqs.weight"`
+	contextLength  int
+	hiddenSize     int
+	numAttnHeads   int
+	numKVHeads     int
+	modelEpsilon   float32
+	ropeBaseFreq   float32
+	ropeFreqScale  float32
+	ropeDimensions uint32
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	*Options
+}
+
+func New(c ml.Config) (model.Model, error) {
+	m := &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+			},
+		),
+		Layers: make([]Layer, c.Uint("block_count")),
+		Options: &Options{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numAttnHeads:   int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			modelEpsilon:   c.Float("attention.layer_norm_rms_epsilon"),
+			contextLength:  int(c.Uint("context_length")),
+			ropeBaseFreq:   c.Float("rope.freq_base"),
+			ropeFreqScale:  c.Float("rope.freq_scale", 1),
+			ropeDimensions: c.Uint("rope.dimension_count", 64),
+		},
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+
+	return m, nil
+}
+
+// Shift applies rotary position embeddings to the key tensor for causal attention caching
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key.RoPE(
+		ctx,
+		ml.RopeConfig{
+			PositionIDs: shift,
+			RopeFactors: m.Options.RopeFactors,
+			RopeDim:     m.Options.ropeDimensions,
+			RopeType:    ml.RopeTypeNeoX,
+			OrigCtxLen:  m.Options.contextLength,
+			RopeBase:    m.Options.ropeBaseFreq,
+			RopeScale:   m.Options.ropeFreqScale,
+		},
+	), nil
+}
+
+// SelfAttention implements the multi-head self-attention mechanism
+// with separate projections for query, key, value and output transformations
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, inputPositions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	// Initialize dimensions and configuration
+	batchSize := hiddenState.Dim(1)
+	headDimension := opts.hiddenSize / opts.numAttnHeads
+	ropeConfig := ml.RopeConfig{
+		PositionIDs: inputPositions,
+		RopeFactors: nil,
+		RopeDim:     opts.ropeDimensions,
+		RopeType:    ml.RopeTypeNeoX,
+		OrigCtxLen:  opts.contextLength,
+		RopeBase:    opts.ropeBaseFreq,
+		RopeScale:   opts.ropeFreqScale,
+	}
+
+	// Project and reshape query states with rotary embeddings
+	queryStates := sa.Query.Forward(ctx, hiddenState)
+	queryStates = queryStates.Reshape(ctx, headDimension, opts.numAttnHeads, batchSize)
+	queryStates = queryStates.RoPE(ctx, ropeConfig)
+
+	// Project and reshape key states with rotary embeddings
+	keyStates := sa.Key.Forward(ctx, hiddenState)
+	keyStates = keyStates.Reshape(ctx, headDimension, opts.numKVHeads, batchSize)
+	keyStates = keyStates.RoPE(ctx, ropeConfig)
+
+	// Project and reshape value states
+	valueStates := sa.Value.Forward(ctx, hiddenState)
+	valueStates = valueStates.Reshape(ctx, headDimension, opts.numKVHeads, batchSize)
+
+	// Update and retrieve from KV cache
+	cache.Put(ctx, keyStates, valueStates)
+	keyStates, valueStates, attentionMask := cache.Get(ctx)
+
+	// Prepare tensors for attention computation
+	queryStates = queryStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	keyStates = keyStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	valueStates = valueStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	// Apply scaling and attention mask to scores
+	attentionScores := keyStates.MulmatFullPrec(ctx, queryStates)
+	attentionScores = attentionScores.Scale(ctx, 1.0/math.Sqrt(float64(headDimension)))
+	attentionScores = attentionScores.Add(ctx, attentionMask)
+	// Compute scaled dot-product attention
+	attentionProbs := attentionScores.Softmax(ctx)
+
+	// Apply attention weights and reshape
+	weightedStates := valueStates.Mulmat(ctx, attentionProbs)
+	weightedStates = weightedStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	weightedStates = weightedStates.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	// Project to output dimension
+	return sa.Output.Forward(ctx, weightedStates)
+}
+
+// MLP implements the feed-forward network component with SwiGLU activation
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	// Apply SwiGLU activation gating
+	gateActivation := mlp.Gate.Forward(ctx, hiddenState).SILU(ctx)
+	upProjection := mlp.Up.Forward(ctx, hiddenState)
+	intermediateStates := gateActivation.Mul(ctx, upProjection)
+
+	// Project back to hidden dimension
+	return mlp.Down.Forward(ctx, intermediateStates)
+}
+
+// Layer represents a single transformer layer combining self-attention and feed-forward components
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention *SelfAttention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	// Self-attention branch with residual connection
+	residual := hiddenState
+
+	normalizedAttention := l.AttentionNorm.Forward(ctx, hiddenState, opts.modelEpsilon)
+	attentionOutput := l.SelfAttention.Forward(ctx, normalizedAttention, positionIDs, cache, opts)
+	hiddenState = attentionOutput.Add(ctx, residual)
+
+	// Feed-forward branch with residual connection
+	residual = hiddenState
+	normalizedMLP := l.MLPNorm.Forward(ctx, hiddenState, opts.modelEpsilon)
+	mlpOutput := l.MLP.Forward(ctx, normalizedMLP, opts)
+	output := mlpOutput.Add(ctx, residual)
+
+	return output
+}
+
+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
+	// Convert input tokens and positions to tensors
+	inputTensor, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
+	if err != nil {
+		return nil, err
+	}
+
+	positionsTensor, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	// Initial token embedding
+	hiddenStates := m.TokenEmbedding.Forward(ctx, inputTensor)
+
+	// Process through transformer layers
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+		hiddenStates = layer.Forward(ctx, hiddenStates, positionsTensor, m.Cache, m.Options)
+	}
+
+	// Final layer normalization and output projection
+	normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)
+	logits := m.Output.Forward(ctx, normalizedOutput)
+
+	// Extract requested output token positions
+	outputsTensor, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	return logits.Rows(ctx, outputsTensor), nil
+}
+
+func init() {
+	model.Register("qwen2", New)
+}
--- a/model/testdata/models/README.md
+++ b/model/testdata/models/README.md
@@ -0,0 +1,10 @@
+# Test Model Directory
+
+This directory is used for storing model files (like `.gguf` files) that are required to run the tests in `model_external_test.go`. 
+
+## Usage
+
+- Place any model files you need for testing in this directory
+- The test file will look for any model files here (e.g., `llama3.gguf`)
+- All non-markdown files in this directory are git-ignored to prevent large model files from being committed to the repository
+- Only `.md` files (like this README) will be tracked in git
--- a/model/testdata/models/qwen2_5.json
+++ b/model/testdata/models/qwen2_5.json
@@ -0,0 +1,7 @@
+{
+  "prompt": "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nhi<|im_end|>\n<|im_start|>assistant\n",
+  "output_contains_one": [
+    "Hello",
+    "Hi"
+  ]
+}
--- a/progress/progress.go
+++ b/progress/progress.go
@@ -1,7 +1,6 @@
 package progress

 import (
-	"bufio"
 	"fmt"
 	"io"
 	"sync"
@@ -14,8 +13,7 @@ type State interface {

 type Progress struct {
 	mu sync.Mutex
-	// buffer output to minimize flickering on all terminals
-	w *bufio.Writer
+	w  io.Writer

 	pos int

@@ -24,7 +22,7 @@ type Progress struct {
 }

 func NewProgress(w io.Writer) *Progress {
-	p := &Progress{w: bufio.NewWriter(w)}
+	p := &Progress{w: w}
 	go p.start()
 	return p
 }
@@ -49,29 +47,26 @@ func (p *Progress) stop() bool {
 func (p *Progress) Stop() bool {
 	stopped := p.stop()
 	if stopped {
-		fmt.Fprintln(p.w)
+		fmt.Fprint(p.w, "\n")
 	}
-
-	// show cursor
-	fmt.Fprint(p.w, "\033[?25h")
-	p.w.Flush()
 	return stopped
 }

 func (p *Progress) StopAndClear() bool {
+	fmt.Fprint(p.w, "\033[?25l")
+	defer fmt.Fprint(p.w, "\033[?25h")
+
 	stopped := p.stop()
 	if stopped {
 		// clear all progress lines
-		for range p.pos - 1 {
-			fmt.Fprint(p.w, "\033[A")
+		for i := range p.pos {
+			if i > 0 {
+				fmt.Fprint(p.w, "\033[A")
+			}
+			fmt.Fprint(p.w, "\033[2K\033[1G")
 		}
-
-		fmt.Fprint(p.w, "\033[2K", "\033[1G")
 	}

-	// show cursor
-	fmt.Fprint(p.w, "\033[?25h")
-	p.w.Flush()
 	return stopped
 }

@@ -86,31 +81,30 @@ func (p *Progress) render() {
 	p.mu.Lock()
 	defer p.mu.Unlock()

-	fmt.Fprint(p.w, "\033[?2026h")
-	defer fmt.Fprint(p.w, "\033[?2026l")
+	fmt.Fprint(p.w, "\033[?25l")
+	defer fmt.Fprint(p.w, "\033[?25h")

-	for range p.pos - 1 {
-		fmt.Fprint(p.w, "\033[A")
+	// clear already rendered progress lines
+	for i := range p.pos {
+		if i > 0 {
+			fmt.Fprint(p.w, "\033[A")
+		}
+		fmt.Fprint(p.w, "\033[2K\033[1G")
 	}

-	fmt.Fprint(p.w, "\033[1G")
-
 	// render progress lines
 	for i, state := range p.states {
-		fmt.Fprint(p.w, state.String(), "\033[K")
+		fmt.Fprint(p.w, state.String())
 		if i < len(p.states)-1 {
 			fmt.Fprint(p.w, "\n")
 		}
 	}

 	p.pos = len(p.states)
-	p.w.Flush()
 }

 func (p *Progress) start() {
 	p.ticker = time.NewTicker(100 * time.Millisecond)
-	// hide cursor
-	fmt.Fprint(p.w, "\033[?25l")
 	for range p.ticker.C {
 		p.render()
 	}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -813,8 +813,6 @@ func (s *Server) loadModel(
 		panic(err)
 	}

-	slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
-
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")
@@ -883,6 +881,7 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting ollama engine")
+	// TODO(jessegross): Some system info would be useful

 	server := &Server{
 		batchSize: *batchSize,
Author	SHA1	Message	Date
Bruce MacDonald	7fa9694359	model: add a test for model forward pass during implementation Adds a new test file to verify model forward pass behavior through JSON-specified test cases. The framework loads model files (.gguf) and their corresponding test specifications to validate expected outputs using greedy sampling.	2025-02-18 14:11:31 -08:00
Bruce MacDonald	96510b9353	model: document qwen2 forward pass	2025-02-14 14:55:30 -08:00
Bruce MacDonald	9f8c89354b	model: add new engine support for qwen2 family	2025-02-14 14:30:06 -08:00
Bruce MacDonald	8815a8ee25	ml: let model specify rope configuration Add support for model-specific RoPE configuration parameters by: 1. Creating a new `RopeConfig` struct to encapsulate all RoPE parameters 2. Adding `RopeType` enum to specify different RoPE variants (Standard/NeoX) 3. Extracting original context length from model config 4. Refactoring `RoPE()` interface to use the new config struct 5. Updating llama and mllama models to use new RoPE configuration This change allows models to specify their RoPE implementation type and original context length, which is important for proper position embedding calculation and model compatibility.	2025-02-14 14:21:00 -08:00