sample: use json unmarshal for sampling params

2025-03-20 15:03:42 -04:00
52 changed files with 400 additions and 1295 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,9 +86,9 @@ if(CMAKE_CUDA_COMPILER)
    )
 endif()

-set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a|1200|1201):xnack[+-]$"
+set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a):xnack[+-]$"
    CACHE STRING
-    "Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a|1200|1201):xnack[+-]$\"."
+    "Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a):xnack[+-]$\"."
 )

 check_language(HIP)
@@ -97,7 +97,7 @@ if(CMAKE_HIP_COMPILER)

    find_package(hip REQUIRED)
    if(NOT AMDGPU_TARGETS)
-        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012]|120[01])$")
+        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
    elseif(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
        list(FILTER AMDGPU_TARGETS EXCLUDE REGEX ${WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX})
    endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -56,7 +56,7 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
-        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
+        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    }
  ],
--- a/README.md
+++ b/README.md
@@ -394,7 +394,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
 - [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance)
 - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)

 ### Cloud

@@ -435,7 +434,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.

 ### Apple Vision Pro

@@ -514,7 +512,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
- [Ollama for D](https://github.com/kassane/ollama-d)

 ### Mobile

--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@@ -1,178 +0,0 @@
-package benchmark
-
-import (
-	"context"
-	"flag"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// Command line flags
-var modelFlag string
-
-func init() {
-	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
-	flag.Lookup("m").DefValue = "model"
-}
-
-// modelName returns the model name from flags, failing the test if not set
-func modelName(b *testing.B) string {
-	if modelFlag == "" {
-		b.Fatal("Error: -m flag is required for benchmark tests")
-	}
-	return modelFlag
-}
-
-type TestCase struct {
-	name      string
-	prompt    string
-	maxTokens int
-}
-
-// runGenerateBenchmark contains the common generate and metrics logic
-func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
-	start := time.Now()
-	var ttft time.Duration
-	var metrics api.Metrics
-
-	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
-		if ttft == 0 && resp.Response != "" {
-			ttft = time.Since(start)
-		}
-		if resp.Done {
-			metrics = resp.Metrics
-		}
-		return nil
-	})
-
-	// Report custom metrics as part of the benchmark results
-	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
-	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
-
-	// Token throughput metrics
-	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
-	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
-	b.ReportMetric(promptThroughput, "prompt_tok/s")
-	b.ReportMetric(genThroughput, "gen_tok/s")
-
-	// Token counts
-	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
-	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
-	if err != nil {
-		b.Fatal(err)
-	}
-}
-
-// BenchmarkColdStart runs benchmarks with model loading from cold state
-func BenchmarkColdStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
-			ctx := context.Background()
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				b.StopTimer()
-				// Ensure model is unloaded before each iteration
-				unload(client, m, b)
-				b.StartTimer()
-
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// BenchmarkWarmStart runs benchmarks with pre-loaded model
-func BenchmarkWarmStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
-			ctx := context.Background()
-
-			// Pre-warm the model
-			warmup(client, m, tt.prompt, b)
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// setup verifies server and model availability
-func setup(b *testing.B) *api.Client {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		b.Fatal(err)
-	}
-	if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil {
-		b.Fatalf("Model unavailable: %v", err)
-	}
-
-	return client
-}
-
-// warmup ensures the model is loaded and warmed up
-func warmup(client *api.Client, model string, prompt string, b *testing.B) {
-	for range 3 {
-		err := client.Generate(
-			context.Background(),
-			&api.GenerateRequest{
-				Model:   model,
-				Prompt:  prompt,
-				Options: map[string]interface{}{"num_predict": 50, "temperature": 0.1},
-			},
-			func(api.GenerateResponse) error { return nil },
-		)
-		if err != nil {
-			b.Logf("Error during model warm-up: %v", err)
-		}
-	}
-}
-
-// unload forces model unloading using KeepAlive: 0 parameter
-func unload(client *api.Client, model string, b *testing.B) {
-	req := &api.GenerateRequest{
-		Model:     model,
-		KeepAlive: &api.Duration{Duration: 0},
-	}
-	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
-		b.Logf("Unload error: %v", err)
-	}
-	time.Sleep(1 * time.Second)
-}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -703,8 +703,6 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 			for _, k := range keys {
 				var v string
 				switch vData := resp.ModelInfo[k].(type) {
-				case bool:
-					v = fmt.Sprintf("%t", vData)
 				case string:
 					v = vData
 				case float64:
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -87,8 +87,6 @@ func TestShowInfo(t *testing.T) {
 			ModelInfo: map[string]any{
 				"general.architecture":    "test",
 				"general.parameter_count": float64(8_000_000_000),
-				"some.true_bool":          true,
-				"some.false_bool":         false,
 				"test.context_length":     float64(1000),
 				"test.embedding_length":   float64(11434),
 			},
@@ -113,8 +111,6 @@ func TestShowInfo(t *testing.T) {
  Metadata
    general.architecture       test     
    general.parameter_count    8e+09    
-    some.false_bool            false    
-    some.true_bool             true     
    test.context_length        1000     
    test.embedding_length      11434    

--- a/convert/convert.go
+++ b/convert/convert.go
@@ -196,10 +196,6 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
-	case "Qwen2_5_VLForConditionalGeneration":
-		conv = &qwen25VLModel{}
-	case "Qwen2_5OmniModel":
-		conv = &qwen25OmniModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
--- a/convert/convert_qwen25_omni.go
+++ b/convert/convert_qwen25_omni.go
@@ -1,209 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"encoding/binary"
-	"io"
-	"log/slog"
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/x448/float16"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type qwen25OmniModel struct {
-	ModelParameters
-	TalkerModel struct {
-		AudioEndTokenID       uint32  `json:"audio_end_token_id"`
-		AudioStartTokenID     uint32  `json:"audio_start_token_id"`
-		AudioTokenIndex       uint32  `json:"audio_token_index"`
-		HeadDim               uint32  `json:"head_dim"`
-		HiddenSize            uint32  `json:"hidden_size"`
-		ImageTokenIndex       uint32  `json:"image_token_index"`
-		IntermediateSize      uint32  `json:"intermediate_size"`
-		MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-		MaxWindowLayers       uint32  `json:"max_window_layers"`
-		NumAttentionHeads     uint32  `json:"num_attention_heads"`
-		HiddenLayers          uint32  `json:"num_hidden_layers"`
-		NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
-		RMSNormEPS            float32 `json:"rms_norm_eps"`
-		RopeTheta             float32 `json:"rope_theta"`
-		VideoTokenIndex       uint32  `json:"video_token_index"`
-		VisionEndTokenID      uint32  `json:"vision_end_token_id"`
-		VisionStartTokenID    uint32  `json:"vision_start_token_id"`
-	} `json:"talker_config"`
-
-	ThinkerModel struct {
-		TextModel struct {
-			HiddenSize            uint32  `json:"hidden_size"`
-			IntermediateSize      uint32  `json:"intermediate_size"`
-			MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-			NumAttentionHeads     uint32  `json:"num_attention_heads"`
-			HiddenLayers          uint32  `json:"num_hidden_layers"`
-			RopeTheta             float32 `json:"rope_theta"`
-			NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
-			RMSNormEPS            float32 `json:"rms_norm_eps"`
-		} `json:"text_config"`
-	} `json:"thinker_config"`
-
-	VisionModel struct {
-	} `json:"vision_config"`
-
-	Token2WavModel struct {
-	} `json:"token2wav_config"`
-}
-
-var _ ModelConverter = (*qwen25OmniModel)(nil)
-
-func (q *qwen25OmniModel) KV(t *Tokenizer) ggml.KV {
-	kv := q.ModelParameters.KV(t)
-	kv["general.architecture"] = "qwen25omni"
-	kv["qwen25omni.block_count"] = q.ThinkerModel.TextModel.HiddenLayers
-	kv["qwen25omni.context_length"] = q.ThinkerModel.TextModel.MaxPositionEmbeddings
-	kv["qwen25omni.embedding_length"] = q.ThinkerModel.TextModel.HiddenSize
-	kv["qwen25omni.feed_forward_length"] = q.ThinkerModel.TextModel.IntermediateSize
-	kv["qwen25omni.attention.head_count"] = q.ThinkerModel.TextModel.NumAttentionHeads
-	kv["qwen25omni.attention.head_count_kv"] = q.ThinkerModel.TextModel.NumKeyValueHeads
-	kv["qwen25omni.rope.freq_base"] = q.ThinkerModel.TextModel.RopeTheta
-	kv["qwen25omni.attention.layer_norm_rms_epsilon"] = q.ThinkerModel.TextModel.RMSNormEPS
-
-	return kv
-}
-
-func (q *qwen25OmniModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
-
-	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "patch_embed.proj.weight") {
-			var buf bytes.Buffer
-			t.WriteTo(&buf)
-			newTensors := splitPatchEmbed(buf, t.Kind(), t.Shape())
-			out = append(out, newTensors...)
-		} else {
-			out = append(out, ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		}
-	}
-
-	return out
-}
-
-func splitPatchEmbed(buf bytes.Buffer, kind uint32, shape []uint64) []ggml.Tensor {
-	slog.Debug("patch stuff", "kind", kind, "shape", shape)
-
-	if kind != tensorKindF16 {
-		panic("tensor is of wrong type")
-	}
-
-	if len(shape) != 5 || (len(shape) == 5 && shape[2] != 2) {
-		panic("wrong sized tensor")
-	}
-
-	// determine the size of the tensor based on its shape
-	shapeToSize := func(s []int) int {
-		r := 1
-		for _, n := range s {
-			r *= int(n)
-		}
-		return r
-	}
-
-	// tensor.WithShape() wants []int
-	intShape := make([]int, len(shape))
-	for i, v := range shape {
-		intShape[i] = int(v)
-	}
-
-	u16s := make([]uint16, shapeToSize(intShape))
-	if err := binary.Read(&buf, binary.LittleEndian, u16s); err != nil {
-		panic("bad read")
-	}
-
-	f32s := make([]float32, len(u16s))
-	for i := range u16s {
-		f32s[i] = float16.Frombits(u16s[i]).Float32()
-	}
-
-	newTensors := []ggml.Tensor{}
-
-	getDataFromSlice := func(f32s []float32, shape []int, s []tensor.Slice) patchEmbed {
-		slog.Debug("getDataFromSlice", "num f32s", len(f32s), "shape", shape)
-		n := tensor.New(tensor.WithShape(shape...), tensor.WithBacking(f32s))
-		t, err := n.Slice(s...)
-		if err != nil {
-			panic(err)
-		}
-
-		ts, err := native.SelectF32(t.Materialize().(*tensor.Dense), 0)
-		if err != nil {
-			panic(err)
-		}
-
-		slog.Debug("first vals", "val 1", ts[0][0], "val 2", ts[0][1], "val 3", ts[0][2])
-
-		f16s := make(patchEmbed, shapeToSize(shape))
-		for r, row := range ts {
-			for c, col := range row {
-				f16s[r+c] = float16.Fromfloat32(col).Bits()
-			}
-		}
-
-		return f16s
-	}
-
-	p := getDataFromSlice(f32s, intShape, []tensor.Slice{nil, nil, tensor.S(0, 1, 1), nil, nil})
-	newTensors = append(newTensors, ggml.Tensor{
-		Name:     "patch_embed.proj.0.weight",
-		Kind:     kind,
-		Shape:    append(shape[:2], shape[3:]...),
-		WriterTo: p,
-	})
-
-	p = getDataFromSlice(f32s, intShape, []tensor.Slice{nil, nil, tensor.S(1, 2, 1), nil, nil})
-	newTensors = append(newTensors, ggml.Tensor{
-		Name:     "patch_embed.proj.1.weight",
-		Kind:     kind,
-		Shape:    append(shape[:2], shape[3:]...),
-		WriterTo: p,
-	})
-
-	return newTensors
-}
-
-type patchEmbed []uint16
-
-func (t patchEmbed) WriteTo(w io.Writer) (int64, error) {
-	err := binary.Write(w, binary.LittleEndian, t)
-	return 0, err
-}
-
-func (p *qwen25OmniModel) Replacements() []string {
-	return []string{
-		"lm_head", "output",
-		"model.embed_tokens", "token_embd",
-		"thinker.audio_tower.layers", "a.blk",
-		"thinker.visual.blocks", "v.blk",
-		"thinker.model.layers", "blk",
-		"talker.model.layers", "tlk.blk",
-		"token2wav.code2wav_bigvgan_model", "t2w.b",
-		"token2wav.code2wav_dit_model", "t2w.d",
-		"input_layernorm", "attn_norm",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.q_proj", "attn_q",
-		"self_attn.o_proj", "attn_output",
-		"mlp.down_proj", "ffn_down",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.up_proj", "ffn_up",
-		"post_attention_layernorm", "ffn_norm",
-		"model.norm", "output_norm",
-	}
-}
--- a/convert/convert_qwen25_vl.go
+++ b/convert/convert_qwen25_vl.go
@@ -1,81 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type qwen25VLModel struct {
-	ModelParameters
-	HiddenSize            uint32  `json:"hidden_size"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	NumAttentionHeads     uint32  `json:"num_attention_heads"`
-	HiddenLayers          uint32  `json:"num_hidden_layers"`
-	RopeTheta             float32 `json:"rope_theta"`
-	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
-	RMSNormEPS            float32 `json:"rms_norm_eps"`
-
-	VisionModel struct {
-	} `json:"vision_config"`
-}
-
-var _ ModelConverter = (*qwen25VLModel)(nil)
-
-func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
-	kv := q.ModelParameters.KV(t)
-	kv["general.architecture"] = "qwen25vl"
-	kv["qwen25vl.block_count"] = q.HiddenLayers
-	kv["qwen25vl.context_length"] = q.MaxPositionEmbeddings
-	kv["qwen25vl.embedding_length"] = q.HiddenSize
-	kv["qwen25vl.feed_forward_length"] = q.IntermediateSize
-	kv["qwen25vl.attention.head_count"] = q.NumAttentionHeads
-	kv["qwen25vl.attention.head_count_kv"] = q.NumKeyValueHeads
-	kv["qwen25vl.rope.freq_base"] = q.RopeTheta
-	kv["qwen25vl.attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
-
-	return kv
-}
-
-func (q *qwen25VLModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
-
-	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "patch_embed.proj.weight") {
-			var buf bytes.Buffer
-			t.WriteTo(&buf)
-			newTensors := splitPatchEmbed(buf, t.Kind(), t.Shape())
-			out = append(out, newTensors...)
-		} else {
-			out = append(out, ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		}
-	}
-
-	return out
-}
-
-func (p *qwen25VLModel) Replacements() []string {
-	return []string{
-		"lm_head", "output",
-		"model.embed_tokens", "token_embd",
-		"model.layers", "blk",
-		"visual.blocks", "v.blk",
-		"input_layernorm", "attn_norm",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.q_proj", "attn_q",
-		"self_attn.o_proj", "attn_output",
-		"mlp.down_proj", "ffn_down",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.up_proj", "ffn_up",
-		"post_attention_layernorm", "ffn_norm",
-		"model.norm", "output_norm",
-	}
-}
--- a/docs/api.md
+++ b/docs/api.md
@@ -558,10 +558,6 @@ Final response:
 {
  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
-  "message": {
-    "role": "assistant",
-    "content": ""
-  },
  "done": true,
  "total_duration": 4883583458,
  "load_duration": 1334875,
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -1,59 +0,0 @@
-# Benchmark
-
-Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
-
-## When to use
-
-Run these benchmarks when:
- Making changes to the model inference engine
- Modifying model loading/unloading logic
- Changing prompt processing or token generation code
- Implementing a new model architecture
- Testing performance across different hardware setups
-
-## Prerequisites
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
-## Usage and Examples
-
->[!NOTE]
->All commands must be run from the root directory of the Ollama project.
-
-Basic syntax:
-```bash
-go test -bench=. ./benchmark/... -m $MODEL_NAME
-```
-
-Required flags:
- `-bench=.`: Run all benchmarks
- `-m`: Model name to benchmark
-
-Optional flags:
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
-
-Common usage patterns:
-
-Single benchmark run with a model specified:
-```bash
-go test -bench=. ./benchmark/... -m llama3.3
-```
-
-## Output metrics
-
-The benchmark reports several key metrics:
-
- `gen_tok/s`: Generated tokens per second
- `prompt_tok/s`: Prompt processing tokens per second
- `ttft_ms`: Time to first token in milliseconds
- `load_ms`: Model load time in milliseconds
- `gen_tokens`: Total tokens generated
- `prompt_tokens`: Total prompt tokens processed
-
-Each benchmark runs two scenarios:
- Cold start: Model is loaded from disk for each test
- Warm start: Model is pre-loaded in memory
-
-Three prompt lengths are tested for each scenario:
- Short prompt (100 tokens)
- Medium prompt (500 tokens)
- Long prompt (1000 tokens)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,13 +20,7 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 2048 tokens. 
-
-This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
-
-```shell
-OLLAMA_CONTEXT_LENGTH=8192 ollama serve
-```
+By default, Ollama uses a context window size of 2048 tokens. This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context length to 8K, use: `OLLAMA_CONTEXT_LENGTH=8192 ollama serve`.

 To change this when using `ollama run`, use `/set parameter`:

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
 On **Linux** systems with systemd, the logs can be found with this command:

 ```shell
-journalctl -u ollama --no-pager --follow --pager-end 
+journalctl -u ollama --no-pager
 ```

 When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -413,7 +413,7 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }

-func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
+func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
 	heads := f.KV().HeadCount()
 	headsKV := f.KV().HeadCountKV()
@@ -426,10 +426,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	layers := f.Tensors().GroupLayers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	kv = make([]uint64, f.KV().BlockCount())
-	for i := range kv {
-		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
-	}
+	kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)

 	switch f.KV().Architecture() {
 	case "llama":
@@ -463,14 +460,16 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	case "mllama":
 		var visionTokens, tiles uint64 = 1601, 4

-		crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
-		for i := range kv {
-			if slices.Contains(crossAttentionLayers, uint32(i)) {
-				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
-					4 * // sizeof(float32)
-					visionTokens *
-					tiles
-			}
+		if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
+			kv = headsKV *
+				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
+				(2* // sizeof(float16)
+					(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
+					context +
+					4* // sizeof(float32)
+						uint64(crossAttentionLayers.size)* // num cross attention layers
+						visionTokens*
+						tiles)
 		}

 		fullOffload = max(
@@ -506,20 +505,6 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				4*embeddingHeadsK*context*8+
 				embedding*embeddingHeadsK*heads*9/16,
 		)
-
-		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
-		// engine. Gemma3 always uses the Ollama engine.
-		if f.KV().Architecture() == "gemma3" {
-			const gemma3GlobalCacheCount = 6
-			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
-			for i := range kv {
-				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
-				// layers are the smaller local (sliding) layers.
-				if (i+1)%gemma3GlobalCacheCount != 0 {
-					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
-				}
-			}
-		}
 	case "command-r":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
--- a/kvcache/cache.go
+++ b/kvcache/cache.go
@@ -43,13 +43,8 @@ type Cache interface {

 	// ** cache management **

-	// Init sets up runtime parameters.
-	// backend: Used to allocate cache data storage and execute management operations (such as defrag)
-	// dtype: The data type for storing cache entries
-	// maxSequences: The maximum number of sequences stored in the cache - across all batches
-	// capacity: The number of cache entries to store, per sequence
-	// maxBatch: The maximum number of tokens that can occur in a single batch
-	Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int)
+	// Init sets up runtime parameters
+	Init(backend ml.Backend, dtype ml.DType, capacity int32)

 	// Close closes the cache and frees resources associated with it
 	Close()
@@ -57,7 +52,7 @@ type Cache interface {
 	// StartForward is called before the start of the model's forward pass.
 	// For each token in the coming batch, there must be a corresponding
 	// entry in positions and seqs.
-	StartForward(ctx ml.Context, batch input.Batch) error
+	StartForward(ctx ml.Context, opts input.Options) error

 	// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
 	CopyPrefix(srcSeq, dstSeq int, len int32)
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -20,6 +20,7 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 // The mask is of shape history size, batch size
 type Causal struct {
 	DType      ml.DType
+	Capacity   int32
 	windowSize int32

 	opts CausalOptions
@@ -97,7 +98,7 @@ func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	}
 }

-func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+func (c *Causal) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
 	if c.config == nil {
 		var config ml.CacheConfig
 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
@@ -118,16 +119,9 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.MaskDType = ml.DTypeF32
 	}

-	var cacheSize int
-	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize) {
-		cacheSize = maxSequences * capacity
-	} else {
-		cacheSize = (maxSequences * int(c.windowSize)) + maxBatch
-	}
-	cacheSize = roundUp(cacheSize, c.config.CachePadding)
-	c.cells = make([]cacheCell, cacheSize)
-
 	c.DType = dtype
+	c.Capacity = int32(roundUp(int(capacity), c.config.CachePadding))
+	c.cells = make([]cacheCell, c.Capacity)
 	c.cellRanges = make(map[int]cellRange)
 	c.backend = backend
 }
@@ -146,14 +140,12 @@ func (c *Causal) Close() {
 	}
 }

-func (c *Causal) StartForward(ctx ml.Context, batch input.Batch) error {
-	c.curBatchSize = len(batch.Positions)
-	c.curSequences = batch.Sequences
-	c.curPositions = batch.Positions
+func (c *Causal) StartForward(ctx ml.Context, opts input.Options) error {
+	c.curBatchSize = len(opts.Positions)
+	c.curSequences = opts.Sequences
+	c.curPositions = opts.Positions
 	c.opts.Except = nil

-	c.updateSlidingWindow()
-
 	var err error
 	c.curLoc, err = c.findStartLoc()
 	if errors.Is(err, ErrKvCacheFull) {
@@ -165,8 +157,8 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch) error {
 	}

 	c.curCellRange = newRange()
-	for i, pos := range batch.Positions {
-		seq := batch.Sequences[i]
+	for i, pos := range opts.Positions {
+		seq := opts.Sequences[i]

 		c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}

@@ -218,51 +210,7 @@ func (c *Causal) findStartLoc() (int, error) {
 		}
 	}

-	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
-}
-
-func (c *Causal) updateSlidingWindow() {
-	if c.windowSize == math.MaxInt32 {
-		return
-	}
-
-	// create a map of unique sequences to the lowest position in that sequence
-	lowestPos := make(map[int]int32)
-	for i := range c.curPositions {
-		seq := c.curSequences[i]
-
-		pos, ok := lowestPos[seq]
-		if !ok {
-			pos = c.curPositions[i]
-		} else if c.curPositions[i] < pos {
-			pos = c.curPositions[i]
-		}
-
-		lowestPos[seq] = pos
-	}
-
-	// delete any entries that are beyond the window of the oldest position in the sequence
-	for seq, pos := range lowestPos {
-		oldRange, ok := c.cellRanges[seq]
-		if !ok {
-			continue
-		}
-
-		newRange := newRange()
-
-		for i := oldRange.min; i <= oldRange.max; i++ {
-			if slices.Contains(c.cells[i].sequences, seq) {
-				if c.cells[i].pos < pos-c.windowSize {
-					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
-				} else {
-					newRange.min = min(newRange.min, i)
-					newRange.max = max(newRange.max, i)
-				}
-			}
-		}
-
-		c.cellRanges[seq] = newRange
-	}
+	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
 }

 func roundDown(length, pad int) int {
@@ -317,7 +265,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	return maskTensor, nil
 }

-func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
+func (c *Causal) moveCells(ctx ml.Context, src, dst, len int) {
 	for i, key := range c.keys {
 		if key == nil {
 			continue
@@ -327,8 +275,8 @@ func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
 		numKVHeads := key.Dim(1)
 		rowSize := key.Stride(2)

-		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
-		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
+		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*len)
+		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*len)

 		value := c.values[i]
 		var vSrcView, vDstView ml.Tensor
@@ -336,14 +284,14 @@ func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
 			vHeadDim := value.Dim(1)
 			elemSize := value.Stride(0)

-			vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-			vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
+			vSrcView = value.View(ctx, elemSize*src, len, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)
+			vDstView = value.View(ctx, elemSize*dst, len, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)
 		} else {
 			vHeadDim := value.Dim(0)
 			rowSize := value.Stride(2)

-			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
-			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
+			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*len)
+			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*len)
 		}

 		ctx.Forward(
@@ -373,8 +321,7 @@ func (c *Causal) defrag() {
 	ctx := c.backend.NewContext()

 	// For every move, 6 tensors are required per layer (2 views and a
-	// copy for each of k and v). We also need to refer to the original
-	// k and v cache tensors - once per layer, not per move.
+	// copy for each of k and v).
 	layers := 0
 	for _, key := range c.keys {
 		if key == nil {
@@ -383,7 +330,7 @@ func (c *Causal) defrag() {
 		layers++
 	}

-	maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
+	maxMoves := ctx.MaxGraphNodes() / (6 * layers)
 	moves := 0

 	var pendingSrc, pendingDst, pendingLen int
@@ -532,14 +479,14 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 	}

 	if _, ok := c.keys[c.curLayer]; !ok {
-		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, kHeadDim, numKVHeads, len(c.cells))
+		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, kHeadDim, numKVHeads, int(c.Capacity))
 	}

 	if _, ok := c.values[c.curLayer]; !ok {
 		if c.config.PermutedV {
-			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, len(c.cells), vHeadDim, numKVHeads)
+			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, int(c.Capacity), vHeadDim, numKVHeads)
 		} else {
-			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, vHeadDim, numKVHeads, len(c.cells))
+			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, vHeadDim, numKVHeads, int(c.Capacity))
 		}
 	}

@@ -550,7 +497,7 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 		elemSize := c.values[c.curLayer].Stride(0)

 		value = value.Permute(ctx, 1, 2, 0, 3)
-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
+		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)))
 	} else {
 		rowSize := c.values[c.curLayer].Stride(2)

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -25,7 +25,7 @@ func TestStore(t *testing.T) {
 	cache := NewCausalCache(nil)
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -58,11 +58,11 @@ func TestSWA(t *testing.T) {
 	cache := NewSWACache(1, nil)
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF32, 16)

 	tests := []testCase{
 		{
-			name:          "FirstBatch",
+			name:          "SlidingWindow",
 			in:            []float32{1, 2, 3, 4},
 			inShape:       []int{1, 1, 4},
 			seqs:          []int{0, 0, 0, 0},
@@ -71,16 +71,6 @@ func TestSWA(t *testing.T) {
 			expectedShape: []int{1, 1, 4},
 			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
 		},
-		{
-			name:          "SecondBatch",
-			in:            []float32{5, 6},
-			inShape:       []int{1, 1, 2},
-			seqs:          []int{0, 0},
-			pos:           []int32{4, 5},
-			expected:      []float32{5, 6, 3, 4},
-			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1))},
-		},
 	}

 	testCache(t, backend, cache, tests)
@@ -91,7 +81,7 @@ func TestSequences(t *testing.T) {
 	cache := NewCausalCache(nil)
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -126,7 +116,7 @@ func TestRemove(t *testing.T) {
 	})
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -191,7 +181,7 @@ func TestDefrag(t *testing.T) {
 	})
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -239,7 +229,7 @@ func TestCopy(t *testing.T) {
 	cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
 	defer cache.Close()

-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 16)

 	tests := []testCase{
 		{
@@ -280,7 +270,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			context := backend.NewContext()
 			defer context.Close()

-			err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs})
+			err := cache.StartForward(context, input.Options{Positions: test.pos, Sequences: test.seqs})
 			if err != nil {
 				panic(err)
 			}
@@ -362,6 +352,7 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 }

 func (c *testContext) Input() ml.Context    { return c }
+func (c *testContext) Output() ml.Context   { return c }
 func (c *testContext) Layer(int) ml.Context { return c }

 func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
@@ -462,7 +453,7 @@ func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0
 	panic("not implemented")
 }

-func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, config ml.RoPEConfig) ml.Tensor {
+func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim, ropeType uint32, base, scale float32) ml.Tensor {
 	panic("not implemented")
 }

--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -49,7 +49,7 @@ func NewEncoderCache() *EncoderCache {
 	}
 }

-func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
 	if c.config == nil {
 		var config ml.CacheConfig
 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
@@ -58,10 +58,6 @@ func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, ca
 		c.config = &config
 	}

-	if maxSequences > 1 {
-		panic(fmt.Errorf("encoder cache does not support multiple sequences; requested: %v", maxSequences))
-	}
-
 	if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
 		panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
 	}
@@ -83,10 +79,10 @@ func (c *EncoderCache) Close() {
 	}
 }

-func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch) error {
+func (c *EncoderCache) StartForward(ctx ml.Context, opts input.Options) error {
 	// We work with the most recent image
-	if len(batch.Multimodal) > 0 {
-		c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
+	if len(opts.Multimodal) > 0 {
+		c.curPos = opts.Positions[opts.Multimodal[len(opts.Multimodal)-1].Index]
 	}

 	return nil
--- a/kvcache/wrapper.go
+++ b/kvcache/wrapper.go
@@ -23,9 +23,9 @@ func NewWrapperCache(caches ...Cache) *WrapperCache {
 	}
 }

-func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
 	for _, cache := range c.caches {
-		cache.Init(backend, dtype, maxSequences, capacity, maxBatch)
+		cache.Init(backend, dtype, capacity)
 	}
 }

@@ -41,14 +41,14 @@ func (c *WrapperCache) Close() {
 	}
 }

-func (c *WrapperCache) StartForward(ctx ml.Context, batch input.Batch) error {
+func (c *WrapperCache) StartForward(ctx ml.Context, opts input.Options) error {
 	for i, cache := range c.caches {
-		err := cache.StartForward(ctx, batch)
+		err := cache.StartForward(ctx, opts)
 		if err != nil {
 			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
 			for j := i - 1; j >= 0; j-- {
-				for k := range batch.Positions {
-					_ = c.caches[j].Remove(batch.Sequences[k], batch.Positions[k], math.MaxInt32)
+				for k := range opts.Positions {
+					_ = c.caches[j].Remove(opts.Sequences[k], opts.Positions[k], math.MaxInt32)
 				}
 			}
 			return err
--- a/llama/patches/0022-add-rdna4-support.patch
+++ b/llama/patches/0022-add-rdna4-support.patch
@@ -1,103 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Saman <saman.khatir@amd.com>
-Date: Wed, 19 Mar 2025 14:02:26 -0700
-Subject: [PATCH] add rdna4 support
-
---
- ggml/src/ggml-cuda/common.cuh    | 6 ++++--
- ggml/src/ggml-cuda/mmq.cu        | 2 +-
- ggml/src/ggml-cuda/mmq.cuh       | 4 ++--
- ggml/src/ggml-cuda/mmvq.cu       | 4 ++--
- ggml/src/ggml-cuda/vendors/hip.h | 4 ++++
- 5 files changed, 13 insertions(+), 7 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index adf0d3ec..b24593fc 100644
--- a/ggml/src/ggml-cuda/common.cuh
-+++ b/ggml/src/ggml-cuda/common.cuh
-@@ -61,11 +61,13 @@
- #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
- #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
- #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
-+#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
- 
- #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
- #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
- #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
-+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
- #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
- #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
- 
-@@ -386,7 +388,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
- #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
-     c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
-+#elif defined(RDNA3) || defined(RDNA4)
-     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
- #elif defined(__gfx1010__) || defined(__gfx900__)
-     int tmp1;
-diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
-index 10f2ebb1..933d945c 100644
--- a/ggml/src/ggml-cuda/mmq.cu
-+++ b/ggml/src/ggml-cuda/mmq.cu
-@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
-         return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-     }
- 
-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-+    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
- }
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index 0451c65f..66ce2bc9 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
- 
- template <ggml_type type, int mmq_x, int nwarps, bool need_check>
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
- #else
- #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
-     __launch_bounds__(WARP_SIZE*nwarps, 1)
-diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
-index 4fb466ca..23ae7abc 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
-+++ b/ggml/src/ggml-cuda/mmvq.cu
-@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
- 
-     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
- 
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
-+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
-     constexpr int nwarps              = 1;
-     constexpr int rows_per_cuda_block = 1;
- #else
-     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
-     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
- 
-     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-     const     int row0 = rows_per_cuda_block*blockIdx.x;
-diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 81964611..a62544b5 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
-+++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -150,6 +150,10 @@
- #define CDNA
- #endif
- 
-+#if defined(__gfx1200__) || defined(__gfx1201__)
-+#define RDNA4
-+#endif
-+
- #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-     defined(__gfx1150__) || defined(__gfx1151__)
- #define RDNA3
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -15,12 +15,12 @@ import (
 )

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
+		estimate := EstimateGPULayers(gpus, f, projectors, opts)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
@@ -71,7 +71,7 @@ type MemoryEstimate struct {

 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64

@@ -137,19 +137,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		}
 	}

-	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct)
+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)

-	if len(kv) > 0 {
-		layerSize += kv[0]
-	}
-
-	var kvTotal uint64
-	for _, kvLayer := range kv {
-		kvTotal += kvLayer
-	}
+	// KV is proportional to the number of layers
+	layerSize += kv / f.KV().BlockCount()

 	if graphPartialOffload == 0 {
-		graphPartialOffload = f.KV().GQA() * kvTotal / 6
+		graphPartialOffload = f.KV().GQA() * kv / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
@@ -223,7 +217,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
-			layerSize += kv[i]
+			layerSize += kv / f.KV().BlockCount()
 			memoryWeights += blk.Size()
 		}

@@ -321,7 +315,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		layersRequested:     opts.NumGPU,
 		layersModel:         int(f.KV().BlockCount()) + 1,
 		availableList:       availableList,
-		kv:                  kvTotal,
+		kv:                  kv,
 		allocationsList:     allocationsList,
 		memoryWeights:       memoryWeights,
 		memoryLayerOutput:   memoryLayerOutput,
@@ -380,7 +374,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
 			slog.Group(
 				"weights",
 				// memory of the weights
-				"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
+				"total", format.HumanBytes2(m.memoryWeights),
 				// memory of repeating layers
 				"repeating", format.HumanBytes2(m.memoryWeights),
 				// memory of non-repeating layers
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	projectors := []string{}
 	opts := api.DefaultOptions()
 	t.Run("cpu", func(t *testing.T) {
-		estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
 		assert.Equal(t, 0, estimate.Layers)
 		assert.Equal(t, uint64(0), estimate.Graph)
 	})
@@ -112,7 +112,7 @@ func TestEstimateGPULayers(t *testing.T) {
 			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
 			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
 			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
+			estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
 			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
 			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
 			var layerSums uint64
--- a/llm/server.go
+++ b/llm/server.go
@@ -109,7 +109,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		gpus = discover.GetCPUInfo()
 	}

-	estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
+	estimate := EstimateGPULayers(gpus, f, projectors, opts)
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,7 +2,6 @@ package ml

 import (
 	"bytes"
-	"context"
 	"encoding/binary"
 	"fmt"
 	"os"
@@ -61,10 +60,6 @@ type CacheConfig struct {

 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// Progress is a callback function that allows reporting percentage completion
-	// of model loading
-	Progress func(float32)
-
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

@@ -81,9 +76,9 @@ type BackendParams struct {
 	FlashAttention bool
 }

-var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
+var backends = make(map[string]func(*os.File, BackendParams) (Backend, error))

-func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
+func RegisterBackend(name string, f func(*os.File, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -91,9 +86,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
 	backends[name] = f
 }

-func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
+func NewBackend(f *os.File, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(ctx, f, params)
+		return backend(f, params)
 	}

 	return nil, fmt.Errorf("unsupported backend")
@@ -110,61 +105,16 @@ type Context interface {
 	MaxGraphNodes() int
 	Close()

-	// Input returns a context appropriate for creating tensors that are
-	// inputs to the model (which includes things like output locations)
+	// Input returns a context appropriate for creating input tensors
 	Input() Context

+	// Output returns a context appropriate for creating output tensors
+	Output() Context
+
 	// Layer returns a context appropriate for creating intermediate tensors
 	Layer(int) Context
 }

-// RopeType represents different RoPE (Rotary Position Embedding) implementation types
-type RopeType int
-
-// Available RoPE implementation types
-const (
-	RopeTypeNormal RopeType = iota // Standard RoPE implementation
-	RopeTypeNeox                   // NeoX-style RoPE implementation
-	RopeTypeMRoPE                  // Multi-scale RoPE implementation
-	RopeTypeVision                 // Vision-specific RoPE implementation
-)
-
-type YarnConfig struct {
-	YarnCtxTrain   int     // Context size used during training (for YaRN scaling)
-	YarnExtFactor  float32 // Extension factor for YaRN
-	YarnAttnFactor float32 // Attention scaling factor for YaRN
-	YarnBetaFast   float32 // Fast decay parameter for YaRN
-	YarnBetaSlow   float32 // Slow decay parameter for YaRN
-}
-
-// DefaultYarnConfig returns a default configuration for YaRN (Yet Another Recurrent Network)
-func DefaultYarnConfig(nCtx int32) *YarnConfig {
-	return &YarnConfig{
-		YarnCtxTrain:   int(nCtx),
-		YarnExtFactor:  0.0,
-		YarnAttnFactor: 1.0,
-		YarnBetaFast:   32.0,
-		YarnBetaSlow:   1.0,
-	}
-}
-
-// RoPEConfig holds configuration for Rotary Position Embedding
-type RoPEConfig struct {
-	// Dim is the dimensionality for applying rotary embeddings
-	Dim uint32
-
-	// Type specifies the RoPE implementation variant
-	Type RopeType
-
-	// Base controls frequency decay for the embeddings
-	Base float32
-
-	// Scale allows scaling the effective context length
-	Scale float32
-
-	*YarnConfig
-}
-
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@@ -188,7 +138,7 @@ type Tensor interface {
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, config RoPEConfig) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor

 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -9,17 +9,15 @@ package ggml
 import "C"

 import (
-	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
 	"maps"
 	"os"
-	"runtime"
 	"slices"
 	"strconv"
 	"strings"
-	"sync/atomic"
 	"unicode"
 	"unsafe"

@@ -48,6 +46,9 @@ type Backend struct {
 	// input is the backend used for inputs
 	input *C.struct_ggml_backend_buffer_type

+	// output is the backend used for outputs
+	output *C.struct_ggml_backend_buffer_type
+
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type

@@ -57,7 +58,7 @@ type Backend struct {
 	maxGraphNodes int
 }

-func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
+func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 	meta, n, err := fs.Decode(r, -1)
 	if err != nil {
 		return nil, err
@@ -296,16 +297,12 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}

-	var doneBytes atomic.Uint64
-	totalBytes := uint64(n) - meta.Tensors().Offset
-
-	g, ctx := errgroup.WithContext(ctx)
-	g.SetLimit(runtime.GOMAXPROCS(0))
+	// concurrently read in tensor data. uses a section reader which is safe for concurrent reads
+	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
+	var g errgroup.Group
 	for _, t := range meta.Tensors().Items() {
-		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
-			for i := range tts {
-				target := targets[t.Name][i]
+		for _, target := range targets[t.Name] {
+			g.Go(func() error {
 				if target == "" {
 					target = t.Name
 				}
@@ -315,43 +312,24 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 					return fmt.Errorf("unassigned tensor: %s", t.Name)
 				}

-				tts[i] = tt
-			}
+				bts := C.malloc(C.size_t(t.Size()))
+				if bts == nil {
+					return errors.New("failed to allocate tensor buffer")
+				}
+				defer C.free(bts)

-			sr := io.NewSectionReader(r, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
-			bts := make([]byte, 128*format.KibiByte)
-
-			var s uint64
-			for s < t.Size() {
-				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
-				if err != nil {
-					return err
+				buf := unsafe.Slice((*byte)(bts), t.Size())
+				n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), buf)
+				if err != nil || n != len(buf) {
+					return errors.New("read failed")
 				}

-				for _, tt := range tts {
-					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
-				}
-
-				s += uint64(n)
-
-				if params.Progress != nil {
-					done := doneBytes.Add(uint64(n))
-					params.Progress(float32(done) / float32(totalBytes))
-				}
-			}
-
-			return nil
-		})
+				C.ggml_backend_tensor_set(tt, bts, 0, C.size_t(t.Size()))
+				return nil
+			})
+		}
 	}

-	// start a goroutine to cancel the errgroup if the parent context is done
-	go func() {
-		<-ctx.Done()
-		g.Go(func() error {
-			return ctx.Err()
-		})
-	}()
-
 	if err := g.Wait(); err != nil {
 		return nil, err
 	}
@@ -397,7 +375,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			C.size_t(maxGraphNodes),
 			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
 		),
-		input: deviceBufferTypes[input.d],
+		input:  deviceBufferTypes[input.d],
+		output: deviceBufferTypes[output.d],
 		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
 			m := make(map[int]*C.struct_ggml_backend_buffer_type)
 			for i, layer := range layers {
@@ -478,6 +457,19 @@ func (c Context) Input() ml.Context {
 	return &c
 }

+func (c Context) Output() ml.Context {
+	if c.b.output != nil {
+		return &Context{
+			b:             c.b,
+			ctx:           c.ctx,
+			buft:          c.b.output,
+			maxGraphNodes: c.maxGraphNodes,
+		}
+	}
+
+	return &c
+}
+
 func (c Context) Layer(i int) ml.Context {
 	if buft, ok := c.b.layers[i]; ok {
 		return &Context{
@@ -907,8 +899,6 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-// GGML RoPE types
-// These are the types used in the C implementation of RoPE
 const (
 	ropeTypeNorm   C.int = 0
 	ropeTypeNeox   C.int = 2
@@ -916,8 +906,7 @@ const (
 	ropeTypeVision C.int = 24
 )

-// RoPE applies Rotary Position Embeddings to the tensor
-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, config ml.RoPEConfig) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{b: t.b}
 	}
@@ -927,41 +916,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, config
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
 	}

-	if config.YarnConfig == nil {
-		config.YarnConfig = ml.DefaultYarnConfig(131072) // 131072 is the default for LLaMA, so it is common at the time of writing
-	}
-
-	// Map Go RopeType to C implementation constants
-	var ropeTypeC C.int
-	switch config.Type {
-	case ml.RopeTypeNormal:
-		ropeTypeC = ropeTypeNorm
-	case ml.RopeTypeNeox:
-		ropeTypeC = ropeTypeNeox
-	case ml.RopeTypeMRoPE:
-		ropeTypeC = ropeTypeMrope
-	case ml.RopeTypeVision:
-		ropeTypeC = ropeTypeVision
-	default:
-		ropeTypeC = ropeTypeNorm
-	}
-
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx,
-			dequant,
-			positionIDs.(*Tensor).t,
-			ropeFactors.(*Tensor).t,
-			C.int(config.Dim),
-			ropeTypeC,
-			C.int(config.YarnCtxTrain),
-			C.float(config.Base),
-			C.float(config.Scale),
-			C.float(config.YarnExtFactor),
-			C.float(config.YarnAttnFactor),
-			C.float(config.YarnBetaFast),
-			C.float(config.YarnBetaSlow),
+			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			C.int(ropeDim),
+			C.int(ropeType),
+			131072, // YaRN n_ctx_train
+			C.float(ropeBase),
+			C.float(ropeScale),
+			0.,  // YaRN ext_factor
+			1.,  // YaRN attn_factor
+			32., // YaRN beta_fast
+			1.,  // YaRN beta_slow
 		),
 	}
 }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -61,13 +61,11 @@
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
-#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000

 #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
 #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
 #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
 #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
 #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)

@@ -388,7 +386,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3) || defined(RDNA4)
+#elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 #elif defined(__gfx1010__) || defined(__gfx900__)
    int tmp1;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
    }

-    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(

 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
    __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
 #else
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
    __launch_bounds__(WARP_SIZE*nwarps, 1)
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(

    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);

-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
 #else
    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)

    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    const     int row0 = rows_per_cuda_block*blockIdx.x;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
@@ -150,10 +150,6 @@
 #define CDNA
 #endif

-#if defined(__gfx1200__) || defined(__gfx1201__)
-#define RDNA4
-#endif
-
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -1,7 +1,5 @@
 package input

-import "github.com/ollama/ollama/ml"
-
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
@@ -35,24 +33,11 @@ type MultimodalIndex struct {
 	Multimodal any
 }

-// Batch contains the inputs for a model forward pass
-type Batch struct {
-	// Inputs is the input tokens, including placeholders for multimodal inputs.
-	Inputs ml.Tensor
-
-	// Multimodal is a set of multimodal embeddings previously created by
-	// EncodeMultimodal, along with an index into Inputs. Unused for text-only
-	// models or for batches without multimodal elements.
+// Options contains the inputs for a model forward pass
+type Options struct {
+	Inputs     []int32
 	Multimodal []MultimodalIndex
-
-	// Positions is the position for each Input, relative to its sequence. Equal
-	// in length to Inputs.
-	Positions []int32
-
-	// Sequences is the sequence for each Input. Equal in length to Inputs.
-	Sequences []int
-
-	// Outputs are the set of indicies into Inputs for which output data should
-	// be returned.
-	Outputs []int32
+	Positions  []int32
+	Sequences  []int
+	Outputs    []int32
 }
--- a/model/model.go
+++ b/model/model.go
@@ -1,7 +1,6 @@
 package model

 import (
-	"context"
 	"errors"
 	"fmt"
 	_ "image/jpeg"
@@ -27,7 +26,7 @@ var ErrNoVisionModel = errors.New("this model is missing data required for image

 // Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
-	Forward(ml.Context, input.Batch) (ml.Tensor, error)
+	Forward(ml.Context, input.Options) (ml.Tensor, error)

 	Backend() ml.Backend
 	Config() config
@@ -95,14 +94,14 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
+func New(modelPath string, params ml.BackendParams) (Model, error) {
 	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
 	defer r.Close()

-	b, err := ml.NewBackend(ctx, r, params)
+	b, err := ml.NewBackend(r, params)
 	if err != nil {
 		return nil, err
 	}
@@ -281,30 +280,24 @@ func canNil(t reflect.Type) bool {
 		t.Kind() == reflect.Slice
 }

-func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Tensor, error) {
-	if len(batch.Positions) != len(batch.Sequences) {
-		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(batch.Positions), len(batch.Sequences))
+func Forward(ctx ml.Context, m Model, opts input.Options) (ml.Tensor, error) {
+	if len(opts.Positions) != len(opts.Sequences) {
+		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(opts.Positions), len(opts.Sequences))
 	}

-	if len(batch.Positions) < 1 {
+	if len(opts.Positions) < 1 {
 		return nil, errors.New("batch size cannot be less than 1")
 	}

-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
-	if err != nil {
-		return nil, err
-	}
-
 	cache := m.Config().Cache
 	if cache != nil {
-		err := cache.StartForward(ctx, batch)
+		err := cache.StartForward(ctx, opts)
 		if err != nil {
 			return nil, err
 		}
 	}

-	t, err := m.Forward(ctx, batch)
+	t, err := m.Forward(ctx, opts)
 	if err != nil {
 		return nil, err
 	}
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -163,7 +163,7 @@ func TestGetTextProcessor(t *testing.T) {

 type notTextProcessorModel struct{}

-func (notTextProcessorModel) Forward(ml.Context, input.Batch) (ml.Tensor, error) {
+func (notTextProcessorModel) Forward(ml.Context, input.Options) (ml.Tensor, error) {
 	panic("unimplemented")
 }

--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -13,11 +13,10 @@ import (
 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
 	attnKeyLen, attnValLen           int
-	eps                              float32
+	eps, ropeBase, ropeScale         float32
 	attnLogitSoftcap                 float32
 	finalLogitSoftcap                float32
 	largeModelScaling                bool
-	ropeConfig                       ml.RoPEConfig
 }

 type Model struct {
@@ -56,15 +55,10 @@ func New(c ml.Config) (model.Model, error) {
 			attnKeyLen:        int(c.Uint("attention.key_length")),
 			attnValLen:        int(c.Uint("attention.value_length")),
 			eps:               c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base", 10000.0),
+			ropeScale:         c.Float("rope.freq_scale", 1.0),
 			attnLogitSoftcap:  c.Float("attn_logit_softcapping"),
 			finalLogitSoftcap: c.Float("final_logit_softcapping"),
-			ropeConfig: ml.RoPEConfig{
-				Base:       c.Float("rope.freq_base", 10000.0),
-				Scale:      c.Float("rope.freq_scale", 1.0),
-				Dim:        c.Uint("attention.key_length"),
-				Type:       ml.RopeTypeNormal,
-				YarnConfig: ml.DefaultYarnConfig(int32(c.Uint("context_length", 131072))),
-			},
 		},
 	}

@@ -84,10 +78,11 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeConfig)
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -97,7 +92,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeConfig)
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -127,7 +122,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, m.ropeConfig), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
 }

 type MLP struct {
@@ -173,18 +168,23 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
 	if err != nil {
 		return nil, err
 	}

-	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))

 	if len(m.Layers) == gemma27BLayerCount {
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -139,18 +139,23 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	return result, nil
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
 	if err != nil {
 		return nil, err
 	}

-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil
 }

 func init() {
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -13,11 +13,9 @@ import (
 type TextOptions struct {
 	hiddenSize, numHeads, numKVHeads int
 	attnKeyLen, attnValLen           int
-	eps                              float32
+	eps, ropeScale                   float32
+	ropeLocalBase, ropeGlobalBase    float32
 	largeModelScaling                bool
-
-	ropeLocalConfig  ml.RoPEConfig
-	ropeGlobalConfig ml.RoPEConfig
 }

 type TextModel struct {
@@ -58,27 +56,15 @@ func newTextModel(c ml.Config) *TextModel {
 		),
 		Layers: make([]TextLayer, numBlocks),
 		TextOptions: &TextOptions{
-			hiddenSize: int(c.Uint("embedding_length")),
-			numHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			attnKeyLen: int(c.Uint("attention.key_length", 256)),
-			attnValLen: int(c.Uint("attention.value_length", 256)),
-			eps:        c.Float("attention.layer_norm_rms_epsilon", 1e-06),
-
-			ropeLocalConfig: ml.RoPEConfig{
-				Base:       c.Float("rope.local.freq_base", 10000.0),
-				Scale:      c.Float("rope.freq_scale", 1.0),
-				Dim:        c.Uint("attention.key_length", 256),
-				Type:       ml.RopeTypeNeox,
-				YarnConfig: ml.DefaultYarnConfig(int32(c.Uint("context_length", 131072))),
-			},
-			ropeGlobalConfig: ml.RoPEConfig{
-				Base:       c.Float("rope.global.freq_base", 1000000.0),
-				Scale:      c.Float("rope.freq_scale", 1.0),
-				Dim:        c.Uint("attention.key_length", 256),
-				Type:       ml.RopeTypeNeox,
-				YarnConfig: ml.DefaultYarnConfig(int32(c.Uint("context_length", 131072))),
-			},
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			attnKeyLen:     int(c.Uint("attention.key_length", 256)),
+			attnValLen:     int(c.Uint("attention.value_length", 256)),
+			eps:            c.Float("attention.layer_norm_rms_epsilon", 1e-06),
+			ropeLocalBase:  c.Float("rope.local.freq_base", 10000.0),
+			ropeGlobalBase: c.Float("rope.global.freq_base", 1000000.0),
+			ropeScale:      c.Float("rope.freq_scale", 1.0),
 		},
 	}

@@ -100,16 +86,17 @@ type TextSelfAttention struct {

 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)

-	ropeConfig := opts.ropeLocalConfig
+	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
-		ropeConfig = opts.ropeGlobalConfig
+		ropeBase = opts.ropeGlobalBase
 	}

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = q.RoPE(ctx, positionIDs, nil, ropeConfig)
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -120,7 +107,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = k.RoPE(ctx, positionIDs, nil, ropeConfig)
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -133,12 +120,12 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	ropeConfig := m.ropeLocalConfig
+	ropeBase := m.TextOptions.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
-		ropeConfig = m.ropeGlobalConfig
+		ropeBase = m.TextOptions.ropeGlobalBase
 	}

-	return key.RoPE(ctx, shift, nil, ropeConfig), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.TextOptions.attnKeyLen), uint32(2), ropeBase, m.TextOptions.ropeScale), nil
 }

 type TextMLP struct {
@@ -184,13 +171,13 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,
 	return hiddenState.Add(ctx, residual)
 }

-func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, opts input.Options, cache kvcache.Cache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextOptions.hiddenSize)))

 	// set image embeddings
 	var except []int
-	for _, image := range batch.Multimodal {
+	for _, image := range opts.Multimodal {
 		visionOutputs := image.Multimodal.(ml.Tensor)
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -14,8 +14,8 @@ import (

 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
-	eps                              float32
-	ropeConfig                       ml.RoPEConfig
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
 }

 type Model struct {
@@ -54,13 +54,9 @@ func New(c ml.Config) (model.Model, error) {
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-			ropeConfig: ml.RoPEConfig{
-				Base:       c.Float("rope.freq_base"),
-				Scale:      c.Float("rope.freq_scale", 1),
-				Dim:        c.Uint("rope.dimension_count"),
-				Type:       ml.RopeTypeNormal,
-				YarnConfig: ml.DefaultYarnConfig(int32(c.Uint("context_length", 131072))),
-			},
+			ropeBase:   c.Float("rope.freq_base"),
+			ropeScale:  c.Float("rope.freq_scale", 1),
+			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}

@@ -80,14 +76,15 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeConfig)
+	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeConfig)
+	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -100,7 +97,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, m.ropeConfig), nil
+	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -142,18 +139,23 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
 	if err != nil {
 		return nil, err
 	}

-	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -135,27 +135,32 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	return inputs, nil
 }

-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
-	if len(batch.Multimodal) > 0 {
-		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
+	if len(opts.Multimodal) > 0 {
+		images := opts.Multimodal[len(opts.Multimodal)-1].Multimodal.([]ml.Tensor)
 		if len(images) > 0 {
 			crossAttentionStates = images[len(images)-1]
 		}
 	}

-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
 	if err != nil {
 		return nil, err
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
 	if err != nil {
 		return nil, err
 	}

 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }

 func init() {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -20,14 +20,15 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeConfig)
+	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeConfig)
+	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -42,7 +43,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeConfig), nil
+		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
 	}

 	return key, nil
@@ -197,8 +198,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,

 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
-	eps                              float32
-	ropeConfig                       ml.RoPEConfig
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32

 	crossAttentionLayers []uint32
 }
@@ -239,14 +240,10 @@ func newTextModel(c ml.Config) *TextModel {
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:             c.Float("rope.freq_base"),
+			ropeScale:            c.Float("rope.freq_scale", 1),
+			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Uints("attention.cross_attention_layers"),
-			ropeConfig: ml.RoPEConfig{
-				Base:       c.Float("rope.freq_base"),
-				Scale:      c.Float("rope.freq_scale", 1),
-				Dim:        c.Uint("rope.dimension_count"),
-				Type:       ml.RopeTypeNormal,
-				YarnConfig: ml.DefaultYarnConfig(int32(c.Uint("context_length", 131072))),
-			},
 		},
 	}
 }
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -31,10 +31,8 @@ type InputCache struct {
 	cache kvcache.Cache
 }

-func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) {
-	numCtx := kvSize / int32(numSlots)
-
-	if numCtx < 1 {
+func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/int32(numSlots) < 1 {
 		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
 	}

@@ -46,11 +44,11 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots

 	cache := model.Config().Cache
 	if cache != nil {
-		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), numSlots, int(numCtx), batchSize)
+		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), kvSize)
 	}

 	return &InputCache{
-		numCtx:         numCtx,
+		numCtx:         kvSize / int32(numSlots),
 		enabled:        cache != nil,
 		slots:          slots,
 		multiUserCache: multiUserCache,
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -348,8 +348,7 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()

-	var batchInputs []int32
-	var batch input.Batch
+	var options input.Options

 	for i, seq := range s.seqs {
 		if seq == nil {
@@ -396,17 +395,17 @@ func (s *Server) processBatch() error {
 				}
 			}

-			batchInputs = append(batchInputs, inp.Token)
+			options.Inputs = append(options.Inputs, inp.Token)
 			if inp.Multimodal != nil {
-				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
+				options.Multimodal = append(options.Multimodal, input.MultimodalIndex{Index: len(options.Inputs) - 1, Multimodal: inp.Multimodal})
 			}

-			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
-			batch.Sequences = append(batch.Sequences, seq.cache.Id)
+			options.Positions = append(options.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
+			options.Sequences = append(options.Sequences, seq.cache.Id)

-			seq.iBatch = len(batch.Outputs)
+			seq.iBatch = len(options.Outputs)
 			if j+1 == len(seq.inputs) {
-				batch.Outputs = append(batch.Outputs, int32(len(batchInputs)-1))
+				options.Outputs = append(options.Outputs, int32(len(options.Inputs)-1))
 			}
 			seq.pendingInputs = append(seq.pendingInputs, inp)
 		}
@@ -414,14 +413,14 @@ func (s *Server) processBatch() error {
 		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}

-	if len(batchInputs) == 0 {
+	if len(options.Inputs) == 0 {
 		return nil
 	}

 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

-	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
+	modelOutput, err := model.Forward(ctx, s.model, options)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
@@ -461,7 +460,7 @@ func (s *Server) processBatch() error {
 		}

 		// sample a token
-		vocabSize := len(logits) / len(batch.Outputs)
+		vocabSize := len(logits) / len(options.Outputs)

 		token, err := seq.sampler.Sample(logits[seq.iBatch*vocabSize : (seq.iBatch+1)*vocabSize])
 		if err != nil {
@@ -562,14 +561,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		}
 	}

-	sampler := sample.NewSampler(
-		req.Options.Temperature,
-		req.Options.TopK,
-		req.Options.TopP,
-		req.Options.MinP,
-		req.Options.Seed,
-		grammar,
-	)
+	sampler := sample.NewSampler(req.Options, grammar)

 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
 		numPredict: req.Options.NumPredict,
@@ -678,7 +670,6 @@ func (m *multiLPath) String() string {
 }

 func (s *Server) loadModel(
-	ctx context.Context,
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
@@ -688,7 +679,7 @@ func (s *Server) loadModel(
 	multiUserCache bool,
 ) {
 	var err error
-	s.model, err = model.New(ctx, mpath, params)
+	s.model, err = model.New(mpath, params)
 	if err != nil {
 		panic(err)
 	}
@@ -700,7 +691,7 @@ func (s *Server) loadModel(
 		panic("loras are not yet implemented")
 	}

-	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
+	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, multiUserCache)
 	if err != nil {
 		panic(err)
 	}
@@ -784,9 +775,6 @@ func Execute(args []string) error {
 	}

 	params := ml.BackendParams{
-		Progress: func(progress float32) {
-			server.progress = progress
-		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,
@@ -795,13 +783,13 @@ func Execute(args []string) error {
 	}

 	server.ready.Add(1)
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
+	go server.loadModel(*mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)

 	server.cond = sync.NewCond(&server.mu)

+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
 	go server.run(ctx)

 	addr := "127.0.0.1:" + strconv.Itoa(*port)
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -1,12 +1,14 @@
 package sample

 import (
+	"encoding/json"
 	"errors"
 	"math"
 	"math/rand/v2"
 	"slices"
 	"sync"

+	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llama"
 )

@@ -126,40 +128,65 @@ func (s *Sampler) sample(tokens []token) (token, error) {
 	return tokens[idx], nil
 }

-// TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
-func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
+// SamplerParams contains the validated and normalized parameters for a sampler
+type SamplerParams struct {
+	Temperature float32 `json:"temperature"`
+	TopK        int     `json:"top_k"`
+	TopP        float32 `json:"top_p"`
+	MinP        float32 `json:"min_p"`
+	Seed        int     `json:"seed"`
+}
+
+// UnmarshalJSON implements json.Unmarshaler to handle validation during JSON unmarshaling
+func (p *SamplerParams) UnmarshalJSON(data []byte) error {
+	type rawParams SamplerParams
+	if err := json.Unmarshal(data, (*rawParams)(p)); err != nil {
+		return err
+	}
+
+	// Validate and normalize after unmarshaling
+	if p.Temperature < 0.0 {
+		p.Temperature = 0.0
+	}
+
+	if p.TopP < 0.0 {
+		p.TopP = 0.0
+	}
+	if p.TopP >= 1.0 {
+		p.TopP = 1.0
+	}
+
+	if p.MinP < 0.0 {
+		p.MinP = 0.0
+	}
+	if p.MinP >= 1.0 {
+		p.MinP = 1.0
+	}
+
+	return nil
+}
+
+// NewSampler creates a new sampler with the given options
+func NewSampler(opts *api.Options, grammar *Grammar) Sampler {
+	var params SamplerParams
+	data, _ := json.Marshal(opts)
+	_ = json.Unmarshal(data, &params)
+
 	var rng *rand.Rand
-	if seed != -1 {
+	if params.Seed != -1 {
 		// PCG requires two parameters: sequence and stream
 		// Use original seed for sequence
-		sequence := uint64(seed)
+		sequence := uint64(params.Seed)
 		// Use golden ratio hash to generate statistically independent seeds
 		rng = rand.New(rand.NewPCG(sequence, sequence^0x9E3779B9))
 	}
-	if temperature < 0.0 {
-		temperature = 0.0
-	}
-
-	if topP < 0.0 {
-		topP = 0.0
-	}
-	if topP >= 1.0 {
-		topP = 1.0
-	}
-
-	if minP < 0.0 {
-		minP = 0.0
-	}
-	if minP >= 1.0 {
-		minP = 1.0
-	}

 	return Sampler{
 		rng:         rng,
-		topK:        topK,
-		topP:        topP,
-		minP:        minP,
-		temperature: temperature,
+		topK:        params.TopK,
+		topP:        params.TopP,
+		minP:        params.MinP,
+		temperature: params.Temperature,
 		grammar:     grammar,
 	}
 }
--- a/sample/samplers_benchmark_test.go
+++ b/sample/samplers_benchmark_test.go
@@ -16,7 +16,7 @@ func BenchmarkWeightedSampler(b *testing.B) {
 				logits[i] = float32(rand.Float64()*10 - 5)
 			}

-			sampler := NewSampler(0.8, 0, 0, 0, 42, nil)
+			sampler := NewSampler(createSamplerOptions(0.8, 0, 0, 0, 42), nil)
 			b.ResetTimer()
 			for b.Loop() {
 				sampler.Sample(logits)
@@ -49,7 +49,7 @@ func BenchmarkWeightedSampler(b *testing.B) {

 	for _, tc := range configs {
 		b.Run("Config"+tc.name, func(b *testing.B) {
-			sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed, nil)
+			sampler := NewSampler(createSamplerOptions(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed), nil)
 			sampler.Sample(logits)

 			b.ResetTimer()
@@ -62,7 +62,7 @@ func BenchmarkWeightedSampler(b *testing.B) {

 	// Test with combined transforms separately - topK influences performance greatly
 	b.Run("TransformCombined", func(b *testing.B) {
-		sampler := NewSampler(0.8, 50, 0.9, 0.05, 42, nil)
+		sampler := NewSampler(createSamplerOptions(0.8, 50, 0.9, 0.05, 42), nil)
 		b.ResetTimer()

 		for b.Loop() {
@@ -81,7 +81,7 @@ func BenchmarkGreedySampler(b *testing.B) {
 				logits[i] = float32(rand.Float64()*10 - 5)
 			}

-			sampler := NewSampler(0, -1, 0, 0, -1, nil)
+			sampler := NewSampler(createSamplerOptions(0, -1, 0, 0, -1), nil)
 			b.ResetTimer()

 			for b.Loop() {
--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -4,11 +4,23 @@ import (
 	"math"
 	"math/rand/v2"
 	"testing"
+
+	"github.com/ollama/ollama/api"
 )

+func createSamplerOptions(temperature float32, topK int, topP float32, minP float32, seed int) *api.Options {
+	return &api.Options{
+		Temperature: temperature,
+		TopK:        topK,
+		TopP:        topP,
+		MinP:        minP,
+		Seed:        seed,
+	}
+}
+
 func TestWeighted(t *testing.T) {
 	logits := []float32{-10, 3, -10, -10}
-	sampler := NewSampler(0, 0, 0, 0, 0, nil)
+	sampler := NewSampler(createSamplerOptions(0, 0, 0, 0, 0), nil)
 	got, err := sampler.Sample(logits)
 	if err != nil {
 		t.Error(err)
@@ -20,7 +32,7 @@ func TestWeighted(t *testing.T) {
 	}

 	logits = []float32{-100, -10, 0, 10}
-	sampler = NewSampler(0, 0, 0, 0, 0, nil)
+	sampler = NewSampler(createSamplerOptions(0, 0, 0, 0, 0), nil)
 	got, err = sampler.Sample(logits)
 	if err != nil {
 		t.Error(err)
@@ -34,7 +46,7 @@ func TestWeighted(t *testing.T) {
 	// Test very high p
 	logits = []float32{1.0, 0.9999999999999999, 0.5, 0.1}
 	// Use extremely small topP to filter out all tokens
-	sampler = NewSampler(1.0, 0, 1e-10, 0, 0, nil)
+	sampler = NewSampler(createSamplerOptions(1.0, 0, 1e-10, 0, 0), nil)
 	got, err = sampler.Sample(logits)
 	if err != nil {
 		t.Error(err)
@@ -47,7 +59,7 @@ func TestWeighted(t *testing.T) {
 	}

 	logits = []float32{float32(math.NaN()), float32(math.NaN()), float32(math.NaN())}
-	sampler = NewSampler(1, 0, 0.95, 0.05, 0, nil)
+	sampler = NewSampler(createSamplerOptions(1, 0, 0.95, 0.05, 0), nil)
 	got, err = sampler.Sample(logits)
 	if err == nil {
 		t.Errorf("expected error, got %d", got)
@@ -57,8 +69,8 @@ func TestWeighted(t *testing.T) {

 func BenchmarkSample(b *testing.B) {
 	samplers := map[string]Sampler{
-		"Greedy":   NewSampler(0, 0, 0, 0, 0, nil), // Use NewSampler with temp=0 for greedy
-		"Weighted": NewSampler(0.5, 10, 0.9, 0.2, -1, nil),
+		"Greedy":   NewSampler(createSamplerOptions(0, 0, 0, 0, 0), nil), // Use NewSampler with temp=0 for greedy
+		"Weighted": NewSampler(createSamplerOptions(0.5, 10, 0.9, 0.2, -1), nil),
 	}

 	// Generate random logits for benchmarking
--- a/server/download.go
+++ b/server/download.go
@@ -29,9 +29,8 @@ import (
 const maxRetries = 6

 var (
-	errMaxRetriesExceeded   = errors.New("max retries exceeded")
-	errPartStalled          = errors.New("part stalled")
-	errMaxRedirectsExceeded = errors.New("maximum redirects exceeded (10) for directURL")
+	errMaxRetriesExceeded = errors.New("max retries exceeded")
+	errPartStalled        = errors.New("part stalled")
 )

 var blobDownloadManager sync.Map
@@ -237,7 +236,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis

 			newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
 				if len(via) > 10 {
-					return errMaxRedirectsExceeded
+					return errors.New("maximum redirects exceeded (10) for directURL")
 				}

 				// if the hostname is the same, allow the redirect
--- a/server/images.go
+++ b/server/images.go
@@ -35,7 +35,6 @@ var (
 	errCapabilityCompletion = errors.New("completion")
 	errCapabilityTools      = errors.New("tools")
 	errCapabilityInsert     = errors.New("insert")
-	errInsecureProtocol     = errors.New("insecure protocol http")
 )

 type Capability string
@@ -480,7 +479,7 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	fn(api.ProgressResponse{Status: "retrieving manifest"})

 	if mp.ProtocolScheme == "http" && !regOpts.Insecure {
-		return errInsecureProtocol
+		return errors.New("insecure protocol http")
 	}

 	manifest, _, err := GetManifest(mp)
@@ -544,7 +543,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	}

 	if mp.ProtocolScheme == "http" && !regOpts.Insecure {
-		return errInsecureProtocol
+		return errors.New("insecure protocol http")
 	}

 	fn(api.ProgressResponse{Status: "pulling manifest"})
--- a/server/internal/client/ollama/registry.go
+++ b/server/internal/client/ollama/registry.go
@@ -59,11 +59,6 @@ var (
 	// ErrCached is passed to [Trace.PushUpdate] when a layer already
 	// exists. It is a non-fatal error and is never returned by [Registry.Push].
 	ErrCached = errors.New("cached")
-
-	// ErrIncomplete is returned by [Registry.Pull] when a model pull was
-	// incomplete due to one or more layer download failures. Users that
-	// want specific errors should use [WithTrace].
-	ErrIncomplete = errors.New("incomplete")
 )

 // Defaults
@@ -276,19 +271,8 @@ func DefaultRegistry() (*Registry, error) {

 func UserAgent() string {
 	buildinfo, _ := debug.ReadBuildInfo()
-
-	version := buildinfo.Main.Version
-	if version == "(devel)" {
-		// When using `go run .` the version is "(devel)". This is seen
-		// as an invalid version by ollama.com and so it defaults to
-		// "needs upgrade" for some requests, such as pulls. These
-		// checks can be skipped by using the special version "v0.0.0",
-		// so we set it to that here.
-		version = "v0.0.0"
-	}
-
 	return fmt.Sprintf("ollama/%s (%s %s) Go/%s",
-		version,
+		buildinfo.Main.Version,
 		runtime.GOARCH,
 		runtime.GOOS,
 		runtime.Version(),
@@ -434,14 +418,13 @@ func canRetry(err error) bool {
 //
 // It always calls update with a nil error.
 type trackingReader struct {
-	l      *Layer
-	r      io.Reader
-	update func(l *Layer, n int64, err error)
+	r io.Reader
+	n *atomic.Int64
 }

 func (r *trackingReader) Read(p []byte) (n int, err error) {
 	n, err = r.r.Read(p)
-	r.update(r.l, int64(n), nil)
+	r.n.Add(int64(n))
 	return
 }

@@ -479,50 +462,43 @@ func (r *Registry) Pull(ctx context.Context, name string) error {

 	// Send initial layer trace events to allow clients to have an
 	// understanding of work to be done before work starts.
-	var expected int64
 	t := traceFromContext(ctx)
 	for _, l := range layers {
 		t.update(l, 0, nil)
-		expected += l.Size
 	}

-	var received atomic.Int64
 	var g errgroup.Group
 	g.SetLimit(r.maxStreams())
 	for _, l := range layers {
 		info, err := c.Get(l.Digest)
 		if err == nil && info.Size == l.Size {
-			received.Add(l.Size)
 			t.update(l, l.Size, ErrCached)
 			continue
 		}

-		var wg sync.WaitGroup
 		chunked, err := c.Chunked(l.Digest, l.Size)
 		if err != nil {
 			t.update(l, 0, err)
 			continue
 		}
+		// TODO(bmizerany): fix this unbounded use of defer
+		defer chunked.Close()

+		var progress atomic.Int64
 		for cs, err := range r.chunksums(ctx, name, l) {
 			if err != nil {
-				// Chunksum stream interrupted. Note in trace
-				// log and let in-flight downloads complete.
-				// This will naturally trigger ErrIncomplete
-				// since received < expected bytes.
-				t.update(l, 0, err)
-				break
+				// Bad chunksums response, update tracing
+				// clients and then bail.
+				t.update(l, progress.Load(), err)
+				return err
 			}

-			wg.Add(1)
 			g.Go(func() (err error) {
 				defer func() {
-					if err == nil {
-						received.Add(cs.Chunk.Size())
-					} else {
+					if err != nil {
 						err = fmt.Errorf("error downloading %s: %w", cs.Digest.Short(), err)
 					}
-					wg.Done()
+					t.update(l, progress.Load(), err)
 				}()

 				req, err := http.NewRequestWithContext(ctx, "GET", cs.URL, nil)
@@ -536,35 +512,25 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 				}
 				defer res.Body.Close()

-				body := &trackingReader{l: l, r: res.Body, update: t.update}
+				// Count bytes towards progress, as they
+				// arrive, so that our bytes piggyback other
+				// chunk updates on completion.
+				//
+				// This tactic is enough to show "smooth"
+				// progress given the current CLI client. In
+				// the near future, the server should report
+				// download rate since it knows better than a
+				// client that is measuring rate based on
+				// wall-clock time-since-last-update.
+				body := &trackingReader{r: res.Body, n: &progress}
+
 				return chunked.Put(cs.Chunk, cs.Digest, body)
 			})
 		}
-
-		// Close writer immediately after downloads finish, not at Pull
-		// exit. Using defer would keep file descriptors open until all
-		// layers complete, potentially exhausting system limits with
-		// many layers.
-		//
-		// The WaitGroup tracks when all chunks finish downloading,
-		// allowing precise writer closure in a background goroutine.
-		// Each layer briefly uses one extra goroutine while at most
-		// maxStreams()-1 chunks download in parallel.
-		//
-		// This caps file descriptors at maxStreams() instead of
-		// growing with layer count.
-		g.Go(func() error {
-			wg.Wait()
-			chunked.Close()
-			return nil
-		})
 	}
 	if err := g.Wait(); err != nil {
 		return err
 	}
-	if received.Load() != expected {
-		return fmt.Errorf("%w: received %d/%d", ErrIncomplete, received.Load(), expected)
-	}

 	md := blob.DigestFromBytes(m.Data)
 	if err := blob.PutBytes(c, md, m.Data); err != nil {
@@ -791,12 +757,15 @@ func (r *Registry) chunksums(ctx context.Context, name string, l *Layer) iter.Se
 		}
 		blobURL := res.Header.Get("Content-Location")

+		var size int64
 		s := bufio.NewScanner(res.Body)
 		s.Split(bufio.ScanWords)
 		for {
 			if !s.Scan() {
 				if s.Err() != nil {
 					yield(chunksum{}, s.Err())
+				} else if size != l.Size {
+					yield(chunksum{}, fmt.Errorf("size mismatch: layer size %d != sum of chunks %d", size, l.Size))
 				}
 				return
 			}
@@ -820,6 +789,12 @@ func (r *Registry) chunksums(ctx context.Context, name string, l *Layer) iter.Se
 				return
 			}

+			size += chunk.Size()
+			if size > l.Size {
+				yield(chunksum{}, fmt.Errorf("chunk size %d exceeds layer size %d", size, l.Size))
+				return
+			}
+
 			cs := chunksum{
 				URL:    blobURL,
 				Chunk:  chunk,
--- a/server/internal/client/ollama/registry_test.go
+++ b/server/internal/client/ollama/registry_test.go
@@ -25,28 +25,6 @@ import (
 	"github.com/ollama/ollama/server/internal/testutil"
 )

-func ExampleRegistry_cancelOnFirstError() {
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	ctx = WithTrace(ctx, &Trace{
-		Update: func(l *Layer, n int64, err error) {
-			if err != nil {
-				// Discontinue pulling layers if there is an
-				// error instead of continuing to pull more
-				// data.
-				cancel()
-			}
-		},
-	})
-
-	var r Registry
-	if err := r.Pull(ctx, "model"); err != nil {
-		// panic for demo purposes
-		panic(err)
-	}
-}
-
 func TestManifestMarshalJSON(t *testing.T) {
 	// All manifests should contain an "empty" config object.
 	var m Manifest
@@ -835,13 +813,8 @@ func TestPullChunksums(t *testing.T) {
 	)
 	err := rc.Pull(ctx, "test")
 	check(err)
-	wantReads := []int64{
-		0, // initial signaling of layer pull starting
-		3, // first chunk read
-		2, // second chunk read
-	}
-	if !slices.Equal(reads, wantReads) {
-		t.Errorf("reads = %v; want %v", reads, wantReads)
+	if !slices.Equal(reads, []int64{0, 3, 5}) {
+		t.Errorf("reads = %v; want %v", reads, []int64{0, 3, 5})
 	}

 	mw, err := rc.Resolve(t.Context(), "test")
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -200,7 +200,7 @@ type params struct {
 	//
 	// Unfortunately, this API was designed to be a bit awkward. Stream is
 	// defined to default to true if not present, so we need a way to check
-	// if the client decisively set it to false. So, we use a pointer to a
+	// if the client decisively it to false. So, we use a pointer to a
 	// bool. Gross.
 	//
 	// Use [stream()] to get the correct value for this field.
@@ -280,17 +280,17 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 	progress := make(map[*ollama.Layer]int64)

 	progressCopy := make(map[*ollama.Layer]int64, len(progress))
-	flushProgress := func() {
+	pushUpdate := func() {
 		defer maybeFlush()

-		// TODO(bmizerany): Flushing every layer in one update doesn't
-		// scale well. We could flush only the modified layers or track
-		// the full download. Needs further consideration, though it's
-		// fine for now.
+		// TODO(bmizerany): This scales poorly with more layers due to
+		// needing to flush out them all in one big update. We _could_
+		// just flush on the changed ones, or just track the whole
+		// download. Needs more thought. This is fine for now.
 		mu.Lock()
 		maps.Copy(progressCopy, progress)
 		mu.Unlock()
-		for l, n := range progressCopy {
+		for l, n := range progress {
 			enc.Encode(progressUpdateJSON{
 				Digest:    l.Digest,
 				Total:     l.Size,
@@ -298,26 +298,19 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 			})
 		}
 	}
-	defer flushProgress()

-	t := time.NewTicker(1000 * time.Hour) // "unstarted" timer
+	t := time.NewTicker(time.Hour) // "unstarted" timer
 	start := sync.OnceFunc(func() {
-		flushProgress() // flush initial state
+		pushUpdate()
 		t.Reset(100 * time.Millisecond)
 	})
 	ctx := ollama.WithTrace(r.Context(), &ollama.Trace{
 		Update: func(l *ollama.Layer, n int64, err error) {
 			if n > 0 {
-				// Block flushing progress updates until every
-				// layer is accounted for. Clients depend on a
-				// complete model size to calculate progress
-				// correctly; if they use an incomplete total,
-				// progress indicators would erratically jump
-				// as new layers are registered.
-				start()
+				start() // flush initial state
 			}
 			mu.Lock()
-			progress[l] += n
+			progress[l] = n
 			mu.Unlock()
 		},
 	})
@@ -330,9 +323,9 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 	for {
 		select {
 		case <-t.C:
-			flushProgress()
+			pushUpdate()
 		case err := <-done:
-			flushProgress()
+			pushUpdate()
 			if err != nil {
 				var status string
 				if errors.Is(err, ollama.ErrModelNotFound) {
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -31,10 +31,9 @@ const (

 var (
 	ErrInvalidImageFormat  = errors.New("invalid image format")
-	ErrInvalidDigestFormat = errors.New("invalid digest format")
 	ErrInvalidProtocol     = errors.New("invalid protocol scheme")
 	ErrInsecureProtocol    = errors.New("insecure protocol http")
-	ErrModelPathInvalid    = errors.New("invalid model path")
+	ErrInvalidDigestFormat = errors.New("invalid digest format")
 )

 func ParseModelPath(name string) ModelPath {
@@ -74,6 +73,8 @@ func ParseModelPath(name string) ModelPath {
 	return mp
 }

+var errModelPathInvalid = errors.New("invalid model path")
+
 func (mp ModelPath) GetNamespaceRepository() string {
 	return fmt.Sprintf("%s/%s", mp.Namespace, mp.Repository)
 }
--- a/server/routes.go
+++ b/server/routes.go
@@ -777,7 +777,7 @@ func (s *Server) ShowHandler(c *gin.Context) {
 func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	name := model.ParseName(req.Model)
 	if !name.IsValid() {
-		return nil, ErrModelPathInvalid
+		return nil, errModelPathInvalid
 	}
 	name, err := getExistingName(name)
 	if err != nil {
--- a/server/sched.go
+++ b/server/sched.go
@@ -711,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 			req.opts.NumCtx = req.origNumCtx * p
 			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						*numParallel = p
 						return []discover.GpuInfo{g}
@@ -727,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 		// Now try all the GPUs
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
 				*numParallel = p
 				return sgl
@@ -750,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.Gp
 	var bestEstimate uint64
 	var bestFit int
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
+		_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestFit = i
@@ -825,7 +825,7 @@ func (s *Scheduler) expireRunner(model *Model) {
 // If not, pick a runner to unload, else return nil and the request can be loaded
 func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
-	estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
+	estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
 		return nil