runner: add test for unicode token processing

2025-05-14 11:29:11 -07:00
70 changed files with 1231 additions and 2016 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,8 +51,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
 add_compile_definitions(NDEBUG)
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
--- a/README.md
+++ b/README.md
@@ -405,7 +405,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 ### Cloud
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -747,38 +747,11 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 				case float64:
 					v = fmt.Sprintf("%g", vData)
 				case []any:
-					targetWidth := 10 // Small width where we are displaying the data in a column
+					n := 3
-
+					if len(vData) < n {
-					var itemsToShow int
+						n = len(vData)
 					totalWidth := 1 // Start with 1 for opening bracket
 					// Find how many we can fit
 					for i := range vData {
 						itemStr := fmt.Sprintf("%v", vData[i])
 						width := runewidth.StringWidth(itemStr)
 						// Add separator width (", ") for all items except the first
 						if i > 0 {
 							width += 2
 						}
 						// Check if adding this item would exceed our width limit
 						if totalWidth+width > targetWidth && i > 0 {
 							break
 						}
 						totalWidth += width
 						itemsToShow++
 					}
 					// Format the output
 					if itemsToShow < len(vData) {
 						v = fmt.Sprintf("%v", vData[:itemsToShow])
 						v = strings.TrimSuffix(v, "]")
 						v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
 					} else {
 						v = fmt.Sprintf("%v", vData)
 					}
 					v = fmt.Sprintf("%v", vData[:n])
 				default:
 					v = fmt.Sprintf("%T", vData)
 				}
@@ -799,19 +772,10 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
-		count := 0
+		for scanner.Scan() && (len(rows) < n || n < 0) {
-		for scanner.Scan() {
+			if text := scanner.Text(); text != "" {
-			text := strings.TrimSpace(scanner.Text())
+				rows = append(rows, []string{"", strings.TrimSpace(text)})
 			if text == "" {
 				continue
 			}
 			count++
 			if n < 0 || count <= n {
 				rows = append(rows, []string{"", text})
 			}
 		}
 		if n >= 0 && count > n {
 			rows = append(rows, []string{"", "..."})
 		}
 		return
 	}
@@ -1236,11 +1200,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 	if err := client.Heartbeat(cmd.Context()); err != nil {
-		if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
+		if !strings.Contains(err.Error(), " refused") {
 			return err
 		}
 		if err := startApp(cmd.Context(), client); err != nil {
-			return fmt.Errorf("ollama server not responding - %w", err)
+			return errors.New("could not connect to ollama app, is it running?")
 		}
 	}
 	return nil
@@ -1318,7 +1282,7 @@ func NewCLI() *cobra.Command {
 	}
 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -225,7 +225,6 @@ Weigh anchor!
  System
    You are a pirate!    
    Ahoy, matey!         
    ...                  
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -4,27 +4,17 @@ import (
 	"context"
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
 	"unsafe"
 	"github.com/ollama/ollama/api"
 	"golang.org/x/sys/windows"
 )
 const (
 	Installer = "OllamaSetup.exe"
 )
 func startApp(ctx context.Context, client *api.Client) error {
-	if len(isProcRunning(Installer)) > 0 {
+	// log.Printf("XXX Attempting to find and start ollama app")
 		return fmt.Errorf("upgrade in progress...")
 	}
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@@ -66,41 +56,3 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
 func isProcRunning(procName string) []uint32 {
 	pids := make([]uint32, 2048)
 	var ret uint32
 	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
 		slog.Debug("failed to check for running installers", "error", err)
 		return nil
 	}
 	pids = pids[:ret]
 	var matches []uint32
 	for _, pid := range pids {
 		if pid == 0 {
 			continue
 		}
 		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
 		if err != nil {
 			continue
 		}
 		defer windows.CloseHandle(hProcess)
 		var module windows.Handle
 		var cbNeeded uint32
 		cb := (uint32)(unsafe.Sizeof(module))
 		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
 			continue
 		}
 		var sz uint32 = 1024 * 8
 		moduleName := make([]uint16, sz)
 		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
 		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
 			continue
 		}
 		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
 		if strings.EqualFold(exeFile, procName) {
 			matches = append(matches, pid)
 		}
 	}
 	return matches
 }
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -53,11 +53,8 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}
 	for _, sv := range t.SpecialVocabulary {
 		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
-		if len(sv.IDs) > 0 {
+		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
 		}
 	}
 	return kv
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -139,8 +139,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}
 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
 			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@@ -182,9 +181,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}
 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -94,9 +94,7 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
-		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
+		if t.Name() == "v.position_embd.gate" {
 			text = append(text, t)
 		} else if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
@@ -107,21 +105,23 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 					WriterTo: tt,
 				})
 			}
-		} else {
+		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
-			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+			t.SetRepacker(m.repack(t.Name()))
 				t.SetRepacker(m.repack(t.Name()))
 			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
 				t.SetRepacker(m.repack(t.Name()))
 			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
 				t.SetRepacker(m.repack(t.Name()))
 			}
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		} else {
 			text = append(text, t)
 		}
 	}
@@ -137,35 +137,16 @@ func (m *mllamaModel) repack(name string) Repacker {
 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
+		t, err = tensor.Tanh(t)
-			heads := m.VisionModel.AttentionHeads
+		if err != nil {
-			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+			return nil, err
-				return nil, err
+		}
 			}
-			if err := t.T(0, 2, 1, 3); err != nil {
+		if name == "v.position_embd.gate" {
-				return nil, err
+			t, err = tensor.Sub(float32(1), t)
 			}
 			if err := t.Reshape(dims...); err != nil {
 				return nil, err
 			}
 			if err := t.Transpose(); err != nil {
 				return nil, err
 			}
 		} else {
 			t, err = tensor.Tanh(t)
 			if err != nil {
 				return nil, err
 			}
 			if name == "v.position_embd.gate" {
 				t, err = tensor.Sub(float32(1), t)
 				if err != nil {
 					return nil, err
 				}
 			}
 		}
 		t = tensor.Materialize(t)
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })
-	m, err := ggml.Decode(r, -1)
+	m, _, err := ggml.Decode(r, -1)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()
-			m, err := ggml.Decode(r, -1)
+			m, _, err := ggml.Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -110,7 +110,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}
 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
 		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -172,34 +171,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}
 	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
 	} else if err != nil {
 		return nil, err
 	} else {
 		defer f.Close()
 		var p map[string]json.RawMessage
 		if err := json.NewDecoder(f).Decode(&p); err != nil {
 			return nil, err
 		}
 		for _, st := range specialTokenTypes {
 			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
 				var ids []int32
 				if err := json.Unmarshal(bts, &ids); err != nil {
 					// value is not a list so the existing ID is used
 					continue
 				}
 				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
 					return sv.Type == st
 				}); i >= 0 {
 					t.SpecialVocabulary[i].IDs = ids
 				}
 			}
 		}
 	}
 	return t, nil
 }
@@ -309,9 +280,6 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
 	// IDs is populated by generation_config.json
 	IDs []int32
 }
 func (sv SpecialVocabulary) Key() string {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -247,67 +247,6 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
 		{
 			name: "generation config eos token ids",
 			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
 				"tokenizer.json": strings.NewReader(`{
 					"added_tokens": [
 						{
 							"id": 0,
 							"content": "<bos>",
 							"special": true
 						},
 						{
 							"id": 1,
 							"content": "<eos>",
 							"special": true
 						},
 						{
 							"id": 2,
 							"content": "<eot>",
 							"special": true
 						},
 						{
 							"id": 3,
 							"content": "<eom>",
 							"special": true
 						}
 					],
 					"model": {
 						"vocab": {
 							"<bos>": 0,
 							"<eos>": 1,
 							"<eot>": 2,
 							"<eom>": 3
 						}
 					}
 				}`),
 				"tokenizer_config.json": strings.NewReader(`{
 					"add_bos_token": true,
 					"add_eos_token": false,
 					"bos_token": "<bos>",
 					"eos_token": "<eos>"
 				}`),
 				"generation_config.json": strings.NewReader(`{
 					"bos_token_id": 0,
 					"eos_token_id": [1, 2, 3]
 				}`),
 			}),
 			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
 			want: &Tokenizer{
 				Vocabulary: &Vocabulary{
 					Model:  "gpt2",
 					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
 					Scores: []float32{0, 1, 2, 3},
 					Types:  []int32{3, 3, 3, 3},
 				},
 				SpecialVocabulary: []*SpecialVocabulary{
 					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
 					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
 				},
 				Pre: "default",
 			},
 		},
 	}
 	for _, tt := range cases {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
 	"math"
 	"slices"
 	"strings"
@@ -15,7 +16,6 @@ import (
 type GGML struct {
 	container
 	model
 	Length int64
 }
 type model interface {
@@ -387,12 +387,12 @@ func DetectContentType(b []byte) string {
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 	var c container
@@ -402,25 +402,24 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	case FILE_MAGIC_GGUF_BE:
 		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
-		return nil, errors.New("invalid file magic")
+		return nil, 0, errors.New("invalid file magic")
 	}
 	model, err := c.Decode(rs)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-		Length:    offset,
+	}, offset, nil
 	}, nil
 }
 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
@@ -654,15 +653,24 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
 		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
 		temporalPatchSize := uint64(2)
-		numPatches := maxPixels / (patchSize * patchSize)
+		// Calculate max possible patches based on max_pixels
 		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
 		maxWidth := maxPixels / maxHeight
 		maxGridHeight := maxHeight / patchSize
 		maxGridWidth := maxWidth / patchSize
 		// Account for merged patches (2x2 grid)
 		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
 		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * patchSize^2)
+			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
-			numPatches*numChannels*patchSize*patchSize +
+			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
-			// Self-attention calculations
+			// Self-attention calculations (similar to other architectures)
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -35,7 +35,7 @@ func TestWriteGGUF(t *testing.T) {
 	}
 	defer r.Close()
-	ff, err := Decode(r, 0)
+	ff, _, err := Decode(r, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
 	}
 	testCases := []testCase{
 		{
-			model: "qwen2.5vl",
+			model: "llava:7b",
 		},
 		{
 			model: "llama3.2-vision",
@@ -60,7 +60,6 @@ func TestVisionModels(t *testing.T) {
 }
 func TestIntegrationSplitBatch(t *testing.T) {
 	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -211,9 +211,10 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 		c.curCellRange.max = len(c.cells) - 1
 	}
-	c.curMask = c.buildMask(ctx)
+	var err error
 	c.curMask, err = c.buildMask(ctx)
-	return nil
+	return err
 }
 func newRange() cellRange {
@@ -296,7 +297,7 @@ func roundUp(length, pad int) int {
 // Builds a mask of history x batch indicating whether for each token in the batch the
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
+func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	// Align and pad the two dimensions as required by the backend
 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
@@ -324,7 +325,10 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		mask[i] = float32(math.Inf(-1))
 	}
-	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
 	if err != nil {
 		return nil, err
 	}
 	if c.config.MaskDType != ml.DTypeF32 {
 		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
@@ -332,7 +336,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		maskTensor = out
 	}
-	return maskTensor
+	return maskTensor, nil
 }
 func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
@@ -487,7 +491,12 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
 	if !slices.Equal(c.opts.Except, opts.Except) {
 		c.opts = opts
 		if ctx != nil {
-			c.curMask = c.buildMask(ctx)
+			var err error
 			c.curMask, err = c.buildMask(ctx)
 			if err != nil {
 				// This error should never occur because we have previously built a mask with the same shape
 				panic(fmt.Errorf("SetCausal: %w", err))
 			}
 		}
 	}
 }
@@ -643,7 +652,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		}
 	}
-	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+	kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
 	if err != nil {
 		return err
 	}
 	for i, key := range c.keys {
 		if key == nil {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}
 			cache.SetLayer(0)
-			tensor := context.FromFloatSlice(test.in, test.inShape...)
+			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)
 			out, _, mask := cache.Get(context)
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
 	}
 	cache.SetLayer(0)
-	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
+	tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
 	cache.Put(context, tensor, tensor)
 	// with window size 4, nothing has slid out of the window yet
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
 	}
 	cache.SetLayer(0)
-	tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
+	tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
 	cache.Put(context, tensor, tensor)
 	// only the latest position has overlapping windows
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }
-func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
 	copy(t.data, s)
-	return t
+	return t, nil
 }
-func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}
-	out := c.FromFloatSlice(f, shape...)
+	out, _ := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32
-	return out
+	return out, nil
 }
 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}
-	out := c.FromFloatSlice(s, len(s))
+	out, _ := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
 func (c *testContext) Compute(...ml.Tensor) {}
-func (c *testContext) Reserve() {}
+func (c *testContext) Reserve() error { return nil }
 func (c *testContext) MaxGraphNodes() int {
 	return 10
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
-	cparams.penalty_present = C.float(params.PenaltyPresent)
+	cparams.penalty_present = C.float(params.PenaltyFreq)
 	cparams.seed = C.uint32_t(params.Seed)
 	grammar := C.CString(params.Grammar)
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
 	defer C.free(unsafe.Pointer(cStr))
 	// Allocate buffer for grammar based on schema length but with upper bound
-	maxLen := max(32768, min(1024*1024, len(schema)*4))
+	maxLen := min(1024*1024, len(schema)*4)
 	buf := make([]byte, maxLen)
 	// Call C function to convert schema to grammar
@@ -602,7 +602,7 @@ type Grammar struct {
 	mu sync.Mutex
 }
-func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
+func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
 	cGrammar := C.CString(grammar)
 	defer C.free(unsafe.Pointer(cGrammar))
@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
 		cEogTokens[i] = C.uint32_t(token)
 	}
-	g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
+	g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
 	if g == nil {
 		return nil
 	}
--- a/llama/patches/0016-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0016-graph-memory-reporting-on-failure.patch
@@ -1,156 +0,0 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Jesse Gross <jesse@ollama.com>
 Date: Fri, 18 Apr 2025 15:58:19 -0700
 Subject: [PATCH] graph memory reporting on failure
 ---
 ggml/include/ggml-alloc.h   |  6 ++++++
 ggml/include/ggml-backend.h |  6 ++++++
 ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
 ggml/src/ggml-backend.cpp   | 10 ++++++++++
 4 files changed, 56 insertions(+), 4 deletions(-)
 diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
 index 2cb150fd..781b1e10 100644
 --- a/ggml/include/ggml-alloc.h
 +++ b/ggml/include/ggml-alloc.h
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 +struct ggml_allocr_buffer_status {
 +    size_t size;
 +    bool allocated;
 +};
 +GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 +
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
 index 778927f6..74e46716 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -304,6 +304,12 @@ extern "C" {
     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
 +    struct ggml_backend_buffer_status {
 +        size_t size;
 +        bool allocated;
 +    };
 +    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
 +
     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
 diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
 index 5fd379f6..04812990 100644
 --- a/ggml/src/ggml-alloc.c
 +++ b/ggml/src/ggml-alloc.c
@@ -364,6 +364,7 @@ struct node_alloc {
 struct ggml_gallocr {
     ggml_backend_buffer_type_t * bufts; // [n_buffers]
     ggml_backend_buffer_t * buffers; // [n_buffers]
 +    size_t *buffer_sizes; // [n_buffers]
     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
     int n_buffers;
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
     GGML_ASSERT(galloc->buffers != NULL);
 +    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
 +    GGML_ASSERT(galloc->buffer_sizes != NULL);
 +
     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
     GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
     ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
 +    free(galloc->buffer_sizes);
     free(galloc->buffers);
     free(galloc->buf_tallocs);
     free(galloc->node_allocs);
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         }
     }
 +    bool success = true;
 +
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
         // if the buffer type is used multiple times, we reuse the same buffer
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
             ggml_backend_buffer_free(galloc->buffers[i]);
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
 -            if (galloc->buffers[i] == NULL) {
 +            if (galloc->buffers[i]) {
 +                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
 +                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 +            } else {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
 -                return false;
 +                galloc->buffer_sizes[i] = new_size;
 +                success = false;
             }
 -            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 +        } else {
 +            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
         }
     }
 -    return true;
 +    return success;
 }
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 +struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
 +    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
 +
 +    for (int i = 0; i < buffer_id; i++) {
 +        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
 +            // This buffer is the same as a previous one due to the same buffer type being used multiple times
 +            // (See above.) However, we need a different check because multiple buffers might be NULL in our
 +            // case and we still want to know the attempted size.
 +
 +            struct ggml_allocr_buffer_status status = {0, true};
 +            return status;
 +        }
 +    }
 +
 +    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
 +    return status;
 +}
 +
 // utils
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
 index 0ce73a99..be335e8c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 +struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
 +    int backend_index = ggml_backend_sched_backend_id(sched, backend);
 +    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
 +
 +    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
 +    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
 +
 +    return status;
 +}
 +
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,9 +1,12 @@
 package llm
 import (
 	"cmp"
 	"fmt"
 	"log/slog"
 	"maps"
 	"os"
 	"slices"
 	"strconv"
 	"strings"
@@ -82,11 +85,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64
 	// Projectors loaded into GPU0 only
-	var llamaEngineProjectorWeights uint64
+	var projectorWeights uint64
-
+	var projectorGraph uint64
 	// Projectors loaded with output layer
 	var ollamaEngineProjectorWeights uint64
 	var ollamaEngineProjectorGraph uint64
 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -111,23 +111,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 	for _, projector := range projectors {
-		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
+		weight := projectorMemoryRequirements(projector)
 		projectorWeights += weight
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if llamaEngineProjectorWeights == 0 {
+	if projectorWeights == 0 && projectorGraph == 0 {
-		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		projectorWeights, projectorGraph = f.VisionGraphSize()
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 	layers := f.Tensors().GroupLayers()
-	// add one layer worth of memory as a buffer
+	// add one layer (chosing the max layer) worth of memory as a buffer
-	if blk0, ok := layers["blk.0"]; ok {
+	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
-		layerSize = blk0.Size()
+		return cmp.Compare(a.Size(), b.Size())
-	} else {
+	}).Size()
 		slog.Warn("model missing blk.0 layer size")
 	}
 	var kvct string
 	if envconfig.FlashAttention() &&
@@ -165,7 +163,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}
 	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -175,7 +172,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}
-	gpuZeroOverhead := llamaEngineProjectorWeights
+	// Output layer handled at the end if we have space
 	gpuZeroOverhead := projectorWeights + projectorGraph
 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -218,8 +216,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
 	} else {
 		overflow += gpuZeroOverhead
 	}
 	// For all the layers, find where they can fit on the GPU(s)
@@ -260,24 +256,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	// Determine if we need to consider output then find where it fits
-	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
+	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
-	if memoryLastLayer > 0 {
+		for j := len(gpusWithSpace); j > 0; j-- {
-		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
+			g := gpusWithSpace[layerCount%j]
-			for j := len(gpusWithSpace); j > 0; j-- {
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-				g := gpusWithSpace[layerCount%j]
+			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
-				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+				gpuAllocations[g.i] += memoryLayerOutput
-				if g.g.FreeMemory > overhead+used+memoryLastLayer {
+				layerCounts[g.i]++
-					gpuAllocations[g.i] += memoryLastLayer
+				layerCount++
-					layerCounts[g.i]++
+				break
 					layerCount++
 					break
 				}
 			}
 		}
 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLastLayer
+			overflow += memoryLayerOutput
 		}
 	}
@@ -335,8 +328,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
+		projectorWeights:    projectorWeights,
-		projectorGraph:      ollamaEngineProjectorGraph,
+		projectorGraph:      projectorGraph,
 	}
 	if gpus[0].Library == "cpu" {
@@ -422,7 +415,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 	}
 	defer file.Close()
-	ggml, err := ggml.Decode(file, 1024)
+	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
 		return 0
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -121,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()
-	ggml, err := ggml.Decode(f, maxArraySize)
+	ggml, _, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 }
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -6,6 +6,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math"
 	"os"
 	"slices"
 	"strconv"
 	"strings"
@@ -14,11 +15,6 @@ import (
 )
 type Backend interface {
 	Load(ctx context.Context, progress func(float32)) error
 	// BackendMemory returns the memory allocations that were made for this model
 	BackendMemory() BackendMemory
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -56,6 +52,10 @@ type CacheConfig struct {
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
 	// Progress is a callback function that allows reporting percentage completion
 	// of model loading
 	Progress func(float32)
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int
@@ -72,87 +72,9 @@ type BackendParams struct {
 	FlashAttention bool
 }
-// ErrNoMem is returned when panicing due to insufficient memory. It includes
+var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
 // the attempted memory allocation.
 type ErrNoMem struct {
 	BackendMemory
 }
-func (e ErrNoMem) Error() string {
+func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
 	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
 }
 type AllocationStatus int
 const (
 	// Unallocated memory - have not yet attempted to allocate
 	Unallocated AllocationStatus = iota
 	// Failed memory - tried to allocate the memory and did not succeed
 	Failed
 	// Allocated memory = tried and succeeded to allocate memory
 	Allocated
 )
 // Memory is the size of an allocation and whether it was successful.
 type Memory struct {
 	Size   uint64
 	Status AllocationStatus
 }
 func (m Memory) String() string {
 	s := fmt.Sprint(m.Size)
 	switch m.Status {
 	case Unallocated:
 		s += "U"
 	case Failed:
 		s += "F"
 	case Allocated:
 		s += "A"
 	}
 	return s
 }
 // DeviceMemory provides a breakdown of the memory needed
 // per device, such as a CPU or GPU.
 type DeviceMemory struct {
 	// Name is the name of the device as labeled by the backend. It
 	// may not be persistent across instances of the runner.
 	Name string
 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory
 	// Cache is the per-layer memory needed for the KV cache.
 	Cache []Memory
 	// Graph is the size of the compute graph. It is not per-layer.
 	Graph Memory
 }
 // BackendMemory provides the amount of memory required to load the model
 // per device based on the BackendParams. In some cases, not all required
 // allocations will be known at this point. However, the size of the most recent
 // allocation is guaranteed to be provided so that if it failed, the caller can
 // accommodate that to make forward progress.
 type BackendMemory struct {
 	// InputsWeights are always located on the CPU and cannot be moved
 	InputWeights Memory
 	// CPU model components are located in system memory. This does not
 	// include unified memory allocated through the GPU.
 	CPU DeviceMemory
 	// GPU model components are located on one or more GPUs.
 	GPUs []DeviceMemory
 }
 var backends = make(map[string]func(string, BackendParams) (Backend, error))
 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -160,9 +82,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
 	backends[name] = f
 }
-func NewBackend(modelPath string, params BackendParams) (Backend, error) {
+func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(modelPath, params)
+		return backend(ctx, f, params)
 	}
 	return nil, fmt.Errorf("unsupported backend")
@@ -171,8 +93,8 @@ func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) Tensor
+	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
-	FromIntSlice(s []int32, shape ...int) Tensor
+	FromIntSlice(s []int32, shape ...int) (Tensor, error)
 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
@@ -184,7 +106,7 @@ type Context interface {
 	// graph, simply preallocates memory. Typically called with a
 	// worst case graph to ensure all resources are available for
 	// for future inference.
-	Reserve()
+	Reserve() error
 	MaxGraphNodes() int
 	Close()
@@ -197,6 +119,21 @@ type Context interface {
 	Layer(int) Context
 }
 // RopeOptions contains optional parameters for RoPE function
 type RopeOptions struct {
 	OriginalContextLen uint32
 }
 // RopeOption defines a function that modifies RopeOpts
 type RopeOption func(*RopeOptions)
 // WithContextLen sets a custom context length
 func WithContextLen(len uint32) RopeOption {
 	return func(opts *RopeOptions) {
 		opts.OriginalContextLen = len
 	}
 }
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@@ -210,8 +147,6 @@ type Tensor interface {
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
 	Div(ctx Context, t2 Tensor) Tensor
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
@@ -220,11 +155,11 @@ type Tensor interface {
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
 	SumRows(ctx Context) Tensor
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Sin(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,6 +10,7 @@ import "C"
 import (
 	"context"
 	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -29,7 +30,6 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"golang.org/x/sync/errgroup"
 )
@@ -44,15 +44,8 @@ func devices() []*C.struct_ggml_backend_device {
 }
 type Backend struct {
 	// modelPath is the location of the model data
 	modelPath string
 	meta *fsggml.GGML
 	// tensorLoadTargets maps from the name of the tensor in the file
 	// to the name that is used by the model definition
 	tensorLoadTargets map[string][]string
 	sched         *C.struct_ggml_backend_sched
 	schedBackends []*C.struct_ggml_backend
 	schedBufts    []*C.struct_ggml_backend_buffer_type
@@ -65,26 +58,14 @@ type Backend struct {
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type
 	// requiredMemory is the cumulative memory allocations needed by the backend
 	requiredMemory *ml.BackendMemory
 	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
 	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
 	flashAttention bool
 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
 	maxGraphNodes int
 }
-func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
+func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
-	r, err := os.Open(modelPath)
+	meta, n, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
 	defer r.Close()
 	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -99,9 +80,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		"num_key_values", len(meta.KV()),
 	)
 	var requiredMemory ml.BackendMemory
 	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
 	type deviceBufferType struct {
 		d   *C.struct_ggml_backend_device
 		bts []*C.struct_ggml_backend_buffer_type
@@ -122,8 +100,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}
 	blocks := int(meta.KV().BlockCount())
 	// create list of buffer types for the cpu
 	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
 	for _, d := range append(accels, append(gpus, cpus...)...) {
@@ -131,27 +107,17 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
 			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
-	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
+	for _, d := range gpus {
 	for i, d := range gpus {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
 			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
 		})
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
 	useDefaultSplit := true
@@ -190,6 +156,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	// inputs always use cpu
 	input := cpuDeviceBufferType
 	blocks := int(meta.KV().BlockCount())
 	// define a range of gpu layers. anything outside of this range is assigned to the cpu
 	gpuRangeStart := max(0, blocks-params.NumGPULayers)
 	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
@@ -230,7 +198,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	// contexts are shared by tensors of the same buffer type
 	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
+	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
@@ -256,16 +224,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			C.ggml_set_name(tt, cname)
 			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
 			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
 			if layer == -1 {
 				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
 				requiredMemory.InputWeights.Status = ml.Allocated
 				requiredMemory.InputWeights.Size += uint64(size)
 			} else {
 				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
 			}
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
@@ -287,22 +245,22 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	for _, t := range meta.Tensors().Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
-			createTensor(tensor{source: t}, input.bts, -1)
+			createTensor(tensor{source: t}, input.bts)
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
-				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
 			}
 		case contains(t.Name, "cls", "output", "output_norm"):
-			createTensor(tensor{source: t}, output.bts, blocks)
+			createTensor(tensor{source: t}, output.bts)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, output.bts, blocks)
+			createTensor(tensor{source: t}, output.bts)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
 			// these tensors should be repeated per layer
 			for i, layer := range layers {
 				createTensor(tensor{
 					source: t,
 					target: "blk." + strconv.Itoa(i) + "." + t.Name,
-				}, layer.bts, i)
+				}, layer.bts)
 			}
 		default:
 			layerIndex := -1
@@ -313,10 +271,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}
 			if layerIndex >= 0 {
-				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
+				createTensor(tensor{source: t}, layers[layerIndex].bts)
 			} else {
 				// load all other tensors on the cpu
-				createTensor(tensor{source: t}, input.bts, -1)
+				createTensor(tensor{source: t}, input.bts)
 			}
 		}
 	}
@@ -329,18 +287,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
 		for i := range btDeviceMemory[bt].Weights {
 			if btDeviceMemory[bt].Weights[i].Size != 0 {
 				if b != nil {
 					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
 				} else {
 					btDeviceMemory[bt].Weights[i].Status = ml.Failed
 				}
 			}
 		}
 		if b == nil {
-			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
+			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
 		}
 		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
@@ -359,6 +307,73 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}
 	var doneBytes atomic.Uint64
 	totalBytes := uint64(n) - meta.Tensors().Offset
 	g, ctx := errgroup.WithContext(ctx)
 	g.SetLimit(runtime.GOMAXPROCS(0))
 	for _, t := range meta.Tensors().Items() {
 		t := t
 		g.Go(func() error {
 			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
 			for i := range tts {
 				target := targets[t.Name][i]
 				if target == "" {
 					target = t.Name
 				}
 				tt, ok := tensors[target]
 				if !ok {
 					return fmt.Errorf("unassigned tensor: %s", t.Name)
 				}
 				tts[i] = tt
 			}
 			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
 			// seeking around within an FD shared between all goroutines.
 			file, err := os.Open(r.Name())
 			if err != nil {
 				slog.Warn("file open error", "file", r.Name(), "error", err)
 				return err
 			}
 			defer file.Close()
 			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
 			bts := make([]byte, 128*format.KibiByte)
 			var s uint64
 			for s < t.Size() {
 				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
 				if err := ctx.Err(); err != nil {
 					return err
 				}
 				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
 				if err != nil {
 					slog.Warn("file read error", "file", r.Name(), "error", err)
 					return err
 				}
 				for _, tt := range tts {
 					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
 				}
 				s += uint64(n)
 				if params.Progress != nil {
 					done := doneBytes.Add(uint64(n))
 					params.Progress(float32(done) / float32(totalBytes))
 				}
 			}
 			return nil
 		})
 	}
 	if err := g.Wait(); err != nil {
 		return nil, err
 	}
 	// map devices to backend buffer types so new tensors can be assigned to the correct device
 	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
@@ -382,11 +397,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
-		modelPath:         modelPath,
+		flashAttention: params.FlashAttention,
-		flashAttention:    params.FlashAttention,
+		meta:           meta,
-		meta:              meta,
+		tensors:        tensors,
 		tensorLoadTargets: targets,
 		tensors:           tensors,
 		sched: C.ggml_backend_sched_new(
 			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
@@ -405,9 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}
 			return m
 		}(),
-		requiredMemory: &requiredMemory,
+		maxGraphNodes: maxGraphNodes,
 		btDeviceMemory: btDeviceMemory,
 		maxGraphNodes:  maxGraphNodes,
 	}, nil
 }
@@ -415,81 +426,6 @@ func init() {
 	ml.RegisterBackend("ggml", New)
 }
 func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 	var doneBytes atomic.Uint64
 	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
 	g, ctx := errgroup.WithContext(ctx)
 	g.SetLimit(runtime.GOMAXPROCS(0))
 	for _, t := range b.meta.Tensors().Items() {
 		t := t
 		g.Go(func() error {
 			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
 			for i := range tts {
 				target := b.tensorLoadTargets[t.Name][i]
 				if target == "" {
 					target = t.Name
 				}
 				tt, ok := b.tensors[target]
 				if !ok {
 					return fmt.Errorf("unassigned tensor: %s", t.Name)
 				}
 				tts[i] = tt
 			}
 			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
 			// seeking around within an FD shared between all goroutines.
 			file, err := os.Open(b.modelPath)
 			if err != nil {
 				slog.Warn("file open error", "file", b.modelPath, "error", err)
 				return err
 			}
 			defer file.Close()
 			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
 			bts := make([]byte, 128*format.KibiByte)
 			var s uint64
 			for s < t.Size() {
 				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
 				if err := ctx.Err(); err != nil {
 					return err
 				}
 				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
 				if err != nil {
 					slog.Warn("file read error", "file", b.modelPath, "error", err)
 					return err
 				}
 				for _, tt := range tts {
 					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
 				}
 				s += uint64(n)
 				if progress != nil {
 					done := doneBytes.Add(uint64(n))
 					progress(float32(done) / float32(totalBytes))
 				}
 			}
 			return nil
 		})
 	}
 	if err := g.Wait(); err != nil {
 		return err
 	}
 	return nil
 }
 func (b *Backend) BackendMemory() ml.BackendMemory {
 	return *b.requiredMemory
 }
 func (b *Backend) Config() fs.Config {
 	return b.meta.KV()
 }
@@ -521,7 +457,6 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			no_alloc: true,
 		}),
 		allocatedBuffers: &allocatedBuffers,
 		layer:            -1,
 	}
 }
@@ -548,9 +483,6 @@ type Context struct {
 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
 	// layer is the graph layer that this context is allocating for - assumed to be cache
 	layer int
 }
 func (c *Context) Input() ml.Context {
@@ -561,7 +493,6 @@ func (c *Context) Input() ml.Context {
 			buft:             c.b.input,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
 			layer:            -1,
 		}
 	}
@@ -576,7 +507,6 @@ func (c *Context) Layer(i int) ml.Context {
 			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
 			layer:            i,
 		}
 	}
@@ -614,34 +544,22 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }
-func (c *Context) Reserve() {
+func (c *Context) Reserve() error {
-	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
+	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
 		C.ggml_backend_sched_reset(c.b.sched)
 		return errors.New("failed to reserve graph")
 	}
 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
 	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
 	for _, bt := range c.b.schedBufts {
 		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
 	}
 	for i := range c.b.schedBackends {
-		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
+		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
 		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
 		graph.Size += uint64(bufferStatus.size)
 		if bufferStatus.allocated && graph.Status != ml.Failed {
 			graph.Status = ml.Allocated
 		} else {
 			graph.Status = ml.Failed
 		}
 		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
-			"size", format.HumanBytes2(uint64(bufferStatus.size)))
+			"size", format.HumanBytes2(uint64(size)))
 	}
-	if !reserved {
+	C.ggml_backend_sched_reset(c.b.sched)
-		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+
-	}
+	return nil
 }
 func (c *Context) MaxGraphNodes() int {
@@ -661,7 +579,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }
-func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
+func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -684,7 +602,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
-		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
+		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
 	} else if len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
@@ -697,43 +615,40 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
 	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
 	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
 	if c.layer >= 0 {
 		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
 		cache.Size += uint64(size)
 		if b != nil {
 			cache.Status = ml.Allocated
 		} else {
 			cache.Status = ml.Failed
 		}
 	}
 	if b == nil {
-		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
 	}
 	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
-	return &Tensor{b: c.b, t: t}
+	return &Tensor{b: c.b, t: t}, nil
 }
 func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-	return c.newTensor(dtype, shape)
+	t, err := c.newTensor(dtype, shape)
 	if err != nil {
 		panic(err)
 	}
 	return t
 }
 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	t := c.newTensor(dtype, shape)
+	t, err := c.newTensor(dtype, shape)
 	if err != nil {
 		panic(err)
 	}
 	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }
-func checkShape[S ~[]E, E any](s S, shape ...int) {
+func checkShape[S ~[]E, E any](s S, shape ...int) error {
 	n := len(s)
 	if n == 0 {
-		return
+		return nil
 	}
 	for _, v := range shape {
@@ -741,32 +656,44 @@ func checkShape[S ~[]E, E any](s S, shape ...int) {
 	}
 	if n != 1 {
-		panic(fmt.Errorf("invalid shape: %v", shape))
+		return fmt.Errorf("invalid shape: %v", shape)
 	}
 	return nil
 }
-func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
-	checkShape(s, shape...)
+	if err := checkShape(s, shape...); err != nil {
 		return nil, err
 	}
-	t := c.newTensor(ml.DTypeF32, shape)
+	t, err := c.newTensor(ml.DTypeF32, shape)
 	if err != nil {
 		return nil, err
 	}
 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}
-	return t
+	return t, nil
 }
-func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
-	checkShape(s, shape...)
+	if err := checkShape(s, shape...); err != nil {
 		return nil, err
 	}
-	t := c.newTensor(ml.DTypeI32, shape)
+	t, err := c.newTensor(ml.DTypeI32, shape)
 	if err != nil {
 		return nil, err
 	}
 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}
-	return t
+	return t, nil
 }
 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -784,7 +711,12 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}
-		return c.Input().FromIntSlice(arange, len(arange))
+		t, err := c.Input().FromIntSlice(arange, len(arange))
 		if err != nil {
 			panic(err)
 		}
 		return t
 	default:
 		panic("unsupported dtype for arange")
 	}
@@ -935,13 +867,6 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }
 func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
 	}
 }
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -990,8 +915,6 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
 	} else if shape[3] != 0 {
 		panic("cuda does not support 4d tensors")
 	}
 	return &Tensor{
@@ -1059,13 +982,6 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	}
 }
 func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
 	}
 }
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1137,15 +1053,28 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }
-func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
+const (
 	ropeTypeNorm   C.int = 0
 	ropeTypeNeox   C.int = 2
 	ropeTypeMrope  C.int = 8
 	ropeTypeVision C.int = 24
 )
 func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
 	// Default options
-	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
+	opts := &ml.RopeOptions{
 		OriginalContextLen: 131072,
 	}
 	// Apply any provided options
 	for _, option := range options {
 		option(opts)
 	}
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{b: t.b}
 	}
 	dequant := t.t
 	if C.ggml_is_quantized(t.t._type) {
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
@@ -1156,11 +1085,11 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx,
 			dequant,
-			positions.(*Tensor).t,
+			positionIDs.(*Tensor).t,
-			opts.Factors.(*Tensor).t,
+			ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(opts.Type),
+			C.int(ropeType),
-			C.int(opts.OriginalContextLength),
+			C.int(opts.OriginalContextLen),
 			C.float(ropeBase),
 			C.float(ropeScale),
 			C.float(0.0),
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -66,12 +66,6 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 struct ggml_allocr_buffer_status {
    size_t size;
    bool allocated;
 };
 GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -304,12 +304,6 @@ extern "C" {
    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
    struct ggml_backend_buffer_status {
        size_t size;
        bool allocated;
    };
    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -364,7 +364,6 @@ struct node_alloc {
 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
    ggml_backend_buffer_t * buffers; // [n_buffers]
    size_t *buffer_sizes; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;
@@ -388,9 +387,6 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);
    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
    GGML_ASSERT(galloc->buffer_sizes != NULL);
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
    GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -457,7 +453,6 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    ggml_hash_set_free(&galloc->hash_set);
    free(galloc->hash_values);
    free(galloc->bufts);
    free(galloc->buffer_sizes);
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
@@ -753,8 +748,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        }
    }
    bool success = true;
    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
        // if the buffer type is used multiple times, we reuse the same buffer
@@ -776,20 +769,15 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            ggml_backend_buffer_free(galloc->buffers[i]);
            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i]) {
+            if (galloc->buffers[i] == NULL) {
                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
            } else {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                galloc->buffer_sizes[i] = new_size;
+                return false;
                success = false;
            }
-        } else {
+            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
        }
    }
-    return success;
+    return true;
 }
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -946,24 +934,6 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
    for (int i = 0; i < buffer_id; i++) {
        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
            // This buffer is the same as a previous one due to the same buffer type being used multiple times
            // (See above.) However, we need a different check because multiple buffers might be NULL in our
            // case and we still want to know the attempted size.
            struct ggml_allocr_buffer_status status = {0, true};
            return status;
        }
    }
    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
    return status;
 }
 // utils
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1629,16 +1629,6 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
    return status;
 }
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -3,7 +3,7 @@ package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
-// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
+// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal
 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
-// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"
--- a/ml/nn/fast/rope.go
+++ b/ml/nn/fast/rope.go
@@ -1,21 +0,0 @@
 // fast provides implementations of fast (fused) operations for increased performance.
 package fast
 import (
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn/rope"
 )
 // fastRoPE is an interface for tensors that support fast rotary positional embedding.
 type fastRoPE interface {
 	RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
 }
 // RoPE applies rotary positional embedding to tensor `t`.
 func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
 	if t, ok := t.(fastRoPE); ok {
 		return t.RoPE(ctx, positions, dim, base, scale, options...)
 	}
 	panic("RoPE not implemented for this tensor type")
 }
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -1,33 +0,0 @@
 package rope
 import "github.com/ollama/ollama/ml"
 // Options contains optional parameters for RoPE function
 type Options struct {
 	OriginalContextLength int
 	Type                  int
 	Factors               ml.Tensor
 }
 // WithOriginalContextLength sets a custom context length
 func WithOriginalContextLength(n int) func(*Options) {
 	return func(opts *Options) {
 		opts.OriginalContextLength = n
 	}
 }
 // WithType sets RoPE type to NeoX
 func WithTypeNeoX() func(*Options) {
 	return func(opts *Options) {
 		opts.Type = 2
 	}
 }
 // WithFactors sets custom rope factors
 func WithFactors(factors ml.Tensor) func(*Options) {
 	return func(opts *Options) {
 		if factors != nil {
 			opts.Factors = factors
 		}
 	}
 }
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,30 +2,16 @@ package input
 import "github.com/ollama/ollama/ml"
 // Multimodal is a multimodal embedding or a component of one.
 // For example, it could be a row of an image that can be processed
 // independently.
 type Multimodal struct {
 	// Tensor is the embedding data. Implementations may chose what to
 	// store here or it may be nil if not needed. However, any ml.Tensor
 	// objects must be stored here and not in Data.
 	Tensor ml.Tensor
 	// Data is implementation-specific opaque data, such as metadata on how
 	// to layout Tensor. It may be nil if not needed. It may also store larger
 	// objects such as complete images if they are to be processed later.
 	Data any
 }
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32
-	// Multimodal is represents a non-text element such as an
+	// Multimodal is opaque data representing a non-text
-	// image (or part of one if the image can be processed in pieces).
+	// element such as an image (or part of one if the image
-	// It may be used either together with Token or on its own.
+	// can be processed in pieces). It may be either together
-	Multimodal []Multimodal
+	// with Token or on its own.
 	Multimodal any
 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -46,7 +32,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal []Multimodal
+	Multimodal any
 }
 // Batch contains the inputs for a model forward pass
--- a/model/model.go
+++ b/model/model.go
@@ -40,13 +40,12 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is one or more tensors, each with optional model-specific
+	// The return value is most typically an ml.Tensor, however, different
-	// opaque metadata. Typically, the tensors might be views into an embedding
+	// type are possible, such as an object containing a tensor plus
-	// with each view representing a chunk of data that can be processed independently
+	// additional metadata, a slice of tensors or even just the original input.
 	// in different batches.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
+	EncodeMultimodal(ml.Context, []byte) (any, error)
 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
@@ -98,8 +97,14 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }
 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(modelPath string, params ml.BackendParams) (Model, error) {
+func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
-	b, err := ml.NewBackend(modelPath, params)
+	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
 	defer r.Close()
 	b, err := ml.NewBackend(ctx, r, params)
 	if err != nil {
 		return nil, err
 	}
@@ -128,7 +133,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}
 	defer r.Close()
-	meta, err := fsggml.Decode(r, -1)
+	meta, _, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -287,7 +292,11 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
 		return nil, errors.New("batch size cannot be less than 1")
 	}
-	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
+	var err error
 	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
 	if err != nil {
 		return nil, err
 	}
 	cache := m.Config().Cache
 	if cache != nil {
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -7,8 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@@ -45,13 +43,10 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOS: append(
+				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
 					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
 					c.Ints("tokenizer.ggml.eos_token_ids")...,
 				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -85,10 +80,11 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	ropeType := uint32(2)
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -98,7 +94,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -128,7 +124,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }
 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
 }
 type MLP struct {
@@ -175,8 +171,15 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -60,16 +60,12 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(1),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
+				EOT:    int32(106),
-					[]int32{
+				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
 						int32(c.Uint("tokenizer.ggml.eos_token_id")),
 						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
 					},
 					c.Ints("tokenizer.ggml.eos_token_ids")...,
 				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -86,7 +82,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -101,30 +97,33 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
-	pixelValues := ctx.Input().FromFloatSlice(f32s,
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 	)
 	if err != nil {
 		return nil, err
 	}
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return visionOutputs, nil
 }
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal[0].Tensor
+			inputMultimodal := inp.Multimodal.(ml.Tensor)
 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
-				input.Input{Token: 255999},                                     // "<start_of_image>""
+				input.Input{Token: 255999},                                                   // "<start_of_image>""
-				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)
 			// add image token placeholders
@@ -141,8 +140,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,8 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )
@@ -75,6 +73,7 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	ropeType := uint32(2)
 	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
@@ -84,7 +83,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -95,7 +94,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -113,7 +112,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
 		ropeBase = m.TextConfig.ropeGlobalBase
 	}
-	return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
 }
 type TextMLP struct {
@@ -166,7 +165,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal[0].Tensor
+		visionOutputs := image.Multimodal.(ml.Tensor)
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
 		for i := range visionOutputs.Dim(1) {
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -1,23 +1,22 @@
 package llama
 import (
-	"cmp"
+	"fmt"
 	"math"
 	"strings"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
 	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
 	ropeDim                          uint32
 }
 type Model struct {
@@ -33,6 +32,10 @@ type Model struct {
 }
 func New(c fs.Config) (model.Model, error) {
 	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
 		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
 	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -40,13 +43,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
+				// TODO: set EOT to EOS otherwise 0 will stop generation
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -54,11 +57,10 @@ func New(c fs.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
 			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
 			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
@@ -75,31 +77,31 @@ type SelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }
-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	headDim := opts.hiddenSize / opts.numHeads
-	ropeDim := cmp.Or(opts.ropeDim, headDim)
+	ropeType := uint32(0)
-	query := sa.Query.Forward(ctx, hiddenState)
+	q := sa.Query.Forward(ctx, hiddenState)
-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
 	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
-	key := sa.Key.Forward(ctx, hiddenState)
+	k := sa.Key.Forward(ctx, hiddenState)
-	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
-	value := sa.Value.Forward(ctx, hiddenState)
+	v := sa.Value.Forward(ctx, hiddenState)
-	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
 	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
-	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	return sa.Output.Forward(ctx, kqv)
 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
 	return sa.Output.Forward(ctx, attention)
 }
 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
 }
 type MLP struct {
@@ -120,11 +122,11 @@ type Layer struct {
 	MLP           *MLP
 }
-func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState
 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -142,19 +144,27 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tenso
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
-		var outputs ml.Tensor
+		var lastLayerOutputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+			lastLayerOutputs = outputs
 		}
-		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
 	}
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
 	"sync"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -40,13 +41,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
+				// TODO: set EOT to EOS otherwise 0 will stop generation
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -62,7 +63,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -77,7 +78,10 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
-	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
+	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
 	if err != nil {
 		return nil, err
 	}
 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
@@ -88,86 +92,81 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	pixelValues := tilesLocal
 	if len(pixelsGlobal) > 0 {
-		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
+		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
 		if err != nil {
 			return nil, err
 		}
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-
+	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
 	var multimodal []input.Multimodal
 	aspectRatio := image.Point{ratioW, ratioH}
 	var offset int
 	patchesPerChunk := projectedOutputs.Dim(1)
 	if aspectRatio.Y*aspectRatio.X > 1 {
 		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
 		for range aspectRatio.Y {
 			for x := range aspectRatio.X {
 				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
 					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
 					patchesPerChunk)
 				var separator separator
 				if x < aspectRatio.X-1 {
 					separator.x = true // <|tile_x_separator|>
 				} else {
 					separator.y = true // <|tile_y_separator|>
 				}
 				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
 				offset += patchesPerChunk
 			}
 		}
 	}
 	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
 		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
 		patchesPerChunk)
 	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
 	return multimodal, nil
 }
-type separator struct {
+type chunks struct {
-	x bool
+	*Model
-	y bool
+	ml.Tensor
 	aspectRatio image.Point
 	dataOnce sync.Once
 	data     []float32
 }
 type chunk struct {
 	*chunks
 	s, n int
 }
 func (r *chunk) floats() []float32 {
 	r.dataOnce.Do(func() {
 		temp := r.Backend().NewContext()
 		defer temp.Close()
 		temp.Forward(r.Tensor).Compute(r.Tensor)
 		r.data = r.Floats()
 	})
 	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 			continue
 		}
 		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>
-		for i, mm := range inp.Multimodal {
+		var offset int
-			patchesPerChunk := mm.Tensor.Dim(1)
+		patchesPerChunk := t.Dim(1)
 		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
 			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
-			if i < len(inp.Multimodal)-1 {
+			for range t.aspectRatio.Y {
-				separator := mm.Data.(*separator)
+				for x := range t.aspectRatio.X {
-
+					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+					if x < t.aspectRatio.X-1 {
-
+						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
-				if separator.x {
+					}
-					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+					offset += patchesPerChunk
 				}
-				if separator.y {
+
-					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
 				}
 			} else {
 				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
 				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
 				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
 				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 			}
 		}
 		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
 		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
 		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 		result = append(result, imageInputs...)
 	}
@@ -175,8 +174,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )
@@ -33,8 +31,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	if useRope {
-		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
-		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
 	}
 	if opts.useQKNorm {
@@ -82,7 +80,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
+		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
 	}
 	return nextStates
@@ -212,7 +210,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
 		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
 		if err != nil {
 			panic(err)
 		}
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}
@@ -223,7 +226,11 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}
-		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
+		var err error
 		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
 		if err != nil {
 			panic(err)
 		}
 	}
 	for i, layer := range m.Layers {
@@ -248,5 +255,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 }
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
+	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
 }
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -245,7 +245,10 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}
-	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
+	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
 	if err != nil {
 		panic(err)
 	}
 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
 	"sync"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -31,26 +32,31 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 var _ model.TextProcessor = (*Model)(nil)
 func New(c fs.Config) (model.Model, error) {
 	textModel, err := NewTextModel(c)
 	if err != nil {
 		return nil, err
 	}
 	m := &Model{
 		TextModel:           textModel,
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
+				// TODO: set EOT to EOS otherwise 0 will stop generation
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 				),
 			},
 		),
 		TextModel:           newTextModel(c),
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
 	}
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -99,7 +105,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -114,20 +120,46 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
-	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
 	if err != nil {
 		return nil, err
 	}
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
 	// split into patches to be sent to the text transformer
-	rows := make([]input.Multimodal, size.Y)
+	parent := imageFeatures{tensor: features}
 	rows := make([]*imageRow, size.Y)
 	for i := range rows {
-		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
+		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
 	}
 	return rows, nil
 }
 type imageFeatures struct {
 	tensor ml.Tensor
 	dataOnce sync.Once
 	data     []float32
 }
 type imageRow struct {
 	parent *imageFeatures
 	s      int
 	shape  []int
 }
 func (r *imageRow) data() []float32 {
 	n := 1
 	for _, s := range r.shape {
 		n *= s
 	}
 	return r.parent.data[r.s*n : (r.s+1)*n]
 }
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -136,14 +168,15 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			for i, row := range inp.Multimodal {
+			inputMultimodal := inp.Multimodal.([]*imageRow)
 			for i, row := range inputMultimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
+				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
-				if i == len(inp.Multimodal)-1 {
+				if i == len(inputMultimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
@@ -158,8 +191,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -1,24 +1,27 @@
 package mistral3
 import (
-	"cmp"
+	"fmt"
 	"math"
 	"strings"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads int
+	hiddenSize, numHeads, numKVHeads, headDim int
-	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale                  float32
-	eps, ropeBase, ropeScale         float32
+	ropeDim                                   uint32
 }
 type TextModel struct {
 	model.Base
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -36,15 +39,19 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeType := uint32(0)
 	headDim := opts.headDim
 	if headDim == 0 {
 		headDim = opts.hiddenSize / opts.numHeads
 	}
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -55,7 +62,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }
 type MLP struct {
@@ -102,7 +109,20 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// image embeddings
 	for _, image := range batch.Multimodal {
-		imageFeature := image.Multimodal[0].Tensor
+		row := image.Multimodal.(*imageRow)
 		row.parent.dataOnce.Do(func() {
 			// use a new, throwaway context so the image tensor is not added to the graph
 			temp := m.Backend().NewContext()
 			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
 			row.parent.data = row.parent.tensor.Floats()
 			temp.Close()
 		})
 		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
 		if err != nil {
 			panic(err)
 		}
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}
@@ -121,18 +141,24 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	return m.Output.Forward(ctx, hiddenState)
 }
-func newTextModel(c fs.Config) *TextModel {
+func NewTextModel(c fs.Config) (*TextModel, error) {
-	return &TextModel{
+	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
 		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
 	}
 	textModel := &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
 			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
 			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
 	return textModel, nil
 }
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,8 +110,15 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}
-	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
+	if err != nil {
 		panic(err)
 	}
 	w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
 	if err != nil {
 		panic(err)
 	}
 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -144,7 +151,10 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}
-	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
+	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
 	if err != nil {
 		panic(err)
 	}
 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
@@ -160,7 +170,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 func newVisionModel(c fs.Config) *VisionModel {
 	return &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
 			numHeads:         int(c.Uint("vision.attention.head_count", 16)),
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -3,7 +3,6 @@ package mllama
 import (
 	"bytes"
 	"image"
 	"slices"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -38,13 +37,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
+				// TODO: set EOT to EOS otherwise 0 will stop generation
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -59,7 +58,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -74,20 +73,21 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
-	if ratio.numTiles() < m.maxNumTiles {
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
-		// Pad tiles to maxNumTiles
+	if err != nil {
-		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
+		return nil, err
 		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
 	}
-	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
+	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
-	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
+
 	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 	if err != nil {
 		return nil, err
 	}
 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
+	return m.Projector.Forward(ctx, crossAttentionStates), nil
 	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
 }
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
@@ -103,11 +103,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
 	}
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	// TODO: attention mask, cross attention mask
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 )
 type TextSelfAttention struct {
@@ -23,14 +21,15 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -45,7 +44,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
+		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
 	}
 	return key, nil
@@ -200,8 +199,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
 	ropeDim                          int
 	eps, ropeBase, ropeScale         float32
 	ropeDim                          uint32
 	crossAttentionLayers []int32
 }
@@ -241,10 +240,10 @@ func newTextModel(c fs.Config) *TextModel {
 			hiddenSize:           int(c.Uint("embedding_length")),
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
 			ropeDim:              int(c.Uint("rope.dimension_count")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
 			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
 		},
 	}
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -16,6 +16,8 @@ type VisionSelfAttention struct {
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
 	Gate ml.Tensor `gguf:"attn_gate"`
 }
 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -23,16 +25,27 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
 	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
 	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
 	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
+	scores := key.Mulmat(ctx, query)
 	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
 	scores = scores.Softmax(ctx)
 	attention := value.Mulmat(ctx, scores)
 	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
 	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-	return sa.Output.Forward(ctx, attention)
+
 	hiddenState = sa.Output.Forward(ctx, attention)
 	return hiddenState
 }
 type VisionMLP struct {
@@ -63,18 +76,21 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
 	if e.AttentionGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState
 	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
 	hiddenState = hiddenState.Add(ctx, residual)
 	if e.MLPGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
 	}
-	hiddenState = hiddenState.Add(ctx, residual)
+
 	return hiddenState
 }
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,7 +7,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
 	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -1,164 +0,0 @@
 package qwen2
 import (
 	"cmp"
 	"math"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
 	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
 }
 type Attention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
 }
 func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenStates.Dim(1)
 	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
 	ropeDim := cmp.Or(opts.ropeDim, headDim)
 	query := attn.Query.Forward(ctx, hiddenStates)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
 	key := attn.Key.Forward(ctx, hiddenStates)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	value := attn.Value.Forward(ctx, hiddenStates)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
 	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
 	return attn.Output.Forward(ctx, attention)
 }
 type MLP struct {
 	Gate *nn.Linear `gguf:"ffn_gate"`
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
 }
 func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
 	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
 	return mlp.Down.Forward(ctx, hiddenStates)
 }
 type DecoderLayer struct {
 	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
 	Attention     *Attention
 	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
 	MLP           *MLP
 }
 func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenStates
 	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
 	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
 	if outputs != nil {
 		hiddenStates = hiddenStates.Rows(ctx, outputs)
 		residual = residual.Rows(ctx, outputs)
 	}
 	hiddenStates = hiddenStates.Add(ctx, residual)
 	residual = hiddenStates
 	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
 	hiddenStates = d.MLP.Forward(ctx, hiddenStates)
 	return hiddenStates.Add(ctx, residual)
 }
 type Model struct {
 	model.Base
 	model.BytePairEncoding
 	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
 	Layers         []DecoderLayer `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm    `gguf:"output_norm"`
 	Output         *nn.Linear     `gguf:"output,alt:token_embd"`
 	Options
 }
 // Forward implements model.Model.
 func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
 			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}
 		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
 	}
 	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
 	hiddenStates = m.Output.Forward(ctx, hiddenStates)
 	return hiddenStates, nil
 }
 func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
 	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
 }
 func New(c fs.Config) (model.Model, error) {
 	m := Model{
 		Layers: make([]DecoderLayer, c.Uint("block_count")),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOS: append(
 					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
 					c.Ints("tokenizer.ggml.eos_token_ids")...,
 				),
 			},
 		),
 		Options: Options{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
 			ropeDim:    int(c.Uint("rope.dimension_count")),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 		},
 	}
 	m.Cache = kvcache.NewCausalCache(m.Shift)
 	return &m, nil
 }
 func init() {
 	model.Register("qwen2", New)
 }
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"image"
 	"slices"
 	"sync"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -34,13 +35,12 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 					c.Ints("tokenizer.ggml.eos_token_ids")...,
 				),
 			},
 		),
 		TextModel:      NewTextModel(c),
@@ -69,12 +69,15 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width
-	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
 	}
 	return pixelValues, grid, nil
 }
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -85,7 +88,31 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	}
 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return &chunks{Model: m, Tensor: visionOutputs}, nil
 }
 type chunks struct {
 	*Model
 	ml.Tensor
 	dataOnce sync.Once
 	data     []float32
 }
 type chunk struct {
 	*chunks
 	s, n int
 }
 func (r *chunk) floats() []float32 {
 	r.dataOnce.Do(func() {
 		temp := r.Backend().NewContext()
 		defer temp.Close()
 		temp.Forward(r.Tensor).Compute(r.Tensor)
 		r.data = r.Floats()
 	})
 	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }
 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
@@ -115,15 +142,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 				result = append(result, input.Input{Token: pre[i]})
 			}
-			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+			// This is an image token with multimodal data
 			chunksData := inp.Multimodal.(*chunks)
 			patchesPerChunk := chunksData.Dim(1)
 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken})
+			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})
 			// Add the image token with the multimodal tensor data at the first position
 			// Create a chunk with proper s and n values
 			result = append(result, input.Input{
 				Token:          imageToken,
-				Multimodal:     inp.Multimodal,
+				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
 				MultimodalHash: inp.MultimodalHash,
 				SameBatch:      patchesPerChunk,
 			})
@@ -139,8 +169,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -7,15 +7,13 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )
 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads int
+	ctxLen, hiddenSize, numHeads, numKVHeads int
-	ropeDim, originalContextLength   int
+	eps, ropeBase, ropeScale                 float32
-	eps, ropeBase, ropeScale         float32
+	ropeDim, defaultContextLen               uint32
 }
 type TextModel struct {
@@ -31,14 +29,15 @@ func NewTextModel(c fs.Config) *TextModel {
 	m := TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
-			hiddenSize:            int(c.Uint("embedding_length")),
+			ctxLen:            int(c.Uint("context_length")),
-			numHeads:              int(c.Uint("attention.head_count")),
+			hiddenSize:        int(c.Uint("embedding_length")),
-			numKVHeads:            int(c.Uint("attention.head_count_kv")),
+			numHeads:          int(c.Uint("attention.head_count")),
-			ropeDim:               int(c.Uint("rope.dimension_count", 128)),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
-			originalContextLength: int(c.Uint("context_length", 128000)),
+			eps:               c.Float("attention.layer_norm_rms_epsilon"),
-			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base"),
-			ropeBase:              c.Float("rope.freq_base"),
+			ropeScale:         c.Float("rope.freq_scale", 1),
-			ropeScale:             c.Float("rope.freq_scale", 1),
+			ropeDim:           c.Uint("rope.dimension_count", 128),
 			defaultContextLen: c.Uint("context_length", 128000),
 		},
 	}
@@ -60,11 +59,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -78,7 +77,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 // Shift applies rotary position embeddings to the key tensor for causal attention caching
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
 }
 // MLP implements the feed-forward network component with SwiGLU activation
@@ -130,7 +129,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
 		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
 		if err != nil {
 			panic(err)
 		}
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -1,6 +1,7 @@
 package qwen25vl
 import (
 	"fmt"
 	"math"
 	"slices"
@@ -43,8 +44,10 @@ func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int
 		}
 	}
-	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
-
+	if err != nil {
 		panic(err)
 	}
 	// Reshape to match [seqLength, seqLength, 1] for broadcasting
 	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
@@ -300,7 +303,10 @@ func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
 		}
 	}
-	t := ctx.Input().FromIntSlice(index, len(index))
+	t, err := ctx.Input().FromIntSlice(index, len(index))
 	if err != nil {
 		panic(err)
 	}
 	return t, bounds
 }
@@ -320,7 +326,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
 		}
 	}
-	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
 	if err != nil {
 		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
 	}
 	// Create position coordinates (y,x pairs) for the grid
 	// In PyTorch: Equivalent to generating position ids with torch.arange()
@@ -330,7 +339,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			coords = append(coords, int32(y), int32(x))
 		}
 	}
-	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
 	if err != nil {
 		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
 	}
 	// Reshape and permute positions to match spatial merging pattern
 	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -1,233 +0,0 @@
 package qwen3
 import (
 	"cmp"
 	"math"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
 	eps                              float32
 	ropeBase, ropeScale              float32
 	keyLength, valueLength int
 	numExperts, numExpertsUsed int
 	normTopKProb               bool
 }
 func (o Options) headDim() int {
 	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
 }
 type Attention struct {
 	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
 	Query     *nn.Linear  `gguf:"attn_q"`
 	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
 	Key       *nn.Linear  `gguf:"attn_k"`
 	Value     *nn.Linear  `gguf:"attn_v"`
 	Output    *nn.Linear  `gguf:"attn_output"`
 }
 func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenStates.Dim(1)
 	query := sa.Query.Forward(ctx, hiddenStates)
 	key := sa.Key.Forward(ctx, hiddenStates)
 	value := sa.Value.Forward(ctx, hiddenStates)
 	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
 	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
 	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
 	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
 	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
 	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
 	return sa.Output.Forward(ctx, attention)
 }
 type MLP interface {
 	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
 }
 type sparse struct {
 	Router *nn.Linear `gguf:"ffn_gate_inp"`
 	Gate   ml.Tensor  `gguf:"ffn_gate_exps.weight"`
 	Up     ml.Tensor  `gguf:"ffn_up_exps.weight"`
 	Down   ml.Tensor  `gguf:"ffn_down_exps.weight"`
 }
 func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
 	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
 	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
 	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
 	routingWeights := routerLogits.Softmax(ctx)
 	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
 	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
 	if opts.normTopKProb {
 		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
 		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
 		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
 	}
 	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
 	upStates := mlp.Up.MulmatID(ctx, hiddenStates, selectedExperts)
 	hiddenStates = mlp.Gate.MulmatID(ctx, hiddenStates, selectedExperts)
 	hiddenStates = hiddenStates.SILU(ctx)
 	hiddenStates = hiddenStates.Mul(ctx, upStates)
 	experts := mlp.Down.MulmatID(ctx, hiddenStates, selectedExperts)
 	experts = experts.Mul(ctx, routingWeights)
 	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
 		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
 	}
 	return nextStates
 }
 type dense struct {
 	Gate *nn.Linear `gguf:"ffn_gate"`
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
 }
 func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
 	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
 	return mlp.Down.Forward(ctx, hiddenStates)
 }
 type Layer struct {
 	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
 	*Attention
 	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
 	MLP
 }
 func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenStates
 	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
 	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
 	if outputs != nil {
 		hiddenStates = hiddenStates.Rows(ctx, outputs)
 		residual = residual.Rows(ctx, outputs)
 	}
 	hiddenStates = hiddenStates.Add(ctx, residual)
 	residual = hiddenStates
 	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
 	hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
 	return hiddenStates.Add(ctx, residual)
 }
 type Model struct {
 	model.Base
 	model.BytePairEncoding
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
 	Layers []Layer `gguf:"blk"`
 	*Options
 }
 // Forward implements model.Model.
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
 			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}
 		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
 	}
 	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
 	return m.Output.Forward(ctx, hiddenStates), nil
 }
 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
 }
 var _ model.Model = (*Model)(nil)
 func New(c fs.Config) (model.Model, error) {
 	layers := make([]Layer, c.Uint("block_count"))
 	for i := range layers {
 		if c.String("general.architecture") == "qwen3moe" {
 			layers[i].MLP = &sparse{}
 		} else {
 			layers[i].MLP = &dense{}
 		}
 	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOS: append(
 					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
 					c.Ints("tokenizer.ggml.eos_token_ids")...,
 				),
 			},
 		),
 		Layers: layers,
 		Options: &Options{
 			hiddenSize:     int(c.Uint("embedding_length")),
 			numHeads:       int(c.Uint("attention.head_count")),
 			numKVHeads:     int(c.Uint("attention.head_count_kv")),
 			keyLength:      int(c.Uint("attention.key_length")),
 			valueLength:    int(c.Uint("attention.value_length")),
 			eps:            c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:       c.Float("rope.freq_base"),
 			ropeScale:      c.Float("rope.freq_scale", 1),
 			numExperts:     int(c.Uint("expert_count")),
 			numExpertsUsed: int(c.Uint("expert_used_count")),
 			normTopKProb:   c.Bool("norm_top_k_prob", true),
 		},
 	}
 	m.Cache = kvcache.NewCausalCache(m.Shift)
 	return &m, nil
 }
 func init() {
 	model.Register("qwen3", New)
 	model.Register("qwen3moe", New)
 }
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -5,13 +5,116 @@ import (
 	"context"
 	"iter"
 	"log/slog"
 	"slices"
 	"strings"
 	"sync"
 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
 	"github.com/ollama/ollama/logutil"
 )
 type Special int32
 const (
 	SpecialBOS Special = iota
 	SpecialEOS
 )
 const (
 	TOKEN_TYPE_NORMAL = iota + 1
 	TOKEN_TYPE_UNKNOWN
 	TOKEN_TYPE_CONTROL
 	TOKEN_TYPE_USER_DEFINED
 	TOKEN_TYPE_UNUSED
 	TOKEN_TYPE_BYTE
 )
 type TextProcessor interface {
 	Encode(s string, addSpecial bool) ([]int32, error)
 	Decode([]int32) (string, error)
 	Is(int32, Special) bool
 	Vocabulary() *Vocabulary
 }
 type Vocabulary struct {
 	Values []string
 	Types  []int32
 	Scores []float32
 	Merges []string
 	BOS, EOS, EOT          int32
 	AddBOS, AddEOS, AddEOT bool
 	specialOnce sync.Once
 	special     []string
 	valuesOnce sync.Once
 	values     map[string]int32
 	mergeOnce sync.Once
 	merge     map[string]int32
 }
 func (v *Vocabulary) Is(id int32, special Special) bool {
 	switch special {
 	case SpecialBOS:
 		return id == v.BOS
 	case SpecialEOS:
 		return id == v.EOS || id == v.EOT
 	default:
 		return false
 	}
 }
 func (v *Vocabulary) Encode(s string) int32 {
 	v.valuesOnce.Do(func() {
 		v.values = make(map[string]int32, len(v.Values))
 		for i, value := range v.Values {
 			v.values[value] = int32(i)
 		}
 	})
 	if id, ok := v.values[s]; ok {
 		return id
 	}
 	return -1
 }
 func (v *Vocabulary) Decode(id int32) string {
 	return v.Values[id]
 }
 func (v *Vocabulary) SpecialVocabulary() []string {
 	v.specialOnce.Do(func() {
 		for i := range v.Values {
 			if slices.Contains([]int{105, 106}, i) {
 				v.special = append(v.special, v.Values[i])
 			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
 				v.special = append(v.special, v.Values[i])
 			}
 		}
 	})
 	return v.special
 }
 func (v *Vocabulary) Merge(left, right string) int {
 	v.mergeOnce.Do(func() {
 		v.merge = make(map[string]int32, len(v.Merges))
 		for i, merge := range v.Merges {
 			v.merge[merge] = int32(i)
 		}
 	})
 	if id, ok := v.merge[left+" "+right]; ok {
 		return int(id)
 	}
 	return -1
 }
 type BytePairEncoding struct {
 	pre   *regexp2.Regexp
 	vocab *Vocabulary
@@ -201,12 +304,27 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
 	if addSpecial && len(ids) > 0 {
-		ids = bpe.vocab.addSpecials(ids)
+		if bpe.vocab.AddBOS {
 			if ids[0] == bpe.vocab.BOS {
 				slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
 			}
 			slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
 			ids = append([]int32{bpe.vocab.BOS}, ids...)
 		}
 		if bpe.vocab.AddEOS {
 			if ids[len(ids)-1] == bpe.vocab.EOS {
 				slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
 			}
 			slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
 			ids = append(ids, bpe.vocab.EOS)
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
@@ -234,6 +352,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}
-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -182,12 +182,27 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
 	if addSpecial && len(ids) > 0 {
-		ids = spm.vocab.addSpecials(ids)
+		if spm.vocab.AddBOS {
 			if ids[0] == spm.vocab.BOS {
 				slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
 			}
 			slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
 			ids = append([]int32{spm.vocab.BOS}, ids...)
 		}
 		if spm.vocab.AddEOS {
 			if ids[len(ids)-1] == spm.vocab.EOS {
 				slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
 			}
 			slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
 			ids = append(ids, spm.vocab.EOS)
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
@@ -246,6 +261,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}
-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
--- a/model/textprocessor.go
+++ b/model/textprocessor.go
@@ -1,17 +0,0 @@
 package model
 const (
 	TOKEN_TYPE_NORMAL = iota + 1
 	TOKEN_TYPE_UNKNOWN
 	TOKEN_TYPE_CONTROL
 	TOKEN_TYPE_USER_DEFINED
 	TOKEN_TYPE_UNUSED
 	TOKEN_TYPE_BYTE
 )
 type TextProcessor interface {
 	Encode(s string, addSpecial bool) ([]int32, error)
 	Decode([]int32) (string, error)
 	Is(int32, Special) bool
 	Vocabulary() *Vocabulary
 }
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -1,112 +0,0 @@
 package model
 import (
 	"log/slog"
 	"slices"
 	"sync"
 )
 type Special int32
 const (
 	SpecialBOS Special = iota
 	SpecialEOS
 )
 type Vocabulary struct {
 	Values []string
 	Types  []int32
 	Scores []float32
 	Merges []string
 	BOS, EOS       []int32
 	AddBOS, AddEOS bool
 	specialOnce sync.Once
 	special     []string
 	valuesOnce sync.Once
 	values     map[string]int32
 	mergeOnce sync.Once
 	merge     map[string]int32
 }
 func (v *Vocabulary) Is(id int32, special Special) bool {
 	switch special {
 	case SpecialBOS:
 		return slices.Contains(v.BOS, id)
 	case SpecialEOS:
 		return slices.Contains(v.EOS, id)
 	default:
 		return false
 	}
 }
 func (v *Vocabulary) addSpecials(ids []int32) []int32 {
 	if v.AddBOS && len(v.BOS) > 0 {
 		if slices.Contains(v.BOS, ids[0]) {
 			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
 		}
 		slog.Debug("adding bos token to prompt", "id", v.BOS)
 		ids = append([]int32{v.BOS[0]}, ids...)
 	}
 	if v.AddEOS && len(v.EOS) > 0 {
 		if slices.Contains(v.BOS, ids[len(ids)-1]) {
 			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
 		}
 		slog.Debug("adding eos token to prompt", "id", v.EOS)
 		ids = append(ids, v.EOS[0])
 	}
 	return ids
 }
 func (v *Vocabulary) Encode(s string) int32 {
 	v.valuesOnce.Do(func() {
 		v.values = make(map[string]int32, len(v.Values))
 		for i, value := range v.Values {
 			v.values[value] = int32(i)
 		}
 	})
 	if id, ok := v.values[s]; ok {
 		return id
 	}
 	return -1
 }
 func (v *Vocabulary) Decode(id int32) string {
 	return v.Values[id]
 }
 func (v *Vocabulary) SpecialVocabulary() []string {
 	v.specialOnce.Do(func() {
 		for i := range v.Values {
 			if v.Types[i] == TOKEN_TYPE_CONTROL {
 				v.special = append(v.special, v.Values[i])
 			}
 		}
 	})
 	return v.special
 }
 func (v *Vocabulary) Merge(left, right string) int {
 	v.mergeOnce.Do(func() {
 		v.merge = make(map[string]int32, len(v.Merges))
 		for i, merge := range v.Merges {
 			v.merge[merge] = int32(i)
 		}
 	})
 	if id, ok := v.merge[left+" "+right]; ok {
 		return int(id)
 	}
 	return -1
 }
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)
 	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
 	slot.Inputs = slot.Inputs[:numPast]
 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", int32(len(prompt))-numPast)
 	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
 	slot.Inputs = slot.Inputs[:numPast]
 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,6 +3,7 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
 	"image"
 	"testing"
 	"time"
@@ -11,6 +12,10 @@ import (
 )
 func TestCountCommon(t *testing.T) {
 	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
 	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
 	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
 	tests := []struct {
 		name     string
 		t1       []input.Input
@@ -31,20 +36,20 @@ func TestCountCommon(t *testing.T) {
 		},
 		{
 			name:     "Image Prefix",
-			t1:       []input.Input{{MultimodalHash: 1}},
+			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
+			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
 			expected: 1,
 		},
 		{
 			name:     "Mixed",
-			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
+			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
 			expected: 2,
 		},
 		{
 			name:     "Mixed, Same Length",
-			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {MultimodalHash: 2}},
+			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
 			expected: 1,
 		},
 		{
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -1,113 +0,0 @@
 package ollamarunner
 import (
 	"errors"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model/input"
 )
 // Tensors can't be used across multiple compute graphs. This is a problem
 // if a single embedding is split across batches using views since all of
 // the views will have the same source tensor. We also don't want to
 // recompute the entire embedding for each batch.
 //
 // To avoid this, we compute all of the tensors for the embedding on the
 // first use and then store the result in system memory. When we need
 // additional tensors, we recreate them from the stored data.
 // multimodalEntry represents the embeddings of a single object (such
 // as an image).
 type multimodalEntry struct {
 	// mm is the original set of tensors created by EncodeMultimodal
 	mm []input.Multimodal
 	// data is the computed result of mm. Nil if not yet computed
 	data [][]float32
 }
 // multimodalStore maps from an individual tensor (of which there
 // may be many in a single multimodal object) to its parent embedding
 type multimodalStore map[ml.Tensor]*multimodalEntry
 func newMultimodalStore() multimodalStore {
 	return make(multimodalStore)
 }
 // addMultimodal stores an embedding for later use in a compute graph
 func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
 	entry := &multimodalEntry{mm: embedding}
 	for _, e := range embedding {
 		if e.Tensor != nil {
 			m[e.Tensor] = entry
 		}
 	}
 }
 // getMultimodal takes a source set of tensors (which may contain a whole or
 // parts of one or more images) and returns the equivalent that can be used in
 // the current context
 func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
 	out := make([]input.Multimodal, len(in))
 	for i := range out {
 		if in[i].Tensor != nil {
 			var err error
 			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
 			if err != nil {
 				return nil, err
 			}
 		}
 		out[i].Data = in[i].Data
 	}
 	return out, nil
 }
 func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
 	entry := m[in]
 	if entry.data == nil {
 		computeCtx := backend.NewContext()
 		defer computeCtx.Close()
 		var tensors []ml.Tensor
 		for _, t := range entry.mm {
 			if t.Tensor != nil {
 				tensors = append(tensors, t.Tensor)
 			}
 		}
 		if len(tensors) == 0 {
 			return nil, nil
 		}
 		computeCtx.Forward(tensors...)
 		entry.data = make([][]float32, len(entry.mm))
 		if !reserve {
 			computeCtx.Compute(tensors...)
 			for i, t := range entry.mm {
 				if t.Tensor != nil {
 					entry.data[i] = t.Tensor.Floats()
 				}
 			}
 		} else {
 			computeCtx.Reserve()
 		}
 	}
 	for i, t := range entry.mm {
 		if in == t.Tensor {
 			if !reserve {
 				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
 			} else {
 				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
 			}
 		}
 	}
 	return nil, errors.New("multimodal tensor not found")
 }
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1,14 +1,12 @@
 package ollamarunner
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
 	"hash/maphash"
 	"image"
 	"log"
 	"log/slog"
 	"net"
@@ -22,7 +20,6 @@ import (
 	"time"
 	"unicode/utf8"
 	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
@@ -43,9 +40,6 @@ type Sequence struct {
 	// multimodal embeddings
 	ctxs []ml.Context
 	// mmStore holds multimodal embeddings to mange memory and enable splitting across batches
 	mmStore multimodalStore
 	// batch index
 	iBatch int
@@ -107,7 +101,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 	startTime := time.Now()
-	inputs, ctxs, mmStore, err := s.inputs(prompt, images)
+	inputs, ctxs, err := s.inputs(prompt, images)
 	if err != nil {
 		return nil, fmt.Errorf("failed to process inputs: %w", err)
 	} else if len(inputs) == 0 {
@@ -162,7 +156,6 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 	return &Sequence{
 		ctxs:                ctxs,
 		mmStore:             mmStore,
 		inputs:              inputs,
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
@@ -181,10 +174,9 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 // inputs processes the prompt and images into a list of inputs
 // by splitting the prompt on [img-<n>] tags, tokenizing text and
 // decoding images
-func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, multimodalStore, error) {
+func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) {
 	var inputs []input.Input
 	var ctxs []ml.Context
 	var mmStore multimodalStore
 	var parts []string
 	var matches [][]string
@@ -195,7 +187,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		re := regexp.MustCompile(`\[img-(\d+)\]`)
 		parts = re.Split(prompt, -1)
 		matches = re.FindAllStringSubmatch(prompt, -1)
 		mmStore = newMultimodalStore()
 	} else {
 		parts = []string{prompt}
 	}
@@ -205,7 +196,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		// text - tokenize
 		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
 		if err != nil {
-			return nil, nil, nil, err
+			return nil, nil, err
 		}
 		for _, t := range tokens {
@@ -225,7 +216,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			}
 			if imageIndex < 0 {
-				return nil, nil, nil, fmt.Errorf("invalid image index: %d", n)
+				return nil, nil, fmt.Errorf("invalid image index: %d", n)
 			}
 			ctx := s.model.Backend().NewContext()
@@ -233,15 +224,13 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			ctxs = append(ctxs, ctx)
 			imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
 			if err != nil {
-				return nil, nil, nil, err
+				return nil, nil, err
 			}
 			s.multimodalHash.Reset()
 			_, _ = s.multimodalHash.Write(images[imageIndex].Data)
 			imageHash := s.multimodalHash.Sum64()
 			mmStore.addMultimodal(imageEmbeddings)
 			inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
 			postTokenize = true
 		}
@@ -251,11 +240,11 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		var err error
 		inputs, err = multimodalProcessor.PostTokenize(inputs)
 		if err != nil {
-			return nil, nil, nil, err
+			return nil, nil, err
 		}
 	}
-	return inputs, ctxs, mmStore, nil
+	return inputs, ctxs, nil
 }
 type Server struct {
@@ -374,9 +363,6 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
 	var batchInputs []int32
 	var batch input.Batch
@@ -447,11 +433,7 @@ func (s *Server) processBatch() error {
 			batchInputs = append(batchInputs, inp.Token)
 			if inp.Multimodal != nil {
-				mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, false)
+				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
 				if err != nil {
 					return err
 				}
 				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: mm})
 			}
 			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
@@ -477,6 +459,9 @@ func (s *Server) processBatch() error {
 		return nil
 	}
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
 	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
@@ -735,71 +720,12 @@ func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
 	var err error
 	inputs := make([]input.Input, s.batchSize)
 	mmStore := newMultimodalStore()
 	// Multimodal strategy:
 	// - Encode a 2048x2048 image. This assumes that a single image of this
 	//   size is sufficient to trigger the worst case. This is currently true
 	//   because for existing models, only a single image fits in a batch.
 	// - Add the embedding to a full batch of tokens - this is necessary because
 	//   the model may be looking for non-image data, such as <image> tags.
 	// - Run PostTokenize to execute any transformations between generated
 	//   embeddings and what the forward pass expects.
 	// - The result may now be larger than a batch (images may not fit in a
 	//   single batch), so trim based on what will fit and must be grouped together.
 	// - Fill out the rest of the space with text tokens.
 	if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok {
 		mmCtx := s.model.Backend().NewContext()
 		defer mmCtx.Close()
 		img := image.NewGray(image.Rect(0, 0, 2048, 2048))
 		var buf bytes.Buffer
 		bmp.Encode(&buf, img)
 		if inputs[0].Multimodal, err = multimodalProcessor.EncodeMultimodal(mmCtx, buf.Bytes()); err == nil {
 			mmStore.addMultimodal(inputs[0].Multimodal)
 			inputs, err = multimodalProcessor.PostTokenize(inputs)
 			if err != nil {
 				return err
 			}
 			for i, inp := range inputs {
 				minBatch := 1 + inp.SameBatch
 				if minBatch > s.batchSize {
 					inputs = inputs[i:min(i+minBatch, len(inputs))]
 					break
 				} else if i+minBatch > s.batchSize {
 					inputs = inputs[:i]
 					break
 				}
 			}
 			if len(inputs) < s.batchSize {
 				newInputs := make([]input.Input, s.batchSize)
 				copy(newInputs, inputs)
 				inputs = newInputs
 			}
 		}
 	}
 	var batch input.Batch
-	batchInputs := make([]int32, len(inputs))
+	inputs := make([]int32, s.batchSize)
 	batch.Positions = make([]int32, len(inputs))
 	batch.Sequences = make([]int, len(inputs))
-	for i, inp := range inputs {
+	for i := range inputs {
 		batchInputs[i] = inp.Token
 		if inp.Multimodal != nil {
 			mm, err := mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, true)
 			if err != nil {
 				return err
 			}
 			batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: i, Multimodal: mm})
 		}
 		batch.Positions[i] = int32(i)
 	}
@@ -808,7 +734,11 @@ func (s *Server) reserveWorstCaseGraph() error {
 		batch.Outputs[i] = int32(i)
 	}
-	batch.Inputs = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
+	var err error
 	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
 	if err != nil {
 		return err
 	}
 	cache := s.model.Config().Cache
 	if cache != nil {
@@ -823,12 +753,16 @@ func (s *Server) reserveWorstCaseGraph() error {
 		return err
 	}
-	ctx.Forward(t).Reserve()
+	err = ctx.Forward(t).Reserve()
 	if err != nil {
 		return err
 	}
 	return nil
 }
-func (s *Server) initModel(
+func (s *Server) loadModel(
 	ctx context.Context,
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
@@ -836,21 +770,21 @@ func (s *Server) initModel(
 	kvCacheType string,
 	kvSize int,
 	multiUserCache bool,
-) error {
+) {
 	var err error
-	s.model, err = model.New(mpath, params)
+	s.model, err = model.New(ctx, mpath, params)
 	if err != nil {
-		return err
+		panic(err)
 	}
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
-		return errors.New("loras are not yet implemented")
+		panic("loras are not yet implemented")
 	}
 	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
 	if err != nil {
-		return err
+		panic(err)
 	}
 	if !s.cache.enabled && parallel > 1 {
@@ -862,30 +796,7 @@ func (s *Server) initModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
-	return s.reserveWorstCaseGraph()
+	err = s.reserveWorstCaseGraph()
 }
 func (s *Server) load(
 	ctx context.Context,
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
 	parallel int,
 	kvCacheType string,
 	kvSize int,
 	multiUserCache bool,
 ) {
 	err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
 	if err != nil {
 		panic(err)
 	}
 	slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
 	err = s.model.Backend().Load(ctx,
 		func(progress float32) {
 			s.progress = progress
 		})
 	if err != nil {
 		panic(err)
 	}
@@ -929,14 +840,9 @@ func Execute(args []string) error {
 		status:    llm.ServerStatusLoadingModel,
 	}
 	server.cond = sync.NewCond(&server.mu)
 	server.ready.Add(1)
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 	// TODO(jessegross): Parameters that need to be implemented:
 	//	no-mmap
 	//	mlock
 	var tensorSplitFloats []float32
 	if *tensorSplit != "" {
@@ -949,6 +855,9 @@ func Execute(args []string) error {
 	}
 	params := ml.BackendParams{
 		Progress: func(progress float32) {
 			server.progress = progress
 		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,
@@ -956,7 +865,14 @@ func Execute(args []string) error {
 		FlashAttention: *flashAttention,
 	}
-	go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
+	server.ready.Add(1)
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 	go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
 	server.cond = sync.NewCond(&server.mu)
 	go server.run(ctx)
 	addr := "127.0.0.1:" + strconv.Itoa(*port)
--- a/runner/ollamarunner/runner_test.go
+++ b/runner/ollamarunner/runner_test.go
@@ -0,0 +1,218 @@
 package ollamarunner
 import (
 	"context"
 	"sync"
 	"testing"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 	"github.com/ollama/ollama/sample"
 	"golang.org/x/sync/semaphore"
 )
 // testBackend implements ml.Backend with minimal functionality required for tests.
 type testBackend struct{}
 func (b *testBackend) Config() fs.Config             { return testConfig{} }
 func (b *testBackend) Get(string) ml.Tensor          { return nil }
 func (b *testBackend) NewContext() ml.Context        { return &testContext{} }
 func (b *testBackend) NewContextSize(int) ml.Context { return &testContext{} }
 // testConfig is a stub implementation of fs.Config used by testBackend.
 type testConfig struct{}
 func (testConfig) Architecture() string                  { return "" }
 func (testConfig) String(string, ...string) string       { return "" }
 func (testConfig) Uint(string, ...uint32) uint32         { return 0 }
 func (testConfig) Float(string, ...float32) float32      { return 0 }
 func (testConfig) Bool(string, ...bool) bool             { return false }
 func (testConfig) Strings(string, ...[]string) []string  { return nil }
 func (testConfig) Ints(string, ...[]int32) []int32       { return nil }
 func (testConfig) Floats(string, ...[]float32) []float32 { return nil }
 type testContext struct{}
 func (c *testContext) Empty(dtype ml.DType, shape ...int) ml.Tensor {
 	sz := 1
 	for _, s := range shape {
 		sz *= s
 	}
 	return &testTensor{dtype: dtype, data: make([]float32, sz), shape: shape}
 }
 func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor { return c.Empty(dtype, shape...) }
 func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
 	copy(t.data, s)
 	return t, nil
 }
 func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	f := make([]float32, len(s))
 	for i, v := range s {
 		f[i] = float32(v)
 	}
 	out, _ := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32
 	return out, nil
 }
 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 	return c.Empty(dtype, int((stop-start)/step))
 }
 func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
 func (c *testContext) Compute(...ml.Tensor)            {}
 func (c *testContext) Reserve() error                  { return nil }
 func (c *testContext) MaxGraphNodes() int              { return 0 }
 func (c *testContext) Close()                          {}
 func (c *testContext) Input() ml.Context               { return c }
 func (c *testContext) Layer(int) ml.Context            { return c }
 type testTensor struct {
 	ml.Tensor
 	dtype ml.DType
 	data  []float32
 	shape []int
 }
 func (t *testTensor) Dim(n int) int    { return t.shape[n] }
 func (t *testTensor) Stride(n int) int { return 0 }
 func (t *testTensor) Shape() []int     { return t.shape }
 func (t *testTensor) DType() ml.DType  { return t.dtype }
 func (t *testTensor) Bytes() []byte    { return nil }
 func (t *testTensor) Floats() []float32 {
 	out := make([]float32, len(t.data))
 	copy(out, t.data)
 	return out
 }
 func (t *testTensor) Neg(ctx ml.Context) ml.Tensor { return nil }
 func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	out, _ := ctx.(*testContext).FromFloatSlice(nil, len(t.data))
 	return out
 }
 func (t *testTensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor            { return nil }
 func (t *testTensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor         { return nil }
 func (t *testTensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return nil }
 func (t *testTensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor  { return nil }
 func (t *testTensor) Softmax(ctx ml.Context) ml.Tensor                      { return nil }
 func (t *testTensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, e float32) ml.Tensor {
 	return nil
 }
 func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	return ctx.(*testContext).Empty(t.dtype, shape...)
 }
 func (t *testTensor) Copy(ctx ml.Context, dest ml.Tensor) ml.Tensor {
 	copy(dest.(*testTensor).data, t.data)
 	return nil
 }
 // fakeModel implements model.Model and model.TextProcessor.
 type fakeModel struct {
 	model.Base
 	decode  map[int32]string
 	logits  [][]float32
 	call    int
 	backend ml.Backend
 }
 func (f *fakeModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	idx := f.call
 	if idx >= len(f.logits) {
 		idx = len(f.logits) - 1
 	}
 	f.call++
 	return ctx.FromFloatSlice(f.logits[idx], len(f.logits[idx]))
 }
 func (f *fakeModel) Backend() ml.Backend {
 	if f.backend == nil {
 		f.backend = &testBackend{}
 	}
 	return f.backend
 }
 func (f *fakeModel) Encode(string, bool) ([]int32, error) { return nil, nil }
 func (f *fakeModel) Decode(ids []int32) (string, error) {
 	var s string
 	for _, id := range ids {
 		s += f.decode[id]
 	}
 	return s, nil
 }
 func (f *fakeModel) Is(id int32, sp model.Special) bool { return false }
 func (f *fakeModel) Vocabulary() *model.Vocabulary      { return &model.Vocabulary{} }
 var _ model.Model = (*fakeModel)(nil)
 var _ model.TextProcessor = (*fakeModel)(nil)
 func TestProcessBatchUnicode(t *testing.T) {
 	tests := []struct {
 		name   string
 		decode map[int32]string
 		logits [][]float32
 		want   string
 	}{
 		{
 			name:   "emoji",
 			decode: map[int32]string{0: "A", 1: "😀", 2: "👍", 3: "!"},
 			logits: [][]float32{{10, 0, 0, 0}, {0, 10, 0, 0}, {0, 0, 10, 0}, {0, 0, 0, 10}},
 			want:   "A😀👍!",
 		},
 		{
 			name:   "ascii",
 			decode: map[int32]string{0: "H", 1: "e", 2: "y"},
 			logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
 			want:   "Hey",
 		},
 		{
 			name:   "multibyte",
 			decode: map[int32]string{0: "世", 1: "界", 2: "😊"},
 			logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
 			want:   "世界😊",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			m := &fakeModel{decode: tt.decode, logits: tt.logits}
 			s := &Server{model: m, batchSize: 1, parallel: 1}
 			s.cache = &InputCache{enabled: true, slots: []InputCacheSlot{{Id: 0}}, numCtx: 10}
 			s.seqs = make([]*Sequence, 1)
 			s.seqsSem = semaphore.NewWeighted(1)
 			if err := s.seqsSem.Acquire(context.Background(), 1); err != nil {
 				t.Fatal(err)
 			}
 			s.cond = sync.NewCond(&s.mu)
 			seq := &Sequence{
 				inputs:     []input.Input{{Token: 0}},
 				cache:      &s.cache.slots[0],
 				responses:  make(chan string, 10),
 				quit:       make(chan bool, 1),
 				numPredict: len(tt.logits),
 				sampler:    sample.NewSampler(0, 0, 0, 0, 0, nil),
 				embedding:  make(chan []float32, 1),
 			}
 			s.seqs[0] = seq
 			for {
 				if err := s.processBatch(); err != nil {
 					t.Fatal(err)
 				}
 				if s.seqs[0] == nil {
 					break
 				}
 			}
 			var result string
 			for r := range seq.responses {
 				result += r
 			}
 			if result != tt.want {
 				t.Fatalf("got %q want %q", result, tt.want)
 			}
 		})
 	}
 }
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa
 		vocabIds[i] = uint32(i)
 	}
-	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS)
+	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)})
 	if grammar == nil {
 		return nil, errors.New("sample: failed to initialize grammar")
 	}
--- a/server/create.go
+++ b/server/create.go
@@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 	}
 	defer bin.Close()
-	f, err := ggml.Decode(bin, -1)
+	f, _, err := ggml.Decode(bin, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -430,7 +430,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 	fnWrap := func(n uint64) {
 		done := doneBytes.Add(n)
 		progress := float32(done) / float32(totalBytes)
-		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0000000000000000000", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
+		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
 	}
 	ftype, err := ggml.ParseFileType(quantizeType)
 	if err != nil {
@@ -467,7 +467,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 		return nil, err
 	}
-	f, err := ggml.Decode(temp, 1024)
+	f, _, err := ggml.Decode(temp, 1024)
 	if err != nil {
 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
 		return nil, err
@@ -501,26 +501,47 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 		return nil, errOnlyGGUFSupported
 	}
-	f, err := ggml.Decode(blob, -1)
+	stat, err := blob.Stat()
 	if err != nil {
 		return nil, err
 	}
-	mediatype := "application/vnd.ollama.image.model"
+	var offset int64
-	if f.KV().Kind() == "adapter" {
+	for offset < stat.Size() {
-		mediatype = "application/vnd.ollama.image.adapter"
+		f, n, err := ggml.Decode(blob, 1024)
-	} else if (f.KV().Uint("block_count") == 0 && f.KV().Uint("vision.block_count") > 0) || f.KV().Kind() == "projector" {
+		if errors.Is(err, io.EOF) {
-		// if a model has vision.block_count but not block_count, it is a standalone vision model
+			break
-		mediatype = "application/vnd.ollama.image.projector"
+		} else if err != nil {
-	}
+			return nil, err
 		}
-	layer, err := NewLayerFromLayer(digest, mediatype, blob.Name())
+		mediatype := "application/vnd.ollama.image.model"
-	if err != nil {
+		if f.KV().Kind() == "adapter" {
-		slog.Debug("could not create new layer from layer", "error", err)
+			mediatype = "application/vnd.ollama.image.adapter"
-		return nil, err
+		} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
-	}
+			mediatype = "application/vnd.ollama.image.projector"
 		}
-	layers = append(layers, &layerGGML{layer, f})
+		var layer Layer
 		if digest != "" && n == stat.Size() && offset == 0 {
 			layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
 			if err != nil {
 				slog.Debug("could not create new layer from layer", "error", err)
 				return nil, err
 			}
 		}
 		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
 		if layer.Digest == "" {
 			layer, err = NewLayer(io.NewSectionReader(blob, offset, n), mediatype)
 			if err != nil {
 				return nil, err
 			}
 		}
 		layers = append(layers, &layerGGML{layer, f})
 		offset = n
 	}
 	return detectChatTemplate(layers)
 }
--- a/server/images.go
+++ b/server/images.go
@@ -75,7 +75,7 @@ func (m *Model) Capabilities() []model.Capability {
 	if err == nil {
 		defer r.Close()
-		f, err := ggml.Decode(r, 1024)
+		f, _, err := ggml.Decode(r, 1024)
 		if err == nil {
 			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
 				capabilities = append(capabilities, model.CapabilityEmbedding)
--- a/server/model.go
+++ b/server/model.go
@@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()
-			f, err := ggml.Decode(blob, -1)
+			f, _, err := ggml.Decode(blob, -1)
 			if err != nil {
 				return nil, err
 			}
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -120,30 +120,14 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 	if newType.IsQuantized() {
 		nx := shape[0]
 		ny := uint64(1)
 		if len(shape) > 1 {
 			ny = shape[1]
 		}
 		qk_k := newType.BlockSize()
 		// Check if first dimension is divisible by block size
 		if nx%qk_k != 0 {
-			// Store the original type for logging
+			slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s.  Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
-			originalType := newType
+			newType = fsggml.TensorTypeF16
 			// Select appropriate fallback based on original type
 			switch newType {
 			case fsggml.TensorTypeQ4_K:
 				newType = fsggml.TensorTypeQ5_0
 			case fsggml.TensorTypeQ5_K:
 				newType = fsggml.TensorTypeQ5_1
 			case fsggml.TensorTypeQ6_K:
 				newType = fsggml.TensorTypeQ8_0
 			}
 			// Final check - if still incompatible, fall back to F16
 			if nx%newType.BlockSize() != 0 {
 				newType = fsggml.TensorTypeF16
 			}
 			slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
 				nx, qk_k, originalType.String(), newType.String()))
 		}
 	}
 	return newType
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -271,7 +271,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatal(err.Error())
 			}
 			defer fp.Close()
-			meta, err := fsggml.Decode(fp, -1)
+			meta, _, err := fsggml.Decode(fp, -1)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
@@ -303,7 +303,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}
 			defer fpNew.Close()
-			newMeta, err := fsggml.Decode(fpNew, -1)
+			newMeta, _, err := fsggml.Decode(fpNew, -1)
 			if err != nil {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}
--- a/server/sched.go
+++ b/server/sched.go
@@ -387,17 +387,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 				s.loadedMu.Unlock()
 				runner.refMu.Unlock()
 				slog.Debug("duplicate expired event, ignoring", "runner", runner)
 			} else if runner.pid != runnerToUnload.pid {
 				// If the pids do not match, we likely had multiple load
 				// failures for the same model in quick succession due to
 				// request context canceled and are draining the queue of
 				// events. Ensure the orphaned runner is properly shut down, but
 				// do not delete the mismatched loaded runner, or wait for VRAM
 				// convergence.
 				slog.Debug("orphaned runner shutting down", "orphan", runner, "loaded", runnerToUnload)
 				runner.unload()
 				s.loadedMu.Unlock()
 				runner.refMu.Unlock()
 			} else {
 				slog.Debug("starting background wait for VRAM recovery", "runner", runner)
 				finished := runner.waitForVRAMRecovery()