int: harden server lifecycle (#12835 )

this should reduce zombies during integration runs
tests: fix embeddinggemma integration test (#12830 )
2025-10-29 11:50:56 -07:00 · 2025-10-29 11:07:28 -07:00 · 2025-10-29 11:03:43 -07:00 · 2025-10-28 23:25:48 -07:00 · 2025-10-28 19:17:54 -07:00 · 2025-10-28 19:09:07 -07:00
49 changed files with 4077 additions and 912 deletions
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -198,6 +198,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &qwen2Model{}
 	case "Qwen2_5_VLForConditionalGeneration":
 		conv = &qwen25VLModel{}
+	case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
+		conv = &qwen3VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
--- a/convert/convert_qwen3.go
+++ b/convert/convert_qwen3.go
@@ -0,0 +1,157 @@
+package convert
+
+import (
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type qwen3Model struct {
+	ModelParameters
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	HeadDim               uint32  `json:"head_dim"`
+	NumExperts            uint32  `json:"num_experts"`
+	NumExpertsPerToken    uint32  `json:"num_experts_per_tok"`
+	NormTopkProb          bool    `json:"norm_topk_prob"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RopeScaling           struct {
+		Type                          string     `json:"type"`
+		Factor                        ropeFactor `json:"factor"`
+		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
+		MropeSection                  []int32    `json:"mrope_section"`
+	} `json:"rope_scaling"`
+	RMSNormEPS float32 `json:"rms_norm_eps"`
+}
+
+// KV implements ModelConverter.
+func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
+	arch := "qwen3"
+	if q.NumExperts > 0 {
+		arch += "moe"
+	}
+
+	kv := q.ModelParameters.KV(t)
+	kv["general.architecture"] = arch
+	kv["block_count"] = q.HiddenLayers
+	kv["context_length"] = q.MaxPositionEmbeddings
+	kv["embedding_length"] = q.HiddenSize
+	kv["feed_forward_length"] = q.IntermediateSize
+	kv["attention.head_count"] = q.NumAttentionHeads
+	kv["attention.head_count_kv"] = q.NumKeyValueHeads
+	kv["attention.key_length"] = q.HeadDim
+	kv["attention.value_length"] = q.HeadDim
+
+	if q.NumExperts > 0 {
+		kv["expert_count"] = q.NumExperts
+		kv["expert_used_count"] = q.NumExpertsPerToken
+		kv["norm_top_k_prob"] = q.NormTopkProb
+	}
+
+	kv["rope.freq_base"] = q.RopeTheta
+	kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
+
+	switch q.RopeScaling.Type {
+	case "":
+		// no scaling
+	case "yarn":
+		kv["rope.scaling.type"] = q.RopeScaling.Type
+		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	case "mrope", "default":
+		kv["rope.mrope_section"] = q.RopeScaling.MropeSection
+	default:
+		panic("unknown rope scaling type")
+	}
+	return kv
+}
+
+// Tensors implements ModelConverter.
+func (q *qwen3Model) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	// TODO: handle split experts
+
+	for _, t := range ts {
+		switch {
+		case strings.Contains(t.Name(), "ffn_gate_up_exps"):
+			afterFunc := func(t tensor.Tensor) (tensor.Tensor, error) { return tensor.Transpose(t, 0, 2, 1) }
+			for t := range splitDim(t, 2,
+				split{Replacer: strings.NewReplacer("gate_up", "gate"), afterFunc: afterFunc},
+				split{Replacer: strings.NewReplacer("gate_up", "up"), afterFunc: afterFunc},
+			) {
+				t.Shape[1], t.Shape[2] = t.Shape[2], t.Shape[1]
+				out = append(out, t)
+			}
+		case strings.Contains(t.Name(), "ffn_down_exps"):
+			shape := slices.Clone(t.Shape())
+			shape[1], shape[2] = shape[2], shape[1]
+			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+				dims := make([]int, len(shape))
+				for i := range shape {
+					dims[i] = int(shape[i])
+				}
+
+				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				tt, err := tensor.Transpose(tt, 0, 2, 1)
+				if err != nil {
+					return nil, err
+				}
+
+				// flatten tensor so it can be written as a vector
+				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
+					return nil, err
+				}
+
+				return native.VectorF32(tt.(*tensor.Dense))
+			})
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    shape,
+				WriterTo: t,
+			})
+		default:
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		}
+	}
+
+	return out
+}
+
+// Replacements implements ModelConverter.
+func (q *qwen3Model) Replacements() []string {
+	return []string{
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.k_norm", "attn_k_norm",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.o_proj", "attn_output",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"mlp.gate.weight", "ffn_gate_inp.weight",
+		"mlp.experts.down_proj", "ffn_down_exps.weight",
+		"mlp.experts.gate_up_proj", "ffn_gate_up_exps.weight",
+		"post_attention_layernorm", "ffn_norm",
+		"model.norm", "output_norm",
+	}
+}
+
+var _ ModelConverter = (*qwen3Model)(nil)
--- a/convert/convert_qwen3vl.go
+++ b/convert/convert_qwen3vl.go
@@ -0,0 +1,116 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type qwen3VLModel struct {
+	qwen3Model `json:"text_config"`
+
+	VisionModel struct {
+		Depth                  uint32  `json:"depth"`
+		HiddenSize             uint32  `json:"hidden_size"`
+		NumHeads               uint32  `json:"num_heads"`
+		InChannels             uint32  `json:"in_channels"`
+		PatchSize              uint32  `json:"patch_size"`
+		SpatialMergeSize       uint32  `json:"spatial_merge_size"`
+		WindowSize             uint32  `json:"window_size"`
+		RMSNormEps             float32 `json:"layer_norm_epsilon"`
+		RopeTheta              float32 `json:"rope_theta"`
+		TemporalPatchSize      uint32  `json:"temporal_patch_size"`
+		DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
+
+		Size struct {
+			ShortestEdge uint32 `json:"shortest_edge"`
+			LongestEdge  uint32 `json:"longest_edge"`
+		} `json:"size"`
+
+		ImageMean []float32 `json:"image_mean"`
+		ImageStd  []float32 `json:"image_std"`
+	} `json:"vision_config"`
+}
+
+func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
+	if err != nil {
+		return err
+	}
+
+	return json.Unmarshal(bts, &m.VisionModel)
+}
+
+func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.qwen3Model.KV(t)
+
+	arch := "qwen3vl"
+	if m.NumExperts > 0 {
+		arch += "moe"
+	}
+	// override architecture
+	kv["general.architecture"] = arch
+
+	kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
+	kv["vision.embedding_length"] = m.VisionModel.HiddenSize
+	kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
+	kv["vision.num_channels"] = m.VisionModel.InChannels
+	kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
+	kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
+	kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
+	kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
+	kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
+	kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
+
+	kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
+	kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
+
+	kv["vision.image_mean"] = m.VisionModel.ImageMean
+	kv["vision.image_std"] = m.VisionModel.ImageStd
+
+	return kv
+}
+
+func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var rest []Tensor
+	var out []*ggml.Tensor
+	for _, t := range ts {
+		switch {
+		case strings.Contains(t.Name(), "attn_qkv"):
+			out = append(out, slices.Collect(splitDim(t, 0,
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
+			))...)
+		case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
+			shape := t.Shape()
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    append([]uint64{shape[0] * shape[1]}, shape[2:]...),
+				WriterTo: t,
+			})
+		default:
+			rest = append(rest, t)
+		}
+	}
+
+	return append(m.qwen3Model.Tensors(rest), out...)
+}
+
+func (m *qwen3VLModel) Replacements() []string {
+	return append(
+		m.qwen3Model.Replacements(),
+		"model.language_", "",
+		"model.visual", "v",
+		"patch_embed.proj", "patch_embed",
+		"blocks", "blk",
+		"attn.qkv", "attn_qkv",
+		"attn.proj", "attn_out",
+		"deepstack_merger_list", "deepstack_merger",
+	)
+}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -19,8 +19,8 @@ type split struct {
 	dim    int
 	slices []tensor.Slice

-	// fn is an optional function to apply to the tensor after slicing
-	fn func(tensor.Tensor) (tensor.Tensor, error)
+	// afterFunc is an optional function to apply to the tensor after slicing
+	afterFunc func(tensor.Tensor) (tensor.Tensor, error)
 }

 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
@@ -54,8 +54,8 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {

 				tt = tensor.Materialize(tt)

-				if split.fn != nil {
-					tt, err = split.fn(tt)
+				if split.afterFunc != nil {
+					tt, err = split.afterFunc(tt)
 					if err != nil {
 						return nil, err
 					}
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -432,7 +432,7 @@ func TestSplitDim(t *testing.T) {
 		t.Run("split with transpose", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 1,
 				split{Replacer: strings.NewReplacer("a", "x")},
-				split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
+				split{Replacer: strings.NewReplacer("b", "y"), afterFunc: func(tt tensor.Tensor) (tensor.Tensor, error) {
 					return tensor.Transpose(tt, 1, 0)
 				}},
 			))
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -117,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.

 		// In the second pass, we more deeply initialize the GPUs to weed out devices that
 		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
-		slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
+		slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
 		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
 		defer cancel()
 		var wg sync.WaitGroup
@@ -129,7 +129,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			if devices[i].Library == "Metal" {
 				continue
 			}
-			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
@@ -155,6 +155,12 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 					envVar:           id,  // Filter to just this one GPU
 				}
 				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
+					slog.Debug("filtering device which didn't fully initialize",
+						"id", devices[i].ID,
+						"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
+						"pci_id", devices[i].PCIID,
+						"library", devices[i].Library,
+					)
 					needsDelete[i] = true
 				} else {
 					supportedMu.Lock()
@@ -170,7 +176,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			}(i)
 		}
 		wg.Wait()
-		logutil.Trace("supported GPU library combinations", "supported", supported)
+		logutil.Trace("supported GPU library combinations before filtering", "supported", supported)

 		filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)

@@ -372,12 +378,13 @@ func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
 			}
 			if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
 				needsDelete[i] = true
-				slog.Debug("dropping Vulkan duplicate by PCI ID",
-					"vulkan_id", devices[i].ID,
-					"vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
+				slog.Debug("filtering device with duplicate PCI ID",
+					"id", devices[i].ID,
+					"library", devices[i].Library,
+					"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
 					"pci_id", devices[i].PCIID,
-					"kept_library", devices[j].Library,
 					"kept_id", devices[j].ID,
+					"kept_library", devices[j].Library,
 				)
 				break
 			}
@@ -422,6 +429,12 @@ func filterOverlapByLibrary(supported map[string]map[string]map[string]int, need
 			}
 			for dev, i := range byLibDirs[libDir] {
 				if _, found := byLibDirs[newest][dev]; found {
+					slog.Debug("filtering device with overlapping libraries",
+						"id", dev,
+						"library", libDir,
+						"delete_index", i,
+						"kept_library", newest,
+					)
 					needsDelete[i] = true
 				}
 			}
--- a/discover/types.go
+++ b/discover/types.go
@@ -3,6 +3,7 @@ package discover
 import (
 	"log/slog"
 	"path/filepath"
+	"sort"
 	"strings"

 	"github.com/ollama/ollama/format"
@@ -26,6 +27,7 @@ type CPU struct {
 }

 func LogDetails(devices []ml.DeviceInfo) {
+	sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference
 	for _, dev := range devices {
 		var libs []string
 		for _, dir := range dev.LibraryPath {
@@ -39,6 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) {
 		}
 		slog.Info("inference compute",
 			"id", dev.ID,
+			"filtered_id", dev.FilteredID,
 			"library", dev.Library,
 			"compute", dev.Compute(),
 			"name", dev.Name,
--- a/docs/api.md
+++ b/docs/api.md
--- a/docs/api/index.mdx
+++ b/docs/api/index.mdx
@@ -1,5 +1,5 @@
 ---
-title: "Introduction"
+title: Introduction
 ---

 Ollama's API allows you to run and interact with models programatically.
@@ -44,4 +44,4 @@ Several community-maintained libraries are available for Ollama. For a full list

 ## Versioning

-Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).
+Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).
--- a/docs/benchmark.mdx
+++ b/docs/benchmark.mdx
@@ -1,71 +0,0 @@
---
-title: Benchmark
---
-
-Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
-
-## When to use
-
-Run these benchmarks when:
-
- Making changes to the model inference engine
- Modifying model loading/unloading logic
- Changing prompt processing or token generation code
- Implementing a new model architecture
- Testing performance across different hardware setups
-
-## Prerequisites
-
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
-
-## Usage and Examples
-
-<Note>
-  All commands must be run from the root directory of the Ollama project.
-</Note>
-
-Basic syntax:
-
-```bash
-go test -bench=. ./benchmark/... -m $MODEL_NAME
-```
-
-Required flags:
-
- `-bench=.`: Run all benchmarks
- `-m`: Model name to benchmark
-
-Optional flags:
-
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
-
-Common usage patterns:
-
-Single benchmark run with a model specified:
-
-```bash
-go test -bench=. ./benchmark/... -m llama3.3
-```
-
-## Output metrics
-
-The benchmark reports several key metrics:
-
- `gen_tok/s`: Generated tokens per second
- `prompt_tok/s`: Prompt processing tokens per second
- `ttft_ms`: Time to first token in milliseconds
- `load_ms`: Model load time in milliseconds
- `gen_tokens`: Total tokens generated
- `prompt_tokens`: Total prompt tokens processed
-
-Each benchmark runs two scenarios:
-
- Cold start: Model is loaded from disk for each test
- Warm start: Model is pre-loaded in memory
-
-Three prompt lengths are tested for each scenario:
-
- Short prompt (100 tokens)
- Medium prompt (500 tokens)
- Long prompt (1000 tokens)
--- a/docs/cloud.mdx
+++ b/docs/cloud.mdx
@@ -17,6 +17,7 @@ Ollama currently supports the following cloud models, with more coming soon:
 - `kimi-k2:1t-cloud`
 - `qwen3-coder:480b-cloud`
 - `glm-4.6:cloud`
+- `minimax-m2:cloud`

 ### Running Cloud models

--- a/docs/docs.json
+++ b/docs/docs.json
@@ -58,7 +58,11 @@
  "redirects": [
    {
      "source": "/openai",
-      "destination": "/api/openai"
+      "destination": "/api/openai-compatibility"
+    },
+    {
+      "source": "/api/openai",
+      "destination": "/api/openai-compatibility"
    }
  ],
  "navigation": {
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -0,0 +1,3 @@
+# Troubleshooting
+
+For troubleshooting, see [https://docs.ollama.com/troubleshooting](https://docs.ollama.com/troubleshooting)
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -242,13 +242,13 @@ func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"gemma3",
 		"gemma3n",
-		"mistral3",
-		"qwen3",
-		"qwen3moe",
+		"gptoss", "gpt-oss",
 		"llama4",
+		"mistral3",
 		"mllama",
 		"qwen25vl",
-		"gptoss", "gpt-oss",
+		"qwen3", "qwen3moe",
+		"qwen3vl", "qwen3vlmoe",
 	}, kv.Architecture())
 }

--- a/integration/README.md
+++ b/integration/README.md
@@ -7,7 +7,7 @@ By default, these tests are disabled so `go test ./...` will exercise only unit

 The integration tests have 2 modes of operating.

-1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
+1. By default, on Unix systems, they will start the server on a random port, run the tests, and then shutdown the server.  On Windows you must ALWAYS run the server on OLLAMA_HOST for the tests to work.
 2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote based on your `OLLAMA_HOST` environment variable

 > [!IMPORTANT]
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -4,9 +4,7 @@ package integration

 import (
 	"context"
-	"errors"
 	"math"
-	"strings"
 	"testing"
 	"time"

@@ -16,6 +14,10 @@ import (

 func dotProduct[V float32 | float64](v1, v2 []V) V {
 	var result V = 0
+	if len(v1) != len(v2) {
+		return result
+	}
+
 	for i := 0; i < len(v1); i++ {
 		result += v1[i] * v2[i]
 	}
@@ -31,9 +33,115 @@ func magnitude[V float32 | float64](v []V) V {
 }

 func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
+	mag1 := magnitude(v1)
+	mag2 := magnitude(v2)
+
+	if mag1 == 0 || mag2 == 0 {
+		return 0
+	}
+
 	return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
 }

+func euclideanDistance[V float32 | float64](v1, v2 []V) V {
+	if len(v1) != len(v2) {
+		return V(math.Inf(1))
+	}
+
+	var sum V = 0
+	for i := 0; i < len(v1); i++ {
+		diff := v1[i] - v2[i]
+		sum += diff * diff
+	}
+
+	return V(math.Sqrt(float64(sum)))
+}
+
+func manhattanDistance[V float32 | float64](v1, v2 []V) V {
+	if len(v1) != len(v2) {
+		return V(math.Inf(1))
+	}
+
+	var sum V = 0
+	for i := 0; i < len(v1); i++ {
+		sum += V(math.Abs(float64(v1[i] - v2[i])))
+	}
+
+	return sum
+}
+
+func TestEmbedCosineDistanceCorrelation(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, model := range libraryEmbedModels {
+		t.Run(model, func(t *testing.T) {
+			testCases := []struct {
+				a string
+				b string
+				c string
+			}{
+				{"cat", "kitten", "dog"},
+				{"king", "queen", "baron"},
+				{"paris", "london", "vancouver"},
+				{"The cat is sleeping on the sofa", "A feline is sleeping on the couch", "Quantum physics is complex"},
+				{"I love programming in python", "Coding in python brings me joy", "Pizza is delicious"},
+				{"Machine learning is fascinating", "Artificial intelligence is amazing", "I need to buy groceries"},
+				{"The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a sleepy dog", "The weather is warm and sunny today"},
+			}
+
+			for _, tc := range testCases {
+				testEmbed := make(map[string][]float32)
+				strs := []string{tc.a, tc.b, tc.c}
+
+				req := api.EmbedRequest{
+					Model:     model,
+					Input:     strs,
+					KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				}
+
+				resp, err := embedTestHelper(ctx, client, t, req)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				for cnt, v := range resp.Embeddings {
+					testEmbed[strs[cnt]] = v
+				}
+
+				// Calculate cosine similarities
+				cosAB := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.b])
+				cosAC := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.c])
+
+				// Calculate distances
+				distAB := euclideanDistance(testEmbed[tc.a], testEmbed[tc.b])
+				distAC := euclideanDistance(testEmbed[tc.a], testEmbed[tc.c])
+
+				manhattanAB := manhattanDistance(testEmbed[tc.a], testEmbed[tc.b])
+				manhattanAC := manhattanDistance(testEmbed[tc.a], testEmbed[tc.c])
+
+				// Consistency check: if cosAB > cosAC, then distances should be smaller
+				if cosAB > cosAC {
+					if distAB >= distAC {
+						t.Errorf("Euclidean distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but distAB=%f >= distAC=%f",
+							model, tc.a, tc.b, tc.c, cosAB, cosAC, distAB, distAC)
+					}
+
+					if manhattanAB >= manhattanAC {
+						t.Errorf("Manhattan distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but manhattanAB=%f >= manhattanAC=%f",
+							model, tc.a, tc.b, tc.c, cosAB, cosAC, manhattanAB, manhattanAC)
+					}
+				} else {
+					t.Errorf("Cosine Similarity inconsistency (%s): cosinSim(%s, %s) < cosinSim(%s, %s)",
+						model, tc.a, tc.b, tc.a, tc.c)
+				}
+			}
+		})
+	}
+}
+
 func TestAllMiniLMEmbeddings(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
@@ -301,216 +409,3 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req

 	return client.Embed(ctx, &req)
 }
-
-func TestEmbedTruncation(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	t.Run("single input token count", func(t *testing.T) {
-		req := api.EmbedRequest{
-			Model: "all-minilm",
-			Input: "why is the sky blue?",
-		}
-
-		res, err := embedTestHelper(ctx, client, t, req)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		if res.PromptEvalCount <= 0 {
-			t.Fatalf("expected positive token count, got %d", res.PromptEvalCount)
-		}
-	})
-
-	t.Run("batch parallel token counting", func(t *testing.T) {
-		req := api.EmbedRequest{
-			Model: "all-minilm",
-			Input: []string{"cat", "dog and mouse", "bird"},
-		}
-
-		res, err := embedTestHelper(ctx, client, t, req)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		if len(res.Embeddings) != 3 {
-			t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
-		}
-
-		if res.PromptEvalCount <= 0 {
-			t.Fatalf("expected positive token count, got %d", res.PromptEvalCount)
-		}
-	})
-
-	t.Run("truncation single input", func(t *testing.T) {
-		truncTrue := true
-		longInput := strings.Repeat("word ", 100)
-
-		req := api.EmbedRequest{
-			Model:    "all-minilm",
-			Input:    longInput,
-			Truncate: &truncTrue,
-			Options:  map[string]any{"num_ctx": 50},
-		}
-
-		res, err := embedTestHelper(ctx, client, t, req)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		if res.PromptEvalCount > 50 {
-			t.Fatalf("expected tokens <= 50 after truncation, got %d", res.PromptEvalCount)
-		}
-
-		if res.PromptEvalCount == 0 {
-			t.Fatal("expected non-zero token count after truncation")
-		}
-	})
-
-	t.Run("truncation batch", func(t *testing.T) {
-		truncTrue := true
-		req := api.EmbedRequest{
-			Model:    "all-minilm",
-			Input:    []string{"short", strings.Repeat("long ", 100), "medium text"},
-			Truncate: &truncTrue,
-			Options:  map[string]any{"num_ctx": 30},
-		}
-
-		res, err := embedTestHelper(ctx, client, t, req)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		if len(res.Embeddings) != 3 {
-			t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
-		}
-
-		if res.PromptEvalCount > 90 {
-			t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
-		}
-	})
-
-	t.Run("truncate false error", func(t *testing.T) {
-		truncFalse := false
-		req := api.EmbedRequest{
-			Model:    "all-minilm",
-			Input:    strings.Repeat("word ", 100),
-			Truncate: &truncFalse,
-			Options:  map[string]any{"num_ctx": 10},
-		}
-
-		_, err := embedTestHelper(ctx, client, t, req)
-		if err == nil {
-			t.Fatal("expected error when truncate=false with long input")
-		}
-
-		if !strings.Contains(err.Error(), "exceeds maximum context length") {
-			t.Fatalf("expected context length error, got: %v", err)
-		}
-	})
-
-	t.Run("runner token count accuracy", func(t *testing.T) {
-		baseline := api.EmbedRequest{Model: "all-minilm", Input: "test"}
-		baseRes, err := embedTestHelper(ctx, client, t, baseline)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		batch := api.EmbedRequest{
-			Model: "all-minilm",
-			Input: []string{"test", "test", "test"},
-		}
-		batchRes, err := embedTestHelper(ctx, client, t, batch)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		expectedCount := baseRes.PromptEvalCount * 3
-		if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
-			t.Fatalf("expected ~%d tokens (3 × %d), got %d",
-				expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
-		}
-	})
-}
-
-// TestEmbedStatusCode tests that errors from the embedding endpoint
-// properly preserve their HTTP status codes when returned to the client.
-// This test specifically checks the error handling path in EmbedHandler
-// where api.StatusError errors should maintain their original status code.
-func TestEmbedStatusCode(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	// Pull the model if needed
-	if err := PullIfMissing(ctx, client, "all-minilm"); err != nil {
-		t.Fatal(err)
-	}
-
-	t.Run("truncation error status code", func(t *testing.T) {
-		truncFalse := false
-		longInput := strings.Repeat("word ", 100)
-
-		req := api.EmbedRequest{
-			Model:    "all-minilm",
-			Input:    longInput,
-			Truncate: &truncFalse,
-			Options:  map[string]any{"num_ctx": 10},
-		}
-
-		_, err := embedTestHelper(ctx, client, t, req)
-		if err == nil {
-			t.Fatal("expected error when truncate=false with long input")
-		}
-
-		// Check that it's a StatusError with the correct status code
-		var statusErr api.StatusError
-		if !errors.As(err, &statusErr) {
-			t.Fatalf("expected api.StatusError, got %T: %v", err, err)
-		}
-
-		// The error should be a 4xx client error (likely 400 Bad Request)
-		// not a 500 Internal Server Error
-		if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
-			t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
-		}
-
-		// Verify the error message is meaningful
-		if !strings.Contains(err.Error(), "context length") {
-			t.Errorf("expected error message to mention context length, got: %v", err)
-		}
-	})
-
-	t.Run("batch truncation error status code", func(t *testing.T) {
-		truncFalse := false
-		req := api.EmbedRequest{
-			Model: "all-minilm",
-			Input: []string{
-				"short input",
-				strings.Repeat("very long input ", 100),
-				"another short input",
-			},
-			Truncate: &truncFalse,
-			Options:  map[string]any{"num_ctx": 10},
-		}
-
-		_, err := embedTestHelper(ctx, client, t, req)
-		if err == nil {
-			t.Fatal("expected error when one input exceeds context with truncate=false")
-		}
-
-		// Check that it's a StatusError with the correct status code
-		var statusErr api.StatusError
-		if !errors.As(err, &statusErr) {
-			t.Fatalf("expected api.StatusError, got %T: %v", err, err)
-		}
-
-		// The error should be a 4xx client error, not a 500 Internal Server Error
-		if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
-			t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
-		}
-	})
-}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -26,6 +26,13 @@ func TestVisionModels(t *testing.T) {
 		{
 			model: "gemma3",
 		},
+		{
+			model: "qwen3-vl:8b",
+		},
+		{
+			// Qwen 3 VL mixture of experts
+			model: "qwen3-vl:30b",
+		},
 	}

 	for _, v := range testCases {
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -248,12 +248,14 @@ var (
 		"zephyr",
 	}
 	libraryEmbedModels = []string{
+		"qwen3-embedding",
+		"embeddinggemma",
+		"nomic-embed-text",
 		"all-minilm",
 		"bge-large",
 		"bge-m3",
 		"granite-embedding",
 		"mxbai-embed-large",
-		"nomic-embed-text",
 		"paraphrase-multilingual",
 		"snowflake-arctic-embed",
 		"snowflake-arctic-embed2",
@@ -321,7 +323,7 @@ func GetTestEndpoint() (*api.Client, string) {
 		}
 	}

-	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && port == defaultPort {
+	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" && port == defaultPort {
 		port = FindPort()
 	}

@@ -335,15 +337,20 @@ func GetTestEndpoint() (*api.Client, string) {
 		http.DefaultClient), fmt.Sprintf("%s:%s", host, port)
 }

-var serverMutex sync.Mutex
-var serverReady bool
-var serverLogFile string
+// Server lifecycle management
+var (
+	serverMutex sync.Mutex
+	serverReady bool
+	serverLog   bytes.Buffer
+	serverDone  chan int
+	serverCmd   *exec.Cmd
+)

 func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
 	// Make sure the server has been built
 	CLIName, err := filepath.Abs("../ollama")
 	if err != nil {
-		return err
+		return fmt.Errorf("failed to get absolute path: %w", err)
 	}

 	if runtime.GOOS == "windows" {
@@ -351,72 +358,42 @@ func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
 	}
 	_, err = os.Stat(CLIName)
 	if err != nil {
-		return fmt.Errorf("CLI missing, did you forget to build first?  %w", err)
+		return fmt.Errorf("CLI missing, did you forget to 'go build .' first?  %w", err)
 	}
 	serverMutex.Lock()
 	defer serverMutex.Unlock()
 	if serverReady {
 		return nil
 	}
+	serverDone = make(chan int)
+	serverLog.Reset()

 	if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
 		slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
 		t.Setenv("OLLAMA_HOST", ollamaHost)
 	}

-	logDir := t.TempDir()
-	slog.Info("starting server", "url", ollamaHost)
-	done, err := SpawnServer(ctx, "../ollama", logDir)
-	if err != nil {
-		return fmt.Errorf("failed to start server: %w", err)
-	}
-
+	serverCmd = exec.Command(CLIName, "serve")
+	serverCmd.Stderr = &serverLog
+	serverCmd.Stdout = &serverLog
 	go func() {
-		<-ctx.Done()
-		serverMutex.Lock()
-		defer serverMutex.Unlock()
-		exitCode := <-done
-		if exitCode > 0 {
-			slog.Warn("server failure", "exit", exitCode)
-		}
-		serverReady = false
-	}()
-
-	// TODO wait only long enough for the server to be responsive...
-	time.Sleep(500 * time.Millisecond)
-
-	serverReady = true
-	return nil
-}
-
-func SpawnServer(ctx context.Context, command, logDir string) (chan int, error) {
-	done := make(chan int)
-	fp, err := os.CreateTemp(logDir, "ollama-server-*.log")
-	if err != nil {
-		return nil, fmt.Errorf("failed to create log file: %w", err)
-	}
-	serverLogFile = fp.Name()
-
-	cmd := exec.CommandContext(ctx, command, "serve")
-	cmd.Stderr = fp
-	cmd.Stdout = fp
-
-	go func() {
-		slog.Info("starting server...")
-		if err := cmd.Run(); err != nil {
-			// "signal: killed" expected
+		slog.Info("starting server", "url", ollamaHost)
+		if err := serverCmd.Run(); err != nil {
+			// "signal: killed" expected during normal shutdown
 			if !strings.Contains(err.Error(), "signal") {
 				slog.Info("failed to run server", "error", err)
 			}
 		}
 		var code int
-		if cmd.ProcessState != nil {
-			code = cmd.ProcessState.ExitCode()
+		if serverCmd.ProcessState != nil {
+			code = serverCmd.ProcessState.ExitCode()
 		}
 		slog.Info("server exited")
-		done <- code
+		serverDone <- code
 	}()
-	return done, nil
+
+	serverReady = true
+	return nil
 }

 func PullIfMissing(ctx context.Context, client *api.Client, modelName string) error {
@@ -477,52 +454,65 @@ var serverProcMutex sync.Mutex
 // Starts the server if needed
 func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, string, func()) {
 	client, testEndpoint := GetTestEndpoint()
-	if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-		serverProcMutex.Lock()
-		if err := startServer(t, ctx, testEndpoint); err != nil {
+	cleanup := func() {}
+	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" {
+		var err error
+		err = startServer(t, ctx, testEndpoint)
+		if err != nil {
 			t.Fatal(err)
 		}
+		cleanup = func() {
+			serverMutex.Lock()
+			defer serverMutex.Unlock()
+			serverReady = false
+
+			slog.Info("shutting down server")
+			serverCmd.Process.Signal(os.Interrupt)
+			slog.Info("waiting for server to exit")
+			<-serverDone
+			slog.Info("terminate complete")
+
+			if t.Failed() {
+				slog.Warn("SERVER LOG FOLLOWS")
+				io.Copy(os.Stderr, &serverLog)
+				slog.Warn("END OF SERVER")
+			}
+			slog.Info("cleanup complete", "failed", t.Failed())
+		}
 	}
 	// Make sure server is online and healthy before returning
-	listCtx, cancel := context.WithDeadlineCause(
-		ctx,
-		time.Now().Add(120*time.Second),
-		fmt.Errorf("list models took too long"),
-	)
-	defer cancel()
-	models, err := client.ListRunning(listCtx)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if len(models.Models) > 0 {
-		names := make([]string, len(models.Models))
-		for i, m := range models.Models {
-			names[i] = m.Name
+	for {
+		select {
+		case <-ctx.Done():
+			t.Fatalf("context done before server ready: %v", ctx.Err())
+			break
+		default:
 		}
-		slog.Info("currently loaded", "models", names)
+		listCtx, cancel := context.WithDeadlineCause(
+			ctx,
+			time.Now().Add(10*time.Second),
+			fmt.Errorf("list models took too long"),
+		)
+		defer cancel()
+		models, err := client.ListRunning(listCtx)
+		if err != nil {
+			if runtime.GOOS == "windows" {
+				t.Fatalf("did you forget to start the server: %v", err)
+			}
+			time.Sleep(10 * time.Millisecond)
+			continue
+		}
+		if len(models.Models) > 0 {
+			names := make([]string, len(models.Models))
+			for i, m := range models.Models {
+				names[i] = m.Name
+			}
+			slog.Info("currently loaded", "models", names)
+		}
+		break
 	}

-	return client, testEndpoint, func() {
-		if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-			defer serverProcMutex.Unlock()
-			if t.Failed() {
-				fp, err := os.Open(serverLogFile)
-				if err != nil {
-					slog.Error("failed to open server log", "logfile", serverLogFile, "error", err)
-					return
-				}
-				defer fp.Close()
-				data, err := io.ReadAll(fp)
-				if err != nil {
-					slog.Error("failed to read server log", "logfile", serverLogFile, "error", err)
-					return
-				}
-				slog.Warn("SERVER LOG FOLLOWS")
-				os.Stderr.Write(data)
-				slog.Warn("END OF SERVER")
-			}
-		}
-	}
+	return client, testEndpoint, cleanup
 }

 func ChatTestHelper(ctx context.Context, t *testing.T, req api.ChatRequest, anyResp []string) {
--- a/llama/patches/0026-GPU-discovery-enhancements.patch
+++ b/llama/patches/0026-GPU-discovery-enhancements.patch
@@ -5,24 +5,33 @@ Subject: [PATCH] GPU discovery enhancements

 Expose more information about the devices through backend props, and leverage
 management libraries for more accurate VRAM usage reporting if available.
+
+vulkan: get GPU ID (ollama v0.11.5)
+
+Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
+
+Vulkan PCI and Memory
+
+fix vulkan PCI ID and ID handling
 ---
- ggml/include/ggml-backend.h        |  11 +
- ggml/src/CMakeLists.txt            |   2 +
- ggml/src/ggml-cuda/ggml-cuda.cu    |  74 +++++
- ggml/src/ggml-cuda/vendors/hip.h   |   3 +
- ggml/src/ggml-impl.h               |   8 +
- ggml/src/ggml-metal/ggml-metal.cpp |   2 +
- ggml/src/mem_hip.cpp               | 449 +++++++++++++++++++++++++++++
- ggml/src/mem_nvml.cpp              | 209 ++++++++++++++
- 8 files changed, 758 insertions(+)
+ ggml/include/ggml-backend.h          |   8 +
+ ggml/src/CMakeLists.txt              |   2 +
+ ggml/src/ggml-cuda/ggml-cuda.cu      |  65 ++++
+ ggml/src/ggml-cuda/vendors/hip.h     |   3 +
+ ggml/src/ggml-impl.h                 |   8 +
+ ggml/src/ggml-metal/ggml-metal.cpp   |   2 +
+ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 212 +++++++++++--
+ ggml/src/mem_hip.cpp                 | 452 +++++++++++++++++++++++++++
+ ggml/src/mem_nvml.cpp                | 209 +++++++++++++
+ 9 files changed, 931 insertions(+), 30 deletions(-)
 create mode 100644 ggml/src/mem_hip.cpp
 create mode 100644 ggml/src/mem_nvml.cpp

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index ba181d09d..094fc3c82 100644
+index ba181d09d..809835243 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -169,6 +169,17 @@ extern "C" {
+@@ -169,6 +169,14 @@ extern "C" {
         const char * device_id;
         // device capabilities
         struct ggml_backend_dev_caps caps;
@@ -31,9 +40,6 @@ index ba181d09d..094fc3c82 100644
 +        int compute_major;
 +        int compute_minor;
 +        int integrated;
-+        int pci_bus_id;
-+        int pci_device_id;
-+        int pci_domain_id;
 +        const char *library;
 +        // number with which the devices are accessed (Vulkan)
 +        const char *numeric_id;
@@ -54,7 +60,7 @@ index 0609c6503..aefe43bdd 100644
 
 target_include_directories(ggml-base PRIVATE .)
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 87c6c34a4..816597d2f 100644
+index 87c6c34a4..b075a18be 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
@@ -86,7 +92,7 @@ index 87c6c34a4..816597d2f 100644
         GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
                         id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
                         ggml_cuda_parse_uuid(prop, id).c_str());
-@@ -3484,6 +3499,14 @@ struct ggml_backend_cuda_device_context {
+@@ -3484,6 +3499,11 @@ struct ggml_backend_cuda_device_context {
     std::string description;
     std::string pci_bus_id;
     std::string id;
@@ -95,22 +101,19 @@ index 87c6c34a4..816597d2f 100644
 +    int driver_major;
 +    int driver_minor;
 +    int integrated;
-+    int pciBusID;
-+    int pciDeviceID;
-+    int pciDomainID;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -3504,6 +3527,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+@@ -3504,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
 +
 +#if defined(GGML_USE_HIP)
 +    if (ggml_hip_mgmt_init() == 0) {
-+        int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
 +        if (status == 0) {
-+            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
 +            ggml_hip_mgmt_release();
 +            return;
 +        }
@@ -120,7 +123,7 @@ index 87c6c34a4..816597d2f 100644
 +    if (ggml_nvml_init() == 0) {
 +        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
 +        if (status == 0) {
-+            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
 +            ggml_nvml_release();
 +            return;
 +        }
@@ -130,7 +133,7 @@ index 87c6c34a4..816597d2f 100644
     CUDA_CHECK(cudaMemGetInfo(free, total));
 }
 
-@@ -3512,6 +3557,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+@@ -3512,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
@@ -138,7 +141,7 @@ index 87c6c34a4..816597d2f 100644
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
 
-@@ -3525,6 +3571,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -3525,6 +3568,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     // If you need the memory data, call ggml_backend_dev_memory() explicitly.
     props->memory_total = props->memory_free = 0;
 
@@ -153,15 +156,12 @@ index 87c6c34a4..816597d2f 100644
 +    props->driver_major = ctx->driver_major;
 +    props->driver_minor = ctx->driver_minor;
 +    props->integrated = ctx->integrated;
-+    props->pci_bus_id = ctx->pciBusID;
-+    props->pci_device_id = ctx->pciDeviceID;
-+    props->pci_domain_id = ctx->pciDomainID;
 +    props->library = GGML_CUDA_NAME;
 +
     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
     bool events = false;
-@@ -4087,6 +4149,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4087,6 +4143,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@@ -169,7 +169,7 @@ index 87c6c34a4..816597d2f 100644
 
             for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                 ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
-@@ -4102,6 +4165,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4102,6 +4159,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
                 dev_ctx->pci_bus_id = pci_bus_id;
 
@@ -181,9 +181,6 @@ index 87c6c34a4..816597d2f 100644
 +                dev_ctx->driver_major = driverVersion / 1000;
 +                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
 +                dev_ctx->integrated = prop.integrated;
-+                dev_ctx->pciBusID = prop.pciBusID;
-+                dev_ctx->pciDeviceID = prop.pciDeviceID;
-+                dev_ctx->pciDomainID = prop.pciDomainID;
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
@@ -209,7 +206,7 @@ index 1f06be80e..2f9ef2dc0 100644
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index d0fb3bcca..80597b6ea 100644
+index d0fb3bcca..b63edd0c1 100644
 --- a/ggml/src/ggml-impl.h
 +++ b/ggml/src/ggml-impl.h
@@ -638,6 +638,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
@@ -221,7 +218,7 @@ index d0fb3bcca..80597b6ea 100644
 +GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
 +GGML_API void ggml_nvml_release();
 +GGML_API int ggml_hip_mgmt_init();
-+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
 +GGML_API void ggml_hip_mgmt_release();
 +
 #ifdef __cplusplus
@@ -247,12 +244,319 @@ index f2ff9f322..f356e4a0a 100644
     props->caps = {
         /* .async                 = */ true,
         /* .host_buffer           = */ false,
+diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+index ed83236f4..0bbcecd01 100644
+--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+@@ -231,6 +231,7 @@ class vk_memory_logger;
+ #endif
+ class vk_perf_logger;
+ static void ggml_vk_destroy_buffer(vk_buffer& buf);
+static std::string ggml_vk_get_device_id(int device);
+ 
+ static constexpr uint32_t mul_mat_vec_max_cols = 8;
+ static constexpr uint32_t p021_max_gqa_ratio = 8;
+@@ -11585,6 +11586,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
+     snprintf(description, description_size, "%s", props.deviceName.data());
+ }
+ 
+static std::string ggml_vk_get_device_id(int device) {
+    ggml_vk_instance_init();
+
+    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
+
+    vk::PhysicalDeviceProperties2 props;
+    vk::PhysicalDeviceIDProperties deviceIDProps;
+    props.pNext = &deviceIDProps;
+    devices[device].getProperties2(&props);
+
+    const auto& uuid = deviceIDProps.deviceUUID;
+    char id[64];
+    snprintf(id, sizeof(id),
+        "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        uuid[0], uuid[1], uuid[2], uuid[3],
+        uuid[4], uuid[5],
+        uuid[6], uuid[7],
+        uuid[8], uuid[9],
+        uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
+    );
+    return std::string(id);
+}
+
+ // backend interface
+ 
+ #define UNUSED GGML_UNUSED
+@@ -12391,31 +12415,103 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
+     ggml_vk_get_device_description(dev_idx, description, description_size);
+ }
+ 
+-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
+std::string ggml_backend_vk_get_device_id(int device) {
+     GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+-    GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
+    int dev_idx = vk_instance.device_indices[device];
+    return ggml_vk_get_device_id(dev_idx);
+}
+
+//////////////////////////
+
+struct ggml_backend_vk_device_context {
+    size_t device;
+    std::string name;
+    std::string description;
+    bool is_integrated_gpu;
+    // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
+    std::string pci_id;
+    std::string id;
+    std::string uuid;
+    std::string numeric_id;
+    int major;
+    int minor;
+    int driver_major;
+    int driver_minor;
+};
+
+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
+    GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
+    GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
+
+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
+ 
+-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
+-    vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
+-    vk::PhysicalDeviceMemoryProperties2 memprops = {};
+-    bool membudget_supported = vk_instance.device_supports_membudget[device];
+    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+    vk::PhysicalDeviceProperties2 props2;
+    vkdev.getProperties2(&props2);
+ 
+-    if (membudget_supported) {
+-        memprops.pNext = &budgetprops;
+    if (!ctx->is_integrated_gpu)
+    {
+        // Use vendor specific management libraries for best VRAM reporting if available
+        switch (props2.properties.vendorID) {
+        case VK_VENDOR_ID_AMD:
+            if (ggml_hip_mgmt_init() == 0) {
+                int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
+                if (status == 0) {
+                    GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
+                    ggml_hip_mgmt_release();
+                    return;
+                }
+                ggml_hip_mgmt_release();
+            }
+            break;
+        case VK_VENDOR_ID_NVIDIA:
+            if (ggml_nvml_init() == 0) {
+                int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
+                if (status == 0) {
+                    GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
+                    ggml_nvml_release();
+                    return;
+                }
+                ggml_nvml_release();
+            }
+            break;
+        }
+     }
+-    vkdev.getMemoryProperties2(&memprops);
+    // else fallback to memory budget if supported
+ 
+-    for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
+-        const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
+    *total = 0;
+    *free = 0;
+    vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
+    vk::PhysicalDeviceMemoryProperties2 memprops2;
+    memprops2.pNext = &mem_budget_props;
+    vkdev.getMemoryProperties2(&memprops2);
+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
+        } else if (ctx->is_integrated_gpu) {
+            // Include shared memory on iGPUs
+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
+        }
+    }
+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+            *free += mem_budget_props.heapBudget[i];
+        } else if (ctx->is_integrated_gpu) {
+            *free += mem_budget_props.heapBudget[i];
+        }
+    }
+    if (*total > 0 && *free > 0) {
+        return;
+    } else if (*total > 0) {
+        *free = *total;
+        return;
+    }
+ 
+    // else just report the physical memory
+    for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
+         if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+             *total = heap.size;
+-
+-            if (membudget_supported && i < budgetprops.heapUsage.size()) {
+-                *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
+-            } else {
+-                *free = heap.size;
+-            }
+            *free = heap.size;
+             break;
+         }
+     }
+@@ -12448,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
+         }
+     }
+ 
+    vk::PhysicalDeviceProperties2 props2;
+     if (!ext_support) {
+-        return "";
+        device.getProperties2(&props2);
+        if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
+            return "";
+        }
+        // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
+     }
+ 
+     vk::PhysicalDeviceProperties2 props = {};
+@@ -12466,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
+ 
+     char pci_bus_id[16] = {};
+     snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
+    if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
+        return "";
+    }
+ 
+     return std::string(pci_bus_id);
+ }
+ 
+-//////////////////////////
+-
+-struct ggml_backend_vk_device_context {
+-    size_t device;
+-    std::string name;
+-    std::string description;
+-    bool is_integrated_gpu;
+-    std::string pci_bus_id;
+-};
+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
+    if (id.empty()) return false;
+    unsigned int d = 0, b = 0, dev = 0, func = 0;
+    // Expected format: dddd:bb:dd.f (all hex)
+    int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
+    if (n < 4) return false;
+    if (domain) *domain = (int) d;
+    if (bus) *bus = (int) b;
+    if (device) *device = (int) dev;
+    return true;
+}
+ 
+ static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
+     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+@@ -12490,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
+     return ctx->description.c_str();
+ }
+ 
+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ctx->id.c_str();
+}
+
+ static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
+-    ggml_backend_vk_get_device_memory(ctx->device, free, total);
+    ggml_backend_vk_get_device_memory(ctx, free, total);
+ }
+ 
+ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
+@@ -12516,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
+ 
+     props->name        = ggml_backend_vk_device_get_name(dev);
+     props->description = ggml_backend_vk_device_get_description(dev);
+    props->id          = ggml_backend_vk_device_get_id(dev);
+     props->type        = ggml_backend_vk_device_get_type(dev);
+-    props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
+    props->device_id   = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
+     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
+     props->caps = {
+         /* .async                 = */ false,
+@@ -12525,6 +12637,14 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
+         /* .buffer_from_host_ptr  = */ false,
+         /* .events                = */ false,
+     };
+
+    props->compute_major = ctx->major;
+    props->compute_minor = ctx->minor;
+    props->driver_major = ctx->driver_major;
+    props->driver_minor = ctx->driver_minor;
+    props->integrated = ctx->is_integrated_gpu;
+    props->library = GGML_VK_NAME;
+    props->numeric_id = ctx->numeric_id.c_str();
+ }
+ 
+ static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
+@@ -12953,6 +13073,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+         static std::mutex mutex;
+         std::lock_guard<std::mutex> lock(mutex);
+         if (!initialized) {
+            std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
+
+             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
+                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
+                 char desc[256];
+@@ -12961,12 +13083,42 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+                 ctx->name = GGML_VK_NAME + std::to_string(i);
+                 ctx->description = desc;
+                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
+-                ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
+                ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
+                ctx->id = ggml_backend_vk_get_device_id(i);
+                 devices.push_back(new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_vk_device_i,
+                     /* .reg     = */ reg,
+                     /* .context = */ ctx,
+                 });
+
+                // Gather additional information about the device
+                int dev_idx = vk_instance.device_indices[i];
+                vk::PhysicalDeviceProperties props1;
+                vk_devices[dev_idx].getProperties(&props1);
+                vk::PhysicalDeviceProperties2 props2;
+                vk::PhysicalDeviceIDProperties device_id_props;
+                vk::PhysicalDevicePCIBusInfoPropertiesEXT  pci_bus_props;
+                vk::PhysicalDeviceDriverProperties driver_props;
+                props2.pNext = &device_id_props;
+                device_id_props.pNext = &pci_bus_props;
+                pci_bus_props.pNext = &driver_props;
+                vk_devices[dev_idx].getProperties2(&props2);
+                std::ostringstream oss;
+                oss << std::hex << std::setfill('0');
+                int byteIdx = 0;
+                for (int i = 0; i < 16; ++i, ++byteIdx) {
+                    oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
+                    if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
+                        oss << '-';
+                    }
+                }
+                ctx->uuid = oss.str();
+                ctx->major = 0;
+                ctx->minor = 0;
+                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
+                ctx->driver_major = 0;
+                ctx->driver_minor = 0;
+                ctx->numeric_id = std::to_string(i);
+             }
+             initialized = true;
+         }
 diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
 new file mode 100644
-index 000000000..8ef19b8cf
+index 000000000..5a7f5d465
 --- /dev/null
 +++ b/ggml/src/mem_hip.cpp
-@@ -0,0 +1,449 @@
+@@ -0,0 +1,452 @@
 +#include "ggml.h"
 +
 +#ifdef _WIN32
@@ -586,7 +890,7 @@ index 000000000..8ef19b8cf
 +    if (gpus != NULL) gpus->pVtbl->Release(gpus); \
 +    if (gpu != NULL) gpu->pVtbl->Release(gpu)
 +
-+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
 +    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
 +    if (adlx.handle == NULL) {
 +        GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -598,9 +902,13 @@ index 000000000..8ef19b8cf
 +    IADLXGPU* gpu = NULL;
 +    IADLXGPUMetrics *gpuMetrics = NULL;
 +    ADLX_RESULT status;
-+    // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs 
-+    adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
 +
+    uint32_t pci_domain, pci_bus, pci_device, pci_function;
+    if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
+        // TODO - parse other formats?
+        GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
+        return ADLX_NOT_FOUND;
+    }
 +    status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
 +    if (ADLX_FAILED(status)) {
 +        GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -623,16 +931,15 @@ index 000000000..8ef19b8cf
 +            GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
 +            continue;
 +        }
-+        adlx_int id;
-+        status = gpu->pVtbl->UniqueId(gpu, &id);
+        adlx_int uniqueID;
+        status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
 +        if (ADLX_FAILED(status)) {
 +            GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
 +            gpu->pVtbl->Release(gpu);
 +            gpu = NULL;
 +            continue;
 +        }
-+        if (id != target) {
-+            GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+        if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
 +            gpu->pVtbl->Release(gpu);
 +            gpu = NULL;
 +            continue;
@@ -695,7 +1002,7 @@ index 000000000..8ef19b8cf
 +    return -1;
 +}
 +void ggml_hip_mgmt_release() {}
-+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
 +    return -1;
 +}
 +
--- a/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch
+++ b/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs
 1 file changed, 68 insertions(+), 3 deletions(-)

 diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
-index c9073cef..f473a2a2 100644
+index c9073cef0..f473a2a2c 100644
 --- a/ggml/src/mem_nvml.cpp
 +++ b/ggml/src/mem_nvml.cpp
@@ -13,6 +13,7 @@
--- a/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch
+++ b/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch
@@ -1,95 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Xiaodong Ye <xiaodong.ye@mthreads.com>
-Date: Mon, 18 Aug 2025 12:48:07 +0800
-Subject: [PATCH] vulkan: get GPU ID (ollama v0.11.5)
-
-Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 ++++++++++++++++++++++++++++
- 1 file changed, 37 insertions(+)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 061cd078..adea7783 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -11588,6 +11588,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
-     snprintf(description, description_size, "%s", props.deviceName.data());
- }
-
-+static std::string ggml_vk_get_device_id(int device) {
-+    ggml_vk_instance_init();
-+
-+    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
-+
-+    vk::PhysicalDeviceProperties2 props;
-+    vk::PhysicalDeviceIDProperties deviceIDProps;
-+    props.pNext = &deviceIDProps;
-+    devices[device].getProperties2(&props);
-+
-+    const auto& uuid = deviceIDProps.deviceUUID;
-+    char id[64];
-+    snprintf(id, sizeof(id),
-+        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-+        uuid[0], uuid[1], uuid[2], uuid[3],
-+        uuid[4], uuid[5],
-+        uuid[6], uuid[7],
-+        uuid[8], uuid[9],
-+        uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
-+    );
-+    return std::string(id);
-+}
-+
- // backend interface
-
- #define UNUSED GGML_UNUSED
-@@ -12394,6 +12417,12 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
-     ggml_vk_get_device_description(dev_idx, description, description_size);
- }
-
-+std::string ggml_backend_vk_get_device_id(int device) {
-+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-+    int dev_idx = vk_instance.device_indices[device];
-+    return ggml_vk_get_device_id(dev_idx);
-+}
-+
- void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-     GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-     GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
-@@ -12481,6 +12510,7 @@ struct ggml_backend_vk_device_context {
-     std::string description;
-     bool is_integrated_gpu;
-     std::string pci_bus_id;
-+    std::string id;
- };
-
- static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
-@@ -12493,6 +12523,11 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
-     return ctx->description.c_str();
- }
-
-+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
-+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-+    return ctx->id.c_str();
-+}
-+
- static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
-     ggml_backend_vk_get_device_memory(ctx->device, free, total);
-@@ -12519,6 +12554,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
-
-     props->name        = ggml_backend_vk_device_get_name(dev);
-     props->description = ggml_backend_vk_device_get_description(dev);
-+    props->id          = ggml_backend_vk_device_get_id(dev);
-     props->type        = ggml_backend_vk_device_get_type(dev);
-     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
-     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
-@@ -12965,6 +13001,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
-                 ctx->description = desc;
-                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
-                 ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
-+                ctx->id = ggml_backend_vk_get_device_id(i);
-                 devices.push_back(new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_vk_device_i,
-                     /* .reg     = */ reg,
-- 
-2.51.0
--- a/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
+++ b/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
@@ -28,7 +28,7 @@ Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
 1 file changed, 9 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 6a278b5e9..87941f872 100644
+index b075a18be..d62f412d6 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -340,6 +340,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
--- a/llama/patches/0028-vulkan-pci-and-memory.patch
+++ b/llama/patches/0028-vulkan-pci-and-memory.patch
@@ -1,254 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date:   Fri Sep 5 08:25:03 2025 -0700
-Subject: [PATCH] Vulkan PCI and Memory
-
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 176 ++++++++++++++++++++++-----
- 1 file changed, 145 insertions(+), 31 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index adea7783..fb7204ce 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -12423,31 +12423,99 @@ std::string ggml_backend_vk_get_device_id(int device) {
-     return ggml_vk_get_device_id(dev_idx);
- }
- 
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-    GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
-+//////////////////////////
-+
-+struct ggml_backend_vk_device_context {
-+    size_t device;
-+    std::string name;
-+    std::string description;
-+    bool is_integrated_gpu;
-+    // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
-+    std::string pci_id;
-+    std::string id;
-+    std::string uuid;
-+    int major;
-+    int minor;
-+    int driver_major;
-+    int driver_minor;
-+    int pci_bus_id;
-+    int pci_device_id;
-+    int pci_domain_id;
-+};
-+
-+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
-+    GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
-+    GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
-+
-+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
- 
-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
-    vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
-    vk::PhysicalDeviceMemoryProperties2 memprops = {};
-    bool membudget_supported = vk_instance.device_supports_membudget[device];
-+    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
-+    vk::PhysicalDeviceProperties2 props2;
-+    vkdev.getProperties2(&props2);
- 
-    if (membudget_supported) {
-        memprops.pNext = &budgetprops;
-+    if (!ctx->is_integrated_gpu)
-+    {
-+        // Use vendor specific management libraries for best VRAM reporting if available
-+        switch (props2.properties.vendorID) {
-+        case VK_VENDOR_ID_AMD:
-+            if (ggml_hip_mgmt_init() == 0) {
-+                int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
-+                if (status == 0) {
-+                    GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
-+                    ggml_hip_mgmt_release();
-+                    return;
-+                }
-+                ggml_hip_mgmt_release();
-+            }
-+            break;
-+        case VK_VENDOR_ID_NVIDIA:
-+            if (ggml_nvml_init() == 0) {
-+                int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
-+                if (status == 0) {
-+                    GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
-+                    ggml_nvml_release();
-+                    return;
-+                }
-+                ggml_nvml_release();
-+            }
-+            break;
-+        }
-     }
-    vkdev.getMemoryProperties2(&memprops);
-+    // else fallback to memory budget if supported
- 
-    for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
-        const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
-+    *total = 0;
-+    *free = 0;
-+    vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
-+    vk::PhysicalDeviceMemoryProperties2 memprops2;
-+    memprops2.pNext = &mem_budget_props;
-+    vkdev.getMemoryProperties2(&memprops2);
-+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
-+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
-+        } else if (ctx->is_integrated_gpu) {
-+            // Include shared memory on iGPUs
-+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
-+        }
-+    }
-+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
-+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-+            *free += mem_budget_props.heapBudget[i];
-+        } else if (ctx->is_integrated_gpu) {
-+            *free += mem_budget_props.heapBudget[i];
-+        }
-+    }
-+    if (*total > 0 && *free > 0) {
-+        return;
-+    } else if (*total > 0) {
-+        *free = *total;
-+        return;
-+    }
- 
-+    // else just report the physical memory
-+    for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
-         if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-             *total = heap.size;
-
-            if (membudget_supported && i < budgetprops.heapUsage.size()) {
-                *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
-            } else {
-                *free = heap.size;
-            }
-+            *free = heap.size;
-             break;
-         }
-     }
-@@ -12502,16 +12570,17 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
-     return std::string(pci_bus_id);
- }
- 
-//////////////////////////
-
-struct ggml_backend_vk_device_context {
-    size_t device;
-    std::string name;
-    std::string description;
-    bool is_integrated_gpu;
-    std::string pci_bus_id;
-    std::string id;
-};
-+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
-+    if (id.empty()) return false;
-+    unsigned int d = 0, b = 0, dev = 0, func = 0;
-+    // Expected format: dddd:bb:dd.f (all hex)
-+    int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
-+    if (n < 4) return false;
-+    if (domain) *domain = (int) d;
-+    if (bus) *bus = (int) b;
-+    if (device) *device = (int) dev;
-+    return true;
-+}
- 
- static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
-     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-@@ -12530,7 +12599,7 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
- 
- static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
-    ggml_backend_vk_get_device_memory(ctx->device, free, total);
-+    ggml_backend_vk_get_device_memory(ctx, free, total);
- }
- 
- static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
-@@ -12556,7 +12625,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
-     props->description = ggml_backend_vk_device_get_description(dev);
-     props->id          = ggml_backend_vk_device_get_id(dev);
-     props->type        = ggml_backend_vk_device_get_type(dev);
-    props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
-+    props->device_id   = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
-     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
-     props->caps = {
-         /* .async                 = */ false,
-@@ -12564,6 +12633,17 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
-         /* .buffer_from_host_ptr  = */ false,
-         /* .events                = */ false,
-     };
-+
-+    props->compute_major = ctx->major;
-+    props->compute_minor = ctx->minor;
-+    props->driver_major = ctx->driver_major;
-+    props->driver_minor = ctx->driver_minor;
-+    props->integrated = ctx->is_integrated_gpu;
-+    props->pci_bus_id = ctx->pci_bus_id;
-+    props->pci_device_id = ctx->pci_device_id;
-+    props->pci_domain_id = ctx->pci_domain_id;
-+    props->library = GGML_VK_NAME;
-+    props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
- }
- 
- static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
-@@ -12992,6 +13071,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
-         static std::mutex mutex;
-         std::lock_guard<std::mutex> lock(mutex);
-         if (!initialized) {
-+            std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
-+
-             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
-                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
-                 char desc[256];
-@@ -13000,13 +13081,46 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
-                 ctx->name = GGML_VK_NAME + std::to_string(i);
-                 ctx->description = desc;
-                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
-                ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
-+                ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
-                 ctx->id = ggml_backend_vk_get_device_id(i);
-                 devices.push_back(new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_vk_device_i,
-                     /* .reg     = */ reg,
-                     /* .context = */ ctx,
-                 });
-+
-+                // Gather additional information about the device
-+                int dev_idx = vk_instance.device_indices[i];
-+                vk::PhysicalDeviceProperties props1;
-+                vk_devices[dev_idx].getProperties(&props1);
-+                vk::PhysicalDeviceProperties2 props2;
-+                vk::PhysicalDeviceIDProperties device_id_props;
-+                vk::PhysicalDevicePCIBusInfoPropertiesEXT  pci_bus_props;
-+                vk::PhysicalDeviceDriverProperties driver_props;
-+                props2.pNext = &device_id_props;
-+                device_id_props.pNext = &pci_bus_props;
-+                pci_bus_props.pNext = &driver_props;
-+                vk_devices[dev_idx].getProperties2(&props2);
-+                std::ostringstream oss;
-+                oss << std::hex << std::setfill('0');
-+                oss << "GPU-";
-+                int byteIdx = 0;
-+                for (int i = 0; i < 16; ++i, ++byteIdx) {
-+                    oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
-+                    if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
-+                        oss << '-';
-+                    }
-+                }
-+                ctx->uuid = oss.str();
-+                ctx->pci_bus_id = pci_bus_props.pciBus;
-+                ctx->pci_device_id = pci_bus_props.pciDevice;
-+                ctx->pci_domain_id = pci_bus_props.pciDomain;
-+                ctx->id = std::to_string(i);
-+                ctx->major = 0;
-+                ctx->minor = 0;
-+                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
-+                ctx->driver_major = 0;
-+                ctx->driver_minor = 0;
-             }
-             initialized = true;
-         }
-- 
-2.51.0
--- a/llama/patches/0029-report-LoadLibrary-failures.patch
+++ b/llama/patches/0029-report-LoadLibrary-failures.patch
--- a/llm/server.go
+++ b/llm/server.go
@@ -69,7 +69,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error)
+	Embedding(ctx context.Context, input string) ([]float32, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -1545,16 +1545,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 }

 type EmbeddingRequest struct {
-	Content  string `json:"content"`
-	Truncate bool   `json:"truncate"`
+	Content string `json:"content"`
 }

 type EmbeddingResponse struct {
-	Embedding       []float32 `json:"embedding"`
-	PromptEvalCount int       `json:"prompt_eval_count"`
+	Embedding []float32 `json:"embedding"`
 }

-func (s *llmServer) Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error) {
+func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
 	logutil.Trace("embedding request", "input", input)

 	if err := s.sem.Acquire(ctx, 1); err != nil {
@@ -1563,54 +1561,51 @@ func (s *llmServer) Embedding(ctx context.Context, input string, truncate bool)
 		} else {
 			slog.Error("Failed to acquire semaphore", "error", err)
 		}
-		return nil, 0, err
+		return nil, err
 	}
 	defer s.sem.Release(1)

 	// Make sure the server is ready
 	status, err := s.getServerStatusRetry(ctx)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	} else if status != ServerStatusReady {
-		return nil, 0, fmt.Errorf("unexpected server status: %s", status)
+		return nil, fmt.Errorf("unexpected server status: %s", status)
 	}

-	data, err := json.Marshal(EmbeddingRequest{Content: input, Truncate: truncate})
+	data, err := json.Marshal(EmbeddingRequest{Content: input})
 	if err != nil {
-		return nil, 0, fmt.Errorf("error marshaling embed data: %w", err)
+		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}

 	r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
 	if err != nil {
-		return nil, 0, fmt.Errorf("error creating embed request: %w", err)
+		return nil, fmt.Errorf("error creating embed request: %w", err)
 	}
 	r.Header.Set("Content-Type", "application/json")

 	resp, err := http.DefaultClient.Do(r)
 	if err != nil {
-		return nil, 0, fmt.Errorf("do embedding request: %w", err)
+		return nil, fmt.Errorf("do embedding request: %w", err)
 	}
 	defer resp.Body.Close()

 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
-		return nil, 0, fmt.Errorf("error reading embed response: %w", err)
+		return nil, fmt.Errorf("error reading embed response: %w", err)
 	}

 	if resp.StatusCode >= 400 {
 		log.Printf("llm embedding error: %s", body)
-		return nil, 0, api.StatusError{
-			StatusCode:   resp.StatusCode,
-			ErrorMessage: string(body),
-		}
+		return nil, fmt.Errorf("%s", body)
 	}

 	var e EmbeddingResponse
 	if err := json.Unmarshal(body, &e); err != nil {
-		return nil, 0, fmt.Errorf("unmarshal tokenize response: %w", err)
+		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}

-	return e.Embedding, e.PromptEvalCount, nil
+	return e.Embedding, nil
 }

 type TokenizeRequest struct {
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -161,6 +161,7 @@ type Tensor interface {

 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
+	Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor

 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -725,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
 		if props.library != nil {
 			info.Library = C.GoString(props.library)
 		}
-		info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
+		if props.device_id != nil {
+			info.PCIID = C.GoString(props.device_id)
+		}
 		info.LibraryPath = ggml.LibPaths()
 		if props.numeric_id != nil {
 			info.FilteredID = C.GoString(props.numeric_id)
@@ -1180,6 +1182,10 @@ func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
 }

 func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
+	if slices.Contains(shape, -1) {
+		inferShape(t, shape)
+	}
+
 	switch len(shape) {
 	case 0:
 		return &Tensor{
@@ -1322,7 +1328,43 @@ func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

+// inferShape updates shape in place to automatically set a single -1 dimesion
+// based on the input tensor and the other dimensions
+func inferShape(t *Tensor, shape []int) {
+	total := 1
+	for _, dim := range t.Shape() {
+		total *= dim
+	}
+
+	dim := -1
+	for i := range shape {
+		switch shape[i] {
+		case -1:
+			if dim != -1 {
+				panic("only one dimension can be inferred")
+			}
+			dim = i
+		case 0:
+			panic("dimension cannot be zero")
+		default:
+			if total%shape[i] != 0 {
+				panic("cannot infer dimension")
+			}
+
+			total /= shape[i]
+		}
+	}
+
+	if dim != -1 {
+		shape[dim] = total
+	}
+}
+
 func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
+	if slices.Contains(shape, -1) {
+		inferShape(t, shape)
+	}
+
 	switch len(shape) {
 	case 1:
 		return &Tensor{
@@ -1535,6 +1577,16 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 	}
 }

+func (t *Tensor) Conv3D(ctx ml.Context, t2 ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
+	var tt ml.Tensor = &Tensor{
+		b: t.b,
+		t: C.ggml_conv_3d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int64_t(c), C.int(s0), C.int(s1), C.int(s2), C.int(p0), C.int(p1), C.int(p2), C.int(d0), C.int(d1), C.int(d2)),
+	}
+
+	tt = tt.Reshape(ctx, t.Dim(3)/c, t2.Dim(3)/c)
+	return tt
+}
+
 func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -174,9 +174,6 @@ extern "C" {
        int compute_major;
        int compute_minor;
        int integrated;
-        int pci_bus_id;
-        int pci_device_id;
-        int pci_domain_id;
        const char *library;
        // number with which the devices are accessed (Vulkan)
        const char *numeric_id;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context {
    int driver_major;
    int driver_minor;
    int integrated;
-    int pciBusID;
-    int pciDeviceID;
-    int pciDomainID;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *

 #if defined(GGML_USE_HIP)
    if (ggml_hip_mgmt_init() == 0) {
-        int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
        if (status == 0) {
-            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
            ggml_hip_mgmt_release();
            return;
        }
@@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
    if (ggml_nvml_init() == 0) {
        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
        if (status == 0) {
-            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
            ggml_nvml_release();
            return;
        }
@@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
    props->driver_major = ctx->driver_major;
    props->driver_minor = ctx->driver_minor;
    props->integrated = ctx->integrated;
-    props->pci_bus_id = ctx->pciBusID;
-    props->pci_device_id = ctx->pciDeviceID;
-    props->pci_domain_id = ctx->pciDomainID;
    props->library = GGML_CUDA_NAME;

    bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
@@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->driver_major = driverVersion / 1000;
                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
                dev_ctx->integrated = prop.integrated;
-                dev_ctx->pciBusID = prop.pciBusID;
-                dev_ctx->pciDeviceID = prop.pciDeviceID;
-                dev_ctx->pciDomainID = prop.pciDomainID;
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
--- a/ml/backend/ggml/ggml/src/ggml-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-impl.h
@@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init();
 GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
 GGML_API void ggml_nvml_release();
 GGML_API int ggml_hip_mgmt_init();
-GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
 GGML_API void ggml_hip_mgmt_release();

 #ifdef __cplusplus
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -231,6 +231,7 @@ class vk_memory_logger;
 #endif
 class vk_perf_logger;
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
+static std::string ggml_vk_get_device_id(int device);

 static constexpr uint32_t mul_mat_vec_max_cols = 8;
 static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) {
    const auto& uuid = deviceIDProps.deviceUUID;
    char id[64];
    snprintf(id, sizeof(id),
-        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
        uuid[0], uuid[1], uuid[2], uuid[3],
        uuid[4], uuid[5],
        uuid[6], uuid[7],
@@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context {
    std::string pci_id;
    std::string id;
    std::string uuid;
+    std::string numeric_id;
    int major;
    int minor;
    int driver_major;
    int driver_minor;
-    int pci_bus_id;
-    int pci_device_id;
-    int pci_domain_id;
 };

 void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
@@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
        switch (props2.properties.vendorID) {
        case VK_VENDOR_ID_AMD:
            if (ggml_hip_mgmt_init() == 0) {
-                int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+                int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
                if (status == 0) {
-                    GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                    GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
                    ggml_hip_mgmt_release();
                    return;
                }
@@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
            if (ggml_nvml_init() == 0) {
                int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
                if (status == 0) {
-                    GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                    GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
                    ggml_nvml_release();
                    return;
                }
@@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
        }
    }

+    vk::PhysicalDeviceProperties2 props2;
    if (!ext_support) {
-        return "";
+        device.getProperties2(&props2);
+        if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
+            return "";
+        }
+        // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
    }

    vk::PhysicalDeviceProperties2 props = {};
@@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {

    char pci_bus_id[16] = {};
    snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
+    if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
+        return "";
+    }

    return std::string(pci_bus_id);
 }
@@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
    props->driver_major = ctx->driver_major;
    props->driver_minor = ctx->driver_minor;
    props->integrated = ctx->is_integrated_gpu;
-    props->pci_bus_id = ctx->pci_bus_id;
-    props->pci_device_id = ctx->pci_device_id;
-    props->pci_domain_id = ctx->pci_domain_id;
    props->library = GGML_VK_NAME;
-    props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
+    props->numeric_id = ctx->numeric_id.c_str();
 }

 static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                vk_devices[dev_idx].getProperties2(&props2);
                std::ostringstream oss;
                oss << std::hex << std::setfill('0');
-                oss << "GPU-";
                int byteIdx = 0;
                for (int i = 0; i < 16; ++i, ++byteIdx) {
                    oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
@@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                    }
                }
                ctx->uuid = oss.str();
-                ctx->pci_bus_id = pci_bus_props.pciBus;
-                ctx->pci_device_id = pci_bus_props.pciDevice;
-                ctx->pci_domain_id = pci_bus_props.pciDomain;
-                ctx->id = std::to_string(i);
                ctx->major = 0;
                ctx->minor = 0;
                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
                ctx->driver_major = 0;
                ctx->driver_minor = 0;
+                ctx->numeric_id = std::to_string(i);
            }
            initialized = true;
        }
--- a/ml/backend/ggml/ggml/src/mem_hip.cpp
+++ b/ml/backend/ggml/ggml/src/mem_hip.cpp
@@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() {
    if (gpus != NULL) gpus->pVtbl->Release(gpus); \
    if (gpu != NULL) gpu->pVtbl->Release(gpu)

-int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
    if (adlx.handle == NULL) {
        GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
    IADLXGPU* gpu = NULL;
    IADLXGPUMetrics *gpuMetrics = NULL;
    ADLX_RESULT status;
-    // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs 
-    adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);

+    uint32_t pci_domain, pci_bus, pci_device, pci_function;
+    if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
+        // TODO - parse other formats?
+        GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
+        return ADLX_NOT_FOUND;
+    }
    status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
    if (ADLX_FAILED(status)) {
        GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
            GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
            continue;
        }
-        adlx_int id;
-        status = gpu->pVtbl->UniqueId(gpu, &id);
+        adlx_int uniqueID;
+        status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
        if (ADLX_FAILED(status)) {
            GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
            gpu->pVtbl->Release(gpu);
            gpu = NULL;
            continue;
        }
-        if (id != target) {
-            GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+        if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
            gpu->pVtbl->Release(gpu);
            gpu = NULL;
            continue;
@@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() {
    return -1;
 }
 void ggml_hip_mgmt_release() {}
-int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
    return -1;
 }

--- a/ml/backend/ggml/ggml_test.go
+++ b/ml/backend/ggml/ggml_test.go
@@ -0,0 +1,126 @@
+package ggml
+
+import (
+	"errors"
+	"os"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
+)
+
+func setup(tb testing.TB) ml.Context {
+	tb.Helper()
+
+	f, err := os.CreateTemp(tb.TempDir(), "*.bin")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	if err := ggml.WriteGGUF(f, ggml.KV{"general.architecture": "test"}, nil); err != nil {
+		tb.Fatal(err)
+	}
+
+	b, err := ml.NewBackend(f.Name(), ml.BackendParams{})
+	if err != nil {
+		tb.Fatal(err)
+	}
+
+	ctx := b.NewContext().Input()
+
+	tb.Cleanup(func() {
+		ctx.Close()
+		b.Close()
+	})
+
+	return ctx
+}
+
+func TestInferShape(t *testing.T) {
+	cases := []struct {
+		name  string
+		input []int
+		want  []int
+		err   error
+	}{
+		{
+			name:  "no inferred shape",
+			input: []int{2, 3, 4},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "infer begin",
+			input: []int{-1, 3, 4},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "infer mid",
+			input: []int{2, -1, 4},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "infer end",
+			input: []int{2, 3, -1},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "too many inferred dims",
+			input: []int{-1, 3, -1},
+			err:   errors.New("only one dimension can be inferred"),
+		},
+		{
+			name:  "infer gather",
+			input: []int{2, -1},
+			want:  []int{2, 12},
+		},
+		{
+			name:  "infer gather all",
+			input: []int{-1},
+			want:  []int{24},
+		},
+		{
+			name:  "infer split",
+			input: []int{2, -1, 3, 2},
+			want:  []int{2, 2, 3, 2},
+		},
+		{
+			name:  "indivisible infer",
+			input: []int{2, -1, 2, 4},
+			err:   errors.New("cannot infer dimension"),
+		},
+		{
+			name:  "infer zero dim",
+			input: []int{2, 0, 4},
+			err:   errors.New("dimension cannot be zero"),
+		},
+	}
+
+	ctx := setup(t)
+	tensor, ok := ctx.Empty(ml.DTypeF32, 2, 3, 4).(*Tensor)
+	if !ok {
+		t.Fatal("expected *Tensor")
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			defer func() {
+				if r := recover(); r == nil && tt.err == nil {
+					// all good
+				} else if r != nil && tt.err == nil {
+					t.Errorf("unexpected panic: %v", r)
+				} else if r == nil && tt.err != nil {
+					t.Errorf("expected panic but did not get one: %v", tt.err)
+				} else if errStr, ok := r.(string); ok && errStr != tt.err.Error() {
+					t.Errorf("expected panic %q but got %q", tt.err.Error(), errStr)
+				}
+			}()
+
+			inferShape(tensor, tt.input)
+			if diff := cmp.Diff(tt.want, tt.input); diff != "" {
+				t.Errorf("%s: shape mismatch (-want +got):\n%s", tt.name, diff)
+			}
+		})
+	}
+}
--- a/ml/device.go
+++ b/ml/device.go
@@ -391,6 +391,10 @@ func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison {
 	if a.PCIID != b.PCIID {
 		return UniqueDevice
 	}
+	// If PCIID is empty, we have to use ID + library for uniqueness
+	if a.PCIID == "" && a.DeviceID != b.DeviceID {
+		return UniqueDevice
+	}
 	if a.Library == b.Library {
 		return SameBackendDevice
 	}
@@ -454,13 +458,13 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
 	var envVar string
 	switch d.Library {
 	case "ROCm":
+		// ROCm must be filtered as it can crash the runner on unsupported devices
 		envVar = "ROCR_VISIBLE_DEVICES"
 		if runtime.GOOS != "linux" {
 			envVar = "HIP_VISIBLE_DEVICES"
 		}
-	case "Vulkan":
-		envVar = "GGML_VK_VISIBLE_DEVICES"
 	default:
+		// CUDA and Vulkan are not filtered via env var, but via scheduling decisions
 		return
 	}
 	v, existing := env[envVar]
--- a/ml/nn/convolution.go
+++ b/ml/nn/convolution.go
@@ -4,8 +4,27 @@ import "github.com/ollama/ollama/ml"

 type Conv2D struct {
 	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
 }

 func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
-	return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
+	t = m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
+	if m.Bias != nil {
+		// Bias shape is (out_channels,) while t shape is (width, height, out_channels, batch)
+		t = t.Add(ctx, m.Bias.Reshape(ctx, 1, 1, -1))
+	}
+	return t
+}
+
+type Conv3D struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *Conv3D) Forward(ctx ml.Context, t ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
+	t = m.Weight.Conv3D(ctx, t, c, s0, s1, s2, p0, p1, p2, d0, d1, d2)
+	if m.Bias != nil {
+		t = t.Add(ctx, m.Bias)
+	}
+	return t
 }
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -14,4 +14,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
+	_ "github.com/ollama/ollama/model/models/qwen3vl"
 )
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -3,6 +3,7 @@ package qwen3
 import (
 	"cmp"
 	"math"
+	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -210,7 +211,7 @@ var _ model.Model = (*Model)(nil)
 func New(c fs.Config) (model.Model, error) {
 	layers := make([]Layer, c.Uint("block_count"))
 	for i := range layers {
-		if c.String("general.architecture") == "qwen3moe" {
+		if strings.HasSuffix(c.String("general.architecture"), "moe") {
 			layers[i].MLP = &sparse{}
 		} else {
 			layers[i].MLP = &dense{}
--- a/model/models/qwen3vl/imageprocessor.go
+++ b/model/models/qwen3vl/imageprocessor.go
@@ -0,0 +1,194 @@
+package qwen3vl
+
+import (
+	"fmt"
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+// ImageProcessor contains configuration for the Qwen 3 VL image processing
+type ImageProcessor struct {
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	mergeSize         int
+	shortestEdge      int
+	longestEdge       int
+	factor            int
+	rescaleFactor     float32
+	imageMean         []float32
+	imageStd          []float32
+}
+
+// newImageProcessor creates a new image processor with default values
+func newImageProcessor(c fs.Config) ImageProcessor {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+
+	return ImageProcessor{
+		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		patchSize:         patchSize,
+		temporalPatchSize: 2,
+		mergeSize:         mergeSize,
+		shortestEdge:      int(c.Uint("vision.shortest_edge", 64<<10)),
+		// FIXME(mxyng): the model defined longest edge (16M) is too large for the default
+		// context length of 8K and will panic. Adjusting to 2M for now.
+		// longestEdge:   int(c.Uint("vision.longest_edge", 16<<20)),
+		longestEdge:   2 << 20,
+		factor:        patchSize * mergeSize,
+		rescaleFactor: 1.0 / 255.0,
+		imageMean:     c.Floats("vision.image_mean", imageproc.ImageNetStandardMean[:]),
+		imageStd:      c.Floats("vision.image_std", imageproc.ImageNetStandardSTD[:]),
+	}
+}
+
+// SmartResize implements the smart resize algorithm
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+
+	if height < factor || width < factor {
+		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
+	} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
+		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
+	}
+
+	round := func(x float64) int { return int(math.RoundToEven(x)) }
+
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+
+	if hBar*wBar > p.longestEdge {
+		beta := math.Sqrt(float64(height*width) / float64(p.longestEdge))
+
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if hBar*wBar < p.shortestEdge {
+		beta := math.Sqrt(float64(p.shortestEdge) / float64(height*width))
+
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+
+	return hBar, wBar
+}
+
+type Grid struct {
+	Height   int
+	Width    int
+	Temporal int
+}
+
+func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) {
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+
+	// Resize image using existing functions
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+
+	normalizedPixels := imageproc.Normalize(
+		resizedImg,
+		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		true, // rescale
+		true, // channelFirst
+	)
+
+	// Calculate grid dimensions
+	grid := &Grid{
+		Height:   resizedHeight / p.patchSize,
+		Width:    resizedWidth / p.patchSize,
+		Temporal: 1, // For single images, temporal dimension is 1
+	}
+
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create patches: %v", err)
+	}
+
+	patchDim := p.numChannels * p.temporalPatchSize *
+		p.patchSize * p.patchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+
+	pixelValues := ctx.Input().FromFloats(patches, patchDim, numPatches)
+
+	// Return patches and grid dimensions
+	return pixelValues, grid, nil
+}
+
+func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
+	channels := p.numChannels
+	patchSize := p.patchSize
+	mergeSize := p.mergeSize
+	temporalPatchSize := p.temporalPatchSize
+
+	// Calculate output dimensions
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+
+	result := make([]float32, numPatches*patchDim)
+	patchIndex := 0
+
+	// Single temporal frame handling (copies to all frames)
+	for range grid.Temporal {
+		for h := 0; h < grid.Height; h += mergeSize {
+			for w := 0; w < grid.Width; w += mergeSize {
+				// Handle the 2x2 merged patches
+				for mh := range mergeSize {
+					for mw := range mergeSize {
+						baseOffset := patchIndex * patchDim
+
+						// Extract patch data for first temporal frame
+						for c := range channels {
+							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+
+							for py := range patchSize {
+								for px := range patchSize {
+									// Calculate source pixel coordinates
+									y := (h+mh)*patchSize + py
+									x := (w+mw)*patchSize + px
+
+									// Source index in input tensor (CHW format)
+									srcIdx := c*height*width + y*width + x
+
+									// Destination index in first temporal frame
+									dstIdx := channelOffset + (py * patchSize) + px
+
+									if srcIdx < len(pixels) && dstIdx < len(result) {
+										result[dstIdx] = pixels[srcIdx]
+									}
+								}
+							}
+						}
+
+						// Copy first temporal frame to all other frames
+						if temporalPatchSize > 1 {
+							for c := range channels {
+								channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+								firstFrameOffset := channelOffset
+								frameSize := patchSize * patchSize
+
+								// Copy first frame to all other frames
+								for tp := 1; tp < temporalPatchSize; tp++ {
+									currentFrameOffset := channelOffset + (tp * frameSize)
+									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
+										result[firstFrameOffset:firstFrameOffset+frameSize])
+								}
+							}
+						}
+
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+
+	return result, nil
+}
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -0,0 +1,204 @@
+package qwen3vl
+
+import (
+	"bytes"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.TextProcessor
+
+	*TextModel
+	*VisionModel `gguf:"v"`
+
+	ImageProcessor
+
+	positionCache []int32
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, grid, err := m.ProcessImage(ctx, img)
+	if err != nil {
+		return nil, err
+	}
+
+	// Calculate tensor dimensions
+	visionOutputs, deepstackVisualEmbeds := m.VisionModel.Forward(ctx, pixelValues, grid)
+	mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
+	for i := range deepstackVisualEmbeds {
+		mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
+	}
+
+	return mm, nil
+}
+
+var (
+	tokenVision      int32 = 151655
+	tokenVisionStart int32 = 151652
+	tokenVisionEnd   int32 = 151653
+)
+
+type modelInput struct {
+	*input.Input
+	position int32
+}
+
+// PostTokenize arranges Qwen 3 VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	m.positionCache = m.positionCache[:0]
+	return slices.Collect(func(yield func(*input.Input) bool) {
+		for i := range inputs {
+			s := []modelInput{{Input: inputs[i]}}
+			if mm := inputs[i].Multimodal; mm != nil {
+				t := mm[0].Tensor
+				s = slices.Repeat([]modelInput{
+					{
+						position: int32(i + 1),
+						Input:    &input.Input{Token: tokenVision},
+					},
+				}, t.Dim(1)+1+1)
+
+				s[0] = modelInput{
+					Input:    &input.Input{Token: tokenVisionStart},
+					position: int32(i),
+				}
+
+				s[len(s)-1] = modelInput{
+					Input:    &input.Input{Token: tokenVisionEnd},
+					position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
+				}
+
+				s[1] = modelInput{
+					Input: &input.Input{
+						Token:          tokenVision,
+						Multimodal:     inputs[i].Multimodal,
+						MultimodalHash: inputs[i].MultimodalHash,
+						SameBatch:      t.Dim(1),
+					},
+					position: int32(i + 1),
+				}
+			}
+
+			for _, e := range s {
+				position := e.position
+				if position == 0 && len(m.positionCache) > 0 {
+					position = m.positionCache[len(m.positionCache)-1] + 1
+				}
+
+				m.positionCache = append(m.positionCache, position)
+				if !yield(e.Input) {
+					return
+				}
+			}
+		}
+	}), nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions)))
+	for i, id := range batch.Positions {
+		if id < int32(len(m.positionCache)) {
+			id = m.positionCache[id]
+		} else if len(m.positionCache) > 0 {
+			id = id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
+		}
+
+		positionSlice[0][i] = id
+		positionSlice[1][i] = id
+		positionSlice[2][i] = id
+	}
+
+	hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
+
+	var deepstackVisualEmbeds []ml.Tensor
+	for _, mi := range batch.Multimodal {
+		visionOutputs := mi.Multimodal[0].Tensor
+		ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+		if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
+			for i := range visionOutputs.Dim(1) {
+				w := grid.Width / m.spatialMergeSize
+				positionSlice[1][mi.Index+i] += int32(i / w)
+				positionSlice[2][mi.Index+i] += int32(i % w)
+			}
+		}
+
+		deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
+		for i, mm := range mi.Multimodal[1:] {
+			deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+			ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
+		}
+	}
+
+	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice))
+	cos, sin := m.rotaryEmbedding(ctx, positions)
+	for i, layer := range m.TextModel.Layers {
+		if m.Cache != nil {
+			m.Cache.SetLayer(i)
+		}
+
+		var outputs ml.Tensor
+		if i == len(m.TextModel.Layers)-1 {
+			outputs = batch.Outputs
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options)
+		if i < len(deepstackVisualEmbeds) {
+			hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
+		}
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, 1e-06)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		TextProcessor: model.NewBytePairEncoding(
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		),
+		TextModel:      newTextModel(c),
+		VisionModel:    newVisionModel(c),
+		ImageProcessor: newImageProcessor(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) {
+		m.positionCache = nil
+		return nil, kvcache.ErrNotSupported
+	})
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen3vl", New)
+	model.Register("qwen3vlmoe", New)
+}
--- a/model/models/qwen3vl/model_text.go
+++ b/model/models/qwen3vl/model_text.go
@@ -0,0 +1,229 @@
+package qwen3vl
+
+import (
+	"cmp"
+	"math"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type TextOptions struct {
+	hiddenSize,
+	numHeads,
+	numKVHeads,
+	keyLength,
+	valueLength int
+
+	eps,
+	ropeBase,
+	ropeScale float32
+	mropeSections []int
+
+	numExperts, numExpertsUsed int
+	normTopKProb               bool
+
+	inverseFrequenciesCache []float32
+}
+
+func (o TextOptions) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+
+type TextAttention struct {
+	Query     *nn.Linear  `gguf:"attn_q"`
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+
+	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type TextMLP interface {
+	Forward(ml.Context, ml.Tensor, *TextOptions) ml.Tensor
+}
+
+type sparse struct {
+	Router *nn.Linear      `gguf:"ffn_gate_inp"`
+	Gate   *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up     *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
+}
+
+func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
+	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
+	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
+
+	routingWeights := routerLogits.Softmax(ctx)
+	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
+		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
+	}
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates, selectedExperts).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates, selectedExperts))
+
+	experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates
+}
+
+type dense struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *TextOptions) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type TextLayer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	*TextAttention
+
+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	TextMLP
+}
+
+func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts)
+
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.TextMLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	Layers []TextLayer `gguf:"blk"`
+
+	Options *TextOptions
+}
+
+func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) {
+	positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1))
+	if len(m.Options.inverseFrequenciesCache) == 0 {
+		m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2)
+		for i := range m.Options.inverseFrequenciesCache {
+			frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim())))
+			m.Options.inverseFrequenciesCache[i] = 1 / frequency
+		}
+	}
+
+	inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache))
+
+	positions = positions.Cast(ctx, ml.DTypeF32)
+	frequencies := inverseFrequencies.Mulmat(ctx, positions)
+
+	interleaved := frequencies.View(ctx,
+		0, frequencies.Dim(0),
+		frequencies.Stride(1), frequencies.Dim(1),
+	)
+
+	for _, i := range []int{1, 2} {
+		args := []int{
+			i * frequencies.Stride(0), 1,
+			3 * frequencies.Stride(0), m.Options.mropeSections[i],
+			frequencies.Stride(1), frequencies.Dim(1),
+		}
+
+		ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...).
+			Copy(ctx, interleaved.View(ctx, args[0], args[1:]...)))
+	}
+
+	interleaved = interleaved.Concat(ctx, interleaved, 0)
+	interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2))
+	return interleaved.Cos(ctx), interleaved.Sin(ctx)
+}
+
+var _ model.Model = (*Model)(nil)
+
+func newTextModel(c fs.Config) *TextModel {
+	layers := make([]TextLayer, c.Uint("block_count"))
+	for i := range layers {
+		if strings.HasSuffix(c.String("general.architecture"), "moe") {
+			layers[i].TextMLP = &sparse{}
+		} else {
+			layers[i].TextMLP = &dense{}
+		}
+	}
+
+	m := TextModel{
+		Layers: layers,
+		Options: &TextOptions{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			keyLength:      int(c.Uint("attention.key_length")),
+			valueLength:    int(c.Uint("attention.value_length")),
+			eps:            c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:       c.Float("rope.freq_base"),
+			ropeScale:      c.Float("rope.scaling.factor", 1),
+			numExperts:     int(c.Uint("expert_count")),
+			numExpertsUsed: int(c.Uint("expert_used_count")),
+			normTopKProb:   c.Bool("norm_top_k_prob", true),
+			mropeSections: slices.Collect(func(yield func(int) bool) {
+				for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
+					if !yield(int(section)) {
+						return
+					}
+				}
+			}),
+		},
+	}
+
+	return &m
+}
--- a/model/models/qwen3vl/model_vision.go
+++ b/model/models/qwen3vl/model_vision.go
@@ -0,0 +1,268 @@
+package qwen3vl
+
+import (
+	"iter"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+type VisionAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_out"`
+}
+
+func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
+	x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
+	return x2.Scale(ctx, -1).Concat(ctx, x1, 0)
+}
+
+func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
+	return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
+}
+
+func (sa *VisionAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
+	query := sa.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, query.Dim(1))
+	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+
+	key := sa.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, opts.headDim(), opts.numHeads, key.Dim(1))
+	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+
+	value := sa.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, opts.headDim(), opts.numHeads, value.Dim(1))
+
+	attention := nn.Attention(ctx, query, key, value, math.Pow(float64(opts.headDim()), -0.5), nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2))
+	return sa.Output.Forward(ctx, attention)
+}
+
+type VisionMLP struct {
+	FC1 *nn.Linear `gguf:"linear_fc1"`
+	FC2 *nn.Linear `gguf:"linear_fc2"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts VisionOptions) ml.Tensor {
+	return mlp.FC2.Forward(ctx, mlp.FC1.Forward(ctx, hiddenStates).GELU(ctx))
+}
+
+type VisionEncoderLayer struct {
+	Norm1     *nn.LayerNorm `gguf:"norm1"`
+	Attention *VisionAttention
+	Norm2     *nn.LayerNorm `gguf:"norm2"`
+	MLP       *VisionMLP    `gguf:"mlp"`
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.Attention.Forward(ctx, hiddenStates, cos, sin, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type VisionOptions struct {
+	hiddenSize,
+	numHeads,
+	patchSize,
+	numChannels,
+	spatialMergeSize,
+	temporalPatchSize,
+	gridPerSide int
+
+	eps,
+	ropeTheta float32
+
+	deepstackVisualIndexes []int32
+	mropeSections          []int
+}
+
+func (o VisionOptions) headDim() int {
+	return o.hiddenSize / o.numHeads
+}
+
+type VisionPatchMerger struct {
+	Norm *nn.LayerNorm `gguf:"norm"`
+	FC1  *nn.Linear    `gguf:"linear_fc1"`
+	FC2  *nn.Linear    `gguf:"linear_fc2"`
+}
+
+func (m *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, postshuffleNorm bool, opts VisionOptions) ml.Tensor {
+	hiddenSize := opts.hiddenSize * opts.spatialMergeSize * opts.spatialMergeSize
+	if postshuffleNorm {
+		visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
+	}
+
+	visionOutputs = m.Norm.Forward(ctx, visionOutputs, opts.eps)
+	visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
+	return m.FC2.Forward(ctx, m.FC1.Forward(ctx, visionOutputs).GELU(ctx))
+}
+
+type VisionPositionEmbedding struct {
+	PositionEmbedding *nn.Embedding `gguf:"pos_embed"`
+}
+
+func makeSlice2D[T int32 | float32](n0, n1 int) iter.Seq[[]T] {
+	return func(yield func([]T) bool) {
+		for range n0 {
+			if !yield(make([]T, n1)) {
+				return
+			}
+		}
+	}
+}
+
+func (m *VisionPositionEmbedding) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts VisionOptions) ml.Tensor {
+	indexSlice := slices.Collect(makeSlice2D[int32](4, grid.Height*grid.Width))
+	weightSlice := slices.Collect(makeSlice2D[float32](4, grid.Height*grid.Width))
+
+	stepHeight := float32(opts.gridPerSide-1) / float32(grid.Height-1)
+	stepWidth := float32(opts.gridPerSide-1) / float32(grid.Width-1)
+
+	var i int
+	for h := range grid.Height {
+		for w := range grid.Width {
+			y, x := float32(h)*stepHeight, float32(w)*stepWidth
+
+			floorY, floorX := int32(y), int32(x)
+			ceilY, ceilX := min(floorY+1, int32(opts.gridPerSide-1)), min(floorX+1, int32(opts.gridPerSide-1))
+
+			indexSlice[0][i] = floorY*int32(opts.gridPerSide) + floorX
+			indexSlice[1][i] = floorY*int32(opts.gridPerSide) + ceilX
+			indexSlice[2][i] = ceilY*int32(opts.gridPerSide) + floorX
+			indexSlice[3][i] = ceilY*int32(opts.gridPerSide) + ceilX
+
+			weightSlice[0][i] = (1 - (y - float32(floorY))) * (1 - (x - float32(floorX)))
+			weightSlice[1][i] = (1 - (y - float32(floorY))) * (x - float32(floorX))
+			weightSlice[2][i] = (y - float32(floorY)) * (1 - (x - float32(floorX)))
+			weightSlice[3][i] = (y - float32(floorY)) * (x - float32(floorX))
+
+			i++
+		}
+	}
+
+	indices := ctx.Input().FromInts(slices.Concat(indexSlice...), grid.Height*grid.Width*4)
+	weights := ctx.Input().FromFloats(slices.Concat(weightSlice...), 1, grid.Height*grid.Width*4)
+
+	n := hiddenStates.Dim(0)
+	positionEmbeds := m.PositionEmbedding.Forward(ctx, indices)
+	positionEmbeds = positionEmbeds.Mul(ctx, weights)
+	positionEmbeds = positionEmbeds.Reshape(ctx, n, -1, 4)
+
+	positionEmbeds = positionEmbeds.View(ctx, 0, n, positionEmbeds.Stride(1), grid.Height*grid.Width).
+		Add(ctx, positionEmbeds.View(ctx, 1*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
+		Add(ctx, positionEmbeds.View(ctx, 2*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
+		Add(ctx, positionEmbeds.View(ctx, 3*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width))
+
+	positionEmbeds = positionEmbeds.Reshape(ctx, -1, grid.Width/opts.spatialMergeSize, opts.spatialMergeSize, grid.Height/opts.spatialMergeSize)
+	positionEmbeds = positionEmbeds.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, n, -1)
+	return hiddenStates.Add(ctx, positionEmbeds)
+}
+
+type VisionModel struct {
+	PatchEmbedding    *nn.Conv3D `gguf:"patch_embed"`
+	PositionEmbedding *VisionPositionEmbedding
+	Layers            []VisionEncoderLayer `gguf:"blk"`
+	PatchMerger       *VisionPatchMerger   `gguf:"merger"`
+	DeepstackMerger   []*VisionPatchMerger `gguf:"deepstack_merger"`
+
+	VisionOptions
+}
+
+func (m *VisionModel) positions(ctx ml.Context, grid *Grid) (_, _ ml.Tensor) {
+	indices := ctx.Input().FromInts(slices.Collect(func(yield func(int32) bool) {
+		for y := range grid.Height {
+			for x := range grid.Width {
+				if !yield(int32(y)) {
+					return
+				}
+				if !yield(int32(x)) {
+					return
+				}
+			}
+		}
+	}), grid.Width*grid.Height*2)
+
+	indices = indices.Reshape(ctx, -1, grid.Width/m.spatialMergeSize, m.spatialMergeSize, grid.Height/m.spatialMergeSize)
+	indices = indices.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	indices = indices.Reshape(ctx, -1)
+
+	halfDim := m.headDim() / 2
+	maxGrid := max(grid.Height, grid.Width)
+	frequencies := ctx.Input().FromFloats(slices.Collect(func(yield func(float32) bool) {
+		ropeTheta := float64(m.ropeTheta)
+		for i := range maxGrid {
+			for j := range halfDim / 2 {
+				if !yield(float32(i) / float32(math.Pow(ropeTheta, float64(j*2)/float64(halfDim)))) {
+					return
+				}
+			}
+		}
+	}), halfDim/2, maxGrid)
+
+	embeds := frequencies.Rows(ctx, indices)
+	embeds = embeds.Reshape(ctx, halfDim, 1, -1)
+	embeds = embeds.Concat(ctx, embeds, 0)
+	return embeds.Cos(ctx), embeds.Sin(ctx)
+}
+
+// Forward computes the vision model for an input tensor
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) (ml.Tensor, []ml.Tensor) {
+	pixelValues = pixelValues.Reshape(ctx, m.patchSize, m.patchSize, m.temporalPatchSize, -1)
+	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.numChannels, m.patchSize, m.patchSize, m.temporalPatchSize, 0, 0, 0, 1, 1, 1)
+	hiddenStates = m.PositionEmbedding.Forward(ctx, hiddenStates, grid, m.VisionOptions)
+
+	cos, sin := m.positions(ctx, grid)
+
+	deepstackStates := make([]ml.Tensor, len(m.deepstackVisualIndexes))
+	for i, layer := range m.Layers {
+		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, m.VisionOptions)
+		if i := slices.Index(m.deepstackVisualIndexes, int32(i)); i >= 0 {
+			deepstackStates[i] = m.DeepstackMerger[i].Forward(ctx, hiddenStates, true, m.VisionOptions)
+		}
+	}
+
+	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, false, m.VisionOptions)
+	return hiddenStates, deepstackStates
+}
+
+// newVisionModel creates a new instance of the Qwen vision model
+func newVisionModel(c fs.Config) *VisionModel {
+	deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes")
+	model := &VisionModel{
+		Layers:          make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
+		DeepstackMerger: make([]*VisionPatchMerger, len(deepstackVisualIndexes)),
+		VisionOptions: VisionOptions{
+			hiddenSize:        int(c.Uint("vision.embedding_length", 1280)),
+			numHeads:          int(c.Uint("vision.attention.head_count", 16)),
+			patchSize:         int(c.Uint("vision.patch_size", 14)),
+			numChannels:       int(c.Uint("vision.num_channels", 3)),
+			eps:               c.Float("vision.attention.layer_norm_epsilon", 1e-6),
+			ropeTheta:         c.Float("vision.rope.freq_base", 10000.0),
+			spatialMergeSize:  int(c.Uint("vision.spatial_merge_size", 2)),
+			temporalPatchSize: int(c.Uint("vision.temporal_patch_size", 2)),
+			gridPerSide:       int(math.Sqrt(float64(c.Uint("vision.num_positional_embeddings", 2304)))),
+			mropeSections: slices.Collect(func(yield func(int) bool) {
+				for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
+					if !yield(int(section)) {
+						return
+					}
+				}
+			}),
+			deepstackVisualIndexes: deepstackVisualIndexes,
+		},
+	}
+
+	return model
+}
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -709,13 +709,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {

 	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{
 		embedding: true,
-		truncate:  req.Truncate,
+
+		// TODO (jmorganca): this should be provided by the server via the
+		// request options and truncated here in the runner, instead of relying on
+		// the server's truncate logic
+		truncate: true,
 	})
 	if err != nil {
-		if errors.Is(err, errorInputTooLong) {
-			http.Error(w, err.Error(), http.StatusBadRequest)
-			return
-		}
 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
 		return
 	}
@@ -758,8 +758,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 	embedding := <-seq.embedding

 	if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
-		Embedding:       embedding,
-		PromptEvalCount: seq.numPromptInputs,
+		Embedding: embedding,
 	}); err != nil {
 		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 	}
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -235,15 +235,28 @@ func countCommonPrefix(a []*input.Input, b []*input.Input) int32 {
 	return count
 }

-// TODO(jessegross): If we need to reprocess the inputs we should ensure that
-// we don't split up a SameBatch
-func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
-	targetFree := (c.numCtx - numKeep) / 2
-	targetFree = max(targetFree, 1)
+// ShiftDiscard computes how many inputs can be discarded from the cache. Inputs in the same batch
+// are discarded together.
+func (c *InputCache) ShiftDiscard(inputs []*input.Input, numKeep int32) int32 {
+	targetFree := max((c.numCtx-numKeep)/2, 1)
+	currentFree := c.numCtx - int32(len(inputs))

-	currentFree := c.numCtx - inputLen
+	var discard, sameBatch int32
+	for _, input := range inputs[numKeep:] {
+		if sameBatch <= 0 && currentFree >= targetFree {
+			break
+		}

-	return max(targetFree-currentFree, 0)
+		sameBatch--
+		currentFree++
+		discard++
+
+		if input.SameBatch > 0 {
+			sameBatch = int32(input.SameBatch)
+		}
+	}
+
+	return discard
 }

 type ErrReprocessInputs struct {
@@ -264,7 +277,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
 	}

 	inputLen := int32(len(slot.Inputs))
-	discard := c.ShiftDiscard(inputLen, numKeep)
+	discard := c.ShiftDiscard(slot.Inputs, numKeep)

 	if discard <= 0 {
 		return nil
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,6 +3,7 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
+	"slices"
 	"testing"
 	"time"

@@ -238,59 +239,137 @@ func TestShiftDiscard(t *testing.T) {
 		name     string
 		numCtx   int32
 		numKeep  int32
-		inputLen int32
+		inputs   []*input.Input
 		expected int32
 	}{
 		{
 			name:     "Shift",
 			numCtx:   2048,
 			numKeep:  5,
-			inputLen: 2048,
+			inputs:   slices.Repeat([]*input.Input{{}}, 2048),
 			expected: 1021,
 		},
 		{
 			name:     "Max Keep",
 			numCtx:   2048,
 			numKeep:  2047,
-			inputLen: 2048,
+			inputs:   slices.Repeat([]*input.Input{{}}, 2048),
 			expected: 1,
 		},
 		{
 			name:     "No Keep",
 			numCtx:   2048,
 			numKeep:  0,
-			inputLen: 2048,
+			inputs:   slices.Repeat([]*input.Input{{}}, 2048),
 			expected: 1024,
 		},
 		{
 			name:     "Truncate",
 			numCtx:   2048,
 			numKeep:  5,
-			inputLen: 5000,
+			inputs:   slices.Repeat([]*input.Input{{}}, 5000),
 			expected: 3973,
 		},
 		{
 			name:     "Truncate Keep",
 			numCtx:   2048,
 			numKeep:  2047,
-			inputLen: 5000,
+			inputs:   slices.Repeat([]*input.Input{{}}, 5000),
 			expected: 2953,
 		},
 		{
 			name:     "No Op",
 			numCtx:   2048,
 			numKeep:  5,
-			inputLen: 512,
+			inputs:   slices.Repeat([]*input.Input{{}}, 512),
 			expected: 0,
 		},
+		{
+			name:    "Same Batch",
+			numCtx:  2048,
+			numKeep: 5,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for range 1024 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+
+				if !yield(&input.Input{SameBatch: 512 - 1}) {
+					return
+				}
+
+				for range 2048 - 1024 - 1 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+			}),
+			expected: 1531,
+		},
+		{
+			name:    "Same Batch Near Start",
+			numCtx:  2048,
+			numKeep: 5,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for range 10 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+
+				if !yield(&input.Input{SameBatch: 512 - 1}) {
+					return
+				}
+
+				for range 2048 - 10 - 1 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+			}),
+			expected: 1021,
+		},
+		{
+			name:   "Consecutive Same Batch",
+			numCtx: 32,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for i := range 32 {
+					input := input.Input{}
+					if i%10 == 0 {
+						input.SameBatch = 10 - 1
+					}
+					if !yield(&input) {
+						return
+					}
+				}
+			}),
+			expected: 20,
+		},
+		{
+			name:   "Overlapping Same Batch",
+			numCtx: 32,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for i := range 32 {
+					input := input.Input{}
+					if slices.Contains([]int{4, 8, 14}, i) {
+						input.SameBatch = 10 - 1
+					}
+					if !yield(&input) {
+						return
+					}
+				}
+			}),
+			expected: 24,
+		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			c := InputCache{numCtx: tt.numCtx}
-			result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
+			result := c.ShiftDiscard(tt.inputs, tt.numKeep)
 			if result != tt.expected {
-				t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
+				t.Errorf("shiftDiscard(ctx: %v, keep: %v inputs: %v): have %v; want %v", tt.numCtx, tt.numKeep, len(tt.inputs), result, tt.expected)
 			}
 		})
 	}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -214,7 +214,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
 		parts = []string{prompt}
 	}

-	postTokenize := false
 	for i, part := range parts {
 		// text - tokenize
 		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
@@ -257,11 +256,10 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
 			mmStore.addMultimodal(imageEmbeddings)

 			inputs = append(inputs, &input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
-			postTokenize = true
 		}
 	}

-	if visionModel && postTokenize {
+	if visionModel {
 		var err error
 		inputs, err = multimodalProcessor.PostTokenize(inputs)
 		if err != nil {
@@ -948,13 +946,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/json")
 	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{
 		embedding: true,
-		truncate:  req.Truncate,
+
+		// TODO (jmorganca): this should be provided by the server via the
+		// request options and truncated here in the runner, instead of relying on
+		// the server's truncate logic
+		truncate: true,
 	})
 	if err != nil {
-		if errors.Is(err, errorInputTooLong) {
-			http.Error(w, err.Error(), http.StatusBadRequest)
-			return
-		}
 		http.Error(w, fmt.Sprintf("failed to create new sequence: %v", err), http.StatusInternalServerError)
 		return
 	}
@@ -995,8 +993,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 	}

 	if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
-		Embedding:       <-seq.embedding,
-		PromptEvalCount: seq.numPromptInputs,
+		Embedding: <-seq.embedding,
 	}); err != nil {
 		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 	}
--- a/server/routes.go
+++ b/server/routes.go
@@ -21,7 +21,6 @@ import (
 	"os/signal"
 	"slices"
 	"strings"
-	"sync/atomic"
 	"syscall"
 	"time"

@@ -143,7 +142,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C

 	// This model is much more capable with a larger context, so set that
 	// unless it would penalize performance too much
-	if !s.lowVRAM && slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
+	if !s.lowVRAM && slices.Contains([]string{
+		"gptoss", "gpt-oss",
+		"qwen3vl", "qwen3vlmoe",
+	}, model.Config.ModelFamily) {
 		opts.NumCtx = max(opts.NumCtx, 8192)
 	}

@@ -660,7 +662,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

-	r, _, _, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
+	r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
 	if err != nil {
 		handleScheduleError(c, req.Model, err)
 		return
@@ -673,12 +675,61 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

+	kvData, _, err := getModelData(m.ModelPath, false)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	var count int
+	for i, s := range input {
+		tokens, err := r.Tokenize(c.Request.Context(), s)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
+
+		ctxLen := min(opts.NumCtx, int(kvData.ContextLength()))
+		if len(tokens) > ctxLen {
+			if !truncate {
+				c.JSON(http.StatusBadRequest, gin.H{"error": "input exceeds maximum context length"})
+				return
+			}
+
+			if bos := kvData.Uint("tokenizer.ggml.bos_token_id"); tokens[0] != int(bos) && kvData.Bool("add_bos_token", true) {
+				ctxLen--
+			}
+
+			if eos := kvData.Uint("tokenizer.ggml.eos_token_id"); tokens[len(tokens)-1] != int(eos) && kvData.Bool("add_eos_token", true) {
+				ctxLen--
+			}
+
+			slog.Info("", "ctxLen", ctxLen, "tokenCount", len(tokens))
+			if ctxLen <= 0 {
+				// return error if the truncated input would be empty or just special tokens
+				c.JSON(http.StatusBadRequest, gin.H{"error": "input after truncation exceeds maximum context length"})
+				return
+			}
+
+			tokens = tokens[:ctxLen]
+
+			s, err = r.Detokenize(c.Request.Context(), tokens)
+			if err != nil {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+				return
+			}
+		}
+
+		count += len(tokens)
+
+		input[i] = s
+	}
+
 	var g errgroup.Group
 	embeddings := make([][]float32, len(input))
-	var totalTokens uint64
 	for i, text := range input {
 		g.Go(func() error {
-			embedding, tokenCount, err := r.Embedding(c.Request.Context(), text, truncate)
+			embedding, err := r.Embedding(c.Request.Context(), text)
 			if err != nil {
 				return err
 			}
@@ -688,18 +739,12 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 				embedding = normalize(embedding[:req.Dimensions])
 			}
 			embeddings[i] = embedding
-			atomic.AddUint64(&totalTokens, uint64(tokenCount))
 			return nil
 		})
 	}

 	if err := g.Wait(); err != nil {
-		var serr api.StatusError
-		if errors.As(err, &serr) {
-			c.AbortWithStatusJSON(serr.StatusCode, gin.H{"error": strings.TrimSpace(serr.ErrorMessage)})
-		} else {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
-		}
+		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
 		return
 	}

@@ -708,7 +753,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		Embeddings:      embeddings,
 		TotalDuration:   time.Since(checkpointStart),
 		LoadDuration:    checkpointLoaded.Sub(checkpointStart),
-		PromptEvalCount: int(totalTokens),
+		PromptEvalCount: count,
 	}
 	c.JSON(http.StatusOK, resp)
 }
@@ -754,7 +799,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}

-	embedding, _, err := r.Embedding(c.Request.Context(), req.Prompt, true)
+	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
 		return
--- a/server/sched.go
+++ b/server/sched.go
@@ -390,11 +390,11 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
 		numParallel = 1
 	}

-	// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
+	// `mllama`, `qwen3vl`, and `qwen3vlmoe` are snowflakes and uses an encoder cache which cannot be used with num_parallel > 1
 	// ref: https://github.com/ollama/ollama/issues/4165
-	if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
+	if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe"}, req.model.Config.ModelFamily) && numParallel != 1 {
 		numParallel = 1
-		slog.Warn("mllama does not currently support parallel requests")
+		slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
 	}

 	sessionDuration := envconfig.KeepAlive()
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -780,8 +780,8 @@ func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn
 	return s.completionResp
 }

-func (s *mockLlm) Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error) {
-	return s.embeddingResp, 0, s.embeddingRespErr
+func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
+	return s.embeddingResp, s.embeddingRespErr
 }

 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
Author	SHA1	Message	Date
Daniel Hiltgen	c88647104d	int: harden server lifecycle (#12835 ) this should reduce zombies during integration runs	2025-10-29 11:50:56 -07:00
Patrick Devine	05aff4a4f1	tests: fix embeddinggemma integration test (#12830 )	2025-10-29 11:07:28 -07:00
Michael Yang	0d140bd1af	fix: conv2d bias (#12834 )	2025-10-29 11:03:43 -07:00
Jeffrey Morgan	93e45f0f0d	docs: temporarily restore api.md and cleanup docs paths (#12818 )	2025-10-28 23:25:48 -07:00
Jeffrey Morgan	a342160803	docs: fix root api documentation page (#12813 )	2025-10-28 19:17:54 -07:00
Jeffrey Morgan	f6c29409dc	docs: add new cloud model + fix openai redirect (#12812 )	2025-10-28 19:09:07 -07:00
Michael Yang	7d25b9e194	feat(model): add qwen3vl (#12665 )	2025-10-28 17:39:47 -07:00
Patrick Devine	36d64fb531	embed: add distance correlation test for library embed models (#12796 )	2025-10-28 16:57:27 -07:00
Parth Sareen	d828517e78	docs: update readme and links (#12809 )	2025-10-28 16:20:02 -07:00
Daniel Hiltgen	14977a9350	Fix vulkan PCI ID and ID handling (#12775 ) * Fix vulkan PCI ID and ID handling Intel GPUs may not report PCI IDs which was leading to incorrect overlap detection. Switch to using the existing PCI IDs, however AMD GPUs claim not to report PCI IDs, but actually do, so try anyway, as this is required for ADLX to find the GPUs on Windows. Numeric IDs lead to scheduling problems, so this also switches Vulkan to use UUID based IDs. The GPU discovery patches have been squashed into a single patch to simplify future rebases. * review comments	2025-10-28 15:15:35 -07:00
Patrick Devine	29f63f37c8	Revert "server: Consolidate embedding truncation in runner (#12730 )" (#12810 ) This reverts commit `5d347f6d6f`.	2025-10-28 14:49:14 -07:00