Compare commits

..

11 Commits

Author SHA1 Message Date
Daniel Hiltgen
c88647104d int: harden server lifecycle (#12835)
this should reduce zombies during integration runs
2025-10-29 11:50:56 -07:00
Patrick Devine
05aff4a4f1 tests: fix embeddinggemma integration test (#12830) 2025-10-29 11:07:28 -07:00
Michael Yang
0d140bd1af fix: conv2d bias (#12834) 2025-10-29 11:03:43 -07:00
Jeffrey Morgan
93e45f0f0d docs: temporarily restore api.md and cleanup docs paths (#12818) 2025-10-28 23:25:48 -07:00
Jeffrey Morgan
a342160803 docs: fix root api documentation page (#12813) 2025-10-28 19:17:54 -07:00
Jeffrey Morgan
f6c29409dc docs: add new cloud model + fix openai redirect (#12812) 2025-10-28 19:09:07 -07:00
Michael Yang
7d25b9e194 feat(model): add qwen3vl (#12665) 2025-10-28 17:39:47 -07:00
Patrick Devine
36d64fb531 embed: add distance correlation test for library embed models (#12796) 2025-10-28 16:57:27 -07:00
Parth Sareen
d828517e78 docs: update readme and links (#12809) 2025-10-28 16:20:02 -07:00
Daniel Hiltgen
14977a9350 Fix vulkan PCI ID and ID handling (#12775)
* Fix vulkan PCI ID and ID handling

Intel GPUs may not report PCI IDs which was leading to incorrect overlap
detection.  Switch to using the existing PCI IDs, however AMD GPUs claim not to
report PCI IDs, but actually do, so try anyway, as this is required for ADLX to
find the GPUs on Windows. Numeric IDs lead to scheduling problems, so this also
switches Vulkan to use UUID based IDs. The GPU discovery patches have been
squashed into a single patch to simplify future rebases.

* review comments
2025-10-28 15:15:35 -07:00
Patrick Devine
29f63f37c8 Revert "server: Consolidate embedding truncation in runner (#12730)" (#12810)
This reverts commit 5d347f6d6f.
2025-10-28 14:49:14 -07:00
49 changed files with 4077 additions and 912 deletions

View File

@@ -198,6 +198,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
conv = &qwen2Model{}
case "Qwen2_5_VLForConditionalGeneration":
conv = &qwen25VLModel{}
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
conv = &qwen3VLModel{}
case "BertModel":
conv = &bertModel{}
case "CohereForCausalLM":

157
convert/convert_qwen3.go Normal file
View File

@@ -0,0 +1,157 @@
package convert
import (
"slices"
"strings"
"github.com/ollama/ollama/fs/ggml"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
)
type qwen3Model struct {
ModelParameters
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
HiddenSize uint32 `json:"hidden_size"`
HiddenLayers uint32 `json:"num_hidden_layers"`
IntermediateSize uint32 `json:"intermediate_size"`
NumAttentionHeads uint32 `json:"num_attention_heads"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
HeadDim uint32 `json:"head_dim"`
NumExperts uint32 `json:"num_experts"`
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
NormTopkProb bool `json:"norm_topk_prob"`
RopeTheta float32 `json:"rope_theta"`
RopeScaling struct {
Type string `json:"type"`
Factor ropeFactor `json:"factor"`
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
MropeSection []int32 `json:"mrope_section"`
} `json:"rope_scaling"`
RMSNormEPS float32 `json:"rms_norm_eps"`
}
// KV implements ModelConverter.
func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
arch := "qwen3"
if q.NumExperts > 0 {
arch += "moe"
}
kv := q.ModelParameters.KV(t)
kv["general.architecture"] = arch
kv["block_count"] = q.HiddenLayers
kv["context_length"] = q.MaxPositionEmbeddings
kv["embedding_length"] = q.HiddenSize
kv["feed_forward_length"] = q.IntermediateSize
kv["attention.head_count"] = q.NumAttentionHeads
kv["attention.head_count_kv"] = q.NumKeyValueHeads
kv["attention.key_length"] = q.HeadDim
kv["attention.value_length"] = q.HeadDim
if q.NumExperts > 0 {
kv["expert_count"] = q.NumExperts
kv["expert_used_count"] = q.NumExpertsPerToken
kv["norm_top_k_prob"] = q.NormTopkProb
}
kv["rope.freq_base"] = q.RopeTheta
kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
switch q.RopeScaling.Type {
case "":
// no scaling
case "yarn":
kv["rope.scaling.type"] = q.RopeScaling.Type
kv["rope.scaling.factor"] = q.RopeScaling.Factor
case "mrope", "default":
kv["rope.mrope_section"] = q.RopeScaling.MropeSection
default:
panic("unknown rope scaling type")
}
return kv
}
// Tensors implements ModelConverter.
func (q *qwen3Model) Tensors(ts []Tensor) []*ggml.Tensor {
var out []*ggml.Tensor
// TODO: handle split experts
for _, t := range ts {
switch {
case strings.Contains(t.Name(), "ffn_gate_up_exps"):
afterFunc := func(t tensor.Tensor) (tensor.Tensor, error) { return tensor.Transpose(t, 0, 2, 1) }
for t := range splitDim(t, 2,
split{Replacer: strings.NewReplacer("gate_up", "gate"), afterFunc: afterFunc},
split{Replacer: strings.NewReplacer("gate_up", "up"), afterFunc: afterFunc},
) {
t.Shape[1], t.Shape[2] = t.Shape[2], t.Shape[1]
out = append(out, t)
}
case strings.Contains(t.Name(), "ffn_down_exps"):
shape := slices.Clone(t.Shape())
shape[1], shape[2] = shape[2], shape[1]
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
dims := make([]int, len(shape))
for i := range shape {
dims[i] = int(shape[i])
}
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
tt, err := tensor.Transpose(tt, 0, 2, 1)
if err != nil {
return nil, err
}
// flatten tensor so it can be written as a vector
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
return nil, err
}
return native.VectorF32(tt.(*tensor.Dense))
})
out = append(out, &ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: shape,
WriterTo: t,
})
default:
out = append(out, &ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
}
return out
}
// Replacements implements ModelConverter.
func (q *qwen3Model) Replacements() []string {
return []string{
"lm_head", "output",
"model.embed_tokens", "token_embd",
"model.layers", "blk",
"input_layernorm", "attn_norm",
"self_attn.k_proj", "attn_k",
"self_attn.k_norm", "attn_k_norm",
"self_attn.v_proj", "attn_v",
"self_attn.q_proj", "attn_q",
"self_attn.q_norm", "attn_q_norm",
"self_attn.o_proj", "attn_output",
"mlp.down_proj", "ffn_down",
"mlp.gate_proj", "ffn_gate",
"mlp.up_proj", "ffn_up",
"mlp.gate.weight", "ffn_gate_inp.weight",
"mlp.experts.down_proj", "ffn_down_exps.weight",
"mlp.experts.gate_up_proj", "ffn_gate_up_exps.weight",
"post_attention_layernorm", "ffn_norm",
"model.norm", "output_norm",
}
}
var _ ModelConverter = (*qwen3Model)(nil)

116
convert/convert_qwen3vl.go Normal file
View File

@@ -0,0 +1,116 @@
package convert
import (
"cmp"
"encoding/json"
"io/fs"
"slices"
"strings"
"github.com/ollama/ollama/fs/ggml"
)
type qwen3VLModel struct {
qwen3Model `json:"text_config"`
VisionModel struct {
Depth uint32 `json:"depth"`
HiddenSize uint32 `json:"hidden_size"`
NumHeads uint32 `json:"num_heads"`
InChannels uint32 `json:"in_channels"`
PatchSize uint32 `json:"patch_size"`
SpatialMergeSize uint32 `json:"spatial_merge_size"`
WindowSize uint32 `json:"window_size"`
RMSNormEps float32 `json:"layer_norm_epsilon"`
RopeTheta float32 `json:"rope_theta"`
TemporalPatchSize uint32 `json:"temporal_patch_size"`
DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
Size struct {
ShortestEdge uint32 `json:"shortest_edge"`
LongestEdge uint32 `json:"longest_edge"`
} `json:"size"`
ImageMean []float32 `json:"image_mean"`
ImageStd []float32 `json:"image_std"`
} `json:"vision_config"`
}
func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
if err != nil {
return err
}
return json.Unmarshal(bts, &m.VisionModel)
}
func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
kv := m.qwen3Model.KV(t)
arch := "qwen3vl"
if m.NumExperts > 0 {
arch += "moe"
}
// override architecture
kv["general.architecture"] = arch
kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
kv["vision.embedding_length"] = m.VisionModel.HiddenSize
kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
kv["vision.num_channels"] = m.VisionModel.InChannels
kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
kv["vision.image_mean"] = m.VisionModel.ImageMean
kv["vision.image_std"] = m.VisionModel.ImageStd
return kv
}
func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
var rest []Tensor
var out []*ggml.Tensor
for _, t := range ts {
switch {
case strings.Contains(t.Name(), "attn_qkv"):
out = append(out, slices.Collect(splitDim(t, 0,
split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
))...)
case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
shape := t.Shape()
out = append(out, &ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: append([]uint64{shape[0] * shape[1]}, shape[2:]...),
WriterTo: t,
})
default:
rest = append(rest, t)
}
}
return append(m.qwen3Model.Tensors(rest), out...)
}
func (m *qwen3VLModel) Replacements() []string {
return append(
m.qwen3Model.Replacements(),
"model.language_", "",
"model.visual", "v",
"patch_embed.proj", "patch_embed",
"blocks", "blk",
"attn.qkv", "attn_qkv",
"attn.proj", "attn_out",
"deepstack_merger_list", "deepstack_merger",
)
}

View File

@@ -19,8 +19,8 @@ type split struct {
dim int
slices []tensor.Slice
// fn is an optional function to apply to the tensor after slicing
fn func(tensor.Tensor) (tensor.Tensor, error)
// afterFunc is an optional function to apply to the tensor after slicing
afterFunc func(tensor.Tensor) (tensor.Tensor, error)
}
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
@@ -54,8 +54,8 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
tt = tensor.Materialize(tt)
if split.fn != nil {
tt, err = split.fn(tt)
if split.afterFunc != nil {
tt, err = split.afterFunc(tt)
if err != nil {
return nil, err
}

View File

@@ -432,7 +432,7 @@ func TestSplitDim(t *testing.T) {
t.Run("split with transpose", func(t *testing.T) {
next, stop := iter.Pull(splitDim(&r, 1,
split{Replacer: strings.NewReplacer("a", "x")},
split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
split{Replacer: strings.NewReplacer("b", "y"), afterFunc: func(tt tensor.Tensor) (tensor.Tensor, error) {
return tensor.Transpose(tt, 1, 0)
}},
))

View File

@@ -117,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
// In the second pass, we more deeply initialize the GPUs to weed out devices that
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
var wg sync.WaitGroup
@@ -129,7 +129,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
if devices[i].Library == "Metal" {
continue
}
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
wg.Add(1)
go func(i int) {
defer wg.Done()
@@ -155,6 +155,12 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
envVar: id, // Filter to just this one GPU
}
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
slog.Debug("filtering device which didn't fully initialize",
"id", devices[i].ID,
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
"pci_id", devices[i].PCIID,
"library", devices[i].Library,
)
needsDelete[i] = true
} else {
supportedMu.Lock()
@@ -170,7 +176,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
}(i)
}
wg.Wait()
logutil.Trace("supported GPU library combinations", "supported", supported)
logutil.Trace("supported GPU library combinations before filtering", "supported", supported)
filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)
@@ -372,12 +378,13 @@ func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
}
if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
needsDelete[i] = true
slog.Debug("dropping Vulkan duplicate by PCI ID",
"vulkan_id", devices[i].ID,
"vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
slog.Debug("filtering device with duplicate PCI ID",
"id", devices[i].ID,
"library", devices[i].Library,
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
"pci_id", devices[i].PCIID,
"kept_library", devices[j].Library,
"kept_id", devices[j].ID,
"kept_library", devices[j].Library,
)
break
}
@@ -422,6 +429,12 @@ func filterOverlapByLibrary(supported map[string]map[string]map[string]int, need
}
for dev, i := range byLibDirs[libDir] {
if _, found := byLibDirs[newest][dev]; found {
slog.Debug("filtering device with overlapping libraries",
"id", dev,
"library", libDir,
"delete_index", i,
"kept_library", newest,
)
needsDelete[i] = true
}
}

View File

@@ -3,6 +3,7 @@ package discover
import (
"log/slog"
"path/filepath"
"sort"
"strings"
"github.com/ollama/ollama/format"
@@ -26,6 +27,7 @@ type CPU struct {
}
func LogDetails(devices []ml.DeviceInfo) {
sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference
for _, dev := range devices {
var libs []string
for _, dir := range dev.LibraryPath {
@@ -39,6 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) {
}
slog.Info("inference compute",
"id", dev.ID,
"filtered_id", dev.FilteredID,
"library", dev.Library,
"compute", dev.Compute(),
"name", dev.Name,

1869
docs/api.md Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
---
title: "Introduction"
title: Introduction
---
Ollama's API allows you to run and interact with models programatically.
@@ -44,4 +44,4 @@ Several community-maintained libraries are available for Ollama. For a full list
## Versioning
Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).
Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).

View File

@@ -1,71 +0,0 @@
---
title: Benchmark
---
Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
## When to use
Run these benchmarks when:
- Making changes to the model inference engine
- Modifying model loading/unloading logic
- Changing prompt processing or token generation code
- Implementing a new model architecture
- Testing performance across different hardware setups
## Prerequisites
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
## Usage and Examples
<Note>
All commands must be run from the root directory of the Ollama project.
</Note>
Basic syntax:
```bash
go test -bench=. ./benchmark/... -m $MODEL_NAME
```
Required flags:
- `-bench=.`: Run all benchmarks
- `-m`: Model name to benchmark
Optional flags:
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
Common usage patterns:
Single benchmark run with a model specified:
```bash
go test -bench=. ./benchmark/... -m llama3.3
```
## Output metrics
The benchmark reports several key metrics:
- `gen_tok/s`: Generated tokens per second
- `prompt_tok/s`: Prompt processing tokens per second
- `ttft_ms`: Time to first token in milliseconds
- `load_ms`: Model load time in milliseconds
- `gen_tokens`: Total tokens generated
- `prompt_tokens`: Total prompt tokens processed
Each benchmark runs two scenarios:
- Cold start: Model is loaded from disk for each test
- Warm start: Model is pre-loaded in memory
Three prompt lengths are tested for each scenario:
- Short prompt (100 tokens)
- Medium prompt (500 tokens)
- Long prompt (1000 tokens)

View File

@@ -17,6 +17,7 @@ Ollama currently supports the following cloud models, with more coming soon:
- `kimi-k2:1t-cloud`
- `qwen3-coder:480b-cloud`
- `glm-4.6:cloud`
- `minimax-m2:cloud`
### Running Cloud models

View File

@@ -58,7 +58,11 @@
"redirects": [
{
"source": "/openai",
"destination": "/api/openai"
"destination": "/api/openai-compatibility"
},
{
"source": "/api/openai",
"destination": "/api/openai-compatibility"
}
],
"navigation": {

3
docs/troubleshooting.md Normal file
View File

@@ -0,0 +1,3 @@
# Troubleshooting
For troubleshooting, see [https://docs.ollama.com/troubleshooting](https://docs.ollama.com/troubleshooting)

View File

@@ -242,13 +242,13 @@ func (kv KV) OllamaEngineRequired() bool {
return slices.Contains([]string{
"gemma3",
"gemma3n",
"mistral3",
"qwen3",
"qwen3moe",
"gptoss", "gpt-oss",
"llama4",
"mistral3",
"mllama",
"qwen25vl",
"gptoss", "gpt-oss",
"qwen3", "qwen3moe",
"qwen3vl", "qwen3vlmoe",
}, kv.Architecture())
}

View File

@@ -7,7 +7,7 @@ By default, these tests are disabled so `go test ./...` will exercise only unit
The integration tests have 2 modes of operating.
1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
1. By default, on Unix systems, they will start the server on a random port, run the tests, and then shutdown the server. On Windows you must ALWAYS run the server on OLLAMA_HOST for the tests to work.
2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote based on your `OLLAMA_HOST` environment variable
> [!IMPORTANT]

View File

@@ -4,9 +4,7 @@ package integration
import (
"context"
"errors"
"math"
"strings"
"testing"
"time"
@@ -16,6 +14,10 @@ import (
func dotProduct[V float32 | float64](v1, v2 []V) V {
var result V = 0
if len(v1) != len(v2) {
return result
}
for i := 0; i < len(v1); i++ {
result += v1[i] * v2[i]
}
@@ -31,9 +33,115 @@ func magnitude[V float32 | float64](v []V) V {
}
func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
mag1 := magnitude(v1)
mag2 := magnitude(v2)
if mag1 == 0 || mag2 == 0 {
return 0
}
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
}
func euclideanDistance[V float32 | float64](v1, v2 []V) V {
if len(v1) != len(v2) {
return V(math.Inf(1))
}
var sum V = 0
for i := 0; i < len(v1); i++ {
diff := v1[i] - v2[i]
sum += diff * diff
}
return V(math.Sqrt(float64(sum)))
}
func manhattanDistance[V float32 | float64](v1, v2 []V) V {
if len(v1) != len(v2) {
return V(math.Inf(1))
}
var sum V = 0
for i := 0; i < len(v1); i++ {
sum += V(math.Abs(float64(v1[i] - v2[i])))
}
return sum
}
func TestEmbedCosineDistanceCorrelation(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
for _, model := range libraryEmbedModels {
t.Run(model, func(t *testing.T) {
testCases := []struct {
a string
b string
c string
}{
{"cat", "kitten", "dog"},
{"king", "queen", "baron"},
{"paris", "london", "vancouver"},
{"The cat is sleeping on the sofa", "A feline is sleeping on the couch", "Quantum physics is complex"},
{"I love programming in python", "Coding in python brings me joy", "Pizza is delicious"},
{"Machine learning is fascinating", "Artificial intelligence is amazing", "I need to buy groceries"},
{"The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a sleepy dog", "The weather is warm and sunny today"},
}
for _, tc := range testCases {
testEmbed := make(map[string][]float32)
strs := []string{tc.a, tc.b, tc.c}
req := api.EmbedRequest{
Model: model,
Input: strs,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
}
resp, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatal(err)
}
for cnt, v := range resp.Embeddings {
testEmbed[strs[cnt]] = v
}
// Calculate cosine similarities
cosAB := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.b])
cosAC := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.c])
// Calculate distances
distAB := euclideanDistance(testEmbed[tc.a], testEmbed[tc.b])
distAC := euclideanDistance(testEmbed[tc.a], testEmbed[tc.c])
manhattanAB := manhattanDistance(testEmbed[tc.a], testEmbed[tc.b])
manhattanAC := manhattanDistance(testEmbed[tc.a], testEmbed[tc.c])
// Consistency check: if cosAB > cosAC, then distances should be smaller
if cosAB > cosAC {
if distAB >= distAC {
t.Errorf("Euclidean distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but distAB=%f >= distAC=%f",
model, tc.a, tc.b, tc.c, cosAB, cosAC, distAB, distAC)
}
if manhattanAB >= manhattanAC {
t.Errorf("Manhattan distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but manhattanAB=%f >= manhattanAC=%f",
model, tc.a, tc.b, tc.c, cosAB, cosAC, manhattanAB, manhattanAC)
}
} else {
t.Errorf("Cosine Similarity inconsistency (%s): cosinSim(%s, %s) < cosinSim(%s, %s)",
model, tc.a, tc.b, tc.a, tc.c)
}
}
})
}
}
func TestAllMiniLMEmbeddings(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
@@ -301,216 +409,3 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req
return client.Embed(ctx, &req)
}
func TestEmbedTruncation(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
t.Run("single input token count", func(t *testing.T) {
req := api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
}
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatal(err)
}
if res.PromptEvalCount <= 0 {
t.Fatalf("expected positive token count, got %d", res.PromptEvalCount)
}
})
t.Run("batch parallel token counting", func(t *testing.T) {
req := api.EmbedRequest{
Model: "all-minilm",
Input: []string{"cat", "dog and mouse", "bird"},
}
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatal(err)
}
if len(res.Embeddings) != 3 {
t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
}
if res.PromptEvalCount <= 0 {
t.Fatalf("expected positive token count, got %d", res.PromptEvalCount)
}
})
t.Run("truncation single input", func(t *testing.T) {
truncTrue := true
longInput := strings.Repeat("word ", 100)
req := api.EmbedRequest{
Model: "all-minilm",
Input: longInput,
Truncate: &truncTrue,
Options: map[string]any{"num_ctx": 50},
}
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatal(err)
}
if res.PromptEvalCount > 50 {
t.Fatalf("expected tokens <= 50 after truncation, got %d", res.PromptEvalCount)
}
if res.PromptEvalCount == 0 {
t.Fatal("expected non-zero token count after truncation")
}
})
t.Run("truncation batch", func(t *testing.T) {
truncTrue := true
req := api.EmbedRequest{
Model: "all-minilm",
Input: []string{"short", strings.Repeat("long ", 100), "medium text"},
Truncate: &truncTrue,
Options: map[string]any{"num_ctx": 30},
}
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatal(err)
}
if len(res.Embeddings) != 3 {
t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
}
if res.PromptEvalCount > 90 {
t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
}
})
t.Run("truncate false error", func(t *testing.T) {
truncFalse := false
req := api.EmbedRequest{
Model: "all-minilm",
Input: strings.Repeat("word ", 100),
Truncate: &truncFalse,
Options: map[string]any{"num_ctx": 10},
}
_, err := embedTestHelper(ctx, client, t, req)
if err == nil {
t.Fatal("expected error when truncate=false with long input")
}
if !strings.Contains(err.Error(), "exceeds maximum context length") {
t.Fatalf("expected context length error, got: %v", err)
}
})
t.Run("runner token count accuracy", func(t *testing.T) {
baseline := api.EmbedRequest{Model: "all-minilm", Input: "test"}
baseRes, err := embedTestHelper(ctx, client, t, baseline)
if err != nil {
t.Fatal(err)
}
batch := api.EmbedRequest{
Model: "all-minilm",
Input: []string{"test", "test", "test"},
}
batchRes, err := embedTestHelper(ctx, client, t, batch)
if err != nil {
t.Fatal(err)
}
expectedCount := baseRes.PromptEvalCount * 3
if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
t.Fatalf("expected ~%d tokens (3 × %d), got %d",
expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
}
})
}
// TestEmbedStatusCode tests that errors from the embedding endpoint
// properly preserve their HTTP status codes when returned to the client.
// This test specifically checks the error handling path in EmbedHandler
// where api.StatusError errors should maintain their original status code.
func TestEmbedStatusCode(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
// Pull the model if needed
if err := PullIfMissing(ctx, client, "all-minilm"); err != nil {
t.Fatal(err)
}
t.Run("truncation error status code", func(t *testing.T) {
truncFalse := false
longInput := strings.Repeat("word ", 100)
req := api.EmbedRequest{
Model: "all-minilm",
Input: longInput,
Truncate: &truncFalse,
Options: map[string]any{"num_ctx": 10},
}
_, err := embedTestHelper(ctx, client, t, req)
if err == nil {
t.Fatal("expected error when truncate=false with long input")
}
// Check that it's a StatusError with the correct status code
var statusErr api.StatusError
if !errors.As(err, &statusErr) {
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
}
// The error should be a 4xx client error (likely 400 Bad Request)
// not a 500 Internal Server Error
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
}
// Verify the error message is meaningful
if !strings.Contains(err.Error(), "context length") {
t.Errorf("expected error message to mention context length, got: %v", err)
}
})
t.Run("batch truncation error status code", func(t *testing.T) {
truncFalse := false
req := api.EmbedRequest{
Model: "all-minilm",
Input: []string{
"short input",
strings.Repeat("very long input ", 100),
"another short input",
},
Truncate: &truncFalse,
Options: map[string]any{"num_ctx": 10},
}
_, err := embedTestHelper(ctx, client, t, req)
if err == nil {
t.Fatal("expected error when one input exceeds context with truncate=false")
}
// Check that it's a StatusError with the correct status code
var statusErr api.StatusError
if !errors.As(err, &statusErr) {
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
}
// The error should be a 4xx client error, not a 500 Internal Server Error
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
}
})
}

View File

@@ -26,6 +26,13 @@ func TestVisionModels(t *testing.T) {
{
model: "gemma3",
},
{
model: "qwen3-vl:8b",
},
{
// Qwen 3 VL mixture of experts
model: "qwen3-vl:30b",
},
}
for _, v := range testCases {

File diff suppressed because one or more lines are too long

View File

@@ -248,12 +248,14 @@ var (
"zephyr",
}
libraryEmbedModels = []string{
"qwen3-embedding",
"embeddinggemma",
"nomic-embed-text",
"all-minilm",
"bge-large",
"bge-m3",
"granite-embedding",
"mxbai-embed-large",
"nomic-embed-text",
"paraphrase-multilingual",
"snowflake-arctic-embed",
"snowflake-arctic-embed2",
@@ -321,7 +323,7 @@ func GetTestEndpoint() (*api.Client, string) {
}
}
if os.Getenv("OLLAMA_TEST_EXISTING") == "" && port == defaultPort {
if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" && port == defaultPort {
port = FindPort()
}
@@ -335,15 +337,20 @@ func GetTestEndpoint() (*api.Client, string) {
http.DefaultClient), fmt.Sprintf("%s:%s", host, port)
}
var serverMutex sync.Mutex
var serverReady bool
var serverLogFile string
// Server lifecycle management
var (
serverMutex sync.Mutex
serverReady bool
serverLog bytes.Buffer
serverDone chan int
serverCmd *exec.Cmd
)
func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
// Make sure the server has been built
CLIName, err := filepath.Abs("../ollama")
if err != nil {
return err
return fmt.Errorf("failed to get absolute path: %w", err)
}
if runtime.GOOS == "windows" {
@@ -351,72 +358,42 @@ func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
}
_, err = os.Stat(CLIName)
if err != nil {
return fmt.Errorf("CLI missing, did you forget to build first? %w", err)
return fmt.Errorf("CLI missing, did you forget to 'go build .' first? %w", err)
}
serverMutex.Lock()
defer serverMutex.Unlock()
if serverReady {
return nil
}
serverDone = make(chan int)
serverLog.Reset()
if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
t.Setenv("OLLAMA_HOST", ollamaHost)
}
logDir := t.TempDir()
slog.Info("starting server", "url", ollamaHost)
done, err := SpawnServer(ctx, "../ollama", logDir)
if err != nil {
return fmt.Errorf("failed to start server: %w", err)
}
serverCmd = exec.Command(CLIName, "serve")
serverCmd.Stderr = &serverLog
serverCmd.Stdout = &serverLog
go func() {
<-ctx.Done()
serverMutex.Lock()
defer serverMutex.Unlock()
exitCode := <-done
if exitCode > 0 {
slog.Warn("server failure", "exit", exitCode)
}
serverReady = false
}()
// TODO wait only long enough for the server to be responsive...
time.Sleep(500 * time.Millisecond)
serverReady = true
return nil
}
func SpawnServer(ctx context.Context, command, logDir string) (chan int, error) {
done := make(chan int)
fp, err := os.CreateTemp(logDir, "ollama-server-*.log")
if err != nil {
return nil, fmt.Errorf("failed to create log file: %w", err)
}
serverLogFile = fp.Name()
cmd := exec.CommandContext(ctx, command, "serve")
cmd.Stderr = fp
cmd.Stdout = fp
go func() {
slog.Info("starting server...")
if err := cmd.Run(); err != nil {
// "signal: killed" expected
slog.Info("starting server", "url", ollamaHost)
if err := serverCmd.Run(); err != nil {
// "signal: killed" expected during normal shutdown
if !strings.Contains(err.Error(), "signal") {
slog.Info("failed to run server", "error", err)
}
}
var code int
if cmd.ProcessState != nil {
code = cmd.ProcessState.ExitCode()
if serverCmd.ProcessState != nil {
code = serverCmd.ProcessState.ExitCode()
}
slog.Info("server exited")
done <- code
serverDone <- code
}()
return done, nil
serverReady = true
return nil
}
func PullIfMissing(ctx context.Context, client *api.Client, modelName string) error {
@@ -477,52 +454,65 @@ var serverProcMutex sync.Mutex
// Starts the server if needed
func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, string, func()) {
client, testEndpoint := GetTestEndpoint()
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
serverProcMutex.Lock()
if err := startServer(t, ctx, testEndpoint); err != nil {
cleanup := func() {}
if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" {
var err error
err = startServer(t, ctx, testEndpoint)
if err != nil {
t.Fatal(err)
}
cleanup = func() {
serverMutex.Lock()
defer serverMutex.Unlock()
serverReady = false
slog.Info("shutting down server")
serverCmd.Process.Signal(os.Interrupt)
slog.Info("waiting for server to exit")
<-serverDone
slog.Info("terminate complete")
if t.Failed() {
slog.Warn("SERVER LOG FOLLOWS")
io.Copy(os.Stderr, &serverLog)
slog.Warn("END OF SERVER")
}
slog.Info("cleanup complete", "failed", t.Failed())
}
}
// Make sure server is online and healthy before returning
listCtx, cancel := context.WithDeadlineCause(
ctx,
time.Now().Add(120*time.Second),
fmt.Errorf("list models took too long"),
)
defer cancel()
models, err := client.ListRunning(listCtx)
if err != nil {
t.Fatal(err)
}
if len(models.Models) > 0 {
names := make([]string, len(models.Models))
for i, m := range models.Models {
names[i] = m.Name
for {
select {
case <-ctx.Done():
t.Fatalf("context done before server ready: %v", ctx.Err())
break
default:
}
slog.Info("currently loaded", "models", names)
listCtx, cancel := context.WithDeadlineCause(
ctx,
time.Now().Add(10*time.Second),
fmt.Errorf("list models took too long"),
)
defer cancel()
models, err := client.ListRunning(listCtx)
if err != nil {
if runtime.GOOS == "windows" {
t.Fatalf("did you forget to start the server: %v", err)
}
time.Sleep(10 * time.Millisecond)
continue
}
if len(models.Models) > 0 {
names := make([]string, len(models.Models))
for i, m := range models.Models {
names[i] = m.Name
}
slog.Info("currently loaded", "models", names)
}
break
}
return client, testEndpoint, func() {
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
defer serverProcMutex.Unlock()
if t.Failed() {
fp, err := os.Open(serverLogFile)
if err != nil {
slog.Error("failed to open server log", "logfile", serverLogFile, "error", err)
return
}
defer fp.Close()
data, err := io.ReadAll(fp)
if err != nil {
slog.Error("failed to read server log", "logfile", serverLogFile, "error", err)
return
}
slog.Warn("SERVER LOG FOLLOWS")
os.Stderr.Write(data)
slog.Warn("END OF SERVER")
}
}
}
return client, testEndpoint, cleanup
}
func ChatTestHelper(ctx context.Context, t *testing.T, req api.ChatRequest, anyResp []string) {

View File

@@ -5,24 +5,33 @@ Subject: [PATCH] GPU discovery enhancements
Expose more information about the devices through backend props, and leverage
management libraries for more accurate VRAM usage reporting if available.
vulkan: get GPU ID (ollama v0.11.5)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
Vulkan PCI and Memory
fix vulkan PCI ID and ID handling
---
ggml/include/ggml-backend.h | 11 +
ggml/src/CMakeLists.txt | 2 +
ggml/src/ggml-cuda/ggml-cuda.cu | 74 +++++
ggml/src/ggml-cuda/vendors/hip.h | 3 +
ggml/src/ggml-impl.h | 8 +
ggml/src/ggml-metal/ggml-metal.cpp | 2 +
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++
ggml/src/mem_nvml.cpp | 209 ++++++++++++++
8 files changed, 758 insertions(+)
ggml/include/ggml-backend.h | 8 +
ggml/src/CMakeLists.txt | 2 +
ggml/src/ggml-cuda/ggml-cuda.cu | 65 ++++
ggml/src/ggml-cuda/vendors/hip.h | 3 +
ggml/src/ggml-impl.h | 8 +
ggml/src/ggml-metal/ggml-metal.cpp | 2 +
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 212 +++++++++++--
ggml/src/mem_hip.cpp | 452 +++++++++++++++++++++++++++
ggml/src/mem_nvml.cpp | 209 +++++++++++++
9 files changed, 931 insertions(+), 30 deletions(-)
create mode 100644 ggml/src/mem_hip.cpp
create mode 100644 ggml/src/mem_nvml.cpp
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index ba181d09d..094fc3c82 100644
index ba181d09d..809835243 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -169,6 +169,17 @@ extern "C" {
@@ -169,6 +169,14 @@ extern "C" {
const char * device_id;
// device capabilities
struct ggml_backend_dev_caps caps;
@@ -31,9 +40,6 @@ index ba181d09d..094fc3c82 100644
+ int compute_major;
+ int compute_minor;
+ int integrated;
+ int pci_bus_id;
+ int pci_device_id;
+ int pci_domain_id;
+ const char *library;
+ // number with which the devices are accessed (Vulkan)
+ const char *numeric_id;
@@ -54,7 +60,7 @@ index 0609c6503..aefe43bdd 100644
target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 87c6c34a4..816597d2f 100644
index 87c6c34a4..b075a18be 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
@@ -86,7 +92,7 @@ index 87c6c34a4..816597d2f 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
@@ -3484,6 +3499,14 @@ struct ggml_backend_cuda_device_context {
@@ -3484,6 +3499,11 @@ struct ggml_backend_cuda_device_context {
std::string description;
std::string pci_bus_id;
std::string id;
@@ -95,22 +101,19 @@ index 87c6c34a4..816597d2f 100644
+ int driver_major;
+ int driver_minor;
+ int integrated;
+ int pciBusID;
+ int pciDeviceID;
+ int pciDomainID;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3504,6 +3527,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
@@ -3504,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
+
+#if defined(GGML_USE_HIP)
+ if (ggml_hip_mgmt_init() == 0) {
+ int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
+ ggml_hip_mgmt_release();
+ return;
+ }
@@ -120,7 +123,7 @@ index 87c6c34a4..816597d2f 100644
+ if (ggml_nvml_init() == 0) {
+ int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
+ ggml_nvml_release();
+ return;
+ }
@@ -130,7 +133,7 @@ index 87c6c34a4..816597d2f 100644
CUDA_CHECK(cudaMemGetInfo(free, total));
}
@@ -3512,6 +3557,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
@@ -3512,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
@@ -138,7 +141,7 @@ index 87c6c34a4..816597d2f 100644
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
@@ -3525,6 +3571,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -3525,6 +3568,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0;
@@ -153,15 +156,12 @@ index 87c6c34a4..816597d2f 100644
+ props->driver_major = ctx->driver_major;
+ props->driver_minor = ctx->driver_minor;
+ props->integrated = ctx->integrated;
+ props->pci_bus_id = ctx->pciBusID;
+ props->pci_device_id = ctx->pciDeviceID;
+ props->pci_domain_id = ctx->pciDomainID;
+ props->library = GGML_CUDA_NAME;
+
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
@@ -4087,6 +4149,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4087,6 +4143,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@@ -169,7 +169,7 @@ index 87c6c34a4..816597d2f 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -4102,6 +4165,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4102,6 +4159,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
@@ -181,9 +181,6 @@ index 87c6c34a4..816597d2f 100644
+ dev_ctx->driver_major = driverVersion / 1000;
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+ dev_ctx->integrated = prop.integrated;
+ dev_ctx->pciBusID = prop.pciBusID;
+ dev_ctx->pciDeviceID = prop.pciDeviceID;
+ dev_ctx->pciDomainID = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
@@ -209,7 +206,7 @@ index 1f06be80e..2f9ef2dc0 100644
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index d0fb3bcca..80597b6ea 100644
index d0fb3bcca..b63edd0c1 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -638,6 +638,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
@@ -221,7 +218,7 @@ index d0fb3bcca..80597b6ea 100644
+GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
+GGML_API void ggml_nvml_release();
+GGML_API int ggml_hip_mgmt_init();
+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
+GGML_API void ggml_hip_mgmt_release();
+
#ifdef __cplusplus
@@ -247,12 +244,319 @@ index f2ff9f322..f356e4a0a 100644
props->caps = {
/* .async = */ true,
/* .host_buffer = */ false,
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ed83236f4..0bbcecd01 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -231,6 +231,7 @@ class vk_memory_logger;
#endif
class vk_perf_logger;
static void ggml_vk_destroy_buffer(vk_buffer& buf);
+static std::string ggml_vk_get_device_id(int device);
static constexpr uint32_t mul_mat_vec_max_cols = 8;
static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11585,6 +11586,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
snprintf(description, description_size, "%s", props.deviceName.data());
}
+static std::string ggml_vk_get_device_id(int device) {
+ ggml_vk_instance_init();
+
+ std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
+
+ vk::PhysicalDeviceProperties2 props;
+ vk::PhysicalDeviceIDProperties deviceIDProps;
+ props.pNext = &deviceIDProps;
+ devices[device].getProperties2(&props);
+
+ const auto& uuid = deviceIDProps.deviceUUID;
+ char id[64];
+ snprintf(id, sizeof(id),
+ "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ uuid[0], uuid[1], uuid[2], uuid[3],
+ uuid[4], uuid[5],
+ uuid[6], uuid[7],
+ uuid[8], uuid[9],
+ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
+ );
+ return std::string(id);
+}
+
// backend interface
#define UNUSED GGML_UNUSED
@@ -12391,31 +12415,103 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
ggml_vk_get_device_description(dev_idx, description, description_size);
}
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
+std::string ggml_backend_vk_get_device_id(int device) {
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
+ int dev_idx = vk_instance.device_indices[device];
+ return ggml_vk_get_device_id(dev_idx);
+}
+
+//////////////////////////
+
+struct ggml_backend_vk_device_context {
+ size_t device;
+ std::string name;
+ std::string description;
+ bool is_integrated_gpu;
+ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
+ std::string pci_id;
+ std::string id;
+ std::string uuid;
+ std::string numeric_id;
+ int major;
+ int minor;
+ int driver_major;
+ int driver_minor;
+};
+
+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
+
+ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
- vk::PhysicalDeviceMemoryProperties2 memprops = {};
- bool membudget_supported = vk_instance.device_supports_membudget[device];
+ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+ vk::PhysicalDeviceProperties2 props2;
+ vkdev.getProperties2(&props2);
- if (membudget_supported) {
- memprops.pNext = &budgetprops;
+ if (!ctx->is_integrated_gpu)
+ {
+ // Use vendor specific management libraries for best VRAM reporting if available
+ switch (props2.properties.vendorID) {
+ case VK_VENDOR_ID_AMD:
+ if (ggml_hip_mgmt_init() == 0) {
+ int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
+ ggml_hip_mgmt_release();
+ return;
+ }
+ ggml_hip_mgmt_release();
+ }
+ break;
+ case VK_VENDOR_ID_NVIDIA:
+ if (ggml_nvml_init() == 0) {
+ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
+ ggml_nvml_release();
+ return;
+ }
+ ggml_nvml_release();
+ }
+ break;
+ }
}
- vkdev.getMemoryProperties2(&memprops);
+ // else fallback to memory budget if supported
- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
+ *total = 0;
+ *free = 0;
+ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
+ vk::PhysicalDeviceMemoryProperties2 memprops2;
+ memprops2.pNext = &mem_budget_props;
+ vkdev.getMemoryProperties2(&memprops2);
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
+ } else if (ctx->is_integrated_gpu) {
+ // Include shared memory on iGPUs
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
+ }
+ }
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+ *free += mem_budget_props.heapBudget[i];
+ } else if (ctx->is_integrated_gpu) {
+ *free += mem_budget_props.heapBudget[i];
+ }
+ }
+ if (*total > 0 && *free > 0) {
+ return;
+ } else if (*total > 0) {
+ *free = *total;
+ return;
+ }
+ // else just report the physical memory
+ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
*total = heap.size;
-
- if (membudget_supported && i < budgetprops.heapUsage.size()) {
- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
- } else {
- *free = heap.size;
- }
+ *free = heap.size;
break;
}
}
@@ -12448,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
}
}
+ vk::PhysicalDeviceProperties2 props2;
if (!ext_support) {
- return "";
+ device.getProperties2(&props2);
+ if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
+ return "";
+ }
+ // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
}
vk::PhysicalDeviceProperties2 props = {};
@@ -12466,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
+ if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
+ return "";
+ }
return std::string(pci_bus_id);
}
-//////////////////////////
-
-struct ggml_backend_vk_device_context {
- size_t device;
- std::string name;
- std::string description;
- bool is_integrated_gpu;
- std::string pci_bus_id;
-};
+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
+ if (id.empty()) return false;
+ unsigned int d = 0, b = 0, dev = 0, func = 0;
+ // Expected format: dddd:bb:dd.f (all hex)
+ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
+ if (n < 4) return false;
+ if (domain) *domain = (int) d;
+ if (bus) *bus = (int) b;
+ if (device) *device = (int) dev;
+ return true;
+}
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
@@ -12490,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
return ctx->description.c_str();
}
+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
+ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+ return ctx->id.c_str();
+}
+
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
- ggml_backend_vk_get_device_memory(ctx->device, free, total);
+ ggml_backend_vk_get_device_memory(ctx, free, total);
}
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -12516,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->name = ggml_backend_vk_device_get_name(dev);
props->description = ggml_backend_vk_device_get_description(dev);
+ props->id = ggml_backend_vk_device_get_id(dev);
props->type = ggml_backend_vk_device_get_type(dev);
- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
+ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
@@ -12525,6 +12637,14 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
+
+ props->compute_major = ctx->major;
+ props->compute_minor = ctx->minor;
+ props->driver_major = ctx->driver_major;
+ props->driver_minor = ctx->driver_minor;
+ props->integrated = ctx->is_integrated_gpu;
+ props->library = GGML_VK_NAME;
+ props->numeric_id = ctx->numeric_id.c_str();
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -12953,6 +13073,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
+ std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
+
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
@@ -12961,12 +13083,42 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
ctx->name = GGML_VK_NAME + std::to_string(i);
ctx->description = desc;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
+ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
+ ctx->id = ggml_backend_vk_get_device_id(i);
devices.push_back(new ggml_backend_device {
/* .iface = */ ggml_backend_vk_device_i,
/* .reg = */ reg,
/* .context = */ ctx,
});
+
+ // Gather additional information about the device
+ int dev_idx = vk_instance.device_indices[i];
+ vk::PhysicalDeviceProperties props1;
+ vk_devices[dev_idx].getProperties(&props1);
+ vk::PhysicalDeviceProperties2 props2;
+ vk::PhysicalDeviceIDProperties device_id_props;
+ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props;
+ vk::PhysicalDeviceDriverProperties driver_props;
+ props2.pNext = &device_id_props;
+ device_id_props.pNext = &pci_bus_props;
+ pci_bus_props.pNext = &driver_props;
+ vk_devices[dev_idx].getProperties2(&props2);
+ std::ostringstream oss;
+ oss << std::hex << std::setfill('0');
+ int byteIdx = 0;
+ for (int i = 0; i < 16; ++i, ++byteIdx) {
+ oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
+ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
+ oss << '-';
+ }
+ }
+ ctx->uuid = oss.str();
+ ctx->major = 0;
+ ctx->minor = 0;
+ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
+ ctx->driver_major = 0;
+ ctx->driver_minor = 0;
+ ctx->numeric_id = std::to_string(i);
}
initialized = true;
}
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
new file mode 100644
index 000000000..8ef19b8cf
index 000000000..5a7f5d465
--- /dev/null
+++ b/ggml/src/mem_hip.cpp
@@ -0,0 +1,449 @@
@@ -0,0 +1,452 @@
+#include "ggml.h"
+
+#ifdef _WIN32
@@ -586,7 +890,7 @@ index 000000000..8ef19b8cf
+ if (gpus != NULL) gpus->pVtbl->Release(gpus); \
+ if (gpu != NULL) gpu->pVtbl->Release(gpu)
+
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
+ std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+ if (adlx.handle == NULL) {
+ GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -598,9 +902,13 @@ index 000000000..8ef19b8cf
+ IADLXGPU* gpu = NULL;
+ IADLXGPUMetrics *gpuMetrics = NULL;
+ ADLX_RESULT status;
+ // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs
+ adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
+
+ uint32_t pci_domain, pci_bus, pci_device, pci_function;
+ if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
+ // TODO - parse other formats?
+ GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
+ return ADLX_NOT_FOUND;
+ }
+ status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -623,16 +931,15 @@ index 000000000..8ef19b8cf
+ GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
+ continue;
+ }
+ adlx_int id;
+ status = gpu->pVtbl->UniqueId(gpu, &id);
+ adlx_int uniqueID;
+ status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
+ gpu->pVtbl->Release(gpu);
+ gpu = NULL;
+ continue;
+ }
+ if (id != target) {
+ GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+ if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
+ gpu->pVtbl->Release(gpu);
+ gpu = NULL;
+ continue;
@@ -695,7 +1002,7 @@ index 000000000..8ef19b8cf
+ return -1;
+}
+void ggml_hip_mgmt_release() {}
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
+ return -1;
+}
+

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs
1 file changed, 68 insertions(+), 3 deletions(-)
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
index c9073cef..f473a2a2 100644
index c9073cef0..f473a2a2c 100644
--- a/ggml/src/mem_nvml.cpp
+++ b/ggml/src/mem_nvml.cpp
@@ -13,6 +13,7 @@

View File

@@ -1,95 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Xiaodong Ye <xiaodong.ye@mthreads.com>
Date: Mon, 18 Aug 2025 12:48:07 +0800
Subject: [PATCH] vulkan: get GPU ID (ollama v0.11.5)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 ++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 061cd078..adea7783 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11588,6 +11588,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
snprintf(description, description_size, "%s", props.deviceName.data());
}
+static std::string ggml_vk_get_device_id(int device) {
+ ggml_vk_instance_init();
+
+ std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
+
+ vk::PhysicalDeviceProperties2 props;
+ vk::PhysicalDeviceIDProperties deviceIDProps;
+ props.pNext = &deviceIDProps;
+ devices[device].getProperties2(&props);
+
+ const auto& uuid = deviceIDProps.deviceUUID;
+ char id[64];
+ snprintf(id, sizeof(id),
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ uuid[0], uuid[1], uuid[2], uuid[3],
+ uuid[4], uuid[5],
+ uuid[6], uuid[7],
+ uuid[8], uuid[9],
+ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
+ );
+ return std::string(id);
+}
+
// backend interface
#define UNUSED GGML_UNUSED
@@ -12394,6 +12417,12 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
ggml_vk_get_device_description(dev_idx, description, description_size);
}
+std::string ggml_backend_vk_get_device_id(int device) {
+ GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+ int dev_idx = vk_instance.device_indices[device];
+ return ggml_vk_get_device_id(dev_idx);
+}
+
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
@@ -12481,6 +12510,7 @@ struct ggml_backend_vk_device_context {
std::string description;
bool is_integrated_gpu;
std::string pci_bus_id;
+ std::string id;
};
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
@@ -12493,6 +12523,11 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
return ctx->description.c_str();
}
+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
+ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+ return ctx->id.c_str();
+}
+
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
ggml_backend_vk_get_device_memory(ctx->device, free, total);
@@ -12519,6 +12554,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->name = ggml_backend_vk_device_get_name(dev);
props->description = ggml_backend_vk_device_get_description(dev);
+ props->id = ggml_backend_vk_device_get_id(dev);
props->type = ggml_backend_vk_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -12965,6 +13001,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
ctx->description = desc;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
+ ctx->id = ggml_backend_vk_get_device_id(i);
devices.push_back(new ggml_backend_device {
/* .iface = */ ggml_backend_vk_device_i,
/* .reg = */ reg,
--
2.51.0

View File

@@ -28,7 +28,7 @@ Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
1 file changed, 9 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 6a278b5e9..87941f872 100644
index b075a18be..d62f412d6 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -340,6 +340,15 @@ static ggml_cuda_device_info ggml_cuda_init() {

View File

@@ -1,254 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri Sep 5 08:25:03 2025 -0700
Subject: [PATCH] Vulkan PCI and Memory
---
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 176 ++++++++++++++++++++++-----
1 file changed, 145 insertions(+), 31 deletions(-)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index adea7783..fb7204ce 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12423,31 +12423,99 @@ std::string ggml_backend_vk_get_device_id(int device) {
return ggml_vk_get_device_id(dev_idx);
}
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
- GGML_ASSERT(device < (int) vk_instance.device_indices.size());
- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
+//////////////////////////
+
+struct ggml_backend_vk_device_context {
+ size_t device;
+ std::string name;
+ std::string description;
+ bool is_integrated_gpu;
+ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
+ std::string pci_id;
+ std::string id;
+ std::string uuid;
+ int major;
+ int minor;
+ int driver_major;
+ int driver_minor;
+ int pci_bus_id;
+ int pci_device_id;
+ int pci_domain_id;
+};
+
+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
+
+ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
- vk::PhysicalDeviceMemoryProperties2 memprops = {};
- bool membudget_supported = vk_instance.device_supports_membudget[device];
+ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+ vk::PhysicalDeviceProperties2 props2;
+ vkdev.getProperties2(&props2);
- if (membudget_supported) {
- memprops.pNext = &budgetprops;
+ if (!ctx->is_integrated_gpu)
+ {
+ // Use vendor specific management libraries for best VRAM reporting if available
+ switch (props2.properties.vendorID) {
+ case VK_VENDOR_ID_AMD:
+ if (ggml_hip_mgmt_init() == 0) {
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_hip_mgmt_release();
+ return;
+ }
+ ggml_hip_mgmt_release();
+ }
+ break;
+ case VK_VENDOR_ID_NVIDIA:
+ if (ggml_nvml_init() == 0) {
+ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_nvml_release();
+ return;
+ }
+ ggml_nvml_release();
+ }
+ break;
+ }
}
- vkdev.getMemoryProperties2(&memprops);
+ // else fallback to memory budget if supported
- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
+ *total = 0;
+ *free = 0;
+ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
+ vk::PhysicalDeviceMemoryProperties2 memprops2;
+ memprops2.pNext = &mem_budget_props;
+ vkdev.getMemoryProperties2(&memprops2);
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
+ } else if (ctx->is_integrated_gpu) {
+ // Include shared memory on iGPUs
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
+ }
+ }
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+ *free += mem_budget_props.heapBudget[i];
+ } else if (ctx->is_integrated_gpu) {
+ *free += mem_budget_props.heapBudget[i];
+ }
+ }
+ if (*total > 0 && *free > 0) {
+ return;
+ } else if (*total > 0) {
+ *free = *total;
+ return;
+ }
+ // else just report the physical memory
+ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
*total = heap.size;
-
- if (membudget_supported && i < budgetprops.heapUsage.size()) {
- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
- } else {
- *free = heap.size;
- }
+ *free = heap.size;
break;
}
}
@@ -12502,16 +12570,17 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
return std::string(pci_bus_id);
}
-//////////////////////////
-
-struct ggml_backend_vk_device_context {
- size_t device;
- std::string name;
- std::string description;
- bool is_integrated_gpu;
- std::string pci_bus_id;
- std::string id;
-};
+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
+ if (id.empty()) return false;
+ unsigned int d = 0, b = 0, dev = 0, func = 0;
+ // Expected format: dddd:bb:dd.f (all hex)
+ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
+ if (n < 4) return false;
+ if (domain) *domain = (int) d;
+ if (bus) *bus = (int) b;
+ if (device) *device = (int) dev;
+ return true;
+}
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
@@ -12530,7 +12599,7 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
- ggml_backend_vk_get_device_memory(ctx->device, free, total);
+ ggml_backend_vk_get_device_memory(ctx, free, total);
}
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -12556,7 +12625,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->description = ggml_backend_vk_device_get_description(dev);
props->id = ggml_backend_vk_device_get_id(dev);
props->type = ggml_backend_vk_device_get_type(dev);
- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
+ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
@@ -12564,6 +12633,17 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
+
+ props->compute_major = ctx->major;
+ props->compute_minor = ctx->minor;
+ props->driver_major = ctx->driver_major;
+ props->driver_minor = ctx->driver_minor;
+ props->integrated = ctx->is_integrated_gpu;
+ props->pci_bus_id = ctx->pci_bus_id;
+ props->pci_device_id = ctx->pci_device_id;
+ props->pci_domain_id = ctx->pci_domain_id;
+ props->library = GGML_VK_NAME;
+ props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -12992,6 +13071,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
+ std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
+
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
@@ -13000,13 +13081,46 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
ctx->name = GGML_VK_NAME + std::to_string(i);
ctx->description = desc;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
+ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
ctx->id = ggml_backend_vk_get_device_id(i);
devices.push_back(new ggml_backend_device {
/* .iface = */ ggml_backend_vk_device_i,
/* .reg = */ reg,
/* .context = */ ctx,
});
+
+ // Gather additional information about the device
+ int dev_idx = vk_instance.device_indices[i];
+ vk::PhysicalDeviceProperties props1;
+ vk_devices[dev_idx].getProperties(&props1);
+ vk::PhysicalDeviceProperties2 props2;
+ vk::PhysicalDeviceIDProperties device_id_props;
+ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props;
+ vk::PhysicalDeviceDriverProperties driver_props;
+ props2.pNext = &device_id_props;
+ device_id_props.pNext = &pci_bus_props;
+ pci_bus_props.pNext = &driver_props;
+ vk_devices[dev_idx].getProperties2(&props2);
+ std::ostringstream oss;
+ oss << std::hex << std::setfill('0');
+ oss << "GPU-";
+ int byteIdx = 0;
+ for (int i = 0; i < 16; ++i, ++byteIdx) {
+ oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
+ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
+ oss << '-';
+ }
+ }
+ ctx->uuid = oss.str();
+ ctx->pci_bus_id = pci_bus_props.pciBus;
+ ctx->pci_device_id = pci_bus_props.pciDevice;
+ ctx->pci_domain_id = pci_bus_props.pciDomain;
+ ctx->id = std::to_string(i);
+ ctx->major = 0;
+ ctx->minor = 0;
+ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
+ ctx->driver_major = 0;
+ ctx->driver_minor = 0;
}
initialized = true;
}
--
2.51.0

View File

@@ -69,7 +69,7 @@ type LlamaServer interface {
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error)
Embedding(ctx context.Context, input string) ([]float32, error)
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
@@ -1545,16 +1545,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
type EmbeddingRequest struct {
Content string `json:"content"`
Truncate bool `json:"truncate"`
Content string `json:"content"`
}
type EmbeddingResponse struct {
Embedding []float32 `json:"embedding"`
PromptEvalCount int `json:"prompt_eval_count"`
Embedding []float32 `json:"embedding"`
}
func (s *llmServer) Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error) {
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
logutil.Trace("embedding request", "input", input)
if err := s.sem.Acquire(ctx, 1); err != nil {
@@ -1563,54 +1561,51 @@ func (s *llmServer) Embedding(ctx context.Context, input string, truncate bool)
} else {
slog.Error("Failed to acquire semaphore", "error", err)
}
return nil, 0, err
return nil, err
}
defer s.sem.Release(1)
// Make sure the server is ready
status, err := s.getServerStatusRetry(ctx)
if err != nil {
return nil, 0, err
return nil, err
} else if status != ServerStatusReady {
return nil, 0, fmt.Errorf("unexpected server status: %s", status)
return nil, fmt.Errorf("unexpected server status: %s", status)
}
data, err := json.Marshal(EmbeddingRequest{Content: input, Truncate: truncate})
data, err := json.Marshal(EmbeddingRequest{Content: input})
if err != nil {
return nil, 0, fmt.Errorf("error marshaling embed data: %w", err)
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
if err != nil {
return nil, 0, fmt.Errorf("error creating embed request: %w", err)
return nil, fmt.Errorf("error creating embed request: %w", err)
}
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(r)
if err != nil {
return nil, 0, fmt.Errorf("do embedding request: %w", err)
return nil, fmt.Errorf("do embedding request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, 0, fmt.Errorf("error reading embed response: %w", err)
return nil, fmt.Errorf("error reading embed response: %w", err)
}
if resp.StatusCode >= 400 {
log.Printf("llm embedding error: %s", body)
return nil, 0, api.StatusError{
StatusCode: resp.StatusCode,
ErrorMessage: string(body),
}
return nil, fmt.Errorf("%s", body)
}
var e EmbeddingResponse
if err := json.Unmarshal(body, &e); err != nil {
return nil, 0, fmt.Errorf("unmarshal tokenize response: %w", err)
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return e.Embedding, e.PromptEvalCount, nil
return e.Embedding, nil
}
type TokenizeRequest struct {

View File

@@ -161,6 +161,7 @@ type Tensor interface {
AvgPool2D(ctx Context, k, s int, p float32) Tensor
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

View File

@@ -725,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
if props.library != nil {
info.Library = C.GoString(props.library)
}
info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
if props.device_id != nil {
info.PCIID = C.GoString(props.device_id)
}
info.LibraryPath = ggml.LibPaths()
if props.numeric_id != nil {
info.FilteredID = C.GoString(props.numeric_id)
@@ -1180,6 +1182,10 @@ func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
}
func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
if slices.Contains(shape, -1) {
inferShape(t, shape)
}
switch len(shape) {
case 0:
return &Tensor{
@@ -1322,7 +1328,43 @@ func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
}
}
// inferShape updates shape in place to automatically set a single -1 dimesion
// based on the input tensor and the other dimensions
func inferShape(t *Tensor, shape []int) {
total := 1
for _, dim := range t.Shape() {
total *= dim
}
dim := -1
for i := range shape {
switch shape[i] {
case -1:
if dim != -1 {
panic("only one dimension can be inferred")
}
dim = i
case 0:
panic("dimension cannot be zero")
default:
if total%shape[i] != 0 {
panic("cannot infer dimension")
}
total /= shape[i]
}
}
if dim != -1 {
shape[dim] = total
}
}
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
if slices.Contains(shape, -1) {
inferShape(t, shape)
}
switch len(shape) {
case 1:
return &Tensor{
@@ -1535,6 +1577,16 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
}
}
func (t *Tensor) Conv3D(ctx ml.Context, t2 ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
var tt ml.Tensor = &Tensor{
b: t.b,
t: C.ggml_conv_3d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int64_t(c), C.int(s0), C.int(s1), C.int(s2), C.int(p0), C.int(p1), C.int(p2), C.int(d0), C.int(d1), C.int(d2)),
}
tt = tt.Reshape(ctx, t.Dim(3)/c, t2.Dim(3)/c)
return tt
}
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
return &Tensor{
b: t.b,

View File

@@ -174,9 +174,6 @@ extern "C" {
int compute_major;
int compute_minor;
int integrated;
int pci_bus_id;
int pci_device_id;
int pci_domain_id;
const char *library;
// number with which the devices are accessed (Vulkan)
const char *numeric_id;

View File

@@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context {
int driver_major;
int driver_minor;
int integrated;
int pciBusID;
int pciDeviceID;
int pciDomainID;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
#if defined(GGML_USE_HIP)
if (ggml_hip_mgmt_init() == 0) {
int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
ggml_hip_mgmt_release();
return;
}
@@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
if (ggml_nvml_init() == 0) {
int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
ggml_nvml_release();
return;
}
@@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->driver_major = ctx->driver_major;
props->driver_minor = ctx->driver_minor;
props->integrated = ctx->integrated;
props->pci_bus_id = ctx->pciBusID;
props->pci_device_id = ctx->pciDeviceID;
props->pci_domain_id = ctx->pciDomainID;
props->library = GGML_CUDA_NAME;
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
@@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->driver_major = driverVersion / 1000;
dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
dev_ctx->integrated = prop.integrated;
dev_ctx->pciBusID = prop.pciBusID;
dev_ctx->pciDeviceID = prop.pciDeviceID;
dev_ctx->pciDomainID = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,

View File

@@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init();
GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
GGML_API void ggml_nvml_release();
GGML_API int ggml_hip_mgmt_init();
GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
GGML_API void ggml_hip_mgmt_release();
#ifdef __cplusplus

View File

@@ -231,6 +231,7 @@ class vk_memory_logger;
#endif
class vk_perf_logger;
static void ggml_vk_destroy_buffer(vk_buffer& buf);
static std::string ggml_vk_get_device_id(int device);
static constexpr uint32_t mul_mat_vec_max_cols = 8;
static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) {
const auto& uuid = deviceIDProps.deviceUUID;
char id[64];
snprintf(id, sizeof(id),
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
"%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5],
uuid[6], uuid[7],
@@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context {
std::string pci_id;
std::string id;
std::string uuid;
std::string numeric_id;
int major;
int minor;
int driver_major;
int driver_minor;
int pci_bus_id;
int pci_device_id;
int pci_domain_id;
};
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
@@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
switch (props2.properties.vendorID) {
case VK_VENDOR_ID_AMD:
if (ggml_hip_mgmt_init() == 0) {
int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
ggml_hip_mgmt_release();
return;
}
@@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
if (ggml_nvml_init() == 0) {
int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
ggml_nvml_release();
return;
}
@@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
}
}
vk::PhysicalDeviceProperties2 props2;
if (!ext_support) {
return "";
device.getProperties2(&props2);
if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
return "";
}
// AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
}
vk::PhysicalDeviceProperties2 props = {};
@@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
return "";
}
return std::string(pci_bus_id);
}
@@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->driver_major = ctx->driver_major;
props->driver_minor = ctx->driver_minor;
props->integrated = ctx->is_integrated_gpu;
props->pci_bus_id = ctx->pci_bus_id;
props->pci_device_id = ctx->pci_device_id;
props->pci_domain_id = ctx->pci_domain_id;
props->library = GGML_VK_NAME;
props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
props->numeric_id = ctx->numeric_id.c_str();
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
vk_devices[dev_idx].getProperties2(&props2);
std::ostringstream oss;
oss << std::hex << std::setfill('0');
oss << "GPU-";
int byteIdx = 0;
for (int i = 0; i < 16; ++i, ++byteIdx) {
oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
@@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
}
}
ctx->uuid = oss.str();
ctx->pci_bus_id = pci_bus_props.pciBus;
ctx->pci_device_id = pci_bus_props.pciDevice;
ctx->pci_domain_id = pci_bus_props.pciDomain;
ctx->id = std::to_string(i);
ctx->major = 0;
ctx->minor = 0;
// TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
ctx->driver_major = 0;
ctx->driver_minor = 0;
ctx->numeric_id = std::to_string(i);
}
initialized = true;
}

View File

@@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() {
if (gpus != NULL) gpus->pVtbl->Release(gpus); \
if (gpu != NULL) gpu->pVtbl->Release(gpu)
int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
std::lock_guard<std::mutex> lock(ggml_adlx_lock);
if (adlx.handle == NULL) {
GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
IADLXGPU* gpu = NULL;
IADLXGPUMetrics *gpuMetrics = NULL;
ADLX_RESULT status;
// The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs
adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
uint32_t pci_domain, pci_bus, pci_device, pci_function;
if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
// TODO - parse other formats?
GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
return ADLX_NOT_FOUND;
}
status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
if (ADLX_FAILED(status)) {
GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
continue;
}
adlx_int id;
status = gpu->pVtbl->UniqueId(gpu, &id);
adlx_int uniqueID;
status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
if (ADLX_FAILED(status)) {
GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
gpu->pVtbl->Release(gpu);
gpu = NULL;
continue;
}
if (id != target) {
GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
gpu->pVtbl->Release(gpu);
gpu = NULL;
continue;
@@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() {
return -1;
}
void ggml_hip_mgmt_release() {}
int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
return -1;
}

View File

@@ -0,0 +1,126 @@
package ggml
import (
"errors"
"os"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
)
func setup(tb testing.TB) ml.Context {
tb.Helper()
f, err := os.CreateTemp(tb.TempDir(), "*.bin")
if err != nil {
tb.Fatal(err)
}
defer f.Close()
if err := ggml.WriteGGUF(f, ggml.KV{"general.architecture": "test"}, nil); err != nil {
tb.Fatal(err)
}
b, err := ml.NewBackend(f.Name(), ml.BackendParams{})
if err != nil {
tb.Fatal(err)
}
ctx := b.NewContext().Input()
tb.Cleanup(func() {
ctx.Close()
b.Close()
})
return ctx
}
func TestInferShape(t *testing.T) {
cases := []struct {
name string
input []int
want []int
err error
}{
{
name: "no inferred shape",
input: []int{2, 3, 4},
want: []int{2, 3, 4},
},
{
name: "infer begin",
input: []int{-1, 3, 4},
want: []int{2, 3, 4},
},
{
name: "infer mid",
input: []int{2, -1, 4},
want: []int{2, 3, 4},
},
{
name: "infer end",
input: []int{2, 3, -1},
want: []int{2, 3, 4},
},
{
name: "too many inferred dims",
input: []int{-1, 3, -1},
err: errors.New("only one dimension can be inferred"),
},
{
name: "infer gather",
input: []int{2, -1},
want: []int{2, 12},
},
{
name: "infer gather all",
input: []int{-1},
want: []int{24},
},
{
name: "infer split",
input: []int{2, -1, 3, 2},
want: []int{2, 2, 3, 2},
},
{
name: "indivisible infer",
input: []int{2, -1, 2, 4},
err: errors.New("cannot infer dimension"),
},
{
name: "infer zero dim",
input: []int{2, 0, 4},
err: errors.New("dimension cannot be zero"),
},
}
ctx := setup(t)
tensor, ok := ctx.Empty(ml.DTypeF32, 2, 3, 4).(*Tensor)
if !ok {
t.Fatal("expected *Tensor")
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
defer func() {
if r := recover(); r == nil && tt.err == nil {
// all good
} else if r != nil && tt.err == nil {
t.Errorf("unexpected panic: %v", r)
} else if r == nil && tt.err != nil {
t.Errorf("expected panic but did not get one: %v", tt.err)
} else if errStr, ok := r.(string); ok && errStr != tt.err.Error() {
t.Errorf("expected panic %q but got %q", tt.err.Error(), errStr)
}
}()
inferShape(tensor, tt.input)
if diff := cmp.Diff(tt.want, tt.input); diff != "" {
t.Errorf("%s: shape mismatch (-want +got):\n%s", tt.name, diff)
}
})
}
}

View File

@@ -391,6 +391,10 @@ func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison {
if a.PCIID != b.PCIID {
return UniqueDevice
}
// If PCIID is empty, we have to use ID + library for uniqueness
if a.PCIID == "" && a.DeviceID != b.DeviceID {
return UniqueDevice
}
if a.Library == b.Library {
return SameBackendDevice
}
@@ -454,13 +458,13 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
var envVar string
switch d.Library {
case "ROCm":
// ROCm must be filtered as it can crash the runner on unsupported devices
envVar = "ROCR_VISIBLE_DEVICES"
if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES"
}
case "Vulkan":
envVar = "GGML_VK_VISIBLE_DEVICES"
default:
// CUDA and Vulkan are not filtered via env var, but via scheduling decisions
return
}
v, existing := env[envVar]

View File

@@ -4,8 +4,27 @@ import "github.com/ollama/ollama/ml"
type Conv2D struct {
Weight ml.Tensor `gguf:"weight"`
Bias ml.Tensor `gguf:"bias"`
}
func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
t = m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
if m.Bias != nil {
// Bias shape is (out_channels,) while t shape is (width, height, out_channels, batch)
t = t.Add(ctx, m.Bias.Reshape(ctx, 1, 1, -1))
}
return t
}
type Conv3D struct {
Weight ml.Tensor `gguf:"weight"`
Bias ml.Tensor `gguf:"bias"`
}
func (m *Conv3D) Forward(ctx ml.Context, t ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
t = m.Weight.Conv3D(ctx, t, c, s0, s1, s2, p0, p1, p2, d0, d1, d2)
if m.Bias != nil {
t = t.Add(ctx, m.Bias)
}
return t
}

View File

@@ -14,4 +14,5 @@ import (
_ "github.com/ollama/ollama/model/models/qwen2"
_ "github.com/ollama/ollama/model/models/qwen25vl"
_ "github.com/ollama/ollama/model/models/qwen3"
_ "github.com/ollama/ollama/model/models/qwen3vl"
)

View File

@@ -3,6 +3,7 @@ package qwen3
import (
"cmp"
"math"
"strings"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
@@ -210,7 +211,7 @@ var _ model.Model = (*Model)(nil)
func New(c fs.Config) (model.Model, error) {
layers := make([]Layer, c.Uint("block_count"))
for i := range layers {
if c.String("general.architecture") == "qwen3moe" {
if strings.HasSuffix(c.String("general.architecture"), "moe") {
layers[i].MLP = &sparse{}
} else {
layers[i].MLP = &dense{}

View File

@@ -0,0 +1,194 @@
package qwen3vl
import (
"fmt"
"image"
"math"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/imageproc"
)
// ImageProcessor contains configuration for the Qwen 3 VL image processing
type ImageProcessor struct {
numChannels int
patchSize int
temporalPatchSize int
mergeSize int
shortestEdge int
longestEdge int
factor int
rescaleFactor float32
imageMean []float32
imageStd []float32
}
// newImageProcessor creates a new image processor with default values
func newImageProcessor(c fs.Config) ImageProcessor {
patchSize := int(c.Uint("vision.patch_size", 14))
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
return ImageProcessor{
numChannels: int(c.Uint("vision.num_channels", 3)), // not set
patchSize: patchSize,
temporalPatchSize: 2,
mergeSize: mergeSize,
shortestEdge: int(c.Uint("vision.shortest_edge", 64<<10)),
// FIXME(mxyng): the model defined longest edge (16M) is too large for the default
// context length of 8K and will panic. Adjusting to 2M for now.
// longestEdge: int(c.Uint("vision.longest_edge", 16<<20)),
longestEdge: 2 << 20,
factor: patchSize * mergeSize,
rescaleFactor: 1.0 / 255.0,
imageMean: c.Floats("vision.image_mean", imageproc.ImageNetStandardMean[:]),
imageStd: c.Floats("vision.image_std", imageproc.ImageNetStandardSTD[:]),
}
}
// SmartResize implements the smart resize algorithm
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
factor := p.factor
if height < factor || width < factor {
panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
}
round := func(x float64) int { return int(math.RoundToEven(x)) }
hBar := round(float64(height)/float64(factor)) * factor
wBar := round(float64(width)/float64(factor)) * factor
if hBar*wBar > p.longestEdge {
beta := math.Sqrt(float64(height*width) / float64(p.longestEdge))
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
} else if hBar*wBar < p.shortestEdge {
beta := math.Sqrt(float64(p.shortestEdge) / float64(height*width))
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
}
return hBar, wBar
}
type Grid struct {
Height int
Width int
Temporal int
}
func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) {
origWidth := img.Bounds().Dx()
origHeight := img.Bounds().Dy()
// Calculate smart resize dimensions
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
// Resize image using existing functions
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
normalizedPixels := imageproc.Normalize(
resizedImg,
[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
true, // rescale
true, // channelFirst
)
// Calculate grid dimensions
grid := &Grid{
Height: resizedHeight / p.patchSize,
Width: resizedWidth / p.patchSize,
Temporal: 1, // For single images, temporal dimension is 1
}
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
if err != nil {
return nil, nil, fmt.Errorf("failed to create patches: %v", err)
}
patchDim := p.numChannels * p.temporalPatchSize *
p.patchSize * p.patchSize
numPatches := grid.Temporal * grid.Height * grid.Width
pixelValues := ctx.Input().FromFloats(patches, patchDim, numPatches)
// Return patches and grid dimensions
return pixelValues, grid, nil
}
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
channels := p.numChannels
patchSize := p.patchSize
mergeSize := p.mergeSize
temporalPatchSize := p.temporalPatchSize
// Calculate output dimensions
numPatches := grid.Temporal * grid.Height * grid.Width
patchDim := channels * temporalPatchSize * patchSize * patchSize
result := make([]float32, numPatches*patchDim)
patchIndex := 0
// Single temporal frame handling (copies to all frames)
for range grid.Temporal {
for h := 0; h < grid.Height; h += mergeSize {
for w := 0; w < grid.Width; w += mergeSize {
// Handle the 2x2 merged patches
for mh := range mergeSize {
for mw := range mergeSize {
baseOffset := patchIndex * patchDim
// Extract patch data for first temporal frame
for c := range channels {
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
for py := range patchSize {
for px := range patchSize {
// Calculate source pixel coordinates
y := (h+mh)*patchSize + py
x := (w+mw)*patchSize + px
// Source index in input tensor (CHW format)
srcIdx := c*height*width + y*width + x
// Destination index in first temporal frame
dstIdx := channelOffset + (py * patchSize) + px
if srcIdx < len(pixels) && dstIdx < len(result) {
result[dstIdx] = pixels[srcIdx]
}
}
}
}
// Copy first temporal frame to all other frames
if temporalPatchSize > 1 {
for c := range channels {
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
firstFrameOffset := channelOffset
frameSize := patchSize * patchSize
// Copy first frame to all other frames
for tp := 1; tp < temporalPatchSize; tp++ {
currentFrameOffset := channelOffset + (tp * frameSize)
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
result[firstFrameOffset:firstFrameOffset+frameSize])
}
}
}
patchIndex++
}
}
}
}
}
return result, nil
}

View File

@@ -0,0 +1,204 @@
package qwen3vl
import (
"bytes"
"image"
"slices"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input"
)
type Model struct {
model.Base
model.TextProcessor
*TextModel
*VisionModel `gguf:"v"`
ImageProcessor
positionCache []int32
}
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Layers) == 0 {
return nil, model.ErrNoVisionModel
}
img, _, err := image.Decode(bytes.NewReader(multimodalData))
if err != nil {
return nil, err
}
pixelValues, grid, err := m.ProcessImage(ctx, img)
if err != nil {
return nil, err
}
// Calculate tensor dimensions
visionOutputs, deepstackVisualEmbeds := m.VisionModel.Forward(ctx, pixelValues, grid)
mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
for i := range deepstackVisualEmbeds {
mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
}
return mm, nil
}
var (
tokenVision int32 = 151655
tokenVisionStart int32 = 151652
tokenVisionEnd int32 = 151653
)
type modelInput struct {
*input.Input
position int32
}
// PostTokenize arranges Qwen 3 VL's inputs for the forward pass
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
m.positionCache = m.positionCache[:0]
return slices.Collect(func(yield func(*input.Input) bool) {
for i := range inputs {
s := []modelInput{{Input: inputs[i]}}
if mm := inputs[i].Multimodal; mm != nil {
t := mm[0].Tensor
s = slices.Repeat([]modelInput{
{
position: int32(i + 1),
Input: &input.Input{Token: tokenVision},
},
}, t.Dim(1)+1+1)
s[0] = modelInput{
Input: &input.Input{Token: tokenVisionStart},
position: int32(i),
}
s[len(s)-1] = modelInput{
Input: &input.Input{Token: tokenVisionEnd},
position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
}
s[1] = modelInput{
Input: &input.Input{
Token: tokenVision,
Multimodal: inputs[i].Multimodal,
MultimodalHash: inputs[i].MultimodalHash,
SameBatch: t.Dim(1),
},
position: int32(i + 1),
}
}
for _, e := range s {
position := e.position
if position == 0 && len(m.positionCache) > 0 {
position = m.positionCache[len(m.positionCache)-1] + 1
}
m.positionCache = append(m.positionCache, position)
if !yield(e.Input) {
return
}
}
}
}), nil
}
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions)))
for i, id := range batch.Positions {
if id < int32(len(m.positionCache)) {
id = m.positionCache[id]
} else if len(m.positionCache) > 0 {
id = id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
}
positionSlice[0][i] = id
positionSlice[1][i] = id
positionSlice[2][i] = id
}
hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
var deepstackVisualEmbeds []ml.Tensor
for _, mi := range batch.Multimodal {
visionOutputs := mi.Multimodal[0].Tensor
ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
for i := range visionOutputs.Dim(1) {
w := grid.Width / m.spatialMergeSize
positionSlice[1][mi.Index+i] += int32(i / w)
positionSlice[2][mi.Index+i] += int32(i % w)
}
}
deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
for i, mm := range mi.Multimodal[1:] {
deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
}
}
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice))
cos, sin := m.rotaryEmbedding(ctx, positions)
for i, layer := range m.TextModel.Layers {
if m.Cache != nil {
m.Cache.SetLayer(i)
}
var outputs ml.Tensor
if i == len(m.TextModel.Layers)-1 {
outputs = batch.Outputs
}
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options)
if i < len(deepstackVisualEmbeds) {
hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
}
}
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, 1e-06)
return m.Output.Forward(ctx, hiddenStates), nil
}
func New(c fs.Config) (model.Model, error) {
m := Model{
TextProcessor: model.NewBytePairEncoding(
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
),
TextModel: newTextModel(c),
VisionModel: newVisionModel(c),
ImageProcessor: newImageProcessor(c),
}
m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) {
m.positionCache = nil
return nil, kvcache.ErrNotSupported
})
return &m, nil
}
func init() {
model.Register("qwen3vl", New)
model.Register("qwen3vlmoe", New)
}

View File

@@ -0,0 +1,229 @@
package qwen3vl
import (
"cmp"
"math"
"slices"
"strings"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
)
type TextOptions struct {
hiddenSize,
numHeads,
numKVHeads,
keyLength,
valueLength int
eps,
ropeBase,
ropeScale float32
mropeSections []int
numExperts, numExpertsUsed int
normTopKProb bool
inverseFrequenciesCache []float32
}
func (o TextOptions) headDim() int {
return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
}
type TextAttention struct {
Query *nn.Linear `gguf:"attn_q"`
QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
Key *nn.Linear `gguf:"attn_k"`
KeyNorm *nn.RMSNorm `gguf:"attn_k_norm"`
Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_output"`
}
func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
batchSize := hiddenStates.Dim(1)
query := sa.Query.Forward(ctx, hiddenStates)
key := sa.Key.Forward(ctx, hiddenStates)
value := sa.Value.Forward(ctx, hiddenStates)
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
query = sa.QueryNorm.Forward(ctx, query, opts.eps)
key = sa.KeyNorm.Forward(ctx, key, opts.eps)
query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
return sa.Output.Forward(ctx, attention)
}
type TextMLP interface {
Forward(ml.Context, ml.Tensor, *TextOptions) ml.Tensor
}
type sparse struct {
Router *nn.Linear `gguf:"ffn_gate_inp"`
Gate *nn.LinearBatch `gguf:"ffn_gate_exps"`
Up *nn.LinearBatch `gguf:"ffn_up_exps"`
Down *nn.LinearBatch `gguf:"ffn_down_exps"`
}
func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
routerLogits := mlp.Router.Forward(ctx, hiddenStates)
routingWeights := routerLogits.Softmax(ctx)
selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
if opts.normTopKProb {
routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
}
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates, selectedExperts).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates, selectedExperts))
experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
experts = experts.Mul(ctx, routingWeights)
nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
for i := 1; i < opts.numExpertsUsed; i++ {
nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
}
return nextStates
}
type dense struct {
Gate *nn.Linear `gguf:"ffn_gate"`
Up *nn.Linear `gguf:"ffn_up"`
Down *nn.Linear `gguf:"ffn_down"`
}
func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *TextOptions) ml.Tensor {
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
return mlp.Down.Forward(ctx, hiddenStates)
}
type TextLayer struct {
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
*TextAttention
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
TextMLP
}
func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
residual := hiddenStates
hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts)
if outputs != nil {
hiddenStates = hiddenStates.Rows(ctx, outputs)
residual = residual.Rows(ctx, outputs)
}
hiddenStates = hiddenStates.Add(ctx, residual)
residual = hiddenStates
hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = d.TextMLP.Forward(ctx, hiddenStates, opts)
return hiddenStates.Add(ctx, residual)
}
type TextModel struct {
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
Output *nn.Linear `gguf:"output,alt:token_embd"`
Layers []TextLayer `gguf:"blk"`
Options *TextOptions
}
func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) {
positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1))
if len(m.Options.inverseFrequenciesCache) == 0 {
m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2)
for i := range m.Options.inverseFrequenciesCache {
frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim())))
m.Options.inverseFrequenciesCache[i] = 1 / frequency
}
}
inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache))
positions = positions.Cast(ctx, ml.DTypeF32)
frequencies := inverseFrequencies.Mulmat(ctx, positions)
interleaved := frequencies.View(ctx,
0, frequencies.Dim(0),
frequencies.Stride(1), frequencies.Dim(1),
)
for _, i := range []int{1, 2} {
args := []int{
i * frequencies.Stride(0), 1,
3 * frequencies.Stride(0), m.Options.mropeSections[i],
frequencies.Stride(1), frequencies.Dim(1),
}
ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...).
Copy(ctx, interleaved.View(ctx, args[0], args[1:]...)))
}
interleaved = interleaved.Concat(ctx, interleaved, 0)
interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2))
return interleaved.Cos(ctx), interleaved.Sin(ctx)
}
var _ model.Model = (*Model)(nil)
func newTextModel(c fs.Config) *TextModel {
layers := make([]TextLayer, c.Uint("block_count"))
for i := range layers {
if strings.HasSuffix(c.String("general.architecture"), "moe") {
layers[i].TextMLP = &sparse{}
} else {
layers[i].TextMLP = &dense{}
}
}
m := TextModel{
Layers: layers,
Options: &TextOptions{
hiddenSize: int(c.Uint("embedding_length")),
numHeads: int(c.Uint("attention.head_count")),
numKVHeads: int(c.Uint("attention.head_count_kv")),
keyLength: int(c.Uint("attention.key_length")),
valueLength: int(c.Uint("attention.value_length")),
eps: c.Float("attention.layer_norm_rms_epsilon"),
ropeBase: c.Float("rope.freq_base"),
ropeScale: c.Float("rope.scaling.factor", 1),
numExperts: int(c.Uint("expert_count")),
numExpertsUsed: int(c.Uint("expert_used_count")),
normTopKProb: c.Bool("norm_top_k_prob", true),
mropeSections: slices.Collect(func(yield func(int) bool) {
for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
if !yield(int(section)) {
return
}
}
}),
},
}
return &m
}

View File

@@ -0,0 +1,268 @@
package qwen3vl
import (
"iter"
"math"
"slices"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
)
type VisionAttention struct {
Query *nn.Linear `gguf:"attn_q"`
Key *nn.Linear `gguf:"attn_k"`
Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_out"`
}
func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
return x2.Scale(ctx, -1).Concat(ctx, x1, 0)
}
func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
}
func (sa *VisionAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
query := sa.Query.Forward(ctx, hiddenStates)
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, query.Dim(1))
query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
key := sa.Key.Forward(ctx, hiddenStates)
key = key.Reshape(ctx, opts.headDim(), opts.numHeads, key.Dim(1))
key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
value := sa.Value.Forward(ctx, hiddenStates)
value = value.Reshape(ctx, opts.headDim(), opts.numHeads, value.Dim(1))
attention := nn.Attention(ctx, query, key, value, math.Pow(float64(opts.headDim()), -0.5), nil)
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2))
return sa.Output.Forward(ctx, attention)
}
type VisionMLP struct {
FC1 *nn.Linear `gguf:"linear_fc1"`
FC2 *nn.Linear `gguf:"linear_fc2"`
}
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts VisionOptions) ml.Tensor {
return mlp.FC2.Forward(ctx, mlp.FC1.Forward(ctx, hiddenStates).GELU(ctx))
}
type VisionEncoderLayer struct {
Norm1 *nn.LayerNorm `gguf:"norm1"`
Attention *VisionAttention
Norm2 *nn.LayerNorm `gguf:"norm2"`
MLP *VisionMLP `gguf:"mlp"`
}
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
residual := hiddenStates
hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = e.Attention.Forward(ctx, hiddenStates, cos, sin, opts)
hiddenStates = hiddenStates.Add(ctx, residual)
residual = hiddenStates
hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
return hiddenStates.Add(ctx, residual)
}
type VisionOptions struct {
hiddenSize,
numHeads,
patchSize,
numChannels,
spatialMergeSize,
temporalPatchSize,
gridPerSide int
eps,
ropeTheta float32
deepstackVisualIndexes []int32
mropeSections []int
}
func (o VisionOptions) headDim() int {
return o.hiddenSize / o.numHeads
}
type VisionPatchMerger struct {
Norm *nn.LayerNorm `gguf:"norm"`
FC1 *nn.Linear `gguf:"linear_fc1"`
FC2 *nn.Linear `gguf:"linear_fc2"`
}
func (m *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, postshuffleNorm bool, opts VisionOptions) ml.Tensor {
hiddenSize := opts.hiddenSize * opts.spatialMergeSize * opts.spatialMergeSize
if postshuffleNorm {
visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
}
visionOutputs = m.Norm.Forward(ctx, visionOutputs, opts.eps)
visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
return m.FC2.Forward(ctx, m.FC1.Forward(ctx, visionOutputs).GELU(ctx))
}
type VisionPositionEmbedding struct {
PositionEmbedding *nn.Embedding `gguf:"pos_embed"`
}
func makeSlice2D[T int32 | float32](n0, n1 int) iter.Seq[[]T] {
return func(yield func([]T) bool) {
for range n0 {
if !yield(make([]T, n1)) {
return
}
}
}
}
func (m *VisionPositionEmbedding) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts VisionOptions) ml.Tensor {
indexSlice := slices.Collect(makeSlice2D[int32](4, grid.Height*grid.Width))
weightSlice := slices.Collect(makeSlice2D[float32](4, grid.Height*grid.Width))
stepHeight := float32(opts.gridPerSide-1) / float32(grid.Height-1)
stepWidth := float32(opts.gridPerSide-1) / float32(grid.Width-1)
var i int
for h := range grid.Height {
for w := range grid.Width {
y, x := float32(h)*stepHeight, float32(w)*stepWidth
floorY, floorX := int32(y), int32(x)
ceilY, ceilX := min(floorY+1, int32(opts.gridPerSide-1)), min(floorX+1, int32(opts.gridPerSide-1))
indexSlice[0][i] = floorY*int32(opts.gridPerSide) + floorX
indexSlice[1][i] = floorY*int32(opts.gridPerSide) + ceilX
indexSlice[2][i] = ceilY*int32(opts.gridPerSide) + floorX
indexSlice[3][i] = ceilY*int32(opts.gridPerSide) + ceilX
weightSlice[0][i] = (1 - (y - float32(floorY))) * (1 - (x - float32(floorX)))
weightSlice[1][i] = (1 - (y - float32(floorY))) * (x - float32(floorX))
weightSlice[2][i] = (y - float32(floorY)) * (1 - (x - float32(floorX)))
weightSlice[3][i] = (y - float32(floorY)) * (x - float32(floorX))
i++
}
}
indices := ctx.Input().FromInts(slices.Concat(indexSlice...), grid.Height*grid.Width*4)
weights := ctx.Input().FromFloats(slices.Concat(weightSlice...), 1, grid.Height*grid.Width*4)
n := hiddenStates.Dim(0)
positionEmbeds := m.PositionEmbedding.Forward(ctx, indices)
positionEmbeds = positionEmbeds.Mul(ctx, weights)
positionEmbeds = positionEmbeds.Reshape(ctx, n, -1, 4)
positionEmbeds = positionEmbeds.View(ctx, 0, n, positionEmbeds.Stride(1), grid.Height*grid.Width).
Add(ctx, positionEmbeds.View(ctx, 1*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
Add(ctx, positionEmbeds.View(ctx, 2*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
Add(ctx, positionEmbeds.View(ctx, 3*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width))
positionEmbeds = positionEmbeds.Reshape(ctx, -1, grid.Width/opts.spatialMergeSize, opts.spatialMergeSize, grid.Height/opts.spatialMergeSize)
positionEmbeds = positionEmbeds.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, n, -1)
return hiddenStates.Add(ctx, positionEmbeds)
}
type VisionModel struct {
PatchEmbedding *nn.Conv3D `gguf:"patch_embed"`
PositionEmbedding *VisionPositionEmbedding
Layers []VisionEncoderLayer `gguf:"blk"`
PatchMerger *VisionPatchMerger `gguf:"merger"`
DeepstackMerger []*VisionPatchMerger `gguf:"deepstack_merger"`
VisionOptions
}
func (m *VisionModel) positions(ctx ml.Context, grid *Grid) (_, _ ml.Tensor) {
indices := ctx.Input().FromInts(slices.Collect(func(yield func(int32) bool) {
for y := range grid.Height {
for x := range grid.Width {
if !yield(int32(y)) {
return
}
if !yield(int32(x)) {
return
}
}
}
}), grid.Width*grid.Height*2)
indices = indices.Reshape(ctx, -1, grid.Width/m.spatialMergeSize, m.spatialMergeSize, grid.Height/m.spatialMergeSize)
indices = indices.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
indices = indices.Reshape(ctx, -1)
halfDim := m.headDim() / 2
maxGrid := max(grid.Height, grid.Width)
frequencies := ctx.Input().FromFloats(slices.Collect(func(yield func(float32) bool) {
ropeTheta := float64(m.ropeTheta)
for i := range maxGrid {
for j := range halfDim / 2 {
if !yield(float32(i) / float32(math.Pow(ropeTheta, float64(j*2)/float64(halfDim)))) {
return
}
}
}
}), halfDim/2, maxGrid)
embeds := frequencies.Rows(ctx, indices)
embeds = embeds.Reshape(ctx, halfDim, 1, -1)
embeds = embeds.Concat(ctx, embeds, 0)
return embeds.Cos(ctx), embeds.Sin(ctx)
}
// Forward computes the vision model for an input tensor
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) (ml.Tensor, []ml.Tensor) {
pixelValues = pixelValues.Reshape(ctx, m.patchSize, m.patchSize, m.temporalPatchSize, -1)
hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.numChannels, m.patchSize, m.patchSize, m.temporalPatchSize, 0, 0, 0, 1, 1, 1)
hiddenStates = m.PositionEmbedding.Forward(ctx, hiddenStates, grid, m.VisionOptions)
cos, sin := m.positions(ctx, grid)
deepstackStates := make([]ml.Tensor, len(m.deepstackVisualIndexes))
for i, layer := range m.Layers {
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, m.VisionOptions)
if i := slices.Index(m.deepstackVisualIndexes, int32(i)); i >= 0 {
deepstackStates[i] = m.DeepstackMerger[i].Forward(ctx, hiddenStates, true, m.VisionOptions)
}
}
hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, false, m.VisionOptions)
return hiddenStates, deepstackStates
}
// newVisionModel creates a new instance of the Qwen vision model
func newVisionModel(c fs.Config) *VisionModel {
deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes")
model := &VisionModel{
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
DeepstackMerger: make([]*VisionPatchMerger, len(deepstackVisualIndexes)),
VisionOptions: VisionOptions{
hiddenSize: int(c.Uint("vision.embedding_length", 1280)),
numHeads: int(c.Uint("vision.attention.head_count", 16)),
patchSize: int(c.Uint("vision.patch_size", 14)),
numChannels: int(c.Uint("vision.num_channels", 3)),
eps: c.Float("vision.attention.layer_norm_epsilon", 1e-6),
ropeTheta: c.Float("vision.rope.freq_base", 10000.0),
spatialMergeSize: int(c.Uint("vision.spatial_merge_size", 2)),
temporalPatchSize: int(c.Uint("vision.temporal_patch_size", 2)),
gridPerSide: int(math.Sqrt(float64(c.Uint("vision.num_positional_embeddings", 2304)))),
mropeSections: slices.Collect(func(yield func(int) bool) {
for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
if !yield(int(section)) {
return
}
}
}),
deepstackVisualIndexes: deepstackVisualIndexes,
},
}
return model
}

View File

@@ -709,13 +709,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{
embedding: true,
truncate: req.Truncate,
// TODO (jmorganca): this should be provided by the server via the
// request options and truncated here in the runner, instead of relying on
// the server's truncate logic
truncate: true,
})
if err != nil {
if errors.Is(err, errorInputTooLong) {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
return
}
@@ -758,8 +758,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
embedding := <-seq.embedding
if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
Embedding: embedding,
PromptEvalCount: seq.numPromptInputs,
Embedding: embedding,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}

View File

@@ -235,15 +235,28 @@ func countCommonPrefix(a []*input.Input, b []*input.Input) int32 {
return count
}
// TODO(jessegross): If we need to reprocess the inputs we should ensure that
// we don't split up a SameBatch
func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
targetFree := (c.numCtx - numKeep) / 2
targetFree = max(targetFree, 1)
// ShiftDiscard computes how many inputs can be discarded from the cache. Inputs in the same batch
// are discarded together.
func (c *InputCache) ShiftDiscard(inputs []*input.Input, numKeep int32) int32 {
targetFree := max((c.numCtx-numKeep)/2, 1)
currentFree := c.numCtx - int32(len(inputs))
currentFree := c.numCtx - inputLen
var discard, sameBatch int32
for _, input := range inputs[numKeep:] {
if sameBatch <= 0 && currentFree >= targetFree {
break
}
return max(targetFree-currentFree, 0)
sameBatch--
currentFree++
discard++
if input.SameBatch > 0 {
sameBatch = int32(input.SameBatch)
}
}
return discard
}
type ErrReprocessInputs struct {
@@ -264,7 +277,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
}
inputLen := int32(len(slot.Inputs))
discard := c.ShiftDiscard(inputLen, numKeep)
discard := c.ShiftDiscard(slot.Inputs, numKeep)
if discard <= 0 {
return nil

View File

@@ -3,6 +3,7 @@ package ollamarunner
import (
"errors"
"fmt"
"slices"
"testing"
"time"
@@ -238,59 +239,137 @@ func TestShiftDiscard(t *testing.T) {
name string
numCtx int32
numKeep int32
inputLen int32
inputs []*input.Input
expected int32
}{
{
name: "Shift",
numCtx: 2048,
numKeep: 5,
inputLen: 2048,
inputs: slices.Repeat([]*input.Input{{}}, 2048),
expected: 1021,
},
{
name: "Max Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 2048,
inputs: slices.Repeat([]*input.Input{{}}, 2048),
expected: 1,
},
{
name: "No Keep",
numCtx: 2048,
numKeep: 0,
inputLen: 2048,
inputs: slices.Repeat([]*input.Input{{}}, 2048),
expected: 1024,
},
{
name: "Truncate",
numCtx: 2048,
numKeep: 5,
inputLen: 5000,
inputs: slices.Repeat([]*input.Input{{}}, 5000),
expected: 3973,
},
{
name: "Truncate Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 5000,
inputs: slices.Repeat([]*input.Input{{}}, 5000),
expected: 2953,
},
{
name: "No Op",
numCtx: 2048,
numKeep: 5,
inputLen: 512,
inputs: slices.Repeat([]*input.Input{{}}, 512),
expected: 0,
},
{
name: "Same Batch",
numCtx: 2048,
numKeep: 5,
inputs: slices.Collect(func(yield func(*input.Input) bool) {
for range 1024 {
if !yield(&input.Input{}) {
return
}
}
if !yield(&input.Input{SameBatch: 512 - 1}) {
return
}
for range 2048 - 1024 - 1 {
if !yield(&input.Input{}) {
return
}
}
}),
expected: 1531,
},
{
name: "Same Batch Near Start",
numCtx: 2048,
numKeep: 5,
inputs: slices.Collect(func(yield func(*input.Input) bool) {
for range 10 {
if !yield(&input.Input{}) {
return
}
}
if !yield(&input.Input{SameBatch: 512 - 1}) {
return
}
for range 2048 - 10 - 1 {
if !yield(&input.Input{}) {
return
}
}
}),
expected: 1021,
},
{
name: "Consecutive Same Batch",
numCtx: 32,
inputs: slices.Collect(func(yield func(*input.Input) bool) {
for i := range 32 {
input := input.Input{}
if i%10 == 0 {
input.SameBatch = 10 - 1
}
if !yield(&input) {
return
}
}
}),
expected: 20,
},
{
name: "Overlapping Same Batch",
numCtx: 32,
inputs: slices.Collect(func(yield func(*input.Input) bool) {
for i := range 32 {
input := input.Input{}
if slices.Contains([]int{4, 8, 14}, i) {
input.SameBatch = 10 - 1
}
if !yield(&input) {
return
}
}
}),
expected: 24,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := InputCache{numCtx: tt.numCtx}
result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
result := c.ShiftDiscard(tt.inputs, tt.numKeep)
if result != tt.expected {
t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
t.Errorf("shiftDiscard(ctx: %v, keep: %v inputs: %v): have %v; want %v", tt.numCtx, tt.numKeep, len(tt.inputs), result, tt.expected)
}
})
}

View File

@@ -214,7 +214,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
parts = []string{prompt}
}
postTokenize := false
for i, part := range parts {
// text - tokenize
tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
@@ -257,11 +256,10 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
mmStore.addMultimodal(imageEmbeddings)
inputs = append(inputs, &input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
postTokenize = true
}
}
if visionModel && postTokenize {
if visionModel {
var err error
inputs, err = multimodalProcessor.PostTokenize(inputs)
if err != nil {
@@ -948,13 +946,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{
embedding: true,
truncate: req.Truncate,
// TODO (jmorganca): this should be provided by the server via the
// request options and truncated here in the runner, instead of relying on
// the server's truncate logic
truncate: true,
})
if err != nil {
if errors.Is(err, errorInputTooLong) {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
http.Error(w, fmt.Sprintf("failed to create new sequence: %v", err), http.StatusInternalServerError)
return
}
@@ -995,8 +993,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
}
if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
Embedding: <-seq.embedding,
PromptEvalCount: seq.numPromptInputs,
Embedding: <-seq.embedding,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}

View File

@@ -21,7 +21,6 @@ import (
"os/signal"
"slices"
"strings"
"sync/atomic"
"syscall"
"time"
@@ -143,7 +142,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
// This model is much more capable with a larger context, so set that
// unless it would penalize performance too much
if !s.lowVRAM && slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
if !s.lowVRAM && slices.Contains([]string{
"gptoss", "gpt-oss",
"qwen3vl", "qwen3vlmoe",
}, model.Config.ModelFamily) {
opts.NumCtx = max(opts.NumCtx, 8192)
}
@@ -660,7 +662,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
return
}
r, _, _, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
if err != nil {
handleScheduleError(c, req.Model, err)
return
@@ -673,12 +675,61 @@ func (s *Server) EmbedHandler(c *gin.Context) {
return
}
kvData, _, err := getModelData(m.ModelPath, false)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
var count int
for i, s := range input {
tokens, err := r.Tokenize(c.Request.Context(), s)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
ctxLen := min(opts.NumCtx, int(kvData.ContextLength()))
if len(tokens) > ctxLen {
if !truncate {
c.JSON(http.StatusBadRequest, gin.H{"error": "input exceeds maximum context length"})
return
}
if bos := kvData.Uint("tokenizer.ggml.bos_token_id"); tokens[0] != int(bos) && kvData.Bool("add_bos_token", true) {
ctxLen--
}
if eos := kvData.Uint("tokenizer.ggml.eos_token_id"); tokens[len(tokens)-1] != int(eos) && kvData.Bool("add_eos_token", true) {
ctxLen--
}
slog.Info("", "ctxLen", ctxLen, "tokenCount", len(tokens))
if ctxLen <= 0 {
// return error if the truncated input would be empty or just special tokens
c.JSON(http.StatusBadRequest, gin.H{"error": "input after truncation exceeds maximum context length"})
return
}
tokens = tokens[:ctxLen]
s, err = r.Detokenize(c.Request.Context(), tokens)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
}
count += len(tokens)
input[i] = s
}
var g errgroup.Group
embeddings := make([][]float32, len(input))
var totalTokens uint64
for i, text := range input {
g.Go(func() error {
embedding, tokenCount, err := r.Embedding(c.Request.Context(), text, truncate)
embedding, err := r.Embedding(c.Request.Context(), text)
if err != nil {
return err
}
@@ -688,18 +739,12 @@ func (s *Server) EmbedHandler(c *gin.Context) {
embedding = normalize(embedding[:req.Dimensions])
}
embeddings[i] = embedding
atomic.AddUint64(&totalTokens, uint64(tokenCount))
return nil
})
}
if err := g.Wait(); err != nil {
var serr api.StatusError
if errors.As(err, &serr) {
c.AbortWithStatusJSON(serr.StatusCode, gin.H{"error": strings.TrimSpace(serr.ErrorMessage)})
} else {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
}
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
return
}
@@ -708,7 +753,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
Embeddings: embeddings,
TotalDuration: time.Since(checkpointStart),
LoadDuration: checkpointLoaded.Sub(checkpointStart),
PromptEvalCount: int(totalTokens),
PromptEvalCount: count,
}
c.JSON(http.StatusOK, resp)
}
@@ -754,7 +799,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
return
}
embedding, _, err := r.Embedding(c.Request.Context(), req.Prompt, true)
embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
return

View File

@@ -390,11 +390,11 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
numParallel = 1
}
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// `mllama`, `qwen3vl`, and `qwen3vlmoe` are snowflakes and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe"}, req.model.Config.ModelFamily) && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
}
sessionDuration := envconfig.KeepAlive()

View File

@@ -780,8 +780,8 @@ func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn
return s.completionResp
}
func (s *mockLlm) Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error) {
return s.embeddingResp, 0, s.embeddingRespErr
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
return s.embeddingResp, s.embeddingRespErr
}
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {