Compare commits
11 Commits
parth/docs
...
v0.12.7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c88647104d | ||
|
|
05aff4a4f1 | ||
|
|
0d140bd1af | ||
|
|
93e45f0f0d | ||
|
|
a342160803 | ||
|
|
f6c29409dc | ||
|
|
7d25b9e194 | ||
|
|
36d64fb531 | ||
|
|
d828517e78 | ||
|
|
14977a9350 | ||
|
|
29f63f37c8 |
@@ -198,6 +198,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
conv = &qwen2Model{}
|
||||
case "Qwen2_5_VLForConditionalGeneration":
|
||||
conv = &qwen25VLModel{}
|
||||
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
|
||||
conv = &qwen3VLModel{}
|
||||
case "BertModel":
|
||||
conv = &bertModel{}
|
||||
case "CohereForCausalLM":
|
||||
|
||||
157
convert/convert_qwen3.go
Normal file
157
convert/convert_qwen3.go
Normal file
@@ -0,0 +1,157 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
)
|
||||
|
||||
type qwen3Model struct {
|
||||
ModelParameters
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
NumExperts uint32 `json:"num_experts"`
|
||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||
NormTopkProb bool `json:"norm_topk_prob"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RopeScaling struct {
|
||||
Type string `json:"type"`
|
||||
Factor ropeFactor `json:"factor"`
|
||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
MropeSection []int32 `json:"mrope_section"`
|
||||
} `json:"rope_scaling"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
}
|
||||
|
||||
// KV implements ModelConverter.
|
||||
func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
|
||||
arch := "qwen3"
|
||||
if q.NumExperts > 0 {
|
||||
arch += "moe"
|
||||
}
|
||||
|
||||
kv := q.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = arch
|
||||
kv["block_count"] = q.HiddenLayers
|
||||
kv["context_length"] = q.MaxPositionEmbeddings
|
||||
kv["embedding_length"] = q.HiddenSize
|
||||
kv["feed_forward_length"] = q.IntermediateSize
|
||||
kv["attention.head_count"] = q.NumAttentionHeads
|
||||
kv["attention.head_count_kv"] = q.NumKeyValueHeads
|
||||
kv["attention.key_length"] = q.HeadDim
|
||||
kv["attention.value_length"] = q.HeadDim
|
||||
|
||||
if q.NumExperts > 0 {
|
||||
kv["expert_count"] = q.NumExperts
|
||||
kv["expert_used_count"] = q.NumExpertsPerToken
|
||||
kv["norm_top_k_prob"] = q.NormTopkProb
|
||||
}
|
||||
|
||||
kv["rope.freq_base"] = q.RopeTheta
|
||||
kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
|
||||
|
||||
switch q.RopeScaling.Type {
|
||||
case "":
|
||||
// no scaling
|
||||
case "yarn":
|
||||
kv["rope.scaling.type"] = q.RopeScaling.Type
|
||||
kv["rope.scaling.factor"] = q.RopeScaling.Factor
|
||||
case "mrope", "default":
|
||||
kv["rope.mrope_section"] = q.RopeScaling.MropeSection
|
||||
default:
|
||||
panic("unknown rope scaling type")
|
||||
}
|
||||
return kv
|
||||
}
|
||||
|
||||
// Tensors implements ModelConverter.
|
||||
func (q *qwen3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
// TODO: handle split experts
|
||||
|
||||
for _, t := range ts {
|
||||
switch {
|
||||
case strings.Contains(t.Name(), "ffn_gate_up_exps"):
|
||||
afterFunc := func(t tensor.Tensor) (tensor.Tensor, error) { return tensor.Transpose(t, 0, 2, 1) }
|
||||
for t := range splitDim(t, 2,
|
||||
split{Replacer: strings.NewReplacer("gate_up", "gate"), afterFunc: afterFunc},
|
||||
split{Replacer: strings.NewReplacer("gate_up", "up"), afterFunc: afterFunc},
|
||||
) {
|
||||
t.Shape[1], t.Shape[2] = t.Shape[2], t.Shape[1]
|
||||
out = append(out, t)
|
||||
}
|
||||
case strings.Contains(t.Name(), "ffn_down_exps"):
|
||||
shape := slices.Clone(t.Shape())
|
||||
shape[1], shape[2] = shape[2], shape[1]
|
||||
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := make([]int, len(shape))
|
||||
for i := range shape {
|
||||
dims[i] = int(shape[i])
|
||||
}
|
||||
|
||||
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
tt, err := tensor.Transpose(tt, 0, 2, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// flatten tensor so it can be written as a vector
|
||||
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return native.VectorF32(tt.(*tensor.Dense))
|
||||
})
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: shape,
|
||||
WriterTo: t,
|
||||
})
|
||||
default:
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// Replacements implements ModelConverter.
|
||||
func (q *qwen3Model) Replacements() []string {
|
||||
return []string{
|
||||
"lm_head", "output",
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.layers", "blk",
|
||||
"input_layernorm", "attn_norm",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.k_norm", "attn_k_norm",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.q_norm", "attn_q_norm",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"mlp.gate.weight", "ffn_gate_inp.weight",
|
||||
"mlp.experts.down_proj", "ffn_down_exps.weight",
|
||||
"mlp.experts.gate_up_proj", "ffn_gate_up_exps.weight",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
"model.norm", "output_norm",
|
||||
}
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*qwen3Model)(nil)
|
||||
116
convert/convert_qwen3vl.go
Normal file
116
convert/convert_qwen3vl.go
Normal file
@@ -0,0 +1,116 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"io/fs"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type qwen3VLModel struct {
|
||||
qwen3Model `json:"text_config"`
|
||||
|
||||
VisionModel struct {
|
||||
Depth uint32 `json:"depth"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
NumHeads uint32 `json:"num_heads"`
|
||||
InChannels uint32 `json:"in_channels"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
||||
WindowSize uint32 `json:"window_size"`
|
||||
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||
DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
|
||||
|
||||
Size struct {
|
||||
ShortestEdge uint32 `json:"shortest_edge"`
|
||||
LongestEdge uint32 `json:"longest_edge"`
|
||||
} `json:"size"`
|
||||
|
||||
ImageMean []float32 `json:"image_mean"`
|
||||
ImageStd []float32 `json:"image_std"`
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
|
||||
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return json.Unmarshal(bts, &m.VisionModel)
|
||||
}
|
||||
|
||||
func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := m.qwen3Model.KV(t)
|
||||
|
||||
arch := "qwen3vl"
|
||||
if m.NumExperts > 0 {
|
||||
arch += "moe"
|
||||
}
|
||||
// override architecture
|
||||
kv["general.architecture"] = arch
|
||||
|
||||
kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
|
||||
kv["vision.embedding_length"] = m.VisionModel.HiddenSize
|
||||
kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
|
||||
kv["vision.num_channels"] = m.VisionModel.InChannels
|
||||
kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
|
||||
kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
|
||||
kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
|
||||
kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
|
||||
kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
|
||||
kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
|
||||
|
||||
kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
|
||||
kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
|
||||
|
||||
kv["vision.image_mean"] = m.VisionModel.ImageMean
|
||||
kv["vision.image_std"] = m.VisionModel.ImageStd
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var rest []Tensor
|
||||
var out []*ggml.Tensor
|
||||
for _, t := range ts {
|
||||
switch {
|
||||
case strings.Contains(t.Name(), "attn_qkv"):
|
||||
out = append(out, slices.Collect(splitDim(t, 0,
|
||||
split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
|
||||
split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
|
||||
split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
|
||||
))...)
|
||||
case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
|
||||
shape := t.Shape()
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: append([]uint64{shape[0] * shape[1]}, shape[2:]...),
|
||||
WriterTo: t,
|
||||
})
|
||||
default:
|
||||
rest = append(rest, t)
|
||||
}
|
||||
}
|
||||
|
||||
return append(m.qwen3Model.Tensors(rest), out...)
|
||||
}
|
||||
|
||||
func (m *qwen3VLModel) Replacements() []string {
|
||||
return append(
|
||||
m.qwen3Model.Replacements(),
|
||||
"model.language_", "",
|
||||
"model.visual", "v",
|
||||
"patch_embed.proj", "patch_embed",
|
||||
"blocks", "blk",
|
||||
"attn.qkv", "attn_qkv",
|
||||
"attn.proj", "attn_out",
|
||||
"deepstack_merger_list", "deepstack_merger",
|
||||
)
|
||||
}
|
||||
@@ -19,8 +19,8 @@ type split struct {
|
||||
dim int
|
||||
slices []tensor.Slice
|
||||
|
||||
// fn is an optional function to apply to the tensor after slicing
|
||||
fn func(tensor.Tensor) (tensor.Tensor, error)
|
||||
// afterFunc is an optional function to apply to the tensor after slicing
|
||||
afterFunc func(tensor.Tensor) (tensor.Tensor, error)
|
||||
}
|
||||
|
||||
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
|
||||
@@ -54,8 +54,8 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
|
||||
|
||||
tt = tensor.Materialize(tt)
|
||||
|
||||
if split.fn != nil {
|
||||
tt, err = split.fn(tt)
|
||||
if split.afterFunc != nil {
|
||||
tt, err = split.afterFunc(tt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -432,7 +432,7 @@ func TestSplitDim(t *testing.T) {
|
||||
t.Run("split with transpose", func(t *testing.T) {
|
||||
next, stop := iter.Pull(splitDim(&r, 1,
|
||||
split{Replacer: strings.NewReplacer("a", "x")},
|
||||
split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
|
||||
split{Replacer: strings.NewReplacer("b", "y"), afterFunc: func(tt tensor.Tensor) (tensor.Tensor, error) {
|
||||
return tensor.Transpose(tt, 1, 0)
|
||||
}},
|
||||
))
|
||||
|
||||
@@ -117,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
|
||||
// In the second pass, we more deeply initialize the GPUs to weed out devices that
|
||||
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
|
||||
slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
|
||||
slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
|
||||
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
var wg sync.WaitGroup
|
||||
@@ -129,7 +129,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
if devices[i].Library == "Metal" {
|
||||
continue
|
||||
}
|
||||
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
|
||||
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
@@ -155,6 +155,12 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
envVar: id, // Filter to just this one GPU
|
||||
}
|
||||
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
|
||||
slog.Debug("filtering device which didn't fully initialize",
|
||||
"id", devices[i].ID,
|
||||
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
|
||||
"pci_id", devices[i].PCIID,
|
||||
"library", devices[i].Library,
|
||||
)
|
||||
needsDelete[i] = true
|
||||
} else {
|
||||
supportedMu.Lock()
|
||||
@@ -170,7 +176,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
logutil.Trace("supported GPU library combinations", "supported", supported)
|
||||
logutil.Trace("supported GPU library combinations before filtering", "supported", supported)
|
||||
|
||||
filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)
|
||||
|
||||
@@ -372,12 +378,13 @@ func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
|
||||
}
|
||||
if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
|
||||
needsDelete[i] = true
|
||||
slog.Debug("dropping Vulkan duplicate by PCI ID",
|
||||
"vulkan_id", devices[i].ID,
|
||||
"vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
|
||||
slog.Debug("filtering device with duplicate PCI ID",
|
||||
"id", devices[i].ID,
|
||||
"library", devices[i].Library,
|
||||
"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
|
||||
"pci_id", devices[i].PCIID,
|
||||
"kept_library", devices[j].Library,
|
||||
"kept_id", devices[j].ID,
|
||||
"kept_library", devices[j].Library,
|
||||
)
|
||||
break
|
||||
}
|
||||
@@ -422,6 +429,12 @@ func filterOverlapByLibrary(supported map[string]map[string]map[string]int, need
|
||||
}
|
||||
for dev, i := range byLibDirs[libDir] {
|
||||
if _, found := byLibDirs[newest][dev]; found {
|
||||
slog.Debug("filtering device with overlapping libraries",
|
||||
"id", dev,
|
||||
"library", libDir,
|
||||
"delete_index", i,
|
||||
"kept_library", newest,
|
||||
)
|
||||
needsDelete[i] = true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package discover
|
||||
import (
|
||||
"log/slog"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
@@ -26,6 +27,7 @@ type CPU struct {
|
||||
}
|
||||
|
||||
func LogDetails(devices []ml.DeviceInfo) {
|
||||
sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference
|
||||
for _, dev := range devices {
|
||||
var libs []string
|
||||
for _, dir := range dev.LibraryPath {
|
||||
@@ -39,6 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) {
|
||||
}
|
||||
slog.Info("inference compute",
|
||||
"id", dev.ID,
|
||||
"filtered_id", dev.FilteredID,
|
||||
"library", dev.Library,
|
||||
"compute", dev.Compute(),
|
||||
"name", dev.Name,
|
||||
|
||||
1869
docs/api.md
Normal file
1869
docs/api.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: "Introduction"
|
||||
title: Introduction
|
||||
---
|
||||
|
||||
Ollama's API allows you to run and interact with models programatically.
|
||||
@@ -44,4 +44,4 @@ Several community-maintained libraries are available for Ollama. For a full list
|
||||
|
||||
## Versioning
|
||||
|
||||
Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).
|
||||
Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
---
|
||||
title: Benchmark
|
||||
---
|
||||
|
||||
Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
|
||||
|
||||
## When to use
|
||||
|
||||
Run these benchmarks when:
|
||||
|
||||
- Making changes to the model inference engine
|
||||
- Modifying model loading/unloading logic
|
||||
- Changing prompt processing or token generation code
|
||||
- Implementing a new model architecture
|
||||
- Testing performance across different hardware setups
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
|
||||
|
||||
## Usage and Examples
|
||||
|
||||
<Note>
|
||||
All commands must be run from the root directory of the Ollama project.
|
||||
</Note>
|
||||
|
||||
Basic syntax:
|
||||
|
||||
```bash
|
||||
go test -bench=. ./benchmark/... -m $MODEL_NAME
|
||||
```
|
||||
|
||||
Required flags:
|
||||
|
||||
- `-bench=.`: Run all benchmarks
|
||||
- `-m`: Model name to benchmark
|
||||
|
||||
Optional flags:
|
||||
|
||||
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
|
||||
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
|
||||
|
||||
Common usage patterns:
|
||||
|
||||
Single benchmark run with a model specified:
|
||||
|
||||
```bash
|
||||
go test -bench=. ./benchmark/... -m llama3.3
|
||||
```
|
||||
|
||||
## Output metrics
|
||||
|
||||
The benchmark reports several key metrics:
|
||||
|
||||
- `gen_tok/s`: Generated tokens per second
|
||||
- `prompt_tok/s`: Prompt processing tokens per second
|
||||
- `ttft_ms`: Time to first token in milliseconds
|
||||
- `load_ms`: Model load time in milliseconds
|
||||
- `gen_tokens`: Total tokens generated
|
||||
- `prompt_tokens`: Total prompt tokens processed
|
||||
|
||||
Each benchmark runs two scenarios:
|
||||
|
||||
- Cold start: Model is loaded from disk for each test
|
||||
- Warm start: Model is pre-loaded in memory
|
||||
|
||||
Three prompt lengths are tested for each scenario:
|
||||
|
||||
- Short prompt (100 tokens)
|
||||
- Medium prompt (500 tokens)
|
||||
- Long prompt (1000 tokens)
|
||||
@@ -17,6 +17,7 @@ Ollama currently supports the following cloud models, with more coming soon:
|
||||
- `kimi-k2:1t-cloud`
|
||||
- `qwen3-coder:480b-cloud`
|
||||
- `glm-4.6:cloud`
|
||||
- `minimax-m2:cloud`
|
||||
|
||||
### Running Cloud models
|
||||
|
||||
|
||||
@@ -58,7 +58,11 @@
|
||||
"redirects": [
|
||||
{
|
||||
"source": "/openai",
|
||||
"destination": "/api/openai"
|
||||
"destination": "/api/openai-compatibility"
|
||||
},
|
||||
{
|
||||
"source": "/api/openai",
|
||||
"destination": "/api/openai-compatibility"
|
||||
}
|
||||
],
|
||||
"navigation": {
|
||||
|
||||
3
docs/troubleshooting.md
Normal file
3
docs/troubleshooting.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Troubleshooting
|
||||
|
||||
For troubleshooting, see [https://docs.ollama.com/troubleshooting](https://docs.ollama.com/troubleshooting)
|
||||
@@ -242,13 +242,13 @@ func (kv KV) OllamaEngineRequired() bool {
|
||||
return slices.Contains([]string{
|
||||
"gemma3",
|
||||
"gemma3n",
|
||||
"mistral3",
|
||||
"qwen3",
|
||||
"qwen3moe",
|
||||
"gptoss", "gpt-oss",
|
||||
"llama4",
|
||||
"mistral3",
|
||||
"mllama",
|
||||
"qwen25vl",
|
||||
"gptoss", "gpt-oss",
|
||||
"qwen3", "qwen3moe",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
}, kv.Architecture())
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ By default, these tests are disabled so `go test ./...` will exercise only unit
|
||||
|
||||
The integration tests have 2 modes of operating.
|
||||
|
||||
1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
|
||||
1. By default, on Unix systems, they will start the server on a random port, run the tests, and then shutdown the server. On Windows you must ALWAYS run the server on OLLAMA_HOST for the tests to work.
|
||||
2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote based on your `OLLAMA_HOST` environment variable
|
||||
|
||||
> [!IMPORTANT]
|
||||
|
||||
@@ -4,9 +4,7 @@ package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"math"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -16,6 +14,10 @@ import (
|
||||
|
||||
func dotProduct[V float32 | float64](v1, v2 []V) V {
|
||||
var result V = 0
|
||||
if len(v1) != len(v2) {
|
||||
return result
|
||||
}
|
||||
|
||||
for i := 0; i < len(v1); i++ {
|
||||
result += v1[i] * v2[i]
|
||||
}
|
||||
@@ -31,9 +33,115 @@ func magnitude[V float32 | float64](v []V) V {
|
||||
}
|
||||
|
||||
func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
|
||||
mag1 := magnitude(v1)
|
||||
mag2 := magnitude(v2)
|
||||
|
||||
if mag1 == 0 || mag2 == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
|
||||
}
|
||||
|
||||
func euclideanDistance[V float32 | float64](v1, v2 []V) V {
|
||||
if len(v1) != len(v2) {
|
||||
return V(math.Inf(1))
|
||||
}
|
||||
|
||||
var sum V = 0
|
||||
for i := 0; i < len(v1); i++ {
|
||||
diff := v1[i] - v2[i]
|
||||
sum += diff * diff
|
||||
}
|
||||
|
||||
return V(math.Sqrt(float64(sum)))
|
||||
}
|
||||
|
||||
func manhattanDistance[V float32 | float64](v1, v2 []V) V {
|
||||
if len(v1) != len(v2) {
|
||||
return V(math.Inf(1))
|
||||
}
|
||||
|
||||
var sum V = 0
|
||||
for i := 0; i < len(v1); i++ {
|
||||
sum += V(math.Abs(float64(v1[i] - v2[i])))
|
||||
}
|
||||
|
||||
return sum
|
||||
}
|
||||
|
||||
func TestEmbedCosineDistanceCorrelation(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for _, model := range libraryEmbedModels {
|
||||
t.Run(model, func(t *testing.T) {
|
||||
testCases := []struct {
|
||||
a string
|
||||
b string
|
||||
c string
|
||||
}{
|
||||
{"cat", "kitten", "dog"},
|
||||
{"king", "queen", "baron"},
|
||||
{"paris", "london", "vancouver"},
|
||||
{"The cat is sleeping on the sofa", "A feline is sleeping on the couch", "Quantum physics is complex"},
|
||||
{"I love programming in python", "Coding in python brings me joy", "Pizza is delicious"},
|
||||
{"Machine learning is fascinating", "Artificial intelligence is amazing", "I need to buy groceries"},
|
||||
{"The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a sleepy dog", "The weather is warm and sunny today"},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
testEmbed := make(map[string][]float32)
|
||||
strs := []string{tc.a, tc.b, tc.c}
|
||||
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: strs,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
}
|
||||
|
||||
resp, err := embedTestHelper(ctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for cnt, v := range resp.Embeddings {
|
||||
testEmbed[strs[cnt]] = v
|
||||
}
|
||||
|
||||
// Calculate cosine similarities
|
||||
cosAB := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.b])
|
||||
cosAC := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.c])
|
||||
|
||||
// Calculate distances
|
||||
distAB := euclideanDistance(testEmbed[tc.a], testEmbed[tc.b])
|
||||
distAC := euclideanDistance(testEmbed[tc.a], testEmbed[tc.c])
|
||||
|
||||
manhattanAB := manhattanDistance(testEmbed[tc.a], testEmbed[tc.b])
|
||||
manhattanAC := manhattanDistance(testEmbed[tc.a], testEmbed[tc.c])
|
||||
|
||||
// Consistency check: if cosAB > cosAC, then distances should be smaller
|
||||
if cosAB > cosAC {
|
||||
if distAB >= distAC {
|
||||
t.Errorf("Euclidean distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but distAB=%f >= distAC=%f",
|
||||
model, tc.a, tc.b, tc.c, cosAB, cosAC, distAB, distAC)
|
||||
}
|
||||
|
||||
if manhattanAB >= manhattanAC {
|
||||
t.Errorf("Manhattan distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but manhattanAB=%f >= manhattanAC=%f",
|
||||
model, tc.a, tc.b, tc.c, cosAB, cosAC, manhattanAB, manhattanAC)
|
||||
}
|
||||
} else {
|
||||
t.Errorf("Cosine Similarity inconsistency (%s): cosinSim(%s, %s) < cosinSim(%s, %s)",
|
||||
model, tc.a, tc.b, tc.a, tc.c)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAllMiniLMEmbeddings(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
@@ -301,216 +409,3 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req
|
||||
|
||||
return client.Embed(ctx, &req)
|
||||
}
|
||||
|
||||
func TestEmbedTruncation(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
t.Run("single input token count", func(t *testing.T) {
|
||||
req := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: "why is the sky blue?",
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(ctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if res.PromptEvalCount <= 0 {
|
||||
t.Fatalf("expected positive token count, got %d", res.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("batch parallel token counting", func(t *testing.T) {
|
||||
req := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: []string{"cat", "dog and mouse", "bird"},
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(ctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(res.Embeddings) != 3 {
|
||||
t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
|
||||
}
|
||||
|
||||
if res.PromptEvalCount <= 0 {
|
||||
t.Fatalf("expected positive token count, got %d", res.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("truncation single input", func(t *testing.T) {
|
||||
truncTrue := true
|
||||
longInput := strings.Repeat("word ", 100)
|
||||
|
||||
req := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: longInput,
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 50},
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(ctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if res.PromptEvalCount > 50 {
|
||||
t.Fatalf("expected tokens <= 50 after truncation, got %d", res.PromptEvalCount)
|
||||
}
|
||||
|
||||
if res.PromptEvalCount == 0 {
|
||||
t.Fatal("expected non-zero token count after truncation")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("truncation batch", func(t *testing.T) {
|
||||
truncTrue := true
|
||||
req := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: []string{"short", strings.Repeat("long ", 100), "medium text"},
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 30},
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(ctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(res.Embeddings) != 3 {
|
||||
t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
|
||||
}
|
||||
|
||||
if res.PromptEvalCount > 90 {
|
||||
t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("truncate false error", func(t *testing.T) {
|
||||
truncFalse := false
|
||||
req := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: strings.Repeat("word ", 100),
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 10},
|
||||
}
|
||||
|
||||
_, err := embedTestHelper(ctx, client, t, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when truncate=false with long input")
|
||||
}
|
||||
|
||||
if !strings.Contains(err.Error(), "exceeds maximum context length") {
|
||||
t.Fatalf("expected context length error, got: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("runner token count accuracy", func(t *testing.T) {
|
||||
baseline := api.EmbedRequest{Model: "all-minilm", Input: "test"}
|
||||
baseRes, err := embedTestHelper(ctx, client, t, baseline)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
batch := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: []string{"test", "test", "test"},
|
||||
}
|
||||
batchRes, err := embedTestHelper(ctx, client, t, batch)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expectedCount := baseRes.PromptEvalCount * 3
|
||||
if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
|
||||
t.Fatalf("expected ~%d tokens (3 × %d), got %d",
|
||||
expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestEmbedStatusCode tests that errors from the embedding endpoint
|
||||
// properly preserve their HTTP status codes when returned to the client.
|
||||
// This test specifically checks the error handling path in EmbedHandler
|
||||
// where api.StatusError errors should maintain their original status code.
|
||||
func TestEmbedStatusCode(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
// Pull the model if needed
|
||||
if err := PullIfMissing(ctx, client, "all-minilm"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
t.Run("truncation error status code", func(t *testing.T) {
|
||||
truncFalse := false
|
||||
longInput := strings.Repeat("word ", 100)
|
||||
|
||||
req := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: longInput,
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 10},
|
||||
}
|
||||
|
||||
_, err := embedTestHelper(ctx, client, t, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when truncate=false with long input")
|
||||
}
|
||||
|
||||
// Check that it's a StatusError with the correct status code
|
||||
var statusErr api.StatusError
|
||||
if !errors.As(err, &statusErr) {
|
||||
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||
}
|
||||
|
||||
// The error should be a 4xx client error (likely 400 Bad Request)
|
||||
// not a 500 Internal Server Error
|
||||
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||
}
|
||||
|
||||
// Verify the error message is meaningful
|
||||
if !strings.Contains(err.Error(), "context length") {
|
||||
t.Errorf("expected error message to mention context length, got: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("batch truncation error status code", func(t *testing.T) {
|
||||
truncFalse := false
|
||||
req := api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: []string{
|
||||
"short input",
|
||||
strings.Repeat("very long input ", 100),
|
||||
"another short input",
|
||||
},
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 10},
|
||||
}
|
||||
|
||||
_, err := embedTestHelper(ctx, client, t, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when one input exceeds context with truncate=false")
|
||||
}
|
||||
|
||||
// Check that it's a StatusError with the correct status code
|
||||
var statusErr api.StatusError
|
||||
if !errors.As(err, &statusErr) {
|
||||
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||
}
|
||||
|
||||
// The error should be a 4xx client error, not a 500 Internal Server Error
|
||||
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -26,6 +26,13 @@ func TestVisionModels(t *testing.T) {
|
||||
{
|
||||
model: "gemma3",
|
||||
},
|
||||
{
|
||||
model: "qwen3-vl:8b",
|
||||
},
|
||||
{
|
||||
// Qwen 3 VL mixture of experts
|
||||
model: "qwen3-vl:30b",
|
||||
},
|
||||
}
|
||||
|
||||
for _, v := range testCases {
|
||||
|
||||
2
integration/testdata/embed.json
vendored
2
integration/testdata/embed.json
vendored
File diff suppressed because one or more lines are too long
@@ -248,12 +248,14 @@ var (
|
||||
"zephyr",
|
||||
}
|
||||
libraryEmbedModels = []string{
|
||||
"qwen3-embedding",
|
||||
"embeddinggemma",
|
||||
"nomic-embed-text",
|
||||
"all-minilm",
|
||||
"bge-large",
|
||||
"bge-m3",
|
||||
"granite-embedding",
|
||||
"mxbai-embed-large",
|
||||
"nomic-embed-text",
|
||||
"paraphrase-multilingual",
|
||||
"snowflake-arctic-embed",
|
||||
"snowflake-arctic-embed2",
|
||||
@@ -321,7 +323,7 @@ func GetTestEndpoint() (*api.Client, string) {
|
||||
}
|
||||
}
|
||||
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" && port == defaultPort {
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" && port == defaultPort {
|
||||
port = FindPort()
|
||||
}
|
||||
|
||||
@@ -335,15 +337,20 @@ func GetTestEndpoint() (*api.Client, string) {
|
||||
http.DefaultClient), fmt.Sprintf("%s:%s", host, port)
|
||||
}
|
||||
|
||||
var serverMutex sync.Mutex
|
||||
var serverReady bool
|
||||
var serverLogFile string
|
||||
// Server lifecycle management
|
||||
var (
|
||||
serverMutex sync.Mutex
|
||||
serverReady bool
|
||||
serverLog bytes.Buffer
|
||||
serverDone chan int
|
||||
serverCmd *exec.Cmd
|
||||
)
|
||||
|
||||
func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
|
||||
// Make sure the server has been built
|
||||
CLIName, err := filepath.Abs("../ollama")
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to get absolute path: %w", err)
|
||||
}
|
||||
|
||||
if runtime.GOOS == "windows" {
|
||||
@@ -351,72 +358,42 @@ func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
|
||||
}
|
||||
_, err = os.Stat(CLIName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("CLI missing, did you forget to build first? %w", err)
|
||||
return fmt.Errorf("CLI missing, did you forget to 'go build .' first? %w", err)
|
||||
}
|
||||
serverMutex.Lock()
|
||||
defer serverMutex.Unlock()
|
||||
if serverReady {
|
||||
return nil
|
||||
}
|
||||
serverDone = make(chan int)
|
||||
serverLog.Reset()
|
||||
|
||||
if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
|
||||
slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
|
||||
t.Setenv("OLLAMA_HOST", ollamaHost)
|
||||
}
|
||||
|
||||
logDir := t.TempDir()
|
||||
slog.Info("starting server", "url", ollamaHost)
|
||||
done, err := SpawnServer(ctx, "../ollama", logDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start server: %w", err)
|
||||
}
|
||||
|
||||
serverCmd = exec.Command(CLIName, "serve")
|
||||
serverCmd.Stderr = &serverLog
|
||||
serverCmd.Stdout = &serverLog
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
serverMutex.Lock()
|
||||
defer serverMutex.Unlock()
|
||||
exitCode := <-done
|
||||
if exitCode > 0 {
|
||||
slog.Warn("server failure", "exit", exitCode)
|
||||
}
|
||||
serverReady = false
|
||||
}()
|
||||
|
||||
// TODO wait only long enough for the server to be responsive...
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
serverReady = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func SpawnServer(ctx context.Context, command, logDir string) (chan int, error) {
|
||||
done := make(chan int)
|
||||
fp, err := os.CreateTemp(logDir, "ollama-server-*.log")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create log file: %w", err)
|
||||
}
|
||||
serverLogFile = fp.Name()
|
||||
|
||||
cmd := exec.CommandContext(ctx, command, "serve")
|
||||
cmd.Stderr = fp
|
||||
cmd.Stdout = fp
|
||||
|
||||
go func() {
|
||||
slog.Info("starting server...")
|
||||
if err := cmd.Run(); err != nil {
|
||||
// "signal: killed" expected
|
||||
slog.Info("starting server", "url", ollamaHost)
|
||||
if err := serverCmd.Run(); err != nil {
|
||||
// "signal: killed" expected during normal shutdown
|
||||
if !strings.Contains(err.Error(), "signal") {
|
||||
slog.Info("failed to run server", "error", err)
|
||||
}
|
||||
}
|
||||
var code int
|
||||
if cmd.ProcessState != nil {
|
||||
code = cmd.ProcessState.ExitCode()
|
||||
if serverCmd.ProcessState != nil {
|
||||
code = serverCmd.ProcessState.ExitCode()
|
||||
}
|
||||
slog.Info("server exited")
|
||||
done <- code
|
||||
serverDone <- code
|
||||
}()
|
||||
return done, nil
|
||||
|
||||
serverReady = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func PullIfMissing(ctx context.Context, client *api.Client, modelName string) error {
|
||||
@@ -477,52 +454,65 @@ var serverProcMutex sync.Mutex
|
||||
// Starts the server if needed
|
||||
func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, string, func()) {
|
||||
client, testEndpoint := GetTestEndpoint()
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
||||
serverProcMutex.Lock()
|
||||
if err := startServer(t, ctx, testEndpoint); err != nil {
|
||||
cleanup := func() {}
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" {
|
||||
var err error
|
||||
err = startServer(t, ctx, testEndpoint)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
cleanup = func() {
|
||||
serverMutex.Lock()
|
||||
defer serverMutex.Unlock()
|
||||
serverReady = false
|
||||
|
||||
slog.Info("shutting down server")
|
||||
serverCmd.Process.Signal(os.Interrupt)
|
||||
slog.Info("waiting for server to exit")
|
||||
<-serverDone
|
||||
slog.Info("terminate complete")
|
||||
|
||||
if t.Failed() {
|
||||
slog.Warn("SERVER LOG FOLLOWS")
|
||||
io.Copy(os.Stderr, &serverLog)
|
||||
slog.Warn("END OF SERVER")
|
||||
}
|
||||
slog.Info("cleanup complete", "failed", t.Failed())
|
||||
}
|
||||
}
|
||||
// Make sure server is online and healthy before returning
|
||||
listCtx, cancel := context.WithDeadlineCause(
|
||||
ctx,
|
||||
time.Now().Add(120*time.Second),
|
||||
fmt.Errorf("list models took too long"),
|
||||
)
|
||||
defer cancel()
|
||||
models, err := client.ListRunning(listCtx)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(models.Models) > 0 {
|
||||
names := make([]string, len(models.Models))
|
||||
for i, m := range models.Models {
|
||||
names[i] = m.Name
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
t.Fatalf("context done before server ready: %v", ctx.Err())
|
||||
break
|
||||
default:
|
||||
}
|
||||
slog.Info("currently loaded", "models", names)
|
||||
listCtx, cancel := context.WithDeadlineCause(
|
||||
ctx,
|
||||
time.Now().Add(10*time.Second),
|
||||
fmt.Errorf("list models took too long"),
|
||||
)
|
||||
defer cancel()
|
||||
models, err := client.ListRunning(listCtx)
|
||||
if err != nil {
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Fatalf("did you forget to start the server: %v", err)
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
continue
|
||||
}
|
||||
if len(models.Models) > 0 {
|
||||
names := make([]string, len(models.Models))
|
||||
for i, m := range models.Models {
|
||||
names[i] = m.Name
|
||||
}
|
||||
slog.Info("currently loaded", "models", names)
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
return client, testEndpoint, func() {
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
||||
defer serverProcMutex.Unlock()
|
||||
if t.Failed() {
|
||||
fp, err := os.Open(serverLogFile)
|
||||
if err != nil {
|
||||
slog.Error("failed to open server log", "logfile", serverLogFile, "error", err)
|
||||
return
|
||||
}
|
||||
defer fp.Close()
|
||||
data, err := io.ReadAll(fp)
|
||||
if err != nil {
|
||||
slog.Error("failed to read server log", "logfile", serverLogFile, "error", err)
|
||||
return
|
||||
}
|
||||
slog.Warn("SERVER LOG FOLLOWS")
|
||||
os.Stderr.Write(data)
|
||||
slog.Warn("END OF SERVER")
|
||||
}
|
||||
}
|
||||
}
|
||||
return client, testEndpoint, cleanup
|
||||
}
|
||||
|
||||
func ChatTestHelper(ctx context.Context, t *testing.T, req api.ChatRequest, anyResp []string) {
|
||||
|
||||
@@ -5,24 +5,33 @@ Subject: [PATCH] GPU discovery enhancements
|
||||
|
||||
Expose more information about the devices through backend props, and leverage
|
||||
management libraries for more accurate VRAM usage reporting if available.
|
||||
|
||||
vulkan: get GPU ID (ollama v0.11.5)
|
||||
|
||||
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
|
||||
|
||||
Vulkan PCI and Memory
|
||||
|
||||
fix vulkan PCI ID and ID handling
|
||||
---
|
||||
ggml/include/ggml-backend.h | 11 +
|
||||
ggml/src/CMakeLists.txt | 2 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 74 +++++
|
||||
ggml/src/ggml-cuda/vendors/hip.h | 3 +
|
||||
ggml/src/ggml-impl.h | 8 +
|
||||
ggml/src/ggml-metal/ggml-metal.cpp | 2 +
|
||||
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++
|
||||
ggml/src/mem_nvml.cpp | 209 ++++++++++++++
|
||||
8 files changed, 758 insertions(+)
|
||||
ggml/include/ggml-backend.h | 8 +
|
||||
ggml/src/CMakeLists.txt | 2 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 65 ++++
|
||||
ggml/src/ggml-cuda/vendors/hip.h | 3 +
|
||||
ggml/src/ggml-impl.h | 8 +
|
||||
ggml/src/ggml-metal/ggml-metal.cpp | 2 +
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 212 +++++++++++--
|
||||
ggml/src/mem_hip.cpp | 452 +++++++++++++++++++++++++++
|
||||
ggml/src/mem_nvml.cpp | 209 +++++++++++++
|
||||
9 files changed, 931 insertions(+), 30 deletions(-)
|
||||
create mode 100644 ggml/src/mem_hip.cpp
|
||||
create mode 100644 ggml/src/mem_nvml.cpp
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index ba181d09d..094fc3c82 100644
|
||||
index ba181d09d..809835243 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -169,6 +169,17 @@ extern "C" {
|
||||
@@ -169,6 +169,14 @@ extern "C" {
|
||||
const char * device_id;
|
||||
// device capabilities
|
||||
struct ggml_backend_dev_caps caps;
|
||||
@@ -31,9 +40,6 @@ index ba181d09d..094fc3c82 100644
|
||||
+ int compute_major;
|
||||
+ int compute_minor;
|
||||
+ int integrated;
|
||||
+ int pci_bus_id;
|
||||
+ int pci_device_id;
|
||||
+ int pci_domain_id;
|
||||
+ const char *library;
|
||||
+ // number with which the devices are accessed (Vulkan)
|
||||
+ const char *numeric_id;
|
||||
@@ -54,7 +60,7 @@ index 0609c6503..aefe43bdd 100644
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 87c6c34a4..816597d2f 100644
|
||||
index 87c6c34a4..b075a18be 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -86,7 +92,7 @@ index 87c6c34a4..816597d2f 100644
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
@@ -3484,6 +3499,14 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -3484,6 +3499,11 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
std::string id;
|
||||
@@ -95,22 +101,19 @@ index 87c6c34a4..816597d2f 100644
|
||||
+ int driver_major;
|
||||
+ int driver_minor;
|
||||
+ int integrated;
|
||||
+ int pciBusID;
|
||||
+ int pciDeviceID;
|
||||
+ int pciDomainID;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -3504,6 +3527,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
@@ -3504,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
+
|
||||
+#if defined(GGML_USE_HIP)
|
||||
+ if (ggml_hip_mgmt_init() == 0) {
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
|
||||
+ ggml_hip_mgmt_release();
|
||||
+ return;
|
||||
+ }
|
||||
@@ -120,7 +123,7 @@ index 87c6c34a4..816597d2f 100644
|
||||
+ if (ggml_nvml_init() == 0) {
|
||||
+ int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
|
||||
+ ggml_nvml_release();
|
||||
+ return;
|
||||
+ }
|
||||
@@ -130,7 +133,7 @@ index 87c6c34a4..816597d2f 100644
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
@@ -3512,6 +3557,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
@@ -3512,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
@@ -138,7 +141,7 @@ index 87c6c34a4..816597d2f 100644
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
@@ -3525,6 +3571,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -3525,6 +3568,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
||||
props->memory_total = props->memory_free = 0;
|
||||
|
||||
@@ -153,15 +156,12 @@ index 87c6c34a4..816597d2f 100644
|
||||
+ props->driver_major = ctx->driver_major;
|
||||
+ props->driver_minor = ctx->driver_minor;
|
||||
+ props->integrated = ctx->integrated;
|
||||
+ props->pci_bus_id = ctx->pciBusID;
|
||||
+ props->pci_device_id = ctx->pciDeviceID;
|
||||
+ props->pci_domain_id = ctx->pciDomainID;
|
||||
+ props->library = GGML_CUDA_NAME;
|
||||
+
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
@@ -4087,6 +4149,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4087,6 +4143,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
@@ -169,7 +169,7 @@ index 87c6c34a4..816597d2f 100644
|
||||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
@@ -4102,6 +4165,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4102,6 +4159,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
|
||||
@@ -181,9 +181,6 @@ index 87c6c34a4..816597d2f 100644
|
||||
+ dev_ctx->driver_major = driverVersion / 1000;
|
||||
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
|
||||
+ dev_ctx->integrated = prop.integrated;
|
||||
+ dev_ctx->pciBusID = prop.pciBusID;
|
||||
+ dev_ctx->pciDeviceID = prop.pciDeviceID;
|
||||
+ dev_ctx->pciDomainID = prop.pciDomainID;
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
@@ -209,7 +206,7 @@ index 1f06be80e..2f9ef2dc0 100644
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
|
||||
index d0fb3bcca..80597b6ea 100644
|
||||
index d0fb3bcca..b63edd0c1 100644
|
||||
--- a/ggml/src/ggml-impl.h
|
||||
+++ b/ggml/src/ggml-impl.h
|
||||
@@ -638,6 +638,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
|
||||
@@ -221,7 +218,7 @@ index d0fb3bcca..80597b6ea 100644
|
||||
+GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
|
||||
+GGML_API void ggml_nvml_release();
|
||||
+GGML_API int ggml_hip_mgmt_init();
|
||||
+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
|
||||
+GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
|
||||
+GGML_API void ggml_hip_mgmt_release();
|
||||
+
|
||||
#ifdef __cplusplus
|
||||
@@ -247,12 +244,319 @@ index f2ff9f322..f356e4a0a 100644
|
||||
props->caps = {
|
||||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index ed83236f4..0bbcecd01 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -231,6 +231,7 @@ class vk_memory_logger;
|
||||
#endif
|
||||
class vk_perf_logger;
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
+static std::string ggml_vk_get_device_id(int device);
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -11585,6 +11586,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
snprintf(description, description_size, "%s", props.deviceName.data());
|
||||
}
|
||||
|
||||
+static std::string ggml_vk_get_device_id(int device) {
|
||||
+ ggml_vk_instance_init();
|
||||
+
|
||||
+ std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
||||
+
|
||||
+ vk::PhysicalDeviceProperties2 props;
|
||||
+ vk::PhysicalDeviceIDProperties deviceIDProps;
|
||||
+ props.pNext = &deviceIDProps;
|
||||
+ devices[device].getProperties2(&props);
|
||||
+
|
||||
+ const auto& uuid = deviceIDProps.deviceUUID;
|
||||
+ char id[64];
|
||||
+ snprintf(id, sizeof(id),
|
||||
+ "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
+ uuid[0], uuid[1], uuid[2], uuid[3],
|
||||
+ uuid[4], uuid[5],
|
||||
+ uuid[6], uuid[7],
|
||||
+ uuid[8], uuid[9],
|
||||
+ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
|
||||
+ );
|
||||
+ return std::string(id);
|
||||
+}
|
||||
+
|
||||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
@@ -12391,31 +12415,103 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
+std::string ggml_backend_vk_get_device_id(int device) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
|
||||
+ int dev_idx = vk_instance.device_indices[device];
|
||||
+ return ggml_vk_get_device_id(dev_idx);
|
||||
+}
|
||||
+
|
||||
+//////////////////////////
|
||||
+
|
||||
+struct ggml_backend_vk_device_context {
|
||||
+ size_t device;
|
||||
+ std::string name;
|
||||
+ std::string description;
|
||||
+ bool is_integrated_gpu;
|
||||
+ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
|
||||
+ std::string pci_id;
|
||||
+ std::string id;
|
||||
+ std::string uuid;
|
||||
+ std::string numeric_id;
|
||||
+ int major;
|
||||
+ int minor;
|
||||
+ int driver_major;
|
||||
+ int driver_minor;
|
||||
+};
|
||||
+
|
||||
+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
|
||||
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
|
||||
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
|
||||
+
|
||||
+ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
|
||||
|
||||
- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
|
||||
- vk::PhysicalDeviceMemoryProperties2 memprops = {};
|
||||
- bool membudget_supported = vk_instance.device_supports_membudget[device];
|
||||
+ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
+ vk::PhysicalDeviceProperties2 props2;
|
||||
+ vkdev.getProperties2(&props2);
|
||||
|
||||
- if (membudget_supported) {
|
||||
- memprops.pNext = &budgetprops;
|
||||
+ if (!ctx->is_integrated_gpu)
|
||||
+ {
|
||||
+ // Use vendor specific management libraries for best VRAM reporting if available
|
||||
+ switch (props2.properties.vendorID) {
|
||||
+ case VK_VENDOR_ID_AMD:
|
||||
+ if (ggml_hip_mgmt_init() == 0) {
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
|
||||
+ ggml_hip_mgmt_release();
|
||||
+ return;
|
||||
+ }
|
||||
+ ggml_hip_mgmt_release();
|
||||
+ }
|
||||
+ break;
|
||||
+ case VK_VENDOR_ID_NVIDIA:
|
||||
+ if (ggml_nvml_init() == 0) {
|
||||
+ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
|
||||
+ ggml_nvml_release();
|
||||
+ return;
|
||||
+ }
|
||||
+ ggml_nvml_release();
|
||||
+ }
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
- vkdev.getMemoryProperties2(&memprops);
|
||||
+ // else fallback to memory budget if supported
|
||||
|
||||
- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
|
||||
- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
|
||||
+ *total = 0;
|
||||
+ *free = 0;
|
||||
+ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
|
||||
+ vk::PhysicalDeviceMemoryProperties2 memprops2;
|
||||
+ memprops2.pNext = &mem_budget_props;
|
||||
+ vkdev.getMemoryProperties2(&memprops2);
|
||||
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
|
||||
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
|
||||
+ } else if (ctx->is_integrated_gpu) {
|
||||
+ // Include shared memory on iGPUs
|
||||
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
|
||||
+ }
|
||||
+ }
|
||||
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
|
||||
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
+ *free += mem_budget_props.heapBudget[i];
|
||||
+ } else if (ctx->is_integrated_gpu) {
|
||||
+ *free += mem_budget_props.heapBudget[i];
|
||||
+ }
|
||||
+ }
|
||||
+ if (*total > 0 && *free > 0) {
|
||||
+ return;
|
||||
+ } else if (*total > 0) {
|
||||
+ *free = *total;
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
+ // else just report the physical memory
|
||||
+ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
-
|
||||
- if (membudget_supported && i < budgetprops.heapUsage.size()) {
|
||||
- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
|
||||
- } else {
|
||||
- *free = heap.size;
|
||||
- }
|
||||
+ *free = heap.size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -12448,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
}
|
||||
}
|
||||
|
||||
+ vk::PhysicalDeviceProperties2 props2;
|
||||
if (!ext_support) {
|
||||
- return "";
|
||||
+ device.getProperties2(&props2);
|
||||
+ if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
|
||||
+ return "";
|
||||
+ }
|
||||
+ // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
@@ -12466,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
+ if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
|
||||
+ return "";
|
||||
+ }
|
||||
|
||||
return std::string(pci_bus_id);
|
||||
}
|
||||
|
||||
-//////////////////////////
|
||||
-
|
||||
-struct ggml_backend_vk_device_context {
|
||||
- size_t device;
|
||||
- std::string name;
|
||||
- std::string description;
|
||||
- bool is_integrated_gpu;
|
||||
- std::string pci_bus_id;
|
||||
-};
|
||||
+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
|
||||
+ if (id.empty()) return false;
|
||||
+ unsigned int d = 0, b = 0, dev = 0, func = 0;
|
||||
+ // Expected format: dddd:bb:dd.f (all hex)
|
||||
+ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
|
||||
+ if (n < 4) return false;
|
||||
+ if (domain) *domain = (int) d;
|
||||
+ if (bus) *bus = (int) b;
|
||||
+ if (device) *device = (int) dev;
|
||||
+ return true;
|
||||
+}
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -12490,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
|
||||
+ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
+ return ctx->id.c_str();
|
||||
+}
|
||||
+
|
||||
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
|
||||
- ggml_backend_vk_get_device_memory(ctx->device, free, total);
|
||||
+ ggml_backend_vk_get_device_memory(ctx, free, total);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -12516,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
+ props->id = ggml_backend_vk_device_get_id(dev);
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
+ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -12525,6 +12637,14 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
+
|
||||
+ props->compute_major = ctx->major;
|
||||
+ props->compute_minor = ctx->minor;
|
||||
+ props->driver_major = ctx->driver_major;
|
||||
+ props->driver_minor = ctx->driver_minor;
|
||||
+ props->integrated = ctx->is_integrated_gpu;
|
||||
+ props->library = GGML_VK_NAME;
|
||||
+ props->numeric_id = ctx->numeric_id.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -12953,6 +13073,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
+ std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
|
||||
+
|
||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -12961,12 +13083,42 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
+ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
+ ctx->id = ggml_backend_vk_get_device_id(i);
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
+
|
||||
+ // Gather additional information about the device
|
||||
+ int dev_idx = vk_instance.device_indices[i];
|
||||
+ vk::PhysicalDeviceProperties props1;
|
||||
+ vk_devices[dev_idx].getProperties(&props1);
|
||||
+ vk::PhysicalDeviceProperties2 props2;
|
||||
+ vk::PhysicalDeviceIDProperties device_id_props;
|
||||
+ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props;
|
||||
+ vk::PhysicalDeviceDriverProperties driver_props;
|
||||
+ props2.pNext = &device_id_props;
|
||||
+ device_id_props.pNext = &pci_bus_props;
|
||||
+ pci_bus_props.pNext = &driver_props;
|
||||
+ vk_devices[dev_idx].getProperties2(&props2);
|
||||
+ std::ostringstream oss;
|
||||
+ oss << std::hex << std::setfill('0');
|
||||
+ int byteIdx = 0;
|
||||
+ for (int i = 0; i < 16; ++i, ++byteIdx) {
|
||||
+ oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
|
||||
+ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
|
||||
+ oss << '-';
|
||||
+ }
|
||||
+ }
|
||||
+ ctx->uuid = oss.str();
|
||||
+ ctx->major = 0;
|
||||
+ ctx->minor = 0;
|
||||
+ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
|
||||
+ ctx->driver_major = 0;
|
||||
+ ctx->driver_minor = 0;
|
||||
+ ctx->numeric_id = std::to_string(i);
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
|
||||
new file mode 100644
|
||||
index 000000000..8ef19b8cf
|
||||
index 000000000..5a7f5d465
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/mem_hip.cpp
|
||||
@@ -0,0 +1,449 @@
|
||||
@@ -0,0 +1,452 @@
|
||||
+#include "ggml.h"
|
||||
+
|
||||
+#ifdef _WIN32
|
||||
@@ -586,7 +890,7 @@ index 000000000..8ef19b8cf
|
||||
+ if (gpus != NULL) gpus->pVtbl->Release(gpus); \
|
||||
+ if (gpu != NULL) gpu->pVtbl->Release(gpu)
|
||||
+
|
||||
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
|
||||
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
|
||||
+ std::lock_guard<std::mutex> lock(ggml_adlx_lock);
|
||||
+ if (adlx.handle == NULL) {
|
||||
+ GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
|
||||
@@ -598,9 +902,13 @@ index 000000000..8ef19b8cf
|
||||
+ IADLXGPU* gpu = NULL;
|
||||
+ IADLXGPUMetrics *gpuMetrics = NULL;
|
||||
+ ADLX_RESULT status;
|
||||
+ // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs
|
||||
+ adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
|
||||
+
|
||||
+ uint32_t pci_domain, pci_bus, pci_device, pci_function;
|
||||
+ if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
|
||||
+ // TODO - parse other formats?
|
||||
+ GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
|
||||
+ return ADLX_NOT_FOUND;
|
||||
+ }
|
||||
+ status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
|
||||
+ if (ADLX_FAILED(status)) {
|
||||
+ GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
|
||||
@@ -623,16 +931,15 @@ index 000000000..8ef19b8cf
|
||||
+ GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
|
||||
+ continue;
|
||||
+ }
|
||||
+ adlx_int id;
|
||||
+ status = gpu->pVtbl->UniqueId(gpu, &id);
|
||||
+ adlx_int uniqueID;
|
||||
+ status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
|
||||
+ if (ADLX_FAILED(status)) {
|
||||
+ GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
|
||||
+ gpu->pVtbl->Release(gpu);
|
||||
+ gpu = NULL;
|
||||
+ continue;
|
||||
+ }
|
||||
+ if (id != target) {
|
||||
+ GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
|
||||
+ if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
|
||||
+ gpu->pVtbl->Release(gpu);
|
||||
+ gpu = NULL;
|
||||
+ continue;
|
||||
@@ -695,7 +1002,7 @@ index 000000000..8ef19b8cf
|
||||
+ return -1;
|
||||
+}
|
||||
+void ggml_hip_mgmt_release() {}
|
||||
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
|
||||
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
|
||||
+ return -1;
|
||||
+}
|
||||
+
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs
|
||||
1 file changed, 68 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
|
||||
index c9073cef..f473a2a2 100644
|
||||
index c9073cef0..f473a2a2c 100644
|
||||
--- a/ggml/src/mem_nvml.cpp
|
||||
+++ b/ggml/src/mem_nvml.cpp
|
||||
@@ -13,6 +13,7 @@
|
||||
@@ -1,95 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Xiaodong Ye <xiaodong.ye@mthreads.com>
|
||||
Date: Mon, 18 Aug 2025 12:48:07 +0800
|
||||
Subject: [PATCH] vulkan: get GPU ID (ollama v0.11.5)
|
||||
|
||||
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
|
||||
---
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 ++++++++++++++++++++++++++++
|
||||
1 file changed, 37 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 061cd078..adea7783 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -11588,6 +11588,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
snprintf(description, description_size, "%s", props.deviceName.data());
|
||||
}
|
||||
|
||||
+static std::string ggml_vk_get_device_id(int device) {
|
||||
+ ggml_vk_instance_init();
|
||||
+
|
||||
+ std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
||||
+
|
||||
+ vk::PhysicalDeviceProperties2 props;
|
||||
+ vk::PhysicalDeviceIDProperties deviceIDProps;
|
||||
+ props.pNext = &deviceIDProps;
|
||||
+ devices[device].getProperties2(&props);
|
||||
+
|
||||
+ const auto& uuid = deviceIDProps.deviceUUID;
|
||||
+ char id[64];
|
||||
+ snprintf(id, sizeof(id),
|
||||
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
+ uuid[0], uuid[1], uuid[2], uuid[3],
|
||||
+ uuid[4], uuid[5],
|
||||
+ uuid[6], uuid[7],
|
||||
+ uuid[8], uuid[9],
|
||||
+ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
|
||||
+ );
|
||||
+ return std::string(id);
|
||||
+}
|
||||
+
|
||||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
@@ -12394,6 +12417,12 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
+std::string ggml_backend_vk_get_device_id(int device) {
|
||||
+ GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
+ int dev_idx = vk_instance.device_indices[device];
|
||||
+ return ggml_vk_get_device_id(dev_idx);
|
||||
+}
|
||||
+
|
||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
|
||||
@@ -12481,6 +12510,7 @@ struct ggml_backend_vk_device_context {
|
||||
std::string description;
|
||||
bool is_integrated_gpu;
|
||||
std::string pci_bus_id;
|
||||
+ std::string id;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -12493,6 +12523,11 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
|
||||
+ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
+ return ctx->id.c_str();
|
||||
+}
|
||||
+
|
||||
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
|
||||
ggml_backend_vk_get_device_memory(ctx->device, free, total);
|
||||
@@ -12519,6 +12554,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
+ props->id = ggml_backend_vk_device_get_id(dev);
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
@@ -12965,6 +13001,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
+ ctx->id = ggml_backend_vk_get_device_id(i);
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
--
|
||||
2.51.0
|
||||
@@ -28,7 +28,7 @@ Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
|
||||
1 file changed, 9 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 6a278b5e9..87941f872 100644
|
||||
index b075a18be..d62f412d6 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -340,6 +340,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -1,254 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Hiltgen <daniel@ollama.com>
|
||||
Date: Fri Sep 5 08:25:03 2025 -0700
|
||||
Subject: [PATCH] Vulkan PCI and Memory
|
||||
|
||||
---
|
||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 176 ++++++++++++++++++++++-----
|
||||
1 file changed, 145 insertions(+), 31 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index adea7783..fb7204ce 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -12423,31 +12423,99 @@ std::string ggml_backend_vk_get_device_id(int device) {
|
||||
return ggml_vk_get_device_id(dev_idx);
|
||||
}
|
||||
|
||||
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
- GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
|
||||
+//////////////////////////
|
||||
+
|
||||
+struct ggml_backend_vk_device_context {
|
||||
+ size_t device;
|
||||
+ std::string name;
|
||||
+ std::string description;
|
||||
+ bool is_integrated_gpu;
|
||||
+ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
|
||||
+ std::string pci_id;
|
||||
+ std::string id;
|
||||
+ std::string uuid;
|
||||
+ int major;
|
||||
+ int minor;
|
||||
+ int driver_major;
|
||||
+ int driver_minor;
|
||||
+ int pci_bus_id;
|
||||
+ int pci_device_id;
|
||||
+ int pci_domain_id;
|
||||
+};
|
||||
+
|
||||
+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
|
||||
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
|
||||
+ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
|
||||
+
|
||||
+ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
|
||||
|
||||
- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
|
||||
- vk::PhysicalDeviceMemoryProperties2 memprops = {};
|
||||
- bool membudget_supported = vk_instance.device_supports_membudget[device];
|
||||
+ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
+ vk::PhysicalDeviceProperties2 props2;
|
||||
+ vkdev.getProperties2(&props2);
|
||||
|
||||
- if (membudget_supported) {
|
||||
- memprops.pNext = &budgetprops;
|
||||
+ if (!ctx->is_integrated_gpu)
|
||||
+ {
|
||||
+ // Use vendor specific management libraries for best VRAM reporting if available
|
||||
+ switch (props2.properties.vendorID) {
|
||||
+ case VK_VENDOR_ID_AMD:
|
||||
+ if (ggml_hip_mgmt_init() == 0) {
|
||||
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
+ ggml_hip_mgmt_release();
|
||||
+ return;
|
||||
+ }
|
||||
+ ggml_hip_mgmt_release();
|
||||
+ }
|
||||
+ break;
|
||||
+ case VK_VENDOR_ID_NVIDIA:
|
||||
+ if (ggml_nvml_init() == 0) {
|
||||
+ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
|
||||
+ if (status == 0) {
|
||||
+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
+ ggml_nvml_release();
|
||||
+ return;
|
||||
+ }
|
||||
+ ggml_nvml_release();
|
||||
+ }
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
- vkdev.getMemoryProperties2(&memprops);
|
||||
+ // else fallback to memory budget if supported
|
||||
|
||||
- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
|
||||
- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
|
||||
+ *total = 0;
|
||||
+ *free = 0;
|
||||
+ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
|
||||
+ vk::PhysicalDeviceMemoryProperties2 memprops2;
|
||||
+ memprops2.pNext = &mem_budget_props;
|
||||
+ vkdev.getMemoryProperties2(&memprops2);
|
||||
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
|
||||
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
|
||||
+ } else if (ctx->is_integrated_gpu) {
|
||||
+ // Include shared memory on iGPUs
|
||||
+ *total += memprops2.memoryProperties.memoryHeaps[i].size;
|
||||
+ }
|
||||
+ }
|
||||
+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
|
||||
+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
+ *free += mem_budget_props.heapBudget[i];
|
||||
+ } else if (ctx->is_integrated_gpu) {
|
||||
+ *free += mem_budget_props.heapBudget[i];
|
||||
+ }
|
||||
+ }
|
||||
+ if (*total > 0 && *free > 0) {
|
||||
+ return;
|
||||
+ } else if (*total > 0) {
|
||||
+ *free = *total;
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
+ // else just report the physical memory
|
||||
+ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
-
|
||||
- if (membudget_supported && i < budgetprops.heapUsage.size()) {
|
||||
- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
|
||||
- } else {
|
||||
- *free = heap.size;
|
||||
- }
|
||||
+ *free = heap.size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -12502,16 +12570,17 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
return std::string(pci_bus_id);
|
||||
}
|
||||
|
||||
-//////////////////////////
|
||||
-
|
||||
-struct ggml_backend_vk_device_context {
|
||||
- size_t device;
|
||||
- std::string name;
|
||||
- std::string description;
|
||||
- bool is_integrated_gpu;
|
||||
- std::string pci_bus_id;
|
||||
- std::string id;
|
||||
-};
|
||||
+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
|
||||
+ if (id.empty()) return false;
|
||||
+ unsigned int d = 0, b = 0, dev = 0, func = 0;
|
||||
+ // Expected format: dddd:bb:dd.f (all hex)
|
||||
+ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
|
||||
+ if (n < 4) return false;
|
||||
+ if (domain) *domain = (int) d;
|
||||
+ if (bus) *bus = (int) b;
|
||||
+ if (device) *device = (int) dev;
|
||||
+ return true;
|
||||
+}
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -12530,7 +12599,7 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
|
||||
|
||||
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
|
||||
- ggml_backend_vk_get_device_memory(ctx->device, free, total);
|
||||
+ ggml_backend_vk_get_device_memory(ctx, free, total);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -12556,7 +12625,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
props->id = ggml_backend_vk_device_get_id(dev);
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
+ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -12564,6 +12633,17 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
+
|
||||
+ props->compute_major = ctx->major;
|
||||
+ props->compute_minor = ctx->minor;
|
||||
+ props->driver_major = ctx->driver_major;
|
||||
+ props->driver_minor = ctx->driver_minor;
|
||||
+ props->integrated = ctx->is_integrated_gpu;
|
||||
+ props->pci_bus_id = ctx->pci_bus_id;
|
||||
+ props->pci_device_id = ctx->pci_device_id;
|
||||
+ props->pci_domain_id = ctx->pci_domain_id;
|
||||
+ props->library = GGML_VK_NAME;
|
||||
+ props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -12992,6 +13071,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
+ std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
|
||||
+
|
||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -13000,13 +13081,46 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
+ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
ctx->id = ggml_backend_vk_get_device_id(i);
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
+
|
||||
+ // Gather additional information about the device
|
||||
+ int dev_idx = vk_instance.device_indices[i];
|
||||
+ vk::PhysicalDeviceProperties props1;
|
||||
+ vk_devices[dev_idx].getProperties(&props1);
|
||||
+ vk::PhysicalDeviceProperties2 props2;
|
||||
+ vk::PhysicalDeviceIDProperties device_id_props;
|
||||
+ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props;
|
||||
+ vk::PhysicalDeviceDriverProperties driver_props;
|
||||
+ props2.pNext = &device_id_props;
|
||||
+ device_id_props.pNext = &pci_bus_props;
|
||||
+ pci_bus_props.pNext = &driver_props;
|
||||
+ vk_devices[dev_idx].getProperties2(&props2);
|
||||
+ std::ostringstream oss;
|
||||
+ oss << std::hex << std::setfill('0');
|
||||
+ oss << "GPU-";
|
||||
+ int byteIdx = 0;
|
||||
+ for (int i = 0; i < 16; ++i, ++byteIdx) {
|
||||
+ oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
|
||||
+ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
|
||||
+ oss << '-';
|
||||
+ }
|
||||
+ }
|
||||
+ ctx->uuid = oss.str();
|
||||
+ ctx->pci_bus_id = pci_bus_props.pciBus;
|
||||
+ ctx->pci_device_id = pci_bus_props.pciDevice;
|
||||
+ ctx->pci_domain_id = pci_bus_props.pciDomain;
|
||||
+ ctx->id = std::to_string(i);
|
||||
+ ctx->major = 0;
|
||||
+ ctx->minor = 0;
|
||||
+ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
|
||||
+ ctx->driver_major = 0;
|
||||
+ ctx->driver_minor = 0;
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
--
|
||||
2.51.0
|
||||
@@ -69,7 +69,7 @@ type LlamaServer interface {
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error)
|
||||
Embedding(ctx context.Context, input string) ([]float32, error)
|
||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
@@ -1545,16 +1545,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
}
|
||||
|
||||
type EmbeddingRequest struct {
|
||||
Content string `json:"content"`
|
||||
Truncate bool `json:"truncate"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type EmbeddingResponse struct {
|
||||
Embedding []float32 `json:"embedding"`
|
||||
PromptEvalCount int `json:"prompt_eval_count"`
|
||||
Embedding []float32 `json:"embedding"`
|
||||
}
|
||||
|
||||
func (s *llmServer) Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error) {
|
||||
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
|
||||
logutil.Trace("embedding request", "input", input)
|
||||
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
@@ -1563,54 +1561,51 @@ func (s *llmServer) Embedding(ctx context.Context, input string, truncate bool)
|
||||
} else {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
}
|
||||
return nil, 0, err
|
||||
return nil, err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady {
|
||||
return nil, 0, fmt.Errorf("unexpected server status: %s", status)
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status)
|
||||
}
|
||||
|
||||
data, err := json.Marshal(EmbeddingRequest{Content: input, Truncate: truncate})
|
||||
data, err := json.Marshal(EmbeddingRequest{Content: input})
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("error marshaling embed data: %w", err)
|
||||
return nil, fmt.Errorf("error marshaling embed data: %w", err)
|
||||
}
|
||||
|
||||
r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("error creating embed request: %w", err)
|
||||
return nil, fmt.Errorf("error creating embed request: %w", err)
|
||||
}
|
||||
r.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := http.DefaultClient.Do(r)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("do embedding request: %w", err)
|
||||
return nil, fmt.Errorf("do embedding request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("error reading embed response: %w", err)
|
||||
return nil, fmt.Errorf("error reading embed response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
log.Printf("llm embedding error: %s", body)
|
||||
return nil, 0, api.StatusError{
|
||||
StatusCode: resp.StatusCode,
|
||||
ErrorMessage: string(body),
|
||||
}
|
||||
return nil, fmt.Errorf("%s", body)
|
||||
}
|
||||
|
||||
var e EmbeddingResponse
|
||||
if err := json.Unmarshal(body, &e); err != nil {
|
||||
return nil, 0, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||
}
|
||||
|
||||
return e.Embedding, e.PromptEvalCount, nil
|
||||
return e.Embedding, nil
|
||||
}
|
||||
|
||||
type TokenizeRequest struct {
|
||||
|
||||
@@ -161,6 +161,7 @@ type Tensor interface {
|
||||
|
||||
AvgPool2D(ctx Context, k, s int, p float32) Tensor
|
||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor
|
||||
|
||||
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
|
||||
|
||||
@@ -725,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
|
||||
if props.library != nil {
|
||||
info.Library = C.GoString(props.library)
|
||||
}
|
||||
info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
|
||||
if props.device_id != nil {
|
||||
info.PCIID = C.GoString(props.device_id)
|
||||
}
|
||||
info.LibraryPath = ggml.LibPaths()
|
||||
if props.numeric_id != nil {
|
||||
info.FilteredID = C.GoString(props.numeric_id)
|
||||
@@ -1180,6 +1182,10 @@ func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
|
||||
}
|
||||
|
||||
func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if slices.Contains(shape, -1) {
|
||||
inferShape(t, shape)
|
||||
}
|
||||
|
||||
switch len(shape) {
|
||||
case 0:
|
||||
return &Tensor{
|
||||
@@ -1322,7 +1328,43 @@ func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
}
|
||||
}
|
||||
|
||||
// inferShape updates shape in place to automatically set a single -1 dimesion
|
||||
// based on the input tensor and the other dimensions
|
||||
func inferShape(t *Tensor, shape []int) {
|
||||
total := 1
|
||||
for _, dim := range t.Shape() {
|
||||
total *= dim
|
||||
}
|
||||
|
||||
dim := -1
|
||||
for i := range shape {
|
||||
switch shape[i] {
|
||||
case -1:
|
||||
if dim != -1 {
|
||||
panic("only one dimension can be inferred")
|
||||
}
|
||||
dim = i
|
||||
case 0:
|
||||
panic("dimension cannot be zero")
|
||||
default:
|
||||
if total%shape[i] != 0 {
|
||||
panic("cannot infer dimension")
|
||||
}
|
||||
|
||||
total /= shape[i]
|
||||
}
|
||||
}
|
||||
|
||||
if dim != -1 {
|
||||
shape[dim] = total
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if slices.Contains(shape, -1) {
|
||||
inferShape(t, shape)
|
||||
}
|
||||
|
||||
switch len(shape) {
|
||||
case 1:
|
||||
return &Tensor{
|
||||
@@ -1535,6 +1577,16 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Conv3D(ctx ml.Context, t2 ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
|
||||
var tt ml.Tensor = &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_conv_3d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int64_t(c), C.int(s0), C.int(s1), C.int(s2), C.int(p0), C.int(p1), C.int(p2), C.int(d0), C.int(d1), C.int(d2)),
|
||||
}
|
||||
|
||||
tt = tt.Reshape(ctx, t.Dim(3)/c, t2.Dim(3)/c)
|
||||
return tt
|
||||
}
|
||||
|
||||
func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
|
||||
3
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
3
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
@@ -174,9 +174,6 @@ extern "C" {
|
||||
int compute_major;
|
||||
int compute_minor;
|
||||
int integrated;
|
||||
int pci_bus_id;
|
||||
int pci_device_id;
|
||||
int pci_domain_id;
|
||||
const char *library;
|
||||
// number with which the devices are accessed (Vulkan)
|
||||
const char *numeric_id;
|
||||
|
||||
15
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
15
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
@@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context {
|
||||
int driver_major;
|
||||
int driver_minor;
|
||||
int integrated;
|
||||
int pciBusID;
|
||||
int pciDeviceID;
|
||||
int pciDomainID;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
if (ggml_hip_mgmt_init() == 0) {
|
||||
int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
|
||||
int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
|
||||
if (status == 0) {
|
||||
GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
|
||||
ggml_hip_mgmt_release();
|
||||
return;
|
||||
}
|
||||
@@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
if (ggml_nvml_init() == 0) {
|
||||
int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
|
||||
if (status == 0) {
|
||||
GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
|
||||
ggml_nvml_release();
|
||||
return;
|
||||
}
|
||||
@@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
props->driver_major = ctx->driver_major;
|
||||
props->driver_minor = ctx->driver_minor;
|
||||
props->integrated = ctx->integrated;
|
||||
props->pci_bus_id = ctx->pciBusID;
|
||||
props->pci_device_id = ctx->pciDeviceID;
|
||||
props->pci_domain_id = ctx->pciDomainID;
|
||||
props->library = GGML_CUDA_NAME;
|
||||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
@@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
dev_ctx->driver_major = driverVersion / 1000;
|
||||
dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
|
||||
dev_ctx->integrated = prop.integrated;
|
||||
dev_ctx->pciBusID = prop.pciBusID;
|
||||
dev_ctx->pciDeviceID = prop.pciDeviceID;
|
||||
dev_ctx->pciDomainID = prop.pciDomainID;
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
|
||||
2
ml/backend/ggml/ggml/src/ggml-impl.h
vendored
2
ml/backend/ggml/ggml/src/ggml-impl.h
vendored
@@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init();
|
||||
GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
|
||||
GGML_API void ggml_nvml_release();
|
||||
GGML_API int ggml_hip_mgmt_init();
|
||||
GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
|
||||
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
|
||||
GGML_API void ggml_hip_mgmt_release();
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -231,6 +231,7 @@ class vk_memory_logger;
|
||||
#endif
|
||||
class vk_perf_logger;
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
static std::string ggml_vk_get_device_id(int device);
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) {
|
||||
const auto& uuid = deviceIDProps.deviceUUID;
|
||||
char id[64];
|
||||
snprintf(id, sizeof(id),
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
"%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
uuid[0], uuid[1], uuid[2], uuid[3],
|
||||
uuid[4], uuid[5],
|
||||
uuid[6], uuid[7],
|
||||
@@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context {
|
||||
std::string pci_id;
|
||||
std::string id;
|
||||
std::string uuid;
|
||||
std::string numeric_id;
|
||||
int major;
|
||||
int minor;
|
||||
int driver_major;
|
||||
int driver_minor;
|
||||
int pci_bus_id;
|
||||
int pci_device_id;
|
||||
int pci_domain_id;
|
||||
};
|
||||
|
||||
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
|
||||
@@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
switch (props2.properties.vendorID) {
|
||||
case VK_VENDOR_ID_AMD:
|
||||
if (ggml_hip_mgmt_init() == 0) {
|
||||
int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
|
||||
int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
|
||||
if (status == 0) {
|
||||
GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
|
||||
ggml_hip_mgmt_release();
|
||||
return;
|
||||
}
|
||||
@@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
if (ggml_nvml_init() == 0) {
|
||||
int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
|
||||
if (status == 0) {
|
||||
GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||
GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
|
||||
ggml_nvml_release();
|
||||
return;
|
||||
}
|
||||
@@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
}
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
if (!ext_support) {
|
||||
return "";
|
||||
device.getProperties2(&props2);
|
||||
if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
|
||||
return "";
|
||||
}
|
||||
// AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
@@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return std::string(pci_bus_id);
|
||||
}
|
||||
@@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
props->driver_major = ctx->driver_major;
|
||||
props->driver_minor = ctx->driver_minor;
|
||||
props->integrated = ctx->is_integrated_gpu;
|
||||
props->pci_bus_id = ctx->pci_bus_id;
|
||||
props->pci_device_id = ctx->pci_device_id;
|
||||
props->pci_domain_id = ctx->pci_domain_id;
|
||||
props->library = GGML_VK_NAME;
|
||||
props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
|
||||
props->numeric_id = ctx->numeric_id.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
vk_devices[dev_idx].getProperties2(&props2);
|
||||
std::ostringstream oss;
|
||||
oss << std::hex << std::setfill('0');
|
||||
oss << "GPU-";
|
||||
int byteIdx = 0;
|
||||
for (int i = 0; i < 16; ++i, ++byteIdx) {
|
||||
oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
|
||||
@@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
}
|
||||
}
|
||||
ctx->uuid = oss.str();
|
||||
ctx->pci_bus_id = pci_bus_props.pciBus;
|
||||
ctx->pci_device_id = pci_bus_props.pciDevice;
|
||||
ctx->pci_domain_id = pci_bus_props.pciDomain;
|
||||
ctx->id = std::to_string(i);
|
||||
ctx->major = 0;
|
||||
ctx->minor = 0;
|
||||
// TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
|
||||
ctx->driver_major = 0;
|
||||
ctx->driver_minor = 0;
|
||||
ctx->numeric_id = std::to_string(i);
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
19
ml/backend/ggml/ggml/src/mem_hip.cpp
vendored
19
ml/backend/ggml/ggml/src/mem_hip.cpp
vendored
@@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() {
|
||||
if (gpus != NULL) gpus->pVtbl->Release(gpus); \
|
||||
if (gpu != NULL) gpu->pVtbl->Release(gpu)
|
||||
|
||||
int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
|
||||
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
|
||||
std::lock_guard<std::mutex> lock(ggml_adlx_lock);
|
||||
if (adlx.handle == NULL) {
|
||||
GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
|
||||
@@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
|
||||
IADLXGPU* gpu = NULL;
|
||||
IADLXGPUMetrics *gpuMetrics = NULL;
|
||||
ADLX_RESULT status;
|
||||
// The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs
|
||||
adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
|
||||
|
||||
uint32_t pci_domain, pci_bus, pci_device, pci_function;
|
||||
if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
|
||||
// TODO - parse other formats?
|
||||
GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
|
||||
return ADLX_NOT_FOUND;
|
||||
}
|
||||
status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
|
||||
if (ADLX_FAILED(status)) {
|
||||
GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
|
||||
@@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
|
||||
GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
|
||||
continue;
|
||||
}
|
||||
adlx_int id;
|
||||
status = gpu->pVtbl->UniqueId(gpu, &id);
|
||||
adlx_int uniqueID;
|
||||
status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
|
||||
if (ADLX_FAILED(status)) {
|
||||
GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
|
||||
gpu->pVtbl->Release(gpu);
|
||||
gpu = NULL;
|
||||
continue;
|
||||
}
|
||||
if (id != target) {
|
||||
GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
|
||||
if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
|
||||
gpu->pVtbl->Release(gpu);
|
||||
gpu = NULL;
|
||||
continue;
|
||||
@@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() {
|
||||
return -1;
|
||||
}
|
||||
void ggml_hip_mgmt_release() {}
|
||||
int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
|
||||
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
126
ml/backend/ggml/ggml_test.go
Normal file
126
ml/backend/ggml/ggml_test.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package ggml
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
func setup(tb testing.TB) ml.Context {
|
||||
tb.Helper()
|
||||
|
||||
f, err := os.CreateTemp(tb.TempDir(), "*.bin")
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err := ggml.WriteGGUF(f, ggml.KV{"general.architecture": "test"}, nil); err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
|
||||
b, err := ml.NewBackend(f.Name(), ml.BackendParams{})
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
|
||||
ctx := b.NewContext().Input()
|
||||
|
||||
tb.Cleanup(func() {
|
||||
ctx.Close()
|
||||
b.Close()
|
||||
})
|
||||
|
||||
return ctx
|
||||
}
|
||||
|
||||
func TestInferShape(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
input []int
|
||||
want []int
|
||||
err error
|
||||
}{
|
||||
{
|
||||
name: "no inferred shape",
|
||||
input: []int{2, 3, 4},
|
||||
want: []int{2, 3, 4},
|
||||
},
|
||||
{
|
||||
name: "infer begin",
|
||||
input: []int{-1, 3, 4},
|
||||
want: []int{2, 3, 4},
|
||||
},
|
||||
{
|
||||
name: "infer mid",
|
||||
input: []int{2, -1, 4},
|
||||
want: []int{2, 3, 4},
|
||||
},
|
||||
{
|
||||
name: "infer end",
|
||||
input: []int{2, 3, -1},
|
||||
want: []int{2, 3, 4},
|
||||
},
|
||||
{
|
||||
name: "too many inferred dims",
|
||||
input: []int{-1, 3, -1},
|
||||
err: errors.New("only one dimension can be inferred"),
|
||||
},
|
||||
{
|
||||
name: "infer gather",
|
||||
input: []int{2, -1},
|
||||
want: []int{2, 12},
|
||||
},
|
||||
{
|
||||
name: "infer gather all",
|
||||
input: []int{-1},
|
||||
want: []int{24},
|
||||
},
|
||||
{
|
||||
name: "infer split",
|
||||
input: []int{2, -1, 3, 2},
|
||||
want: []int{2, 2, 3, 2},
|
||||
},
|
||||
{
|
||||
name: "indivisible infer",
|
||||
input: []int{2, -1, 2, 4},
|
||||
err: errors.New("cannot infer dimension"),
|
||||
},
|
||||
{
|
||||
name: "infer zero dim",
|
||||
input: []int{2, 0, 4},
|
||||
err: errors.New("dimension cannot be zero"),
|
||||
},
|
||||
}
|
||||
|
||||
ctx := setup(t)
|
||||
tensor, ok := ctx.Empty(ml.DTypeF32, 2, 3, 4).(*Tensor)
|
||||
if !ok {
|
||||
t.Fatal("expected *Tensor")
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
defer func() {
|
||||
if r := recover(); r == nil && tt.err == nil {
|
||||
// all good
|
||||
} else if r != nil && tt.err == nil {
|
||||
t.Errorf("unexpected panic: %v", r)
|
||||
} else if r == nil && tt.err != nil {
|
||||
t.Errorf("expected panic but did not get one: %v", tt.err)
|
||||
} else if errStr, ok := r.(string); ok && errStr != tt.err.Error() {
|
||||
t.Errorf("expected panic %q but got %q", tt.err.Error(), errStr)
|
||||
}
|
||||
}()
|
||||
|
||||
inferShape(tensor, tt.input)
|
||||
if diff := cmp.Diff(tt.want, tt.input); diff != "" {
|
||||
t.Errorf("%s: shape mismatch (-want +got):\n%s", tt.name, diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -391,6 +391,10 @@ func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison {
|
||||
if a.PCIID != b.PCIID {
|
||||
return UniqueDevice
|
||||
}
|
||||
// If PCIID is empty, we have to use ID + library for uniqueness
|
||||
if a.PCIID == "" && a.DeviceID != b.DeviceID {
|
||||
return UniqueDevice
|
||||
}
|
||||
if a.Library == b.Library {
|
||||
return SameBackendDevice
|
||||
}
|
||||
@@ -454,13 +458,13 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
|
||||
var envVar string
|
||||
switch d.Library {
|
||||
case "ROCm":
|
||||
// ROCm must be filtered as it can crash the runner on unsupported devices
|
||||
envVar = "ROCR_VISIBLE_DEVICES"
|
||||
if runtime.GOOS != "linux" {
|
||||
envVar = "HIP_VISIBLE_DEVICES"
|
||||
}
|
||||
case "Vulkan":
|
||||
envVar = "GGML_VK_VISIBLE_DEVICES"
|
||||
default:
|
||||
// CUDA and Vulkan are not filtered via env var, but via scheduling decisions
|
||||
return
|
||||
}
|
||||
v, existing := env[envVar]
|
||||
|
||||
@@ -4,8 +4,27 @@ import "github.com/ollama/ollama/ml"
|
||||
|
||||
type Conv2D struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
Bias ml.Tensor `gguf:"bias"`
|
||||
}
|
||||
|
||||
func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
|
||||
return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
|
||||
t = m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
|
||||
if m.Bias != nil {
|
||||
// Bias shape is (out_channels,) while t shape is (width, height, out_channels, batch)
|
||||
t = t.Add(ctx, m.Bias.Reshape(ctx, 1, 1, -1))
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
type Conv3D struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
Bias ml.Tensor `gguf:"bias"`
|
||||
}
|
||||
|
||||
func (m *Conv3D) Forward(ctx ml.Context, t ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
|
||||
t = m.Weight.Conv3D(ctx, t, c, s0, s1, s2, p0, p1, p2, d0, d1, d2)
|
||||
if m.Bias != nil {
|
||||
t = t.Add(ctx, m.Bias)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
@@ -14,4 +14,5 @@ import (
|
||||
_ "github.com/ollama/ollama/model/models/qwen2"
|
||||
_ "github.com/ollama/ollama/model/models/qwen25vl"
|
||||
_ "github.com/ollama/ollama/model/models/qwen3"
|
||||
_ "github.com/ollama/ollama/model/models/qwen3vl"
|
||||
)
|
||||
|
||||
@@ -3,6 +3,7 @@ package qwen3
|
||||
import (
|
||||
"cmp"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
@@ -210,7 +211,7 @@ var _ model.Model = (*Model)(nil)
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
layers := make([]Layer, c.Uint("block_count"))
|
||||
for i := range layers {
|
||||
if c.String("general.architecture") == "qwen3moe" {
|
||||
if strings.HasSuffix(c.String("general.architecture"), "moe") {
|
||||
layers[i].MLP = &sparse{}
|
||||
} else {
|
||||
layers[i].MLP = &dense{}
|
||||
|
||||
194
model/models/qwen3vl/imageprocessor.go
Normal file
194
model/models/qwen3vl/imageprocessor.go
Normal file
@@ -0,0 +1,194 @@
|
||||
package qwen3vl
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
// ImageProcessor contains configuration for the Qwen 3 VL image processing
|
||||
type ImageProcessor struct {
|
||||
numChannels int
|
||||
patchSize int
|
||||
temporalPatchSize int
|
||||
mergeSize int
|
||||
shortestEdge int
|
||||
longestEdge int
|
||||
factor int
|
||||
rescaleFactor float32
|
||||
imageMean []float32
|
||||
imageStd []float32
|
||||
}
|
||||
|
||||
// newImageProcessor creates a new image processor with default values
|
||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
|
||||
return ImageProcessor{
|
||||
numChannels: int(c.Uint("vision.num_channels", 3)), // not set
|
||||
patchSize: patchSize,
|
||||
temporalPatchSize: 2,
|
||||
mergeSize: mergeSize,
|
||||
shortestEdge: int(c.Uint("vision.shortest_edge", 64<<10)),
|
||||
// FIXME(mxyng): the model defined longest edge (16M) is too large for the default
|
||||
// context length of 8K and will panic. Adjusting to 2M for now.
|
||||
// longestEdge: int(c.Uint("vision.longest_edge", 16<<20)),
|
||||
longestEdge: 2 << 20,
|
||||
factor: patchSize * mergeSize,
|
||||
rescaleFactor: 1.0 / 255.0,
|
||||
imageMean: c.Floats("vision.image_mean", imageproc.ImageNetStandardMean[:]),
|
||||
imageStd: c.Floats("vision.image_std", imageproc.ImageNetStandardSTD[:]),
|
||||
}
|
||||
}
|
||||
|
||||
// SmartResize implements the smart resize algorithm
|
||||
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
||||
factor := p.factor
|
||||
|
||||
if height < factor || width < factor {
|
||||
panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
|
||||
} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
|
||||
panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
|
||||
}
|
||||
|
||||
round := func(x float64) int { return int(math.RoundToEven(x)) }
|
||||
|
||||
hBar := round(float64(height)/float64(factor)) * factor
|
||||
wBar := round(float64(width)/float64(factor)) * factor
|
||||
|
||||
if hBar*wBar > p.longestEdge {
|
||||
beta := math.Sqrt(float64(height*width) / float64(p.longestEdge))
|
||||
|
||||
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
|
||||
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
|
||||
} else if hBar*wBar < p.shortestEdge {
|
||||
beta := math.Sqrt(float64(p.shortestEdge) / float64(height*width))
|
||||
|
||||
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
||||
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
||||
}
|
||||
|
||||
return hBar, wBar
|
||||
}
|
||||
|
||||
type Grid struct {
|
||||
Height int
|
||||
Width int
|
||||
Temporal int
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) {
|
||||
origWidth := img.Bounds().Dx()
|
||||
origHeight := img.Bounds().Dy()
|
||||
|
||||
// Calculate smart resize dimensions
|
||||
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
|
||||
|
||||
// Resize image using existing functions
|
||||
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
|
||||
|
||||
normalizedPixels := imageproc.Normalize(
|
||||
resizedImg,
|
||||
[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
|
||||
[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
|
||||
true, // rescale
|
||||
true, // channelFirst
|
||||
)
|
||||
|
||||
// Calculate grid dimensions
|
||||
grid := &Grid{
|
||||
Height: resizedHeight / p.patchSize,
|
||||
Width: resizedWidth / p.patchSize,
|
||||
Temporal: 1, // For single images, temporal dimension is 1
|
||||
}
|
||||
|
||||
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to create patches: %v", err)
|
||||
}
|
||||
|
||||
patchDim := p.numChannels * p.temporalPatchSize *
|
||||
p.patchSize * p.patchSize
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
|
||||
pixelValues := ctx.Input().FromFloats(patches, patchDim, numPatches)
|
||||
|
||||
// Return patches and grid dimensions
|
||||
return pixelValues, grid, nil
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
|
||||
channels := p.numChannels
|
||||
patchSize := p.patchSize
|
||||
mergeSize := p.mergeSize
|
||||
temporalPatchSize := p.temporalPatchSize
|
||||
|
||||
// Calculate output dimensions
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
patchDim := channels * temporalPatchSize * patchSize * patchSize
|
||||
|
||||
result := make([]float32, numPatches*patchDim)
|
||||
patchIndex := 0
|
||||
|
||||
// Single temporal frame handling (copies to all frames)
|
||||
for range grid.Temporal {
|
||||
for h := 0; h < grid.Height; h += mergeSize {
|
||||
for w := 0; w < grid.Width; w += mergeSize {
|
||||
// Handle the 2x2 merged patches
|
||||
for mh := range mergeSize {
|
||||
for mw := range mergeSize {
|
||||
baseOffset := patchIndex * patchDim
|
||||
|
||||
// Extract patch data for first temporal frame
|
||||
for c := range channels {
|
||||
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
|
||||
|
||||
for py := range patchSize {
|
||||
for px := range patchSize {
|
||||
// Calculate source pixel coordinates
|
||||
y := (h+mh)*patchSize + py
|
||||
x := (w+mw)*patchSize + px
|
||||
|
||||
// Source index in input tensor (CHW format)
|
||||
srcIdx := c*height*width + y*width + x
|
||||
|
||||
// Destination index in first temporal frame
|
||||
dstIdx := channelOffset + (py * patchSize) + px
|
||||
|
||||
if srcIdx < len(pixels) && dstIdx < len(result) {
|
||||
result[dstIdx] = pixels[srcIdx]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Copy first temporal frame to all other frames
|
||||
if temporalPatchSize > 1 {
|
||||
for c := range channels {
|
||||
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
|
||||
firstFrameOffset := channelOffset
|
||||
frameSize := patchSize * patchSize
|
||||
|
||||
// Copy first frame to all other frames
|
||||
for tp := 1; tp < temporalPatchSize; tp++ {
|
||||
currentFrameOffset := channelOffset + (tp * frameSize)
|
||||
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
|
||||
result[firstFrameOffset:firstFrameOffset+frameSize])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
patchIndex++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
204
model/models/qwen3vl/model.go
Normal file
204
model/models/qwen3vl/model.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package qwen3vl
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.TextProcessor
|
||||
|
||||
*TextModel
|
||||
*VisionModel `gguf:"v"`
|
||||
|
||||
ImageProcessor
|
||||
|
||||
positionCache []int32
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Layers) == 0 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
|
||||
img, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, grid, err := m.ProcessImage(ctx, img)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Calculate tensor dimensions
|
||||
visionOutputs, deepstackVisualEmbeds := m.VisionModel.Forward(ctx, pixelValues, grid)
|
||||
mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
|
||||
for i := range deepstackVisualEmbeds {
|
||||
mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
|
||||
}
|
||||
|
||||
return mm, nil
|
||||
}
|
||||
|
||||
var (
|
||||
tokenVision int32 = 151655
|
||||
tokenVisionStart int32 = 151652
|
||||
tokenVisionEnd int32 = 151653
|
||||
)
|
||||
|
||||
type modelInput struct {
|
||||
*input.Input
|
||||
position int32
|
||||
}
|
||||
|
||||
// PostTokenize arranges Qwen 3 VL's inputs for the forward pass
|
||||
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
m.positionCache = m.positionCache[:0]
|
||||
return slices.Collect(func(yield func(*input.Input) bool) {
|
||||
for i := range inputs {
|
||||
s := []modelInput{{Input: inputs[i]}}
|
||||
if mm := inputs[i].Multimodal; mm != nil {
|
||||
t := mm[0].Tensor
|
||||
s = slices.Repeat([]modelInput{
|
||||
{
|
||||
position: int32(i + 1),
|
||||
Input: &input.Input{Token: tokenVision},
|
||||
},
|
||||
}, t.Dim(1)+1+1)
|
||||
|
||||
s[0] = modelInput{
|
||||
Input: &input.Input{Token: tokenVisionStart},
|
||||
position: int32(i),
|
||||
}
|
||||
|
||||
s[len(s)-1] = modelInput{
|
||||
Input: &input.Input{Token: tokenVisionEnd},
|
||||
position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
|
||||
}
|
||||
|
||||
s[1] = modelInput{
|
||||
Input: &input.Input{
|
||||
Token: tokenVision,
|
||||
Multimodal: inputs[i].Multimodal,
|
||||
MultimodalHash: inputs[i].MultimodalHash,
|
||||
SameBatch: t.Dim(1),
|
||||
},
|
||||
position: int32(i + 1),
|
||||
}
|
||||
}
|
||||
|
||||
for _, e := range s {
|
||||
position := e.position
|
||||
if position == 0 && len(m.positionCache) > 0 {
|
||||
position = m.positionCache[len(m.positionCache)-1] + 1
|
||||
}
|
||||
|
||||
m.positionCache = append(m.positionCache, position)
|
||||
if !yield(e.Input) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}), nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions)))
|
||||
for i, id := range batch.Positions {
|
||||
if id < int32(len(m.positionCache)) {
|
||||
id = m.positionCache[id]
|
||||
} else if len(m.positionCache) > 0 {
|
||||
id = id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
|
||||
}
|
||||
|
||||
positionSlice[0][i] = id
|
||||
positionSlice[1][i] = id
|
||||
positionSlice[2][i] = id
|
||||
}
|
||||
|
||||
hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
|
||||
|
||||
var deepstackVisualEmbeds []ml.Tensor
|
||||
for _, mi := range batch.Multimodal {
|
||||
visionOutputs := mi.Multimodal[0].Tensor
|
||||
ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
|
||||
|
||||
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
|
||||
for i := range visionOutputs.Dim(1) {
|
||||
w := grid.Width / m.spatialMergeSize
|
||||
positionSlice[1][mi.Index+i] += int32(i / w)
|
||||
positionSlice[2][mi.Index+i] += int32(i % w)
|
||||
}
|
||||
}
|
||||
|
||||
deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
|
||||
for i, mm := range mi.Multimodal[1:] {
|
||||
deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
|
||||
ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
|
||||
}
|
||||
}
|
||||
|
||||
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice))
|
||||
cos, sin := m.rotaryEmbedding(ctx, positions)
|
||||
for i, layer := range m.TextModel.Layers {
|
||||
if m.Cache != nil {
|
||||
m.Cache.SetLayer(i)
|
||||
}
|
||||
|
||||
var outputs ml.Tensor
|
||||
if i == len(m.TextModel.Layers)-1 {
|
||||
outputs = batch.Outputs
|
||||
}
|
||||
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options)
|
||||
if i < len(deepstackVisualEmbeds) {
|
||||
hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
|
||||
}
|
||||
}
|
||||
|
||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, 1e-06)
|
||||
return m.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
m := Model{
|
||||
TextProcessor: model.NewBytePairEncoding(
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
),
|
||||
TextModel: newTextModel(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) {
|
||||
m.positionCache = nil
|
||||
return nil, kvcache.ErrNotSupported
|
||||
})
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("qwen3vl", New)
|
||||
model.Register("qwen3vlmoe", New)
|
||||
}
|
||||
229
model/models/qwen3vl/model_text.go
Normal file
229
model/models/qwen3vl/model_text.go
Normal file
@@ -0,0 +1,229 @@
|
||||
package qwen3vl
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"math"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type TextOptions struct {
|
||||
hiddenSize,
|
||||
numHeads,
|
||||
numKVHeads,
|
||||
keyLength,
|
||||
valueLength int
|
||||
|
||||
eps,
|
||||
ropeBase,
|
||||
ropeScale float32
|
||||
mropeSections []int
|
||||
|
||||
numExperts, numExpertsUsed int
|
||||
normTopKProb bool
|
||||
|
||||
inverseFrequenciesCache []float32
|
||||
}
|
||||
|
||||
func (o TextOptions) headDim() int {
|
||||
return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
|
||||
}
|
||||
|
||||
type TextAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
KeyNorm *nn.RMSNorm `gguf:"attn_k_norm"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
||||
batchSize := hiddenStates.Dim(1)
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenStates)
|
||||
key := sa.Key.Forward(ctx, hiddenStates)
|
||||
value := sa.Value.Forward(ctx, hiddenStates)
|
||||
|
||||
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
|
||||
key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
||||
value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
||||
|
||||
query = sa.QueryNorm.Forward(ctx, query, opts.eps)
|
||||
key = sa.KeyNorm.Forward(ctx, key, opts.eps)
|
||||
|
||||
query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
|
||||
key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
|
||||
|
||||
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
|
||||
attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type TextMLP interface {
|
||||
Forward(ml.Context, ml.Tensor, *TextOptions) ml.Tensor
|
||||
}
|
||||
|
||||
type sparse struct {
|
||||
Router *nn.Linear `gguf:"ffn_gate_inp"`
|
||||
Gate *nn.LinearBatch `gguf:"ffn_gate_exps"`
|
||||
Up *nn.LinearBatch `gguf:"ffn_up_exps"`
|
||||
Down *nn.LinearBatch `gguf:"ffn_down_exps"`
|
||||
}
|
||||
|
||||
func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
|
||||
hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
|
||||
routerLogits := mlp.Router.Forward(ctx, hiddenStates)
|
||||
|
||||
routingWeights := routerLogits.Softmax(ctx)
|
||||
selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
|
||||
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
|
||||
if opts.normTopKProb {
|
||||
routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
|
||||
routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
|
||||
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
|
||||
|
||||
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates, selectedExperts).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates, selectedExperts))
|
||||
|
||||
experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
|
||||
experts = experts.Mul(ctx, routingWeights)
|
||||
|
||||
nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
|
||||
for i := 1; i < opts.numExpertsUsed; i++ {
|
||||
nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
|
||||
}
|
||||
|
||||
return nextStates
|
||||
}
|
||||
|
||||
type dense struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *TextOptions) ml.Tensor {
|
||||
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
||||
return mlp.Down.Forward(ctx, hiddenStates)
|
||||
}
|
||||
|
||||
type TextLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
*TextAttention
|
||||
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
TextMLP
|
||||
}
|
||||
|
||||
func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
||||
residual := hiddenStates
|
||||
hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts)
|
||||
|
||||
if outputs != nil {
|
||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
||||
residual = residual.Rows(ctx, outputs)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = d.TextMLP.Forward(ctx, hiddenStates, opts)
|
||||
return hiddenStates.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
Layers []TextLayer `gguf:"blk"`
|
||||
|
||||
Options *TextOptions
|
||||
}
|
||||
|
||||
func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) {
|
||||
positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1))
|
||||
if len(m.Options.inverseFrequenciesCache) == 0 {
|
||||
m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2)
|
||||
for i := range m.Options.inverseFrequenciesCache {
|
||||
frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim())))
|
||||
m.Options.inverseFrequenciesCache[i] = 1 / frequency
|
||||
}
|
||||
}
|
||||
|
||||
inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache))
|
||||
|
||||
positions = positions.Cast(ctx, ml.DTypeF32)
|
||||
frequencies := inverseFrequencies.Mulmat(ctx, positions)
|
||||
|
||||
interleaved := frequencies.View(ctx,
|
||||
0, frequencies.Dim(0),
|
||||
frequencies.Stride(1), frequencies.Dim(1),
|
||||
)
|
||||
|
||||
for _, i := range []int{1, 2} {
|
||||
args := []int{
|
||||
i * frequencies.Stride(0), 1,
|
||||
3 * frequencies.Stride(0), m.Options.mropeSections[i],
|
||||
frequencies.Stride(1), frequencies.Dim(1),
|
||||
}
|
||||
|
||||
ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...).
|
||||
Copy(ctx, interleaved.View(ctx, args[0], args[1:]...)))
|
||||
}
|
||||
|
||||
interleaved = interleaved.Concat(ctx, interleaved, 0)
|
||||
interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2))
|
||||
return interleaved.Cos(ctx), interleaved.Sin(ctx)
|
||||
}
|
||||
|
||||
var _ model.Model = (*Model)(nil)
|
||||
|
||||
func newTextModel(c fs.Config) *TextModel {
|
||||
layers := make([]TextLayer, c.Uint("block_count"))
|
||||
for i := range layers {
|
||||
if strings.HasSuffix(c.String("general.architecture"), "moe") {
|
||||
layers[i].TextMLP = &sparse{}
|
||||
} else {
|
||||
layers[i].TextMLP = &dense{}
|
||||
}
|
||||
}
|
||||
|
||||
m := TextModel{
|
||||
Layers: layers,
|
||||
Options: &TextOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
keyLength: int(c.Uint("attention.key_length")),
|
||||
valueLength: int(c.Uint("attention.value_length")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.scaling.factor", 1),
|
||||
numExperts: int(c.Uint("expert_count")),
|
||||
numExpertsUsed: int(c.Uint("expert_used_count")),
|
||||
normTopKProb: c.Bool("norm_top_k_prob", true),
|
||||
mropeSections: slices.Collect(func(yield func(int) bool) {
|
||||
for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
|
||||
if !yield(int(section)) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}),
|
||||
},
|
||||
}
|
||||
|
||||
return &m
|
||||
}
|
||||
268
model/models/qwen3vl/model_vision.go
Normal file
268
model/models/qwen3vl/model_vision.go
Normal file
@@ -0,0 +1,268 @@
|
||||
package qwen3vl
|
||||
|
||||
import (
|
||||
"iter"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
type VisionAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
}
|
||||
|
||||
func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
|
||||
x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
|
||||
x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
|
||||
return x2.Scale(ctx, -1).Concat(ctx, x1, 0)
|
||||
}
|
||||
|
||||
func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
|
||||
return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
|
||||
}
|
||||
|
||||
func (sa *VisionAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
|
||||
query := sa.Query.Forward(ctx, hiddenStates)
|
||||
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, query.Dim(1))
|
||||
query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenStates)
|
||||
key = key.Reshape(ctx, opts.headDim(), opts.numHeads, key.Dim(1))
|
||||
key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenStates)
|
||||
value = value.Reshape(ctx, opts.headDim(), opts.numHeads, value.Dim(1))
|
||||
|
||||
attention := nn.Attention(ctx, query, key, value, math.Pow(float64(opts.headDim()), -0.5), nil)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2))
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type VisionMLP struct {
|
||||
FC1 *nn.Linear `gguf:"linear_fc1"`
|
||||
FC2 *nn.Linear `gguf:"linear_fc2"`
|
||||
}
|
||||
|
||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts VisionOptions) ml.Tensor {
|
||||
return mlp.FC2.Forward(ctx, mlp.FC1.Forward(ctx, hiddenStates).GELU(ctx))
|
||||
}
|
||||
|
||||
type VisionEncoderLayer struct {
|
||||
Norm1 *nn.LayerNorm `gguf:"norm1"`
|
||||
Attention *VisionAttention
|
||||
Norm2 *nn.LayerNorm `gguf:"norm2"`
|
||||
MLP *VisionMLP `gguf:"mlp"`
|
||||
}
|
||||
|
||||
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
|
||||
residual := hiddenStates
|
||||
hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = e.Attention.Forward(ctx, hiddenStates, cos, sin, opts)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
|
||||
return hiddenStates.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type VisionOptions struct {
|
||||
hiddenSize,
|
||||
numHeads,
|
||||
patchSize,
|
||||
numChannels,
|
||||
spatialMergeSize,
|
||||
temporalPatchSize,
|
||||
gridPerSide int
|
||||
|
||||
eps,
|
||||
ropeTheta float32
|
||||
|
||||
deepstackVisualIndexes []int32
|
||||
mropeSections []int
|
||||
}
|
||||
|
||||
func (o VisionOptions) headDim() int {
|
||||
return o.hiddenSize / o.numHeads
|
||||
}
|
||||
|
||||
type VisionPatchMerger struct {
|
||||
Norm *nn.LayerNorm `gguf:"norm"`
|
||||
FC1 *nn.Linear `gguf:"linear_fc1"`
|
||||
FC2 *nn.Linear `gguf:"linear_fc2"`
|
||||
}
|
||||
|
||||
func (m *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, postshuffleNorm bool, opts VisionOptions) ml.Tensor {
|
||||
hiddenSize := opts.hiddenSize * opts.spatialMergeSize * opts.spatialMergeSize
|
||||
if postshuffleNorm {
|
||||
visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
|
||||
}
|
||||
|
||||
visionOutputs = m.Norm.Forward(ctx, visionOutputs, opts.eps)
|
||||
visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
|
||||
return m.FC2.Forward(ctx, m.FC1.Forward(ctx, visionOutputs).GELU(ctx))
|
||||
}
|
||||
|
||||
type VisionPositionEmbedding struct {
|
||||
PositionEmbedding *nn.Embedding `gguf:"pos_embed"`
|
||||
}
|
||||
|
||||
func makeSlice2D[T int32 | float32](n0, n1 int) iter.Seq[[]T] {
|
||||
return func(yield func([]T) bool) {
|
||||
for range n0 {
|
||||
if !yield(make([]T, n1)) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *VisionPositionEmbedding) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts VisionOptions) ml.Tensor {
|
||||
indexSlice := slices.Collect(makeSlice2D[int32](4, grid.Height*grid.Width))
|
||||
weightSlice := slices.Collect(makeSlice2D[float32](4, grid.Height*grid.Width))
|
||||
|
||||
stepHeight := float32(opts.gridPerSide-1) / float32(grid.Height-1)
|
||||
stepWidth := float32(opts.gridPerSide-1) / float32(grid.Width-1)
|
||||
|
||||
var i int
|
||||
for h := range grid.Height {
|
||||
for w := range grid.Width {
|
||||
y, x := float32(h)*stepHeight, float32(w)*stepWidth
|
||||
|
||||
floorY, floorX := int32(y), int32(x)
|
||||
ceilY, ceilX := min(floorY+1, int32(opts.gridPerSide-1)), min(floorX+1, int32(opts.gridPerSide-1))
|
||||
|
||||
indexSlice[0][i] = floorY*int32(opts.gridPerSide) + floorX
|
||||
indexSlice[1][i] = floorY*int32(opts.gridPerSide) + ceilX
|
||||
indexSlice[2][i] = ceilY*int32(opts.gridPerSide) + floorX
|
||||
indexSlice[3][i] = ceilY*int32(opts.gridPerSide) + ceilX
|
||||
|
||||
weightSlice[0][i] = (1 - (y - float32(floorY))) * (1 - (x - float32(floorX)))
|
||||
weightSlice[1][i] = (1 - (y - float32(floorY))) * (x - float32(floorX))
|
||||
weightSlice[2][i] = (y - float32(floorY)) * (1 - (x - float32(floorX)))
|
||||
weightSlice[3][i] = (y - float32(floorY)) * (x - float32(floorX))
|
||||
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
indices := ctx.Input().FromInts(slices.Concat(indexSlice...), grid.Height*grid.Width*4)
|
||||
weights := ctx.Input().FromFloats(slices.Concat(weightSlice...), 1, grid.Height*grid.Width*4)
|
||||
|
||||
n := hiddenStates.Dim(0)
|
||||
positionEmbeds := m.PositionEmbedding.Forward(ctx, indices)
|
||||
positionEmbeds = positionEmbeds.Mul(ctx, weights)
|
||||
positionEmbeds = positionEmbeds.Reshape(ctx, n, -1, 4)
|
||||
|
||||
positionEmbeds = positionEmbeds.View(ctx, 0, n, positionEmbeds.Stride(1), grid.Height*grid.Width).
|
||||
Add(ctx, positionEmbeds.View(ctx, 1*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
|
||||
Add(ctx, positionEmbeds.View(ctx, 2*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
|
||||
Add(ctx, positionEmbeds.View(ctx, 3*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width))
|
||||
|
||||
positionEmbeds = positionEmbeds.Reshape(ctx, -1, grid.Width/opts.spatialMergeSize, opts.spatialMergeSize, grid.Height/opts.spatialMergeSize)
|
||||
positionEmbeds = positionEmbeds.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, n, -1)
|
||||
return hiddenStates.Add(ctx, positionEmbeds)
|
||||
}
|
||||
|
||||
type VisionModel struct {
|
||||
PatchEmbedding *nn.Conv3D `gguf:"patch_embed"`
|
||||
PositionEmbedding *VisionPositionEmbedding
|
||||
Layers []VisionEncoderLayer `gguf:"blk"`
|
||||
PatchMerger *VisionPatchMerger `gguf:"merger"`
|
||||
DeepstackMerger []*VisionPatchMerger `gguf:"deepstack_merger"`
|
||||
|
||||
VisionOptions
|
||||
}
|
||||
|
||||
func (m *VisionModel) positions(ctx ml.Context, grid *Grid) (_, _ ml.Tensor) {
|
||||
indices := ctx.Input().FromInts(slices.Collect(func(yield func(int32) bool) {
|
||||
for y := range grid.Height {
|
||||
for x := range grid.Width {
|
||||
if !yield(int32(y)) {
|
||||
return
|
||||
}
|
||||
if !yield(int32(x)) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}), grid.Width*grid.Height*2)
|
||||
|
||||
indices = indices.Reshape(ctx, -1, grid.Width/m.spatialMergeSize, m.spatialMergeSize, grid.Height/m.spatialMergeSize)
|
||||
indices = indices.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
indices = indices.Reshape(ctx, -1)
|
||||
|
||||
halfDim := m.headDim() / 2
|
||||
maxGrid := max(grid.Height, grid.Width)
|
||||
frequencies := ctx.Input().FromFloats(slices.Collect(func(yield func(float32) bool) {
|
||||
ropeTheta := float64(m.ropeTheta)
|
||||
for i := range maxGrid {
|
||||
for j := range halfDim / 2 {
|
||||
if !yield(float32(i) / float32(math.Pow(ropeTheta, float64(j*2)/float64(halfDim)))) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}), halfDim/2, maxGrid)
|
||||
|
||||
embeds := frequencies.Rows(ctx, indices)
|
||||
embeds = embeds.Reshape(ctx, halfDim, 1, -1)
|
||||
embeds = embeds.Concat(ctx, embeds, 0)
|
||||
return embeds.Cos(ctx), embeds.Sin(ctx)
|
||||
}
|
||||
|
||||
// Forward computes the vision model for an input tensor
|
||||
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) (ml.Tensor, []ml.Tensor) {
|
||||
pixelValues = pixelValues.Reshape(ctx, m.patchSize, m.patchSize, m.temporalPatchSize, -1)
|
||||
hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.numChannels, m.patchSize, m.patchSize, m.temporalPatchSize, 0, 0, 0, 1, 1, 1)
|
||||
hiddenStates = m.PositionEmbedding.Forward(ctx, hiddenStates, grid, m.VisionOptions)
|
||||
|
||||
cos, sin := m.positions(ctx, grid)
|
||||
|
||||
deepstackStates := make([]ml.Tensor, len(m.deepstackVisualIndexes))
|
||||
for i, layer := range m.Layers {
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, m.VisionOptions)
|
||||
if i := slices.Index(m.deepstackVisualIndexes, int32(i)); i >= 0 {
|
||||
deepstackStates[i] = m.DeepstackMerger[i].Forward(ctx, hiddenStates, true, m.VisionOptions)
|
||||
}
|
||||
}
|
||||
|
||||
hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, false, m.VisionOptions)
|
||||
return hiddenStates, deepstackStates
|
||||
}
|
||||
|
||||
// newVisionModel creates a new instance of the Qwen vision model
|
||||
func newVisionModel(c fs.Config) *VisionModel {
|
||||
deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes")
|
||||
model := &VisionModel{
|
||||
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
|
||||
DeepstackMerger: make([]*VisionPatchMerger, len(deepstackVisualIndexes)),
|
||||
VisionOptions: VisionOptions{
|
||||
hiddenSize: int(c.Uint("vision.embedding_length", 1280)),
|
||||
numHeads: int(c.Uint("vision.attention.head_count", 16)),
|
||||
patchSize: int(c.Uint("vision.patch_size", 14)),
|
||||
numChannels: int(c.Uint("vision.num_channels", 3)),
|
||||
eps: c.Float("vision.attention.layer_norm_epsilon", 1e-6),
|
||||
ropeTheta: c.Float("vision.rope.freq_base", 10000.0),
|
||||
spatialMergeSize: int(c.Uint("vision.spatial_merge_size", 2)),
|
||||
temporalPatchSize: int(c.Uint("vision.temporal_patch_size", 2)),
|
||||
gridPerSide: int(math.Sqrt(float64(c.Uint("vision.num_positional_embeddings", 2304)))),
|
||||
mropeSections: slices.Collect(func(yield func(int) bool) {
|
||||
for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
|
||||
if !yield(int(section)) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}),
|
||||
deepstackVisualIndexes: deepstackVisualIndexes,
|
||||
},
|
||||
}
|
||||
|
||||
return model
|
||||
}
|
||||
@@ -709,13 +709,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{
|
||||
embedding: true,
|
||||
truncate: req.Truncate,
|
||||
|
||||
// TODO (jmorganca): this should be provided by the server via the
|
||||
// request options and truncated here in the runner, instead of relying on
|
||||
// the server's truncate logic
|
||||
truncate: true,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, errorInputTooLong) {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
@@ -758,8 +758,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
||||
embedding := <-seq.embedding
|
||||
|
||||
if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
|
||||
Embedding: embedding,
|
||||
PromptEvalCount: seq.numPromptInputs,
|
||||
Embedding: embedding,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
@@ -235,15 +235,28 @@ func countCommonPrefix(a []*input.Input, b []*input.Input) int32 {
|
||||
return count
|
||||
}
|
||||
|
||||
// TODO(jessegross): If we need to reprocess the inputs we should ensure that
|
||||
// we don't split up a SameBatch
|
||||
func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
|
||||
targetFree := (c.numCtx - numKeep) / 2
|
||||
targetFree = max(targetFree, 1)
|
||||
// ShiftDiscard computes how many inputs can be discarded from the cache. Inputs in the same batch
|
||||
// are discarded together.
|
||||
func (c *InputCache) ShiftDiscard(inputs []*input.Input, numKeep int32) int32 {
|
||||
targetFree := max((c.numCtx-numKeep)/2, 1)
|
||||
currentFree := c.numCtx - int32(len(inputs))
|
||||
|
||||
currentFree := c.numCtx - inputLen
|
||||
var discard, sameBatch int32
|
||||
for _, input := range inputs[numKeep:] {
|
||||
if sameBatch <= 0 && currentFree >= targetFree {
|
||||
break
|
||||
}
|
||||
|
||||
return max(targetFree-currentFree, 0)
|
||||
sameBatch--
|
||||
currentFree++
|
||||
discard++
|
||||
|
||||
if input.SameBatch > 0 {
|
||||
sameBatch = int32(input.SameBatch)
|
||||
}
|
||||
}
|
||||
|
||||
return discard
|
||||
}
|
||||
|
||||
type ErrReprocessInputs struct {
|
||||
@@ -264,7 +277,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
|
||||
}
|
||||
|
||||
inputLen := int32(len(slot.Inputs))
|
||||
discard := c.ShiftDiscard(inputLen, numKeep)
|
||||
discard := c.ShiftDiscard(slot.Inputs, numKeep)
|
||||
|
||||
if discard <= 0 {
|
||||
return nil
|
||||
|
||||
@@ -3,6 +3,7 @@ package ollamarunner
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -238,59 +239,137 @@ func TestShiftDiscard(t *testing.T) {
|
||||
name string
|
||||
numCtx int32
|
||||
numKeep int32
|
||||
inputLen int32
|
||||
inputs []*input.Input
|
||||
expected int32
|
||||
}{
|
||||
{
|
||||
name: "Shift",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputLen: 2048,
|
||||
inputs: slices.Repeat([]*input.Input{{}}, 2048),
|
||||
expected: 1021,
|
||||
},
|
||||
{
|
||||
name: "Max Keep",
|
||||
numCtx: 2048,
|
||||
numKeep: 2047,
|
||||
inputLen: 2048,
|
||||
inputs: slices.Repeat([]*input.Input{{}}, 2048),
|
||||
expected: 1,
|
||||
},
|
||||
{
|
||||
name: "No Keep",
|
||||
numCtx: 2048,
|
||||
numKeep: 0,
|
||||
inputLen: 2048,
|
||||
inputs: slices.Repeat([]*input.Input{{}}, 2048),
|
||||
expected: 1024,
|
||||
},
|
||||
{
|
||||
name: "Truncate",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputLen: 5000,
|
||||
inputs: slices.Repeat([]*input.Input{{}}, 5000),
|
||||
expected: 3973,
|
||||
},
|
||||
{
|
||||
name: "Truncate Keep",
|
||||
numCtx: 2048,
|
||||
numKeep: 2047,
|
||||
inputLen: 5000,
|
||||
inputs: slices.Repeat([]*input.Input{{}}, 5000),
|
||||
expected: 2953,
|
||||
},
|
||||
{
|
||||
name: "No Op",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputLen: 512,
|
||||
inputs: slices.Repeat([]*input.Input{{}}, 512),
|
||||
expected: 0,
|
||||
},
|
||||
{
|
||||
name: "Same Batch",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputs: slices.Collect(func(yield func(*input.Input) bool) {
|
||||
for range 1024 {
|
||||
if !yield(&input.Input{}) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if !yield(&input.Input{SameBatch: 512 - 1}) {
|
||||
return
|
||||
}
|
||||
|
||||
for range 2048 - 1024 - 1 {
|
||||
if !yield(&input.Input{}) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}),
|
||||
expected: 1531,
|
||||
},
|
||||
{
|
||||
name: "Same Batch Near Start",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputs: slices.Collect(func(yield func(*input.Input) bool) {
|
||||
for range 10 {
|
||||
if !yield(&input.Input{}) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if !yield(&input.Input{SameBatch: 512 - 1}) {
|
||||
return
|
||||
}
|
||||
|
||||
for range 2048 - 10 - 1 {
|
||||
if !yield(&input.Input{}) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}),
|
||||
expected: 1021,
|
||||
},
|
||||
{
|
||||
name: "Consecutive Same Batch",
|
||||
numCtx: 32,
|
||||
inputs: slices.Collect(func(yield func(*input.Input) bool) {
|
||||
for i := range 32 {
|
||||
input := input.Input{}
|
||||
if i%10 == 0 {
|
||||
input.SameBatch = 10 - 1
|
||||
}
|
||||
if !yield(&input) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}),
|
||||
expected: 20,
|
||||
},
|
||||
{
|
||||
name: "Overlapping Same Batch",
|
||||
numCtx: 32,
|
||||
inputs: slices.Collect(func(yield func(*input.Input) bool) {
|
||||
for i := range 32 {
|
||||
input := input.Input{}
|
||||
if slices.Contains([]int{4, 8, 14}, i) {
|
||||
input.SameBatch = 10 - 1
|
||||
}
|
||||
if !yield(&input) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}),
|
||||
expected: 24,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
c := InputCache{numCtx: tt.numCtx}
|
||||
result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
|
||||
result := c.ShiftDiscard(tt.inputs, tt.numKeep)
|
||||
if result != tt.expected {
|
||||
t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
|
||||
t.Errorf("shiftDiscard(ctx: %v, keep: %v inputs: %v): have %v; want %v", tt.numCtx, tt.numKeep, len(tt.inputs), result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -214,7 +214,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
|
||||
parts = []string{prompt}
|
||||
}
|
||||
|
||||
postTokenize := false
|
||||
for i, part := range parts {
|
||||
// text - tokenize
|
||||
tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
|
||||
@@ -257,11 +256,10 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
|
||||
mmStore.addMultimodal(imageEmbeddings)
|
||||
|
||||
inputs = append(inputs, &input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
|
||||
postTokenize = true
|
||||
}
|
||||
}
|
||||
|
||||
if visionModel && postTokenize {
|
||||
if visionModel {
|
||||
var err error
|
||||
inputs, err = multimodalProcessor.PostTokenize(inputs)
|
||||
if err != nil {
|
||||
@@ -948,13 +946,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{
|
||||
embedding: true,
|
||||
truncate: req.Truncate,
|
||||
|
||||
// TODO (jmorganca): this should be provided by the server via the
|
||||
// request options and truncated here in the runner, instead of relying on
|
||||
// the server's truncate logic
|
||||
truncate: true,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, errorInputTooLong) {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
http.Error(w, fmt.Sprintf("failed to create new sequence: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
@@ -995,8 +993,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
|
||||
Embedding: <-seq.embedding,
|
||||
PromptEvalCount: seq.numPromptInputs,
|
||||
Embedding: <-seq.embedding,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
@@ -21,7 +21,6 @@ import (
|
||||
"os/signal"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -143,7 +142,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
|
||||
|
||||
// This model is much more capable with a larger context, so set that
|
||||
// unless it would penalize performance too much
|
||||
if !s.lowVRAM && slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
|
||||
if !s.lowVRAM && slices.Contains([]string{
|
||||
"gptoss", "gpt-oss",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
}, model.Config.ModelFamily) {
|
||||
opts.NumCtx = max(opts.NumCtx, 8192)
|
||||
}
|
||||
|
||||
@@ -660,7 +662,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
r, _, _, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
|
||||
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), []model.Capability{}, req.Options, req.KeepAlive)
|
||||
if err != nil {
|
||||
handleScheduleError(c, req.Model, err)
|
||||
return
|
||||
@@ -673,12 +675,61 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
kvData, _, err := getModelData(m.ModelPath, false)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
var count int
|
||||
for i, s := range input {
|
||||
tokens, err := r.Tokenize(c.Request.Context(), s)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
ctxLen := min(opts.NumCtx, int(kvData.ContextLength()))
|
||||
if len(tokens) > ctxLen {
|
||||
if !truncate {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "input exceeds maximum context length"})
|
||||
return
|
||||
}
|
||||
|
||||
if bos := kvData.Uint("tokenizer.ggml.bos_token_id"); tokens[0] != int(bos) && kvData.Bool("add_bos_token", true) {
|
||||
ctxLen--
|
||||
}
|
||||
|
||||
if eos := kvData.Uint("tokenizer.ggml.eos_token_id"); tokens[len(tokens)-1] != int(eos) && kvData.Bool("add_eos_token", true) {
|
||||
ctxLen--
|
||||
}
|
||||
|
||||
slog.Info("", "ctxLen", ctxLen, "tokenCount", len(tokens))
|
||||
if ctxLen <= 0 {
|
||||
// return error if the truncated input would be empty or just special tokens
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "input after truncation exceeds maximum context length"})
|
||||
return
|
||||
}
|
||||
|
||||
tokens = tokens[:ctxLen]
|
||||
|
||||
s, err = r.Detokenize(c.Request.Context(), tokens)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
count += len(tokens)
|
||||
|
||||
input[i] = s
|
||||
}
|
||||
|
||||
var g errgroup.Group
|
||||
embeddings := make([][]float32, len(input))
|
||||
var totalTokens uint64
|
||||
for i, text := range input {
|
||||
g.Go(func() error {
|
||||
embedding, tokenCount, err := r.Embedding(c.Request.Context(), text, truncate)
|
||||
embedding, err := r.Embedding(c.Request.Context(), text)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -688,18 +739,12 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
embedding = normalize(embedding[:req.Dimensions])
|
||||
}
|
||||
embeddings[i] = embedding
|
||||
atomic.AddUint64(&totalTokens, uint64(tokenCount))
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := g.Wait(); err != nil {
|
||||
var serr api.StatusError
|
||||
if errors.As(err, &serr) {
|
||||
c.AbortWithStatusJSON(serr.StatusCode, gin.H{"error": strings.TrimSpace(serr.ErrorMessage)})
|
||||
} else {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
|
||||
}
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -708,7 +753,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
Embeddings: embeddings,
|
||||
TotalDuration: time.Since(checkpointStart),
|
||||
LoadDuration: checkpointLoaded.Sub(checkpointStart),
|
||||
PromptEvalCount: int(totalTokens),
|
||||
PromptEvalCount: count,
|
||||
}
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
@@ -754,7 +799,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
embedding, _, err := r.Embedding(c.Request.Context(), req.Prompt, true)
|
||||
embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
|
||||
return
|
||||
|
||||
@@ -390,11 +390,11 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
||||
numParallel = 1
|
||||
}
|
||||
|
||||
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
|
||||
// `mllama`, `qwen3vl`, and `qwen3vlmoe` are snowflakes and uses an encoder cache which cannot be used with num_parallel > 1
|
||||
// ref: https://github.com/ollama/ollama/issues/4165
|
||||
if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
|
||||
if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe"}, req.model.Config.ModelFamily) && numParallel != 1 {
|
||||
numParallel = 1
|
||||
slog.Warn("mllama does not currently support parallel requests")
|
||||
slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
|
||||
}
|
||||
|
||||
sessionDuration := envconfig.KeepAlive()
|
||||
|
||||
@@ -780,8 +780,8 @@ func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn
|
||||
return s.completionResp
|
||||
}
|
||||
|
||||
func (s *mockLlm) Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error) {
|
||||
return s.embeddingResp, 0, s.embeddingRespErr
|
||||
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
|
||||
return s.embeddingResp, s.embeddingRespErr
|
||||
}
|
||||
|
||||
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
|
||||
Reference in New Issue
Block a user