diff --git a/llm/server.go b/llm/server.go index c83bd5a40..578360210 100644 --- a/llm/server.go +++ b/llm/server.go @@ -146,6 +146,16 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st if envconfig.NewEngine() || f.KV().OllamaEngineRequired() { if len(projectors) == 0 { textProcessor, err = model.NewTextProcessor(modelPath) + } else if len(projectors) == 1 { + var canMerge bool + canMerge, err = model.CanMergeProjector(modelPath) + if err == nil { + if !canMerge { + err = errors.New("split vision models aren't supported") + } else { + textProcessor, err = model.NewTextProcessor(modelPath) + } + } } else { err = errors.New("split vision models aren't supported") } @@ -479,10 +489,10 @@ type LoadRequest struct { GPULayers ml.GPULayersList MultiUserCache bool - // Legacy fields - not used with the Ollama engine ProjectorPath string - MainGPU int - UseMmap bool + // Legacy fields - not used with the Ollama engine + MainGPU int + UseMmap bool } type LoadResponse struct { diff --git a/model/model.go b/model/model.go index 0af16da80..ff3fbf882 100644 --- a/model/model.go +++ b/model/model.go @@ -37,6 +37,9 @@ type Model interface { Backend() ml.Backend Config() config + + PostPopulate() + IsOnlineProjectorMergingSupported() bool } // MultimodalProcessor must be implemented by multimodal models. @@ -90,6 +93,16 @@ func (m *Base) Config() config { return m.config } +func (m *Base) PostPopulate() { + // stub. This method can be used for redirecting tensors that + // has renamed by convert_hf_to_gguf.py from llama.cpp + // or any other model-specific logic +} + +func (m *Base) IsOnlineProjectorMergingSupported() bool { + return false +} + var models = make(map[string]func(fs.Config) (Model, error)) // Register registers a model constructor for the given architecture @@ -115,7 +128,8 @@ func New(modelPath string, params ml.BackendParams) (Model, error) { base := Base{b: b, config: m.Config()} v := reflect.ValueOf(m) - v.Elem().Set(populateFields(base, v.Elem())) + v.Elem().Set(PopulateFields(base, v.Elem())) + m.PostPopulate() return m, nil } @@ -143,6 +157,25 @@ func NewTextProcessor(s string) (TextProcessor, error) { return tp, nil } +func CanMergeProjector(s string) (bool, error) { + r, err := os.Open(s) + if err != nil { + return false, err + } + defer r.Close() + + meta, err := fsggml.Decode(r, -1) + if err != nil { + return false, err + } + + m, err := modelForArch(meta.KV()) + if err != nil { + return false, err + } + return m.IsOnlineProjectorMergingSupported(), nil +} + func modelForArch(c fs.Config) (Model, error) { arch := c.Architecture() if pooling.Type(c.Uint("pooling_type")) != pooling.TypeNone { @@ -157,7 +190,7 @@ func modelForArch(c fs.Config) (Model, error) { return f(c) } -func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { +func PopulateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { t := v.Type() if t.Kind() == reflect.Struct { @@ -172,7 +205,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { // make a copy tagsCopy := tags if tag := t.Field(i).Tag.Get("gguf"); tag != "" { - tagsCopy = append(tagsCopy, parseTag(tag)) + tagsCopy = append(tagsCopy, ParseTag(tag)) } if tt == reflect.TypeOf((*Base)(nil)).Elem() { @@ -194,13 +227,23 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { } else if len(childNames) == 0 { // current tag has names but no children, create branches for each name for _, name := range names { - fullNames = append(fullNames, []string{name}) + if name == "" { + // If an empty alternate empty name exists, do not add it into the list + // as Go will create double dots in the name + fullNames = append(fullNames, []string{}) + } else { + fullNames = append(fullNames, []string{name}) + } } } else { // merge each name with each child for _, name := range names { for _, childName := range childNames { - fullNames = append(fullNames, append([]string{name}, childName...)) + if name == "" { + fullNames = append(fullNames, childName) + } else { + fullNames = append(fullNames, append([]string{name}, childName...)) + } } } } @@ -218,14 +261,14 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { } } } else if tt.Kind() == reflect.Pointer || tt.Kind() == reflect.Interface { - setPointer(base, vv, tagsCopy) + SetPointer(base, vv, tagsCopy) } else if tt.Kind() == reflect.Slice || tt.Kind() == reflect.Array { for i := range vv.Len() { vvv := vv.Index(i) if vvv.Kind() == reflect.Pointer || vvv.Kind() == reflect.Interface { - setPointer(base, vvv, append(tagsCopy, Tag{name: strconv.Itoa(i)})) + SetPointer(base, vvv, append(tagsCopy, Tag{name: strconv.Itoa(i)})) } else { - vvv.Set(populateFields(base, vvv, append(tagsCopy, Tag{name: strconv.Itoa(i)})...)) + vvv.Set(PopulateFields(base, vvv, append(tagsCopy, Tag{name: strconv.Itoa(i)})...)) } } } @@ -243,7 +286,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { return v } -func setPointer(base Base, v reflect.Value, tags []Tag) { +func SetPointer(base Base, v reflect.Value, tags []Tag) { vv := v if v.Kind() == reflect.Interface { if v.IsNil() { @@ -258,7 +301,7 @@ func setPointer(base Base, v reflect.Value, tags []Tag) { vv = reflect.New(v.Type().Elem()).Elem() } - if f := populateFields(base, vv, tags...); f.CanAddr() { + if f := PopulateFields(base, vv, tags...); f.CanAddr() { v.Set(f.Addr()) } } @@ -271,7 +314,7 @@ type Tag struct { alternatives []string } -func parseTag(s string) (tag Tag) { +func ParseTag(s string) (tag Tag) { parts := strings.Split(s, ",") if len(parts) > 0 { tag.name = parts[0] diff --git a/model/model_test.go b/model/model_test.go index f6d75b230..71b45de58 100644 --- a/model/model_test.go +++ b/model/model_test.go @@ -38,7 +38,7 @@ func TestParseTags(t *testing.T) { for _, tt := range cases { t.Run(tt.value, func(t *testing.T) { - got := parseTag(tt.value) + got := ParseTag(tt.value) if diff := cmp.Diff(tt.want, got, cmp.AllowUnexported((Tag{}))); diff != "" { t.Errorf("ParseTags() returned unexpected values (-want +got):\n%s", diff) } @@ -81,7 +81,7 @@ func TestPopulateFields(t *testing.T) { var m fakeModel v := reflect.ValueOf(&m) - v.Elem().Set(populateFields(Base{b: &fakeBackend{ + v.Elem().Set(PopulateFields(Base{b: &fakeBackend{ names: []string{ "input.weight", "blk.0.attn_q.weight", @@ -130,7 +130,7 @@ func TestPopulateFieldsAlternateName(t *testing.T) { var m fakeModel v := reflect.ValueOf(&m) - v.Elem().Set(populateFields(Base{b: &fakeBackend{ + v.Elem().Set(PopulateFields(Base{b: &fakeBackend{ names: []string{ "input.weight", "nested.b.weight", @@ -166,7 +166,7 @@ func TestPopulateFieldsPrefixSuffixName(t *testing.T) { Blocks: make([]fakeBlock, 2), } v := reflect.ValueOf(&m) - v.Elem().Set(populateFields(Base{b: &fakeBackend{ + v.Elem().Set(PopulateFields(Base{b: &fakeBackend{ names: []string{ "blk.0.a.weight", "blk.0.b_weight", diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go index e595f1863..1ae76651f 100644 --- a/model/models/gemma3/model.go +++ b/model/models/gemma3/model.go @@ -164,6 +164,10 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { return hiddenState, nil } +func (m *Model) IsOnlineProjectorMergingSupported() bool { + return true +} + func init() { model.Register("gemma3", New) model.Register("gemma3_embed", newEmbedModel) diff --git a/model/models/gemma3/model_vision.go b/model/models/gemma3/model_vision.go index 8b1a8eb00..8cb6ac6d4 100644 --- a/model/models/gemma3/model_vision.go +++ b/model/models/gemma3/model_vision.go @@ -76,9 +76,9 @@ type VisionModelOptions struct { } type VisionModel struct { - PatchEmbedding *nn.Conv2D `gguf:"patch_embedding"` - PositionEmbedding *nn.Embedding `gguf:"position_embedding"` - PostLayerNorm *nn.LayerNorm `gguf:"post_layernorm"` + PatchEmbedding *nn.Conv2D `gguf:"patch_embedding,alt:patch_embd"` + PositionEmbedding *nn.Embedding `gguf:"position_embedding,alt:position_embd"` + PostLayerNorm *nn.LayerNorm `gguf:"post_layernorm,alt:post_ln"` Layers []VisionEncoderLayer `gguf:"blk"` diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go index 8230dde39..f14186ef8 100644 --- a/model/models/mistral3/model.go +++ b/model/models/mistral3/model.go @@ -59,7 +59,7 @@ func New(c fs.Config) (model.Model, error) { } type PatchMerger struct { - MergingLayer *nn.Linear `gguf:"merging_layer"` + MergingLayer *nn.Linear `gguf:"merging_layer,alt:"` } func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, size image.Point, spatialMergeSize int) ml.Tensor { @@ -72,9 +72,9 @@ func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, size ima } type MultiModalProjector struct { - Norm *nn.RMSNorm `gguf:"norm"` - Linear1 *nn.Linear `gguf:"linear_1"` - Linear2 *nn.Linear `gguf:"linear_2"` + Norm *nn.RMSNorm `gguf:"norm,alt:input_norm"` + Linear1 *nn.Linear `gguf:"linear_1,alt:1"` + Linear2 *nn.Linear `gguf:"linear_2,alt:2"` PatchMerger *PatchMerger `gguf:"patch_merger"` spatialMergeSize int @@ -164,6 +164,10 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { return m.TextModel.Forward(ctx, batch.Inputs, positions, positionsScale, batch.Outputs, batch, m.Cache), nil } +func (m *Model) IsOnlineProjectorMergingSupported() bool { + return true +} + func init() { model.Register("mistral3", New) } diff --git a/model/models/mistral3/model_vision.go b/model/models/mistral3/model_vision.go index 1de0412d5..d4750fac1 100644 --- a/model/models/mistral3/model_vision.go +++ b/model/models/mistral3/model_vision.go @@ -87,8 +87,8 @@ type VisionModelOptions struct { } type VisionModel struct { - PatchEmbedding *nn.Conv2D `gguf:"patch_conv"` - EncoderNorm *nn.RMSNorm `gguf:"encoder_norm"` + PatchEmbedding *nn.Conv2D `gguf:"patch_conv,alt:patch_embd"` + EncoderNorm *nn.RMSNorm `gguf:"encoder_norm,alt:pre_ln"` Layers []VisionEncoderLayer `gguf:"blk"` *VisionModelOptions diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go index 81296a81b..c1bb8fe2b 100644 --- a/model/models/qwen25vl/model.go +++ b/model/models/qwen25vl/model.go @@ -3,6 +3,7 @@ package qwen25vl import ( "bytes" "image" + "reflect" "slices" "github.com/ollama/ollama/fs" @@ -190,6 +191,28 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { return m.Output.Forward(ctx, hiddenStates), nil } +func (m *Model) PostPopulate() { + if m.VisionModel.PatchMerger.MLP0.Weight == nil { + if tensor := m.Base.Backend().Get("mm.0.weight"); tensor != nil { + model.SetPointer(m.Base, reflect.ValueOf(m.VisionModel.PatchMerger.MLP0), []model.Tag{model.ParseTag("mm.0")}) + model.SetPointer(m.Base, reflect.ValueOf(m.VisionModel.PatchMerger.MLP2), []model.Tag{model.ParseTag("mm.2")}) + model.SetPointer(m.Base, reflect.ValueOf(m.VisionModel.PatchMerger.LNQ), []model.Tag{model.ParseTag("v.post_ln")}) + } + } + if m.VisionModel.PatchEmbedding.PatchConv0.Weight == nil { + if tensor := m.Base.Backend().Get("v.patch_embd.weight"); tensor != nil { + m.VisionModel.PatchEmbedding.PatchConv0.Weight = tensor + } + if tensor := m.Base.Backend().Get("v.patch_embd.weight.1"); tensor != nil { + m.VisionModel.PatchEmbedding.PatchConv1.Weight = tensor + } + } +} + +func (m *Model) IsOnlineProjectorMergingSupported() bool { + return true +} + func init() { model.Register("qwen25vl", New) } diff --git a/model/models/qwen3vl/model.go b/model/models/qwen3vl/model.go index cb1ce8d2c..db10bf85c 100644 --- a/model/models/qwen3vl/model.go +++ b/model/models/qwen3vl/model.go @@ -2,7 +2,9 @@ package qwen3vl import ( "bytes" + "fmt" "image" + "reflect" "slices" "github.com/ollama/ollama/fs" @@ -170,6 +172,27 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { return m.Output.Forward(ctx, hiddenStates), nil } +func (m *Model) PostPopulate() { + if m.VisionModel.PatchMerger.FC1.Weight == nil { + if tensor := m.Base.Backend().Get("mm.0.weight"); tensor != nil { + model.SetPointer(m.Base, reflect.ValueOf(m.VisionModel.PatchMerger.FC1), []model.Tag{model.ParseTag("mm.0")}) + model.SetPointer(m.Base, reflect.ValueOf(m.VisionModel.PatchMerger.FC2), []model.Tag{model.ParseTag("mm.2")}) + model.SetPointer(m.Base, reflect.ValueOf(m.VisionModel.PatchMerger.Norm), []model.Tag{model.ParseTag("v.post_ln")}) + } + } + for i, deepstacks := range m.VisionModel.DeepstackMerger { + if deepstacks.FC1.Weight == nil { + if tensor := m.Base.Backend().Get(fmt.Sprintf("v.deepstack.%d.weight", m.VisionModel.deepstackVisualIndexes[i])); tensor != nil { + model.SetPointer(m.Base, reflect.ValueOf(deepstacks), []model.Tag{model.ParseTag("v.deepstack.%d")}) + } + } + } +} + +func (m *Model) IsOnlineProjectorMergingSupported() bool { + return true +} + func New(c fs.Config) (model.Model, error) { m := Model{ TextProcessor: model.NewBytePairEncoding( diff --git a/model/models/qwen3vl/model_vision.go b/model/models/qwen3vl/model_vision.go index 761281edc..76004d551 100644 --- a/model/models/qwen3vl/model_vision.go +++ b/model/models/qwen3vl/model_vision.go @@ -94,8 +94,8 @@ func (o VisionOptions) headDim() int { type VisionPatchMerger struct { Norm *nn.LayerNorm `gguf:"norm"` - FC1 *nn.Linear `gguf:"linear_fc1"` - FC2 *nn.Linear `gguf:"linear_fc2"` + FC1 *nn.Linear `gguf:"linear_fc1,alt:fc.fc1"` + FC2 *nn.Linear `gguf:"linear_fc2,alt:fc.fc2"` } func (m *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, postshuffleNorm bool, opts VisionOptions) ml.Tensor { @@ -241,6 +241,13 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) // newVisionModel creates a new instance of the Qwen vision model func newVisionModel(c fs.Config) *VisionModel { deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes") + if deepstackVisualIndexes == nil && c.Bools("vision.is_deepstack_layers") != nil { + for i, flag := range c.Bools("vision.is_deepstack_layers") { + if flag { + deepstackVisualIndexes = append(deepstackVisualIndexes, int32(i)) + } + } + } model := &VisionModel{ Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)), DeepstackMerger: make([]*VisionPatchMerger, len(deepstackVisualIndexes)), diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index a756cba23..a7e24302f 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -1171,6 +1171,7 @@ func (s *Server) allocModel( mpath string, params ml.BackendParams, loraPath []string, + projectorPath string, parallel int, kvCacheType string, kvSize int, @@ -1302,7 +1303,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { s.batchSize = req.BatchSize - err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache) + err := s.allocModel(s.modelPath, params, req.LoraPath, req.ProjectorPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache) if err != nil { s.closeModel()