diff --git a/model/models/models.go b/model/models/models.go index c880a4720..650211dfd 100644 --- a/model/models/models.go +++ b/model/models/models.go @@ -12,4 +12,5 @@ import ( _ "github.com/ollama/ollama/model/models/qwen2" _ "github.com/ollama/ollama/model/models/qwen25vl" _ "github.com/ollama/ollama/model/models/qwen3" + _ "github.com/ollama/ollama/model/models/qwen3vl" ) diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go index d73f499d2..fd5cc5381 100644 --- a/model/models/qwen25vl/model.go +++ b/model/models/qwen25vl/model.go @@ -44,8 +44,8 @@ func New(c fs.Config) (model.Model, error) { }, ), TextModel: NewTextModel(c), - VisionModel: newVisionModel(c), - ImageProcessor: newImageProcessor(c), + VisionModel: NewVisionModel(c), + ImageProcessor: NewImageProcessor(c), } m.Cache = kvcache.NewCausalCache(m.TextModel.Shift) @@ -65,8 +65,8 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, * } // Calculate tensor dimensions - patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize * - m.ImageProcessor.patchSize * m.ImageProcessor.patchSize + patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize * + m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize numPatches := grid.Temporal * grid.Height * grid.Width pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches) diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go index 4d7afaa14..e6a622ab0 100644 --- a/model/models/qwen25vl/model_vision.go +++ b/model/models/qwen25vl/model_vision.go @@ -345,8 +345,8 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor return positionalEmbedding } -// newVisionModel creates a new instance of the Qwen vision model -func newVisionModel(c fs.Config) *VisionModel { +// NewVisionModel creates a new instance of the Qwen vision model +func NewVisionModel(c fs.Config) *VisionModel { patchSize := int(c.Uint("vision.patch_size", 14)) hiddenSize := int(c.Uint("vision.embedding_length", 1280)) numHeads := int(c.Uint("vision.attention.head_count", 16)) diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go index dc91bdea5..bd52c6e04 100644 --- a/model/models/qwen25vl/process_image.go +++ b/model/models/qwen25vl/process_image.go @@ -11,40 +11,40 @@ import ( // ImageProcessor contains configuration for the Qwen 2.5 VL image processing type ImageProcessor struct { - numChannels int - patchSize int - temporalPatchSize int - mergeSize int - minPixels int - maxPixels int - factor int - rescaleFactor float32 - imageMean []float32 - imageStd []float32 + NumChannels int + PatchSize int + TemporalPatchSize int + MergeSize int + MinPixels int + MaxPixels int + Factor int + RescaleFactor float32 + ImageMean []float32 + ImageStd []float32 } // newImageProcessor creates a new image processor with default values -func newImageProcessor(c fs.Config) ImageProcessor { +func NewImageProcessor(c fs.Config) ImageProcessor { patchSize := int(c.Uint("vision.patch_size", 14)) mergeSize := int(c.Uint("vision.spatial_merge_size", 2)) return ImageProcessor{ - numChannels: int(c.Uint("vision.num_channels", 3)), // not set - patchSize: patchSize, - temporalPatchSize: 2, - mergeSize: mergeSize, - minPixels: 56 * 56, - maxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit - factor: patchSize * mergeSize, - rescaleFactor: 1.0 / 255.0, - imageMean: imageproc.ClipDefaultMean[:], - imageStd: imageproc.ClipDefaultSTD[:], + NumChannels: int(c.Uint("vision.num_channels", 3)), // not set + PatchSize: patchSize, + TemporalPatchSize: 2, + MergeSize: mergeSize, + MinPixels: 56 * 56, + MaxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit + Factor: patchSize * mergeSize, + RescaleFactor: 1.0 / 255.0, + ImageMean: imageproc.ClipDefaultMean[:], + ImageStd: imageproc.ClipDefaultSTD[:], } } // SmartResize implements the smart resize algorithm func (p *ImageProcessor) SmartResize(height, width int) (int, int) { - factor := p.factor + factor := p.Factor if height < factor || width < factor { panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor)) @@ -57,13 +57,13 @@ func (p *ImageProcessor) SmartResize(height, width int) (int, int) { hBar := round(float64(height)/float64(factor)) * factor wBar := round(float64(width)/float64(factor)) * factor - if hBar*wBar > p.maxPixels { - beta := math.Sqrt(float64(height*width) / float64(p.maxPixels)) + if hBar*wBar > p.MaxPixels { + beta := math.Sqrt(float64(height*width) / float64(p.MaxPixels)) hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor - } else if hBar*wBar < p.minPixels { - beta := math.Sqrt(float64(p.minPixels) / float64(height*width)) + } else if hBar*wBar < p.MinPixels { + beta := math.Sqrt(float64(p.MinPixels) / float64(height*width)) hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor @@ -90,16 +90,16 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) normalizedPixels := imageproc.Normalize( resizedImg, - [3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]}, - [3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]}, + [3]float32{p.ImageMean[0], p.ImageMean[1], p.ImageMean[2]}, + [3]float32{p.ImageStd[0], p.ImageStd[1], p.ImageStd[2]}, true, // rescale true, // channelFirst ) // Calculate grid dimensions grid := &Grid{ - Height: resizedHeight / p.patchSize, - Width: resizedWidth / p.patchSize, + Height: resizedHeight / p.PatchSize, + Width: resizedWidth / p.PatchSize, Temporal: 1, // For single images, temporal dimension is 1 } @@ -113,10 +113,10 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) } func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) { - channels := p.numChannels - patchSize := p.patchSize - mergeSize := p.mergeSize - temporalPatchSize := p.temporalPatchSize + channels := p.NumChannels + patchSize := p.PatchSize + mergeSize := p.MergeSize + temporalPatchSize := p.TemporalPatchSize // Calculate output dimensions numPatches := grid.Temporal * grid.Height * grid.Width diff --git a/model/models/qwen3vl/model.go b/model/models/qwen3vl/model.go new file mode 100644 index 000000000..630604b02 --- /dev/null +++ b/model/models/qwen3vl/model.go @@ -0,0 +1,153 @@ +package qwen3vl + +import ( + "bytes" + "fmt" + "image" + "slices" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/kvcache" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model" + "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/model/models/qwen25vl" + "github.com/ollama/ollama/model/models/qwen3" +) + +type Model struct { + model.Base + model.BytePairEncoding + + TextModel *qwen3.Model + *qwen25vl.VisionModel + + qwen25vl.ImageProcessor +} + +var _ model.MultimodalProcessor = (*Model)(nil) + +func New(c fs.Config) (model.Model, error) { + textModel, err := qwen3.New(c) + if err != nil { + return nil, err + } + + m := &Model{ + BytePairEncoding: model.NewBytePairEncoding( + c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), + &model.Vocabulary{ + Values: c.Strings("tokenizer.ggml.tokens"), + Types: c.Ints("tokenizer.ggml.token_type"), + Merges: c.Strings("tokenizer.ggml.merges"), + AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, + AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + []int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))}, + c.Ints("tokenizer.ggml.eos_token_ids")..., + ), + }, + ), + TextModel: textModel.(*qwen3.Model), + VisionModel: qwen25vl.NewVisionModel(c), + ImageProcessor: qwen25vl.NewImageProcessor(c), + } + + m.Cache = kvcache.NewCausalCache(m.TextModel.Shift) + + return m, nil +} + +func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *qwen25vl.Grid, error) { + image, _, err := image.Decode(bytes.NewReader(multimodalData)) + if err != nil { + return nil, nil, err + } + + f32s, grid, err := m.ImageProcessor.ProcessImage(image) + if err != nil { + return nil, nil, err + } + + // Calculate tensor dimensions + patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize * + m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize + numPatches := grid.Temporal * grid.Height * grid.Width + + pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches) + + return pixelValues, grid, nil +} + +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) { + if len(m.VisionModel.Layers) == 0 { + return nil, model.ErrNoVisionModel + } + + pixels, grid, err := m.PixelValues(ctx, multimodalData) + if err != nil { + return nil, err + } + + visionOutputs := m.VisionModel.Forward(ctx, pixels, grid) + return []input.Multimodal{{Tensor: visionOutputs}}, nil +} + +// PostTokenize arranges Qwen-3-VL's inputs for the forward pass +func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) { + var result []*input.Input + + var ( + imageToken int32 = 151655 + visionStartToken int32 = 151652 + visionEndToken int32 = 151653 + ) + + nImg := 0 + for _, inp := range inputs { + if inp.Multimodal == nil { + // If not a multimodal input, add it to the result unchanged + result = append(result, inp) + } else { + // Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix + // the image tokens with a prompt, so we add a prefix here + nImg++ + pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true) + if err != nil { + return nil, fmt.Errorf("failed to encode image prompt: %w", err) + } + for i := range pre { + result = append(result, &input.Input{Token: pre[i]}) + } + + patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1) + + // First add the vision start token + result = append(result, &input.Input{Token: visionStartToken}) + + // Add the image token with the multimodal tensor data at the first position + result = append(result, &input.Input{ + Token: imageToken, + Multimodal: inp.Multimodal, + MultimodalHash: inp.MultimodalHash, + SameBatch: patchesPerChunk, + }) + + // Add the placeholder tokens for the remaining positions (tokensPerGrid-1) + result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...) + + result = append(result, &input.Input{Token: visionEndToken}) + } + } + + return result, nil +} + +func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { + return m.TextModel.Forward(ctx, batch) +} + +func init() { + model.Register("qwen3vl", New) +}