wip

vision conversion
image processing
2025-03-21 13:17:13 -07:00 · 2025-03-20 15:37:21 -07:00 · 2025-03-20 15:15:04 -07:00 · 2025-03-20 12:54:20 -07:00 · 2025-03-20 12:47:42 -07:00 · 2025-03-20 12:44:02 -07:00
17 changed files with 1227508 additions and 247 deletions
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -182,10 +182,10 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {

 	var conv ModelConverter
 	switch p.Architectures[0] {
-	case "LlamaForCausalLM", "MistralForCausalLM":
+	case "LlamaForCausalLM":
 		conv = &llamaModel{}
 	case "Mistral3ForConditionalGeneration":
-		conv = &mistralModel{}
+		conv = &mistral3Model{}
 	case "MixtralForCausalLM":
 		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
@@ -248,5 +248,10 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}

+	// iterate through all ts and print the name
+	for _, t := range ts {
+		fmt.Print(t.Name(), "\n")
+	}
+
 	return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts))
 }
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -3,7 +3,6 @@ package convert
 import (
 	"cmp"
 	"fmt"
-	"math"
 	"strings"

 	"github.com/pdevine/tensor"
@@ -12,10 +11,12 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 )

-type mistralModel struct {
+type mistral3Model struct {
 	ModelParameters
-	// Text model parameters
-	TextConfig struct {
+	ImageTokenIndex    uint32 `json:"image_token_index"`
+	SpatialMergeSize   uint32 `json:"spatial_merge_size"`
+	VisionFeatureLayer int32  `json:"vision_feature_layer"`
+	TextModel          struct {
 		NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 		MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 		HiddenSize            uint32  `json:"hidden_size"`
@@ -25,121 +26,83 @@ type mistralModel struct {
 		RopeTheta             float32 `json:"rope_theta"`
 		RMSNormEPS            float32 `json:"rms_norm_eps"`
 		HeadDim               uint32  `json:"head_dim"`
+		SlidingWindow         *uint32 `json:"sliding_window"`
+		HiddenAct             string  `json:"hidden_act"`
+		VocabSize             uint32  `json:"vocab_size"`
 	} `json:"text_config"`
-
-	// Vision model parameters
-	VisionConfig struct {
+	VisionModel struct {
+		NumAttentionHeads uint32  `json:"num_attention_heads"`
 		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
 		HiddenSize        uint32  `json:"hidden_size"`
 		IntermediateSize  uint32  `json:"intermediate_size"`
-		NumAttentionHeads uint32  `json:"num_attention_heads"`
 		ImageSize         uint32  `json:"image_size"`
+		NumChannels       uint32  `json:"num_channels"`
 		PatchSize         uint32  `json:"patch_size"`
+		HeadDim           uint32  `json:"head_dim"`
+		HiddenAct         string  `json:"hidden_act"`
 		RopeTheta         float32 `json:"rope_theta"`
 	} `json:"vision_config"`
-
-	// Multimodal specific parameters
-	ImageTokenIndex         uint32 `json:"image_token_index"`
-	MultimodalProjectorBias bool   `json:"multimodal_projector_bias"`
+	MultiModalProjectorBias bool   `json:"multimodal_projector_bias"`
 	ProjectorHiddenAct      string `json:"projector_hidden_act"`
-	SpatialMergeSize        uint32 `json:"spatial_merge_size"`
-	VisionFeatureLayer      int32  `json:"vision_feature_layer"`
-
-	// For RoPE scaling if needed
-	RopeScaling struct {
-		Type                            string  `json:"type"`
-		RopeType                        string  `json:"rope_type"`
-		Factor                          float32 `json:"factor"`
-		LowFrequencyFactor              float32 `json:"low_freq_factor"`
-		HighFrequencyFactor             float32 `json:"high_freq_factor"`
-		OriginalMaxPositionalEmbeddings uint32  `json:"original_max_positional_embeddings"`
-
-		factors ropeFactor
-	} `json:"rope_scaling"`
 }

-func (p *mistralModel) KV(t *Tokenizer) ggml.KV {
+func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "mistral"
-	kv["mistral.vocab_size"] = p.VocabSize
-	kv["mistral.image_token_index"] = p.ImageTokenIndex
-	kv["mistral.multimodal_projector_bias"] = p.MultimodalProjectorBias
-	kv["mistral.projector_hidden_act"] = p.ProjectorHiddenAct
-	kv["mistral.spatial_merge_size"] = p.SpatialMergeSize
-	// kv["mistral.vision_feature_layer"] = p.VisionFeatureLayer
+	kv["general.architecture"] = "mistral3"
+	kv["mistral3.vocab_size"] = p.TextModel.VocabSize

-	// Text model config
-	kv["mistral.block_count"] = p.TextConfig.NumHiddenLayers
-	kv["mistral.context_length"] = p.TextConfig.MaxPositionEmbeddings
-	kv["mistral.embedding_length"] = p.TextConfig.HiddenSize
-	kv["mistral.feed_forward_length"] = p.TextConfig.IntermediateSize
-	kv["mistral.attention.head_count"] = p.TextConfig.NumAttentionHeads
-	kv["mistral.attention.head_count_kv"] = p.TextConfig.NumKeyValueHeads
-	kv["mistral.rope.dimension_count"] = p.TextConfig.HiddenSize / p.TextConfig.NumAttentionHeads
-	kv["mistral.rope.freq_base"] = p.TextConfig.RopeTheta
-	kv["mistral.attention.layer_norm_rms_epsilon"] = p.TextConfig.RMSNormEPS
-	kv["mistral.attention.key_length"] = p.TextConfig.HeadDim
-	kv["mistral.attention.value_length"] = p.TextConfig.HeadDim
+	// Text configuration
+	kv["mistral3.block_count"] = p.TextModel.NumHiddenLayers
+	kv["mistral3.context_length"] = p.TextModel.MaxPositionEmbeddings
+	kv["mistral3.embedding_length"] = p.TextModel.HiddenSize
+	kv["mistral3.feed_forward_length"] = p.TextModel.IntermediateSize
+	kv["mistral3.attention.head_count"] = p.TextModel.NumAttentionHeads
+	kv["mistral3.attention.head_count_kv"] = p.TextModel.NumKeyValueHeads
+	kv["mistral3.attention.layer_norm_rms_epsilon"] = p.TextModel.RMSNormEPS
+	kv["mistral3.attention.key_length"] = p.TextModel.HeadDim
+	kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
+	kv["mistral3.rope.dimension_count"] = p.TextModel.HiddenSize / p.TextModel.NumHiddenLayers
+	kv["mistral3.rope.freq_base"] = p.TextModel.RopeTheta

-	// Vision model config
-	kv["mistral.vision.block_count"] = p.VisionConfig.NumHiddenLayers
-	kv["mistral.vision.embedding_length"] = p.VisionConfig.HiddenSize
-	kv["mistral.vision.feed_forward_length"] = p.VisionConfig.IntermediateSize
-	kv["mistral.vision.attention.head_count"] = p.VisionConfig.NumAttentionHeads
-	kv["mistral.vision.image_size"] = p.VisionConfig.ImageSize
-	kv["mistral.vision.patch_size"] = p.VisionConfig.PatchSize
-	kv["mistral.vision.rope.freq_base"] = p.VisionConfig.RopeTheta
+	// Vision configuration
+	kv["mistral3.vision.block_count"] = p.VisionModel.NumHiddenLayers
+	kv["mistral3.vision.embedding_length"] = p.VisionModel.HiddenSize
+	kv["mistral3.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
+	kv["mistral3.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
+	kv["mistral3.vision.attention.key_length"] = p.VisionModel.HeadDim
+	kv["mistral3.vision.image_size"] = p.VisionModel.ImageSize
+	kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
+	kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
+	// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
+	kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta

-	// If RoPE scaling is present
-	if p.RopeScaling.Type == "linear" {
-		kv["mistral.rope.scaling.type"] = p.RopeScaling.Type
-		kv["mistral.rope.scaling.factor"] = p.RopeScaling.Factor
-	} else if p.RopeScaling.RopeType == "llama3" {
-		dim := p.TextConfig.HiddenSize / p.TextConfig.NumAttentionHeads
-		for i := uint32(0); i < dim; i += 2 {
-			factor := cmp.Or(p.RopeScaling.Factor, 8.0)
-			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
-			factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
+	// Multimodal configuration
+	kv["mistral3.image_token_index"] = p.ImageTokenIndex
+	kv["mistral3.spatial_merge_size"] = p.SpatialMergeSize

-			original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
-			lambdaLow := float32(original) / factorLow
-			lambdaHigh := float32(original) / factorHigh
+	kv["mistral3.mm.projector_bias"] = p.MultiModalProjectorBias

-			lambda := 2 * math.Pi * math.Pow(float64(p.TextConfig.RopeTheta), float64(i)/float64(dim))
-			if lambda < float64(lambdaHigh) {
-				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
-			} else if lambda > float64(lambdaLow) {
-				p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
-			} else {
-				smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
-				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
-			}
-		}
+	if p.ProjectorHiddenAct != "" {
+		kv["mistral3.mm.projector_hidden_act"] = p.ProjectorHiddenAct
 	}

 	return kv
 }

-func (p *mistralModel) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
 	var out []ggml.Tensor

-	if p.RopeScaling.factors != nil {
-		out = append(out, ggml.Tensor{
-			Name:     "rope_freqs.weight",
-			Kind:     0,
-			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
-			WriterTo: p.RopeScaling.factors,
-		})
-	}
-
 	for _, t := range ts {
-		// Process tensors that require repacking
 		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
 			strings.HasSuffix(t.Name(), "attn_k.weight") {
 			t.SetRepacker(p.repack)
 		}

-		// Add all tensors to output
+		// Skip certain vision model tensors that might need special handling
+		if strings.HasPrefix(t.Name(), "patch_merger.") || strings.HasPrefix(t.Name(), "pre_mm_projector_output_norm.") {
+			continue
+		}
+
 		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
@@ -151,22 +114,37 @@ func (p *mistralModel) Tensors(ts []Tensor) []ggml.Tensor {
 	return out
 }

-func (p *mistralModel) Replacements() []string {
+func (p *mistral3Model) Replacements() []string {
 	return []string{
+		// Text model replacements
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"post_attention_layernorm", "ffn_norm",
+		"lm_head", "output",
+		"model.embed_tokens.weight", "token_embd.weight",
+		"model.norm.weight", "output_norm.weight",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+
 		// Language model replacements
 		"language_model.model.embed_tokens", "token_embd",
-		"language_model.model.norm", "output_norm",
 		"language_model.model.layers", "blk",
-		"language_model.model.layers.*.input_layernorm", "input_layernorm",
-		"language_model.model.layers.*.self_attn.q_proj", "self_attn.q_proj",
-		"language_model.model.layers.*.self_attn.k_proj", "self_attn.k_proj",
-		"language_model.model.layers.*.self_attn.v_proj", "self_attn.v_proj",
-		"language_model.model.layers.*.self_attn.o_proj", "self_attn.o_proj",
-		"language_model.model.layers.*.mlp.gate_proj", "mlp.gate_proj",
-		"language_model.model.layers.*.mlp.down_proj", "mlp.down_proj",
-		"language_model.model.layers.*.mlp.up_proj", "mlp.up_proj",
-		"language_model.model.layers.*.post_attention_layernorm", "post_attention_layernorm",
+		"language_model.model.layers.*.input_layernorm", "attn_norm",
+		"language_model.model.layers.*.self_attn.q_proj", "attn_q",
+		"language_model.model.layers.*.self_attn.k_proj", "attn_k",
+		"language_model.model.layers.*.self_attn.v_proj", "attn_v",
+		"language_model.model.layers.*.self_attn.o_proj", "attn_output",
+		"language_model.model.layers.*.mlp.gate_proj", "ffn_gate",
+		"language_model.model.layers.*.mlp.down_proj", "ffn_down",
+		"language_model.model.layers.*.mlp.up_proj", "ffn_up",
+		"language_model.model.layers.*.post_attention_layernorm", "ffn_norm",
 		"language_model.lm_head", "output",
+		"language_model.model.norm", "output_norm",

 		// Vision model replacements - map to shorter prefixes
 		"vision_tower", "v",
@@ -185,14 +163,21 @@ func (p *mistralModel) Replacements() []string {
 		"vision_tower.transformer.layers.*.ffn_norm", "v.ffn_norm",
 		"vision_tower.ln_pre", "v.encoder_norm",
 		"vision_tower.patch_conv", "v.patch_conv",
+		"vision_tower.embeddings", "v.embeddings",
+
+		// Alternative vision model paths
+		"vision_model.vision_model.embeddings", "v.embeddings",
+		"vision_model.vision_model", "v",
+		"vision_model.layers", "v.blk",

 		// Multimodal projector components
 		"multi_modal_projector.patch_merger", "mm.patch_merger",
 		"multi_modal_projector.norm", "mm.norm",
+		"multi_modal_projector.linear", "mm.projection",
 	}
 }

-func (p *mistralModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+func (p *mistral3Model) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
@@ -200,17 +185,9 @@ func (p *mistralModel) repack(name string, data []float32, shape []uint64) ([]fl

 	var heads uint32
 	if strings.HasSuffix(name, "attn_q.weight") {
-		if strings.Contains(name, "vision") {
-			heads = p.VisionConfig.NumAttentionHeads
-		} else {
-			heads = p.TextConfig.NumAttentionHeads
-		}
+		heads = p.TextModel.NumAttentionHeads
 	} else if strings.HasSuffix(name, "attn_k.weight") {
-		if strings.Contains(name, "vision") {
-			heads = p.VisionConfig.NumAttentionHeads
-		} else {
-			heads = cmp.Or(p.TextConfig.NumKeyValueHeads, p.TextConfig.NumAttentionHeads)
-		}
+		heads = cmp.Or(p.TextModel.NumKeyValueHeads, p.TextModel.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
 	}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -62,10 +62,7 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 		Pattern string
 		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
 	}{
-		{"model-*-of-*.safetensors", parseSafetensors},
-		{"model.safetensors", parseSafetensors},
-		{"adapters.safetensors", parseSafetensors},
-		{"adapter_model.safetensors", parseSafetensors},
+		{"*.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -10,7 +10,7 @@ import (
 	"github.com/ollama/ollama/model/input"
 )

-type TextOptions struct {
+type TextConfig struct {
 	hiddenSize, numHeads, numKVHeads int
 	attnKeyLen, attnValLen           int
 	eps, ropeScale                   float32
@@ -27,7 +27,7 @@ type TextModel struct {
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

-	*TextOptions
+	*TextConfig
 }

 const (
@@ -55,7 +55,7 @@ func newTextModel(c ml.Config) *TextModel {
 			},
 		),
 		Layers: make([]TextLayer, numBlocks),
-		TextOptions: &TextOptions{
+		TextConfig: &TextConfig{
 			hiddenSize:     int(c.Uint("embedding_length")),
 			numHeads:       int(c.Uint("attention.head_count")),
 			numKVHeads:     int(c.Uint("attention.head_count_kv")),
@@ -84,7 +84,7 @@ type TextSelfAttention struct {
 	Output    *nn.Linear  `gguf:"attn_output"`
 }

-func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	ropeType := uint32(2)

@@ -120,12 +120,12 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	ropeBase := m.TextOptions.ropeLocalBase
+	ropeBase := m.TextConfig.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
-		ropeBase = m.TextOptions.ropeGlobalBase
+		ropeBase = m.TextConfig.ropeGlobalBase
 	}

-	return key.RoPE(ctx, shift, nil, uint32(m.TextOptions.attnKeyLen), uint32(2), ropeBase, m.TextOptions.ropeScale), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
 }

 type TextMLP struct {
@@ -134,7 +134,7 @@ type TextMLP struct {
 	Gate *nn.Linear `gguf:"ffn_gate"`
 }

-func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
+func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextConfig) ml.Tensor {
 	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
 	return mlp.Down.Forward(ctx, hiddenState)
 }
@@ -148,7 +148,7 @@ type TextLayer struct {
 	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm"`
 }

-func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -173,7 +173,7 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,

 func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, opts input.Options, cache kvcache.Cache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
-	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextOptions.hiddenSize)))
+	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextConfig.hiddenSize)))

 	// set image embeddings
 	var except []int
@@ -206,7 +206,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			lastLayerOutputs = outputs
 		}

-		hiddenState = layer.Forward(ctx, i, hiddenState, positions, lastLayerOutputs, cache, m.TextOptions)
+		hiddenState = layer.Forward(ctx, i, hiddenState, positions, lastLayerOutputs, cache, m.TextConfig)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
--- a/model/models/gemma3/process_image.go
+++ b/model/models/gemma3/process_image.go
@@ -51,7 +51,7 @@ func (p *ImageProcessor) pack(img image.Image, mean, std [3]float32) []float32 {
 func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
 	outputSize := image.Point{p.imageSize, p.imageSize}
 	newImage := imageproc.Composite(img)
-	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
+	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBicubic)

 	data := p.pack(newImage, imageproc.ImageNetStandardMean, imageproc.ImageNetStandardSTD)
 	return data, nil
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -13,9 +13,9 @@ import (
 )

 type Options struct {
-	hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
+	hiddenSize, numHeads, numKVHeads, headDim int
+	eps, ropeBase, ropeScale                  float32
+	ropeDim                                   uint32
 }

 type Model struct {
@@ -37,6 +37,8 @@ func New(c ml.Config) (model.Model, error) {

 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
+			// TODO: need to set this in the conversion for mistral:
+			// tokenizer.ggml.pretokenizer = [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
@@ -53,6 +55,7 @@ func New(c ml.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
@@ -75,24 +78,36 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
+	// Get head dimension - use explicit value if available, otherwise calculate
+	headDim := opts.headDim
+	if headDim == 0 {
+		headDim = opts.hiddenSize / opts.numHeads
+	}

+	// Query projection and reshape
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
 	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

+	// Key projection and reshape
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

+	// Value projection and reshape
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

+	// Attention computation
 	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
 	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)

+	// Reshape attention output for final projection
+	outputDim := headDim * opts.numHeads
+	kqv = kqv.Reshape(ctx, outputDim, batchSize)
+
+	// Apply output projection
 	return sa.Output.Forward(ctx, kqv)
 }

--- a/model/models/mistral3/imageproc.go
+++ b/model/models/mistral3/imageproc.go
@@ -1,4 +1,4 @@
-package pixtral
+package mistral3

 import (
 	"fmt"
@@ -8,6 +8,7 @@ import (
 	"io"
 	"math"

+	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model/imageproc"
 )

@@ -27,8 +28,8 @@ func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.

 	if ratio > 1.0 {
 		newSize = image.Point{
-			int(math.Ceil(float64(b.Max.X) / ratio)),
-			int(math.Ceil(float64(b.Max.Y) / ratio)),
+			int(math.Floor(float64(b.Max.X) / ratio)),
+			int(math.Floor(float64(b.Max.Y) / ratio)),
 		}
 	}

@@ -66,3 +67,30 @@ func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
 	opts := map[string]any{}
 	return data, opts, nil
 }
+
+type ImageProcessor struct {
+	imageSize   int
+	patchSize   int
+	numChannels int
+	longestEdge int
+}
+
+func newImageProcessor(c ml.Config) ImageProcessor {
+	return ImageProcessor{
+		imageSize:   int(c.Uint("vision.image_size", 1540)),
+		patchSize:   int(c.Uint("vision.patch_size", 14)),
+		numChannels: int(c.Uint("vision.num_channels", 3)),
+		longestEdge: int(c.Uint("vision.longest_edge", 1024)),
+	}
+}
+
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
+	outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
+
+	newImage := imageproc.Composite(img)
+	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
+
+	data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
+
+	return data, nil
+}
--- a/model/models/mistral3/imageproc_test.go
+++ b/model/models/mistral3/imageproc_test.go
@@ -1,4 +1,4 @@
-package pixtral
+package mistral3

 import (
 	"bytes"
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -0,0 +1,139 @@
+package mistral3
+
+import (
+	"bytes"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	*TextModel
+	*VisionModel         `gguf:"v,vision"`
+	*MultiModalProjector `gguf:"mm"`
+
+	ImageProcessor
+}
+
+// Implement MultimodalProcessor interface
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+func New(c ml.Config) (model.Model, error) {
+	textModel, err := NewTextModel(c)
+	if err != nil {
+		return nil, err
+	}
+
+	m := &Model{
+		TextModel:           textModel,
+		VisionModel:         newVisionModel(c),
+		ImageProcessor:      newImageProcessor(c),
+		MultiModalProjector: newMultiModalProjector(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+
+	return m, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	// Decode image
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	// Process image
+	f32s, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create tensor from image data
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.numChannels,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Forward pass through vision model
+	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
+
+	// Project to text embedding space
+	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.VisionModel.eps)
+
+	return visionOutputs, nil
+}
+
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
+
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			result = append(result, inp)
+		} else {
+			inputMultimodal := inp.Multimodal.(ml.Tensor)
+
+			// Add special image tokens - using the imageTokenIndex from config
+			result = append(result,
+				input.Input{Token: int32(m.MultiModalProjector.imageTokenIndex)},             // Image token
+				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // Image data
+			)
+
+			// Add image token placeholders
+			result = append(result, slices.Repeat([]input.Input{{Token: 0}}, inputMultimodal.Dim(1)-1)...)
+		}
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	// Handle multimodal inputs
+	// var except []int
+	// hiddenState := m.TextModel.TokenEmbedding.Forward(ctx, inputs)
+
+	// for _, image := range opts.Multimodal {
+	// 	visionOutputs := image.Multimodal.(ml.Tensor)
+
+	// 	// Copy vision outputs into the hidden state
+	// 	ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+	// 	for i := range visionOutputs.Dim(1) {
+	// 		except = append(except, image.Index+i)
+	// 	}
+	// }
+
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil
+}
+
+func init() {
+	model.Register("mistral3", New)
+}
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -1,4 +1,4 @@
-package llama
+package mistral3

 import (
 	"fmt"
@@ -12,13 +12,13 @@ import (
 	"github.com/ollama/ollama/model/input"
 )

-type Options struct {
+type TextOptions struct {
 	hiddenSize, numHeads, numKVHeads, headDim int
 	eps, ropeBase, ropeScale                  float32
 	ropeDim                                   uint32
 }

-type Model struct {
+type TextModel struct {
 	model.Base
 	model.BytePairEncoding

@@ -27,67 +27,18 @@ type Model struct {
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

-	*Options
-}
-
-func New(c ml.Config) (model.Model, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
-	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Uints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
-		Layers: make([]Layer, c.Uint("block_count")),
-		Options: &Options{
-			hiddenSize: int(c.Uint("embedding_length")),
-			numHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			headDim:    int(c.Uint("attention.key_length")),
-			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:   c.Float("rope.freq_base"),
-			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
-		},
-	}
-
-	fmt.Println("Model Parameters:")
-	fmt.Printf("  model_type: %q\n", "gpt2")
-	fmt.Printf("  vocab_size: %d\n", len(c.Strings("tokenizer.ggml.tokens")))
-	fmt.Printf("  hidden_size: %d\n", m.Options.hiddenSize)
-	fmt.Printf("  num_hidden_layers: %d\n", c.Uint("block_count"))
-	fmt.Printf("  num_attention_heads: %d\n", m.Options.numHeads)
-	fmt.Printf("  num_key_value_heads: %d\n", m.Options.numKVHeads)
-	fmt.Printf("  rms_norm_eps: %g\n", m.Options.eps)
-	fmt.Printf("  rope_theta: %g\n", m.Options.ropeBase)
-	fmt.Printf("  bos_token_id: %d\n", c.Uint("tokenizer.ggml.bos_token_id"))
-	fmt.Printf("  eos_token_id: %d\n", c.Uint("tokenizer.ggml.eos_token_id"))
-	fmt.Printf("  pad_token_id: %d\n", c.Uint("tokenizer.ggml.pad_token_id", 0))
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-
-	return &m, nil
+	*TextOptions
 }

 type SelfAttention struct {
-	Query       *nn.Linear `gguf:"self_attn.q_proj"`
-	Key         *nn.Linear `gguf:"self_attn.k_proj"`
-	Value       *nn.Linear `gguf:"self_attn.v_proj"`
-	Output      *nn.Linear `gguf:"self_attn.o_proj"`
+	Query       *nn.Linear `gguf:"attn_q"`
+	Key         *nn.Linear `gguf:"attn_k"`
+	Value       *nn.Linear `gguf:"attn_v"`
+	Output      *nn.Linear `gguf:"attn_output"`
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	ropeType := uint32(0)
 	// Get head dimension - use explicit value if available, otherwise calculate
@@ -122,29 +73,29 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 	return sa.Output.Forward(ctx, kqv)
 }

-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
-	Up   *nn.Linear `gguf:"mlp.up_proj"`
-	Down *nn.Linear `gguf:"mlp.down_proj"`
-	Gate *nn.Linear `gguf:"mlp.gate_proj"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
 }

-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
 	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
 	return mlp.Down.Forward(ctx, hiddenState)
 }

 type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"input_layernorm"`
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
 	SelfAttention *SelfAttention
-	MLPNorm       *nn.RMSNorm `gguf:"post_attention_layernorm"`
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -165,43 +116,56 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }

-func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
-	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
-	if err != nil {
-		return nil, err
-	}
-
-	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
-	if err != nil {
-		return nil, err
-	}
-
-	// Get token embeddings
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, opts input.Options, cache kvcache.Cache) ml.Tensor {
+	// Process text inputs
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)

+	// Process through text transformer layers
 	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
+		cache.SetLayer(i)

 		var lastLayerOutputs ml.Tensor
 		if i == len(m.Layers)-1 {
 			lastLayerOutputs = outputs
 		}

-		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, cache, m.TextOptions)
 	}

-	// Apply output normalization
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
-
-	// Apply output projection
-	return m.Output.Forward(ctx, hiddenState), nil
+	return m.Output.Forward(ctx, hiddenState)
 }

-func init() {
-	model.Register("mistral", New)
+func NewTextModel(c ml.Config) (*TextModel, error) {
+	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
+		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
+	}
+
+	textModel := &TextModel{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+			},
+		),
+		Layers: make([]Layer, c.Uint("block_count")),
+		TextOptions: &TextOptions{
+			hiddenSize: int(c.Uint("embedding_length")),
+			numHeads:   int(c.Uint("attention.head_count")),
+			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:   c.Float("rope.freq_base"),
+			ropeScale:  c.Float("rope.freq_scale", 1),
+			ropeDim:    c.Uint("rope.dimension_count"),
+		},
+	}
+
+	return textModel, nil
 }
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -0,0 +1,143 @@
+package mistral3
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+var batchSize int = 1
+
+type VisionSelfAttention struct {
+	Query       *nn.Linear `gguf:"attn_q"`
+	Key         *nn.Linear `gguf:"attn_k"`
+	Value       *nn.Linear `gguf:"attn_v"`
+	Output      *nn.Linear `gguf:"attn_output"`
+	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	headDim := opts.headDim
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	key := sa.Key.Forward(ctx, hiddenState)
+	value := sa.Value.Forward(ctx, hiddenState)
+
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	key = key.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	value = value.Reshape(ctx, headDim, opts.numHeads, batchSize)
+
+	ropeType := uint32(0)
+	query = query.RoPE(ctx, positionIDs, sa.RopeFactors, uint32(headDim), ropeType, opts.ropeBase, opts.ropeScale)
+	key = key.RoPE(ctx, positionIDs, sa.RopeFactors, uint32(headDim), ropeType, opts.ropeBase, opts.ropeScale)
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+type VisionMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type VisionEncoderLayer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention *VisionSelfAttention
+
+	FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP     *VisionMLP  `gguf:"mlp"`
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	// self attention
+	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, positionIDs, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	// feed forward
+	hiddenState = e.FFNNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+type VisionModelOptions struct {
+	hiddenSize       int
+	numHeads         int
+	headDim          int
+	intermediateSize int
+	imageSize        int
+	patchSize        int
+	numChannels      int
+	eps              float32
+	ropeBase         float32
+	ropeScale        float32
+}
+
+type VisionModel struct {
+	PatchEmbedding *nn.Conv2D           `gguf:"patch_conv"`
+	EncoderNorm    *nn.LayerNorm        `gguf:"encoder_norm"`
+	Layers         []VisionEncoderLayer `gguf:"blk"`
+
+	*VisionModelOptions
+}
+
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
+	numPatchesH := m.imageSize / m.patchSize
+	numPatchesW := m.imageSize / m.patchSize
+	numPatches := numPatchesH * numPatchesW
+
+	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	// Create position IDs
+	positions := make([]int32, numPatches)
+	for i := range positions {
+		positions[i] = int32(i)
+	}
+
+	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
+	if err != nil {
+		panic(err)
+	}
+
+	// Apply encoder normalization
+	hiddenState = m.EncoderNorm.Forward(ctx, hiddenState, m.eps)
+
+	// Process through transformer layers
+	for _, layer := range m.Layers {
+		hiddenState = layer.Forward(ctx, hiddenState, positionIDs, m.VisionModelOptions)
+	}
+
+	return hiddenState
+}
+
+func newVisionModel(c ml.Config) *VisionModel {
+	return &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
+			numHeads:         int(c.Uint("vision.attention.head_count", 16)),
+			headDim:          int(c.Uint("vision.attention.key_length", 64)),
+			intermediateSize: int(c.Uint("vision.feed_forward_length", 4096)),
+			imageSize:        int(c.Uint("vision.image_size", 1540)),
+			patchSize:        int(c.Uint("vision.patch_size", 14)),
+			numChannels:      int(c.Uint("vision.num_channels", 3)),
+			eps:              c.Float("vision.attention.layer_norm_epsilon", 1e-05),
+			ropeBase:         c.Float("vision.rope.freq_base", 10000.0),
+			ropeScale:        c.Float("vision.rope.freq_scale", 1.0),
+		},
+	}
+}
--- a/model/models/mistral3/multimodal_proj.go
+++ b/model/models/mistral3/multimodal_proj.go
@@ -0,0 +1,38 @@
+package mistral3
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+type MultiModalProjector struct {
+	Norm       *nn.RMSNorm `gguf:"norm"`
+	Projection *nn.Linear  `gguf:"projection"`
+
+	spatialMergeSize int
+	imageTokenIndex  int
+	hasBias          bool
+}
+
+func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
+	// Apply normalization
+	visionOutputs = p.Norm.Forward(ctx, visionOutputs, eps)
+
+	// If the spatial merge size is > 1, average pool the patches
+	if p.spatialMergeSize > 1 {
+		// Implementation depends on how the model handles spatial merging
+		// For simplicity, we'll use a spatial pooling approach
+		visionOutputs = visionOutputs.AvgPool2D(ctx, p.spatialMergeSize, p.spatialMergeSize, 0)
+	}
+
+	// Project to text embedding dimension
+	return p.Projection.Forward(ctx, visionOutputs)
+}
+
+func newMultiModalProjector(c ml.Config) *MultiModalProjector {
+	return &MultiModalProjector{
+		spatialMergeSize: int(c.Uint("spatial_merge_size", 2)),
+		imageTokenIndex:  int(c.Uint("image_token_index", 10)),
+		hasBias:          c.Bool("mm.projector_bias", false),
+	}
+}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -4,6 +4,6 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/llama"
-	_ "github.com/ollama/ollama/model/models/mistral"
+	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
 )
--- a/model/process_text_test.go
+++ b/model/process_text_test.go
@@ -222,14 +222,10 @@ func tekken(t testing.TB) TextProcessor {
 	defer configFile.Close()

 	var config struct {
-		AddBosToken bool `json:"add_bos_token"`
-		AddEosToken bool `json:"add_eos_token"`
-		BosToken    struct {
-			Content string `json:"content"`
-		} `json:"bos_token"`
-		EosToken struct {
-			Content string `json:"content"`
-		} `json:"eos_token"`
+		AddBosToken bool   `json:"add_bos_token"`
+		AddEosToken bool   `json:"add_eos_token"`
+		BosToken    string `json:"bos_token"`
+		EosToken    string `json:"eos_token"`
 	}
 	if err := json.NewDecoder(configFile).Decode(&config); err != nil {
 		t.Fatal(err)
@@ -319,8 +315,8 @@ func tekken(t testing.TB) TextProcessor {
 		Types:  types,
 		Scores: scores,
 		Merges: merges,
-		BOS:    vocab[config.BosToken.Content],
-		EOS:    vocab[config.EosToken.Content],
+		BOS:    vocab[config.BosToken],
+		EOS:    vocab[config.EosToken],
 		AddBOS: config.AddBosToken,
 		AddEOS: config.AddEosToken,
 	}
--- a/model/testdata/mistral-small/tokenizer.json
+++ b/model/testdata/mistral-small/tokenizer.json
--- a/model/testdata/mistral-small/tokenizer_config.json
+++ b/model/testdata/mistral-small/tokenizer_config.json
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -211,16 +211,10 @@ func filesForModel(path string) ([]string, error) {
 	}

 	var files []string
-	if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
+	if st, _ := glob(filepath.Join(path, "*.safetensors"), "application/octet-stream"); len(st) > 0 {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
-	} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
-		// covers adapters.safetensors
-		files = append(files, st...)
-	} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
-		// covers adapter_model.safetensors
-		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
Author	SHA1	Message	Date
Bruce MacDonald	9de1410542	wip	2025-03-21 13:17:13 -07:00
Bruce MacDonald	9a9944fc6b	vision conversion	2025-03-20 15:37:21 -07:00
Bruce MacDonald	26767c665a	image processing	2025-03-20 15:15:04 -07:00
Bruce MacDonald	1eab2c85cc	split text model to its own file	2025-03-20 12:54:20 -07:00
Bruce MacDonald	c133341847	...	2025-03-20 12:47:42 -07:00
Bruce MacDonald	ca07379f57	mistral3 arch	2025-03-20 12:44:02 -07:00
Bruce MacDonald	713f7550a1	wip: test fixes	2025-03-20 11:19:42 -07:00
Bruce MacDonald	fe796cfc75	convert: mistral-3.1-2503 text component	2025-03-20 10:58:23 -07:00
Bruce MacDonald	434f793075	minimal convert	2025-03-19 16:56:52 -07:00