the updated interface supports variadic attention options which removes the need for individual `AttentionWith...` functions. it means more models can use the attention interface, e.g. models with custom masks, logit softcapping, etc. additionally, this interface should be less error prone since there are now reasonable defaults for all optional parameters
137 lines
4.6 KiB
Go
137 lines
4.6 KiB
Go
package qwen25vl
|
|
|
|
import (
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/kvcache"
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/ml/nn"
|
|
"github.com/ollama/ollama/ml/nn/rope"
|
|
)
|
|
|
|
type TextOptions struct {
|
|
hiddenSize, numHeads, numKVHeads int
|
|
ropeDim, originalContextLength int
|
|
eps, ropeBase, ropeScale float32
|
|
mropeSections []int
|
|
}
|
|
|
|
func (o TextOptions) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
|
return nn.RoPE(ctx, states, positions, o.ropeDim, o.ropeBase, 1./o.ropeScale, rope.WithMRoPE(o.mropeSections))
|
|
}
|
|
|
|
type TextModel struct {
|
|
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
|
Layers []Layer `gguf:"blk"`
|
|
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
|
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
|
|
|
*TextOptions
|
|
positionCache []int32
|
|
}
|
|
|
|
func NewTextModel(c fs.Config) *TextModel {
|
|
m := TextModel{
|
|
Layers: make([]Layer, c.Uint("block_count")),
|
|
TextOptions: &TextOptions{
|
|
hiddenSize: int(c.Uint("embedding_length")),
|
|
numHeads: int(c.Uint("attention.head_count")),
|
|
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
|
ropeDim: int(c.Uint("rope.dimension_count", 128)),
|
|
originalContextLength: int(c.Uint("context_length", 128000)),
|
|
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
|
ropeBase: c.Float("rope.freq_base"),
|
|
ropeScale: c.Float("rope.scaling.factor", 1),
|
|
mropeSections: func() []int {
|
|
sections := c.Ints("rope.mrope_section")
|
|
s := make([]int, len(sections))
|
|
for i, section := range sections {
|
|
s[i] = int(section)
|
|
}
|
|
return s
|
|
}(),
|
|
},
|
|
}
|
|
|
|
return &m
|
|
}
|
|
|
|
// SelfAttention implements the multi-head self-attention mechanism
|
|
// with separate projections for query, key, value and output transformations
|
|
type SelfAttention struct {
|
|
Query *nn.Linear `gguf:"attn_q"`
|
|
Key *nn.Linear `gguf:"attn_k"`
|
|
Value *nn.Linear `gguf:"attn_v"`
|
|
Output *nn.Linear `gguf:"attn_output"`
|
|
}
|
|
|
|
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
|
batchSize := hiddenState.Dim(1)
|
|
headDim := opts.hiddenSize / opts.numHeads
|
|
|
|
q := sa.Query.Forward(ctx, hiddenState)
|
|
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
|
q = opts.applyRotaryPositionEmbeddings(ctx, q, positionIDs)
|
|
|
|
k := sa.Key.Forward(ctx, hiddenState)
|
|
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
|
k = opts.applyRotaryPositionEmbeddings(ctx, k, positionIDs)
|
|
|
|
v := sa.Value.Forward(ctx, hiddenState)
|
|
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
|
|
|
kqv := nn.Attention(ctx, q, k, v, cache)
|
|
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
|
|
|
return sa.Output.Forward(ctx, kqv)
|
|
}
|
|
|
|
// Shift applies rotary position embeddings to the key tensor for causal attention caching
|
|
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
|
m.positionCache = nil
|
|
return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
|
|
}
|
|
|
|
// MLP implements the feed-forward network component with SwiGLU activation
|
|
type MLP struct {
|
|
Up *nn.Linear `gguf:"ffn_up"`
|
|
Down *nn.Linear `gguf:"ffn_down"`
|
|
Gate *nn.Linear `gguf:"ffn_gate"`
|
|
}
|
|
|
|
func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
|
|
// Apply SwiGLU activation gating
|
|
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx, mlp.Up.Forward(ctx, hiddenState))
|
|
// Project back to hidden dimension
|
|
return mlp.Down.Forward(ctx, hiddenState)
|
|
}
|
|
|
|
// Layer represents a single transformer layer combining self-attention and feed-forward components
|
|
type Layer struct {
|
|
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
|
SelfAttention *SelfAttention
|
|
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
|
MLP *MLP
|
|
}
|
|
|
|
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
|
// Self-attention branch with residual connection
|
|
residual := hiddenState
|
|
|
|
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
|
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
|
|
|
// In the final layer (outputs != nil), optimize by pruning to just the token positions
|
|
// we need logits for.
|
|
if outputs != nil {
|
|
hiddenState = hiddenState.Rows(ctx, outputs)
|
|
residual = residual.Rows(ctx, outputs)
|
|
}
|
|
|
|
hiddenState = hiddenState.Add(ctx, residual)
|
|
// Feed-forward branch with residual connection
|
|
residual = hiddenState
|
|
hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
|
hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
|
|
return hiddenState.Add(ctx, residual)
|
|
}
|