the updated interface supports variadic attention options which removes the need for individual `AttentionWith...` functions. it means more models can use the attention interface, e.g. models with custom masks, logit softcapping, etc. additionally, this interface should be less error prone since there are now reasonable defaults for all optional parameters
56 lines
1.1 KiB
Go
56 lines
1.1 KiB
Go
package attention
|
|
|
|
import (
|
|
"github.com/ollama/ollama/ml"
|
|
)
|
|
|
|
type Options struct {
|
|
// Scale is a scaling factor applied to the attention scores. Default is 1/√d_k.
|
|
Scale float64
|
|
|
|
// LogitSoftcap is used to apply a soft cap to the logits before softmax.
|
|
LogitSoftcap float32
|
|
|
|
// Mask is used in some attention mechanisms to mask out certain positions.
|
|
Mask ml.Tensor
|
|
|
|
// Sinks is used in some attention mechanisms to store additional data.
|
|
Sinks ml.Tensor
|
|
|
|
// MLA is used in some attention mechanisms for multi-latent attention.
|
|
MLA ml.Tensor
|
|
|
|
// Cached indicates whether key/value were retrieved from cache.
|
|
Cached bool
|
|
}
|
|
|
|
func WithScale(scale float64) func(*Options) {
|
|
return func(o *Options) {
|
|
o.Scale = scale
|
|
}
|
|
}
|
|
|
|
func WithSinks(sinks ml.Tensor) func(*Options) {
|
|
return func(o *Options) {
|
|
o.Sinks = sinks
|
|
}
|
|
}
|
|
|
|
func WithMLA(mla ml.Tensor) func(*Options) {
|
|
return func(o *Options) {
|
|
o.MLA = mla
|
|
}
|
|
}
|
|
|
|
func WithMask(mask ml.Tensor) func(*Options) {
|
|
return func(o *Options) {
|
|
o.Mask = mask
|
|
}
|
|
}
|
|
|
|
func WithLogitSoftcap(softcap float32) func(*Options) {
|
|
return func(o *Options) {
|
|
o.LogitSoftcap = softcap
|
|
}
|
|
}
|