56 lines
1.1 KiB
Go
56 lines
1.1 KiB
Go
package attention
|
|
|
|
import (
|
|
"github.com/ollama/ollama/ml"
|
|
)
|
|
|
|
type Options struct {
|
|
// Scale is a scaling factor applied to the attention scores. Default is 1/√d_k.
|
|
Scale float64
|
|
|
|
// LogitSoftcap is used to apply a soft cap to the logits before softmax.
|
|
LogitSoftcap float32
|
|
|
|
// Mask is used in some attention mechanisms to mask out certain positions.
|
|
Mask ml.Tensor
|
|
|
|
// Sinks is used in some attention mechanisms to store additional data.
|
|
Sinks ml.Tensor
|
|
|
|
// MLA is used in some attention mechanisms for multi-latent attention.
|
|
MLA ml.Tensor
|
|
|
|
// Cached indicates whether key/value were retrieved from cache.
|
|
Cached bool
|
|
}
|
|
|
|
func WithScale(scale float64) func(*Options) {
|
|
return func(o *Options) {
|
|
o.Scale = scale
|
|
}
|
|
}
|
|
|
|
func WithSinks(sinks ml.Tensor) func(*Options) {
|
|
return func(o *Options) {
|
|
o.Sinks = sinks
|
|
}
|
|
}
|
|
|
|
func WithMLA(mla ml.Tensor) func(*Options) {
|
|
return func(o *Options) {
|
|
o.MLA = mla
|
|
}
|
|
}
|
|
|
|
func WithMask(mask ml.Tensor) func(*Options) {
|
|
return func(o *Options) {
|
|
o.Mask = mask
|
|
}
|
|
}
|
|
|
|
func WithLogitSoftcap(softcap float32) func(*Options) {
|
|
return func(o *Options) {
|
|
o.LogitSoftcap = softcap
|
|
}
|
|
}
|