Compare commits

..

8 Commits

Author SHA1 Message Date
Bruce MacDonald
31d04eb795 model: add a test for model forward pass during implementation
Adds a new test file to verify model forward pass behavior through
JSON-specified test cases. The framework loads model files (.gguf) and their
corresponding test specifications to validate expected outputs using greedy
sampling.
2025-02-18 14:21:10 -08:00
Michael Yang
7b5d916a9a ci: set owner/group in tarball
set owner and group when building the linux tarball so extracted files
are consistent. this is the behaviour of release tarballs in version
0.5.7 and lower
2025-02-18 20:11:09 +00:00
benhaotang
33ad61b112 Add OpenDeepResearcher-via-searxng to Community Integrations (#9138) 2025-02-18 11:39:11 -08:00
L. Jiang
716e365615 test: add test cases for HumanNumber (#9108) 2025-02-18 11:35:26 -08:00
innightwolfsleep
3b4424ff98 readme: add LLM Telegram Bot to community integrations (#9150) 2025-02-18 10:04:30 -05:00
James-William-Kincaid-III
0667baddc6 docs: fix incorrect shortcut key in windows.md (#9098) 2025-02-15 15:38:24 -05:00
Bruce MacDonald
d006e1e09b model: document high-level model interface (#9122) 2025-02-14 16:01:00 -08:00
Daniel Hiltgen
df2680b4b9 Wire up system info log for new engine (#9123) 2025-02-14 15:55:33 -08:00
13 changed files with 105 additions and 338 deletions

View File

@@ -329,7 +329,9 @@ jobs:
done
working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
- run: |
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
done
- uses: actions/upload-artifact@v4
with:
name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}

View File

@@ -381,6 +381,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
### Cloud
@@ -548,6 +549,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
- [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
### Supported backends

View File

@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
## Troubleshooting
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in:
the explorer window by hitting `<Ctrl>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains most resent logs from the GUI application
- *server.log* contains the most recent server logs

View File

@@ -12,6 +12,9 @@ func TestHumanNumber(t *testing.T) {
testCases := []testCase{
{0, "0"},
{999, "999"},
{1000, "1K"},
{1001, "1K"},
{1000000, "1M"},
{125000000, "125M"},
{500500000, "500.50M"},

View File

@@ -305,6 +305,10 @@ func (b *testBackend) NewContext() ml.Context {
return &testContext{}
}
func (b *testBackend) SystemInfo() string {
return "not implemented"
}
type testContext struct{}
func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
@@ -430,7 +434,7 @@ func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0
panic("not implemented")
}
func (t *testTensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim uint32, base, scale float32) ml.Tensor {
panic("not implemented")
}

View File

@@ -23,6 +23,7 @@ type Backend interface {
Config() Config
Get(name string) Tensor
NewContext() Context
SystemInfo() string
}
var backends = make(map[string]func(*os.File) (Backend, error))
@@ -43,42 +44,6 @@ func NewBackend(f *os.File) (Backend, error) {
return nil, fmt.Errorf("unsupported backend")
}
// RopeType specifies the type of RoPE (Rotary Position Embedding) to use, these types are implemented in the backend
type RopeType int
const (
RopeTypeStandard RopeType = iota
_ // not yet used
RopeTypeNeoX
)
// RopeConfig contains all configuration for the RoPE (Rotary Position Embedding) operation
type RopeConfig struct {
// PositionIDs contains the position indices for each token in the sequence
// These indices are used to calculate the rotary embeddings
PositionIDs Tensor
// RopeFactors is an optional tensor containing pre-computed rotation factors
RopeFactors Tensor
// RopeDim specifies the dimension size for the rotary embeddings
RopeDim uint32
// RopeType indicates which RoPE variant to use (e.g. normal or neox)
RopeType RopeType
// OrigCtxLen stores the original context length the model was trained with
OrigCtxLen int
// RopeBase is the base value used in the frequency calculation
RopeBase float32
// RopeScale is a scaling factor applied to position indices
RopeScale float32
// YaRN parameters can be added here if they need to be configurable
}
type Context interface {
Zeros(dtype DType, shape ...int) Tensor
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
@@ -111,7 +76,7 @@ type Tensor interface {
Scale(ctx Context, s float64) Tensor
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
RoPE(ctx Context, rc RopeConfig) Tensor
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
Tanh(ctx Context) Tensor
GELU(ctx Context) Tensor

View File

@@ -1,11 +1,27 @@
package ggml
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
/*
#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
#include <stdlib.h>
#include <stdint.h>
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
COMPILER inline get_compiler() {
#if defined(__clang__)
return COMP_CLANG;
#elif defined(__GNUC__)
return COMP_GCC;
#else
return UNKNOWN_COMPILER;
#endif
}
*/
import "C"
import (
@@ -579,9 +595,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
}
}
func (t *Tensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
if rc.RopeFactors == nil {
rc.RopeFactors = &Tensor{}
const (
ropeTypeNorm C.int = iota
)
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
if ropeFactors == nil {
ropeFactors = &Tensor{}
}
dequant := t.t
@@ -591,15 +611,12 @@ func (t *Tensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
return &Tensor{
t: C.ggml_rope_ext(
ctx.(*Context).ctx,
dequant,
rc.PositionIDs.(*Tensor).t,
rc.RopeFactors.(*Tensor).t,
C.int(rc.RopeDim),
C.int(rc.RopeType),
C.int(rc.OrigCtxLen),
C.float(rc.RopeBase),
C.float(rc.RopeScale),
ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
C.int(ropeDim),
131072, // YaRN n_ctx_train
ropeTypeNorm, // ROPE_TYPE_NORM
C.float(ropeBase),
C.float(ropeScale),
0., // YaRN ext_factor
1., // YaRN attn_factor
32., // YaRN beta_fast
@@ -625,3 +642,34 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
}
}
func (b *Backend) SystemInfo() string {
var compiler string
switch C.get_compiler() {
case C.COMP_UNKNOWN:
compiler = "cgo(unknown_compiler)"
case C.COMP_GCC:
compiler = "cgo(gcc)"
case C.COMP_CLANG:
compiler = "cgo(clang)"
}
var s string
for i := range C.ggml_backend_reg_count() {
reg := C.ggml_backend_reg_get(i)
fName := C.CString("ggml_backend_get_features")
defer C.free(unsafe.Pointer(fName))
get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
if get_features_fn != nil {
s += C.GoString(C.ggml_backend_reg_name(reg))
s += " : "
for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
s += C.GoString(features.name)
s += " = "
s += C.GoString(features.value)
s += " | "
}
}
}
return s + compiler
}

View File

@@ -21,6 +21,7 @@ import (
_ "github.com/ollama/ollama/ml/backend"
)
// Options contains the inputs for a model forward pass
type Options struct {
Inputs []int32
Positions []int32
@@ -34,11 +35,13 @@ type config struct {
Cache kvcache.Cache
}
// Base implements the common fields and methods for all models
type Base struct {
b ml.Backend
config
}
// Backend returns the underlying backend that will run the model
func (m *Base) Backend() ml.Backend {
return m.b
}
@@ -47,6 +50,7 @@ func (m *Base) Config() config {
return m.config
}
// Model implements a specific model architecture, defining the forward pass and any model-specific configuration
type Model interface {
Forward(ml.Context, Options) (ml.Tensor, error)
@@ -56,6 +60,7 @@ type Model interface {
var models = make(map[string]func(ml.Config) (Model, error))
// Register registers a model constructor for the given architecture
func Register(name string, f func(ml.Config) (Model, error)) {
if _, ok := models[name]; ok {
panic("model: model already registered")
@@ -64,8 +69,9 @@ func Register(name string, f func(ml.Config) (Model, error)) {
models[name] = f
}
func New(s string) (Model, error) {
r, err := os.Open(s)
// New initializes a new model instance with the provided configuration based on the metadata in the model file
func New(modelPath string) (Model, error) {
r, err := os.Open(modelPath)
if err != nil {
return nil, err
}

View File

@@ -10,10 +10,10 @@ import (
)
type Options struct {
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
ctxLen, hiddenSize, numHeads, numKVHeads int
eps, ropeBase, ropeScale float32
ropeDim uint32
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
hiddenSize, numHeads, numKVHeads int
eps, ropeBase, ropeScale float32
ropeDim uint32
}
type Model struct {
@@ -46,7 +46,6 @@ func New(c ml.Config) (model.Model, error) {
numHeads: int(c.Uint("attention.head_count")),
numKVHeads: int(c.Uint("attention.head_count_kv")),
eps: c.Float("attention.layer_norm_rms_epsilon"),
ctxLen: int(c.Uint("context_length")),
ropeBase: c.Float("rope.freq_base"),
ropeScale: c.Float("rope.freq_scale", 1),
ropeDim: c.Uint("rope.dimension_count"),
@@ -68,23 +67,14 @@ type SelfAttention struct {
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
rc := ml.RopeConfig{
PositionIDs: positionIDs,
RopeFactors: opts.RopeFactors,
RopeDim: opts.ropeDim,
RopeType: ml.RopeTypeStandard,
OrigCtxLen: opts.ctxLen,
RopeBase: opts.ropeBase,
RopeScale: opts.ropeScale,
}
q := sa.Query.Forward(ctx, hiddenState)
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
q = q.RoPE(ctx, rc)
q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
k := sa.Key.Forward(ctx, hiddenState)
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
k = k.RoPE(ctx, rc)
k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
v := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -109,18 +99,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
}
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.RoPE(
ctx,
ml.RopeConfig{
PositionIDs: shift,
RopeFactors: m.Options.RopeFactors,
RopeDim: m.Options.ropeDim,
RopeType: ml.RopeTypeStandard,
OrigCtxLen: m.Options.ctxLen,
RopeBase: m.Options.ropeBase,
RopeScale: m.Options.ropeScale,
},
), nil
return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, m.Options.ropeBase, m.Options.ropeScale), nil
}
type MLP struct {

View File

@@ -19,23 +19,14 @@ type TextSelfAttention struct {
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
rc := ml.RopeConfig{
PositionIDs: positions,
RopeFactors: opts.RopeFactors,
RopeDim: opts.ropeDim,
RopeType: ml.RopeTypeStandard,
OrigCtxLen: opts.ctxLen,
RopeBase: opts.ropeBase,
RopeScale: opts.ropeScale,
}
query := sa.Query.Forward(ctx, hiddenState)
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
query = query.RoPE(ctx, rc)
query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
key := sa.Key.Forward(ctx, hiddenState)
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
key = key.RoPE(ctx, rc)
key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
value := sa.Value.Forward(ctx, hiddenState)
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -61,18 +52,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
// This will only get called for layers in the cache, which are just the self attention layers
return key.RoPE(
ctx,
ml.RopeConfig{
PositionIDs: shift,
RopeFactors: m.RopeFactors,
RopeDim: m.ropeDim,
RopeType: ml.RopeTypeStandard,
OrigCtxLen: m.ctxLen,
RopeBase: m.ropeBase,
RopeScale: m.ropeScale,
},
), nil
return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
}
type TextMLP struct {
@@ -209,9 +189,9 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, cr
type TextModelOptions struct {
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
ctxLen, hiddenSize, numHeads, numKVHeads int
eps, ropeBase, ropeScale float32
ropeDim uint32
hiddenSize, numHeads, numKVHeads int
eps, ropeBase, ropeScale float32
ropeDim uint32
crossAttentionLayers []uint32
}

View File

@@ -3,5 +3,4 @@ package models
import (
_ "github.com/ollama/ollama/model/models/llama"
_ "github.com/ollama/ollama/model/models/mllama"
_ "github.com/ollama/ollama/model/models/qwen2"
)

View File

@@ -1,222 +0,0 @@
package qwen2
import (
"math"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
)
type Options struct {
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
contextLength int
hiddenSize int
numAttnHeads int
numKVHeads int
modelEpsilon float32
ropeBaseFreq float32
ropeFreqScale float32
ropeDimensions uint32
}
type Model struct {
model.Base
model.BytePairEncoding
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
Layers []Layer `gguf:"blk"`
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
Output *nn.Linear `gguf:"output,alt:token_embd"`
*Options
}
func New(c ml.Config) (model.Model, error) {
m := &Model{
BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Uints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
},
),
Layers: make([]Layer, c.Uint("block_count")),
Options: &Options{
hiddenSize: int(c.Uint("embedding_length")),
numAttnHeads: int(c.Uint("attention.head_count")),
numKVHeads: int(c.Uint("attention.head_count_kv")),
modelEpsilon: c.Float("attention.layer_norm_rms_epsilon"),
contextLength: int(c.Uint("context_length")),
ropeBaseFreq: c.Float("rope.freq_base"),
ropeFreqScale: c.Float("rope.freq_scale", 1),
ropeDimensions: c.Uint("rope.dimension_count", 64),
},
}
m.Cache = kvcache.NewCausalCache(m.Shift)
return m, nil
}
// Shift applies rotary position embeddings to the key tensor for causal attention caching
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.RoPE(
ctx,
ml.RopeConfig{
PositionIDs: shift,
RopeFactors: m.Options.RopeFactors,
RopeDim: m.Options.ropeDimensions,
RopeType: ml.RopeTypeNeoX,
OrigCtxLen: m.Options.contextLength,
RopeBase: m.Options.ropeBaseFreq,
RopeScale: m.Options.ropeFreqScale,
},
), nil
}
// SelfAttention implements the multi-head self-attention mechanism
// with separate projections for query, key, value and output transformations
type SelfAttention struct {
Query *nn.Linear `gguf:"attn_q"`
Key *nn.Linear `gguf:"attn_k"`
Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_output"`
}
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, inputPositions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
// Initialize dimensions and configuration
batchSize := hiddenState.Dim(1)
headDimension := opts.hiddenSize / opts.numAttnHeads
ropeConfig := ml.RopeConfig{
PositionIDs: inputPositions,
RopeFactors: nil,
RopeDim: opts.ropeDimensions,
RopeType: ml.RopeTypeNeoX,
OrigCtxLen: opts.contextLength,
RopeBase: opts.ropeBaseFreq,
RopeScale: opts.ropeFreqScale,
}
// Project and reshape query states with rotary embeddings
queryStates := sa.Query.Forward(ctx, hiddenState)
queryStates = queryStates.Reshape(ctx, headDimension, opts.numAttnHeads, batchSize)
queryStates = queryStates.RoPE(ctx, ropeConfig)
// Project and reshape key states with rotary embeddings
keyStates := sa.Key.Forward(ctx, hiddenState)
keyStates = keyStates.Reshape(ctx, headDimension, opts.numKVHeads, batchSize)
keyStates = keyStates.RoPE(ctx, ropeConfig)
// Project and reshape value states
valueStates := sa.Value.Forward(ctx, hiddenState)
valueStates = valueStates.Reshape(ctx, headDimension, opts.numKVHeads, batchSize)
// Update and retrieve from KV cache
cache.Put(ctx, keyStates, valueStates)
keyStates, valueStates, attentionMask := cache.Get(ctx)
// Prepare tensors for attention computation
queryStates = queryStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
keyStates = keyStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
valueStates = valueStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
// Apply scaling and attention mask to scores
attentionScores := keyStates.MulmatFullPrec(ctx, queryStates)
attentionScores = attentionScores.Scale(ctx, 1.0/math.Sqrt(float64(headDimension)))
attentionScores = attentionScores.Add(ctx, attentionMask)
// Compute scaled dot-product attention
attentionProbs := attentionScores.Softmax(ctx)
// Apply attention weights and reshape
weightedStates := valueStates.Mulmat(ctx, attentionProbs)
weightedStates = weightedStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
weightedStates = weightedStates.Reshape(ctx, opts.hiddenSize, batchSize)
// Project to output dimension
return sa.Output.Forward(ctx, weightedStates)
}
// MLP implements the feed-forward network component with SwiGLU activation
type MLP struct {
Up *nn.Linear `gguf:"ffn_up"`
Down *nn.Linear `gguf:"ffn_down"`
Gate *nn.Linear `gguf:"ffn_gate"`
}
func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
// Apply SwiGLU activation gating
gateActivation := mlp.Gate.Forward(ctx, hiddenState).SILU(ctx)
upProjection := mlp.Up.Forward(ctx, hiddenState)
intermediateStates := gateActivation.Mul(ctx, upProjection)
// Project back to hidden dimension
return mlp.Down.Forward(ctx, intermediateStates)
}
// Layer represents a single transformer layer combining self-attention and feed-forward components
type Layer struct {
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
SelfAttention *SelfAttention
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
MLP *MLP
}
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
// Self-attention branch with residual connection
residual := hiddenState
normalizedAttention := l.AttentionNorm.Forward(ctx, hiddenState, opts.modelEpsilon)
attentionOutput := l.SelfAttention.Forward(ctx, normalizedAttention, positionIDs, cache, opts)
hiddenState = attentionOutput.Add(ctx, residual)
// Feed-forward branch with residual connection
residual = hiddenState
normalizedMLP := l.MLPNorm.Forward(ctx, hiddenState, opts.modelEpsilon)
mlpOutput := l.MLP.Forward(ctx, normalizedMLP, opts)
output := mlpOutput.Add(ctx, residual)
return output
}
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
// Convert input tokens and positions to tensors
inputTensor, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
if err != nil {
return nil, err
}
positionsTensor, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
if err != nil {
return nil, err
}
// Initial token embedding
hiddenStates := m.TokenEmbedding.Forward(ctx, inputTensor)
// Process through transformer layers
for i, layer := range m.Layers {
m.Cache.SetLayer(i)
hiddenStates = layer.Forward(ctx, hiddenStates, positionsTensor, m.Cache, m.Options)
}
// Final layer normalization and output projection
normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)
logits := m.Output.Forward(ctx, normalizedOutput)
// Extract requested output token positions
outputsTensor, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
if err != nil {
return nil, err
}
return logits.Rows(ctx, outputsTensor), nil
}
func init() {
model.Register("qwen2", New)
}

View File

@@ -813,6 +813,8 @@ func (s *Server) loadModel(
panic(err)
}
slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
// TODO(jessegross): LoRA loading
if lpath.String() != "" {
panic("loras are not yet implemented")
@@ -881,7 +883,6 @@ func Execute(args []string) error {
})
slog.SetDefault(slog.New(handler))
slog.Info("starting ollama engine")
// TODO(jessegross): Some system info would be useful
server := &Server{
batchSize: *batchSize,