Compare commits
1 Commits
v0.7.1
...
brucemacd/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a2cd7b48a |
@@ -51,8 +51,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
|
|||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
|
||||||
|
|
||||||
add_compile_definitions(NDEBUG)
|
|
||||||
|
|
||||||
set(GGML_CPU ON)
|
set(GGML_CPU ON)
|
||||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
||||||
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
||||||
|
|||||||
@@ -405,7 +405,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
||||||
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
|
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
|
||||||
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
|
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
|
||||||
- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
|
|
||||||
|
|
||||||
### Cloud
|
### Cloud
|
||||||
|
|
||||||
|
|||||||
56
cmd/cmd.go
56
cmd/cmd.go
@@ -747,38 +747,11 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
|
|||||||
case float64:
|
case float64:
|
||||||
v = fmt.Sprintf("%g", vData)
|
v = fmt.Sprintf("%g", vData)
|
||||||
case []any:
|
case []any:
|
||||||
targetWidth := 10 // Small width where we are displaying the data in a column
|
n := 3
|
||||||
|
if len(vData) < n {
|
||||||
var itemsToShow int
|
n = len(vData)
|
||||||
totalWidth := 1 // Start with 1 for opening bracket
|
|
||||||
|
|
||||||
// Find how many we can fit
|
|
||||||
for i := range vData {
|
|
||||||
itemStr := fmt.Sprintf("%v", vData[i])
|
|
||||||
width := runewidth.StringWidth(itemStr)
|
|
||||||
|
|
||||||
// Add separator width (", ") for all items except the first
|
|
||||||
if i > 0 {
|
|
||||||
width += 2
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if adding this item would exceed our width limit
|
|
||||||
if totalWidth+width > targetWidth && i > 0 {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
totalWidth += width
|
|
||||||
itemsToShow++
|
|
||||||
}
|
|
||||||
|
|
||||||
// Format the output
|
|
||||||
if itemsToShow < len(vData) {
|
|
||||||
v = fmt.Sprintf("%v", vData[:itemsToShow])
|
|
||||||
v = strings.TrimSuffix(v, "]")
|
|
||||||
v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
|
|
||||||
} else {
|
|
||||||
v = fmt.Sprintf("%v", vData)
|
|
||||||
}
|
}
|
||||||
|
v = fmt.Sprintf("%v", vData[:n])
|
||||||
default:
|
default:
|
||||||
v = fmt.Sprintf("%T", vData)
|
v = fmt.Sprintf("%T", vData)
|
||||||
}
|
}
|
||||||
@@ -799,19 +772,10 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
|
|||||||
|
|
||||||
head := func(s string, n int) (rows [][]string) {
|
head := func(s string, n int) (rows [][]string) {
|
||||||
scanner := bufio.NewScanner(strings.NewReader(s))
|
scanner := bufio.NewScanner(strings.NewReader(s))
|
||||||
count := 0
|
for scanner.Scan() && (len(rows) < n || n < 0) {
|
||||||
for scanner.Scan() {
|
if text := scanner.Text(); text != "" {
|
||||||
text := strings.TrimSpace(scanner.Text())
|
rows = append(rows, []string{"", strings.TrimSpace(text)})
|
||||||
if text == "" {
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
count++
|
|
||||||
if n < 0 || count <= n {
|
|
||||||
rows = append(rows, []string{"", text})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if n >= 0 && count > n {
|
|
||||||
rows = append(rows, []string{"", "..."})
|
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -1236,11 +1200,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := client.Heartbeat(cmd.Context()); err != nil {
|
if err := client.Heartbeat(cmd.Context()); err != nil {
|
||||||
if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
|
if !strings.Contains(err.Error(), " refused") {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := startApp(cmd.Context(), client); err != nil {
|
if err := startApp(cmd.Context(), client); err != nil {
|
||||||
return fmt.Errorf("ollama server not responding - %w", err)
|
return errors.New("could not connect to ollama app, is it running?")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@@ -1318,7 +1282,7 @@ func NewCLI() *cobra.Command {
|
|||||||
}
|
}
|
||||||
|
|
||||||
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
|
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
|
||||||
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
|
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
|
||||||
|
|
||||||
showCmd := &cobra.Command{
|
showCmd := &cobra.Command{
|
||||||
Use: "show MODEL",
|
Use: "show MODEL",
|
||||||
|
|||||||
@@ -225,7 +225,6 @@ Weigh anchor!
|
|||||||
System
|
System
|
||||||
You are a pirate!
|
You are a pirate!
|
||||||
Ahoy, matey!
|
Ahoy, matey!
|
||||||
...
|
|
||||||
|
|
||||||
`
|
`
|
||||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
|||||||
@@ -4,27 +4,17 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unsafe"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"golang.org/x/sys/windows"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
Installer = "OllamaSetup.exe"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func startApp(ctx context.Context, client *api.Client) error {
|
func startApp(ctx context.Context, client *api.Client) error {
|
||||||
if len(isProcRunning(Installer)) > 0 {
|
// log.Printf("XXX Attempting to find and start ollama app")
|
||||||
return fmt.Errorf("upgrade in progress...")
|
|
||||||
}
|
|
||||||
AppName := "ollama app.exe"
|
AppName := "ollama app.exe"
|
||||||
exe, err := os.Executable()
|
exe, err := os.Executable()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -66,41 +56,3 @@ func startApp(ctx context.Context, client *api.Client) error {
|
|||||||
}
|
}
|
||||||
return waitForServer(ctx, client)
|
return waitForServer(ctx, client)
|
||||||
}
|
}
|
||||||
|
|
||||||
func isProcRunning(procName string) []uint32 {
|
|
||||||
pids := make([]uint32, 2048)
|
|
||||||
var ret uint32
|
|
||||||
if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
|
|
||||||
slog.Debug("failed to check for running installers", "error", err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
pids = pids[:ret]
|
|
||||||
var matches []uint32
|
|
||||||
for _, pid := range pids {
|
|
||||||
if pid == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
defer windows.CloseHandle(hProcess)
|
|
||||||
var module windows.Handle
|
|
||||||
var cbNeeded uint32
|
|
||||||
cb := (uint32)(unsafe.Sizeof(module))
|
|
||||||
if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
var sz uint32 = 1024 * 8
|
|
||||||
moduleName := make([]uint16, sz)
|
|
||||||
cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
|
|
||||||
if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
|
|
||||||
if strings.EqualFold(exeFile, procName) {
|
|
||||||
matches = append(matches, pid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return matches
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -53,11 +53,8 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, sv := range t.SpecialVocabulary {
|
for _, sv := range t.SpecialVocabulary {
|
||||||
kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
|
|
||||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
||||||
if len(sv.IDs) > 0 {
|
kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
|
||||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return kv
|
return kv
|
||||||
|
|||||||
@@ -139,8 +139,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
|
if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||||
strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
|
|
||||||
if !p.skipRepack {
|
if !p.skipRepack {
|
||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
@@ -182,9 +181,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
|
|||||||
}
|
}
|
||||||
|
|
||||||
var heads uint32
|
var heads uint32
|
||||||
if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
|
if strings.HasSuffix(name, "attn_q.weight") {
|
||||||
heads = p.NumAttentionHeads
|
heads = p.NumAttentionHeads
|
||||||
} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
|
} else if strings.HasSuffix(name, "attn_k.weight") {
|
||||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||||
|
|||||||
@@ -94,9 +94,7 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
var out []*ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
var text []Tensor
|
var text []Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
|
if t.Name() == "v.position_embd.gate" {
|
||||||
text = append(text, t)
|
|
||||||
} else if t.Name() == "v.position_embd.gate" {
|
|
||||||
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
|
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
|
||||||
tt := t.Clone()
|
tt := t.Clone()
|
||||||
tt.SetRepacker(m.repack(name))
|
tt.SetRepacker(m.repack(name))
|
||||||
@@ -107,21 +105,23 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
WriterTo: tt,
|
WriterTo: tt,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
} else {
|
} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
||||||
if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
t.SetRepacker(m.repack(t.Name()))
|
||||||
t.SetRepacker(m.repack(t.Name()))
|
|
||||||
} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
|
|
||||||
t.SetRepacker(m.repack(t.Name()))
|
|
||||||
} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
|
|
||||||
t.SetRepacker(m.repack(t.Name()))
|
|
||||||
}
|
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
WriterTo: t,
|
WriterTo: t,
|
||||||
})
|
})
|
||||||
|
} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
text = append(text, t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,35 +137,16 @@ func (m *mllamaModel) repack(name string) Repacker {
|
|||||||
|
|
||||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
|
t, err = tensor.Tanh(t)
|
||||||
heads := m.VisionModel.AttentionHeads
|
if err != nil {
|
||||||
if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
return nil, err
|
||||||
return nil, err
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if err := t.T(0, 2, 1, 3); err != nil {
|
if name == "v.position_embd.gate" {
|
||||||
return nil, err
|
t, err = tensor.Sub(float32(1), t)
|
||||||
}
|
|
||||||
|
|
||||||
if err := t.Reshape(dims...); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := t.Transpose(); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
t, err = tensor.Tanh(t)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if name == "v.position_embd.gate" {
|
|
||||||
t, err = tensor.Sub(float32(1), t)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
t = tensor.Materialize(t)
|
t = tensor.Materialize(t)
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
|||||||
}
|
}
|
||||||
t.Cleanup(func() { r.Close() })
|
t.Cleanup(func() { r.Close() })
|
||||||
|
|
||||||
m, err := ggml.Decode(r, -1)
|
m, _, err := ggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
m, err := ggml.Decode(r, -1)
|
m, _, err := ggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -110,7 +110,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
|
if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
|
||||||
// noop
|
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
} else {
|
} else {
|
||||||
@@ -172,34 +171,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
|
|
||||||
} else if err != nil {
|
|
||||||
return nil, err
|
|
||||||
} else {
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
var p map[string]json.RawMessage
|
|
||||||
if err := json.NewDecoder(f).Decode(&p); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, st := range specialTokenTypes {
|
|
||||||
if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
|
|
||||||
var ids []int32
|
|
||||||
if err := json.Unmarshal(bts, &ids); err != nil {
|
|
||||||
// value is not a list so the existing ID is used
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
|
|
||||||
return sv.Type == st
|
|
||||||
}); i >= 0 {
|
|
||||||
t.SpecialVocabulary[i].IDs = ids
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return t, nil
|
return t, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -309,9 +280,6 @@ type SpecialVocabulary struct {
|
|||||||
ID int
|
ID int
|
||||||
Content string
|
Content string
|
||||||
AddToken bool
|
AddToken bool
|
||||||
|
|
||||||
// IDs is populated by generation_config.json
|
|
||||||
IDs []int32
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sv SpecialVocabulary) Key() string {
|
func (sv SpecialVocabulary) Key() string {
|
||||||
|
|||||||
@@ -247,67 +247,6 @@ func TestParseTokenizer(t *testing.T) {
|
|||||||
Pre: "default",
|
Pre: "default",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
name: "generation config eos token ids",
|
|
||||||
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
|
||||||
"tokenizer.json": strings.NewReader(`{
|
|
||||||
"added_tokens": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"content": "<bos>",
|
|
||||||
"special": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"content": "<eos>",
|
|
||||||
"special": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"content": "<eot>",
|
|
||||||
"special": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"content": "<eom>",
|
|
||||||
"special": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"model": {
|
|
||||||
"vocab": {
|
|
||||||
"<bos>": 0,
|
|
||||||
"<eos>": 1,
|
|
||||||
"<eot>": 2,
|
|
||||||
"<eom>": 3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}`),
|
|
||||||
"tokenizer_config.json": strings.NewReader(`{
|
|
||||||
"add_bos_token": true,
|
|
||||||
"add_eos_token": false,
|
|
||||||
"bos_token": "<bos>",
|
|
||||||
"eos_token": "<eos>"
|
|
||||||
}`),
|
|
||||||
"generation_config.json": strings.NewReader(`{
|
|
||||||
"bos_token_id": 0,
|
|
||||||
"eos_token_id": [1, 2, 3]
|
|
||||||
}`),
|
|
||||||
}),
|
|
||||||
specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
|
|
||||||
want: &Tokenizer{
|
|
||||||
Vocabulary: &Vocabulary{
|
|
||||||
Model: "gpt2",
|
|
||||||
Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
|
|
||||||
Scores: []float32{0, 1, 2, 3},
|
|
||||||
Types: []int32{3, 3, 3, 3},
|
|
||||||
},
|
|
||||||
SpecialVocabulary: []*SpecialVocabulary{
|
|
||||||
{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
|
|
||||||
{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
|
|
||||||
},
|
|
||||||
Pre: "default",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range cases {
|
for _, tt := range cases {
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -15,7 +16,6 @@ import (
|
|||||||
type GGML struct {
|
type GGML struct {
|
||||||
container
|
container
|
||||||
model
|
model
|
||||||
Length int64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type model interface {
|
type model interface {
|
||||||
@@ -387,12 +387,12 @@ func DetectContentType(b []byte) string {
|
|||||||
//
|
//
|
||||||
// It collects array values for arrays with a size less than or equal to
|
// It collects array values for arrays with a size less than or equal to
|
||||||
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
|
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
|
||||||
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||||
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||||
|
|
||||||
var magic uint32
|
var magic uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var c container
|
var c container
|
||||||
@@ -402,25 +402,24 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
|||||||
case FILE_MAGIC_GGUF_BE:
|
case FILE_MAGIC_GGUF_BE:
|
||||||
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||||
default:
|
default:
|
||||||
return nil, errors.New("invalid file magic")
|
return nil, 0, errors.New("invalid file magic")
|
||||||
}
|
}
|
||||||
|
|
||||||
model, err := c.Decode(rs)
|
model, err := c.Decode(rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// final model type
|
// final model type
|
||||||
return &GGML{
|
return &GGML{
|
||||||
container: c,
|
container: c,
|
||||||
model: model,
|
model: model,
|
||||||
Length: offset,
|
}, offset, nil
|
||||||
}, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||||
@@ -654,15 +653,24 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
|||||||
numPatches*numPatches*headCount)
|
numPatches*numPatches*headCount)
|
||||||
case "qwen25vl":
|
case "qwen25vl":
|
||||||
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
|
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
|
||||||
|
mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
|
||||||
|
temporalPatchSize := uint64(2)
|
||||||
|
|
||||||
numPatches := maxPixels / (patchSize * patchSize)
|
// Calculate max possible patches based on max_pixels
|
||||||
|
maxHeight := uint64(math.Sqrt(float64(maxPixels)))
|
||||||
|
maxWidth := maxPixels / maxHeight
|
||||||
|
maxGridHeight := maxHeight / patchSize
|
||||||
|
maxGridWidth := maxWidth / patchSize
|
||||||
|
// Account for merged patches (2x2 grid)
|
||||||
|
numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
|
||||||
|
|
||||||
|
// Calculate graph size based on typical operations in ProcessImage and createPatches
|
||||||
graphSize = 4 * (maxPixels*numChannels + // Original image storage
|
graphSize = 4 * (maxPixels*numChannels + // Original image storage
|
||||||
// Normalized pixels
|
// Normalized pixels
|
||||||
maxPixels*numChannels +
|
maxPixels*numChannels +
|
||||||
// Patches storage (numPatches * channels * patchSize^2)
|
// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
|
||||||
numPatches*numChannels*patchSize*patchSize +
|
numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
|
||||||
// Self-attention calculations
|
// Self-attention calculations (similar to other architectures)
|
||||||
numPatches*numPatches*headCount +
|
numPatches*numPatches*headCount +
|
||||||
// Additional buffer for processing
|
// Additional buffer for processing
|
||||||
embeddingLength*numPatches)
|
embeddingLength*numPatches)
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ func TestWriteGGUF(t *testing.T) {
|
|||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
ff, err := Decode(r, 0)
|
ff, _, err := Decode(r, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
|
|||||||
}
|
}
|
||||||
testCases := []testCase{
|
testCases := []testCase{
|
||||||
{
|
{
|
||||||
model: "qwen2.5vl",
|
model: "llava:7b",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
model: "llama3.2-vision",
|
model: "llama3.2-vision",
|
||||||
@@ -60,7 +60,6 @@ func TestVisionModels(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestIntegrationSplitBatch(t *testing.T) {
|
func TestIntegrationSplitBatch(t *testing.T) {
|
||||||
skipUnderMinVRAM(t, 6)
|
|
||||||
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
req := api.GenerateRequest{
|
req := api.GenerateRequest{
|
||||||
|
|||||||
@@ -211,9 +211,10 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
|
|||||||
c.curCellRange.max = len(c.cells) - 1
|
c.curCellRange.max = len(c.cells) - 1
|
||||||
}
|
}
|
||||||
|
|
||||||
c.curMask = c.buildMask(ctx)
|
var err error
|
||||||
|
c.curMask, err = c.buildMask(ctx)
|
||||||
|
|
||||||
return nil
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func newRange() cellRange {
|
func newRange() cellRange {
|
||||||
@@ -296,7 +297,7 @@ func roundUp(length, pad int) int {
|
|||||||
// Builds a mask of history x batch indicating whether for each token in the batch the
|
// Builds a mask of history x batch indicating whether for each token in the batch the
|
||||||
// token in the history should apply. This is based on both the sequence and causality (the
|
// token in the history should apply. This is based on both the sequence and causality (the
|
||||||
// position of the history is not ahead of the token in the batch).
|
// position of the history is not ahead of the token in the batch).
|
||||||
func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||||
// Align and pad the two dimensions as required by the backend
|
// Align and pad the two dimensions as required by the backend
|
||||||
batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
|
batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
|
||||||
|
|
||||||
@@ -324,7 +325,10 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
|||||||
mask[i] = float32(math.Inf(-1))
|
mask[i] = float32(math.Inf(-1))
|
||||||
}
|
}
|
||||||
|
|
||||||
maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
|
maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
if c.config.MaskDType != ml.DTypeF32 {
|
if c.config.MaskDType != ml.DTypeF32 {
|
||||||
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
|
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
|
||||||
@@ -332,7 +336,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
|||||||
maskTensor = out
|
maskTensor = out
|
||||||
}
|
}
|
||||||
|
|
||||||
return maskTensor
|
return maskTensor, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
|
func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
|
||||||
@@ -487,7 +491,12 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
|
|||||||
if !slices.Equal(c.opts.Except, opts.Except) {
|
if !slices.Equal(c.opts.Except, opts.Except) {
|
||||||
c.opts = opts
|
c.opts = opts
|
||||||
if ctx != nil {
|
if ctx != nil {
|
||||||
c.curMask = c.buildMask(ctx)
|
var err error
|
||||||
|
c.curMask, err = c.buildMask(ctx)
|
||||||
|
if err != nil {
|
||||||
|
// This error should never occur because we have previously built a mask with the same shape
|
||||||
|
panic(fmt.Errorf("SetCausal: %w", err))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -643,7 +652,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
|
kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
for i, key := range c.keys {
|
for i, key := range c.keys {
|
||||||
if key == nil {
|
if key == nil {
|
||||||
|
|||||||
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
|
|||||||
}
|
}
|
||||||
|
|
||||||
cache.SetLayer(0)
|
cache.SetLayer(0)
|
||||||
tensor := context.FromFloatSlice(test.in, test.inShape...)
|
tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
|
||||||
cache.Put(context, tensor, tensor)
|
cache.Put(context, tensor, tensor)
|
||||||
|
|
||||||
out, _, mask := cache.Get(context)
|
out, _, mask := cache.Get(context)
|
||||||
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cache.SetLayer(0)
|
cache.SetLayer(0)
|
||||||
tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
|
tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
|
||||||
cache.Put(context, tensor, tensor)
|
cache.Put(context, tensor, tensor)
|
||||||
|
|
||||||
// with window size 4, nothing has slid out of the window yet
|
// with window size 4, nothing has slid out of the window yet
|
||||||
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cache.SetLayer(0)
|
cache.SetLayer(0)
|
||||||
tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
|
tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
|
||||||
cache.Put(context, tensor, tensor)
|
cache.Put(context, tensor, tensor)
|
||||||
|
|
||||||
// only the latest position has overlapping windows
|
// only the latest position has overlapping windows
|
||||||
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
|||||||
return c.Empty(dtype, shape...)
|
return c.Empty(dtype, shape...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
|
func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||||
t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
|
t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
|
||||||
|
|
||||||
copy(t.data, s)
|
copy(t.data, s)
|
||||||
|
|
||||||
return t
|
return t, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
|
func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||||
f := make([]float32, len(s))
|
f := make([]float32, len(s))
|
||||||
for i := range f {
|
for i := range f {
|
||||||
f[i] = float32(s[i])
|
f[i] = float32(s[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
out := c.FromFloatSlice(f, shape...)
|
out, _ := c.FromFloatSlice(f, shape...)
|
||||||
out.(*testTensor).dtype = ml.DTypeI32
|
out.(*testTensor).dtype = ml.DTypeI32
|
||||||
|
|
||||||
return out
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||||
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
|
|||||||
s = append(s, i)
|
s = append(s, i)
|
||||||
}
|
}
|
||||||
|
|
||||||
out := c.FromFloatSlice(s, len(s))
|
out, _ := c.FromFloatSlice(s, len(s))
|
||||||
out.(*testTensor).dtype = dtype
|
out.(*testTensor).dtype = dtype
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
|
|||||||
|
|
||||||
func (c *testContext) Compute(...ml.Tensor) {}
|
func (c *testContext) Compute(...ml.Tensor) {}
|
||||||
|
|
||||||
func (c *testContext) Reserve() {}
|
func (c *testContext) Reserve() error { return nil }
|
||||||
|
|
||||||
func (c *testContext) MaxGraphNodes() int {
|
func (c *testContext) MaxGraphNodes() int {
|
||||||
return 10
|
return 10
|
||||||
|
|||||||
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
|
|||||||
cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
|
cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
|
||||||
cparams.penalty_repeat = C.float(params.PenaltyRepeat)
|
cparams.penalty_repeat = C.float(params.PenaltyRepeat)
|
||||||
cparams.penalty_freq = C.float(params.PenaltyFreq)
|
cparams.penalty_freq = C.float(params.PenaltyFreq)
|
||||||
cparams.penalty_present = C.float(params.PenaltyPresent)
|
cparams.penalty_present = C.float(params.PenaltyFreq)
|
||||||
cparams.seed = C.uint32_t(params.Seed)
|
cparams.seed = C.uint32_t(params.Seed)
|
||||||
|
|
||||||
grammar := C.CString(params.Grammar)
|
grammar := C.CString(params.Grammar)
|
||||||
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
|
|||||||
defer C.free(unsafe.Pointer(cStr))
|
defer C.free(unsafe.Pointer(cStr))
|
||||||
|
|
||||||
// Allocate buffer for grammar based on schema length but with upper bound
|
// Allocate buffer for grammar based on schema length but with upper bound
|
||||||
maxLen := max(32768, min(1024*1024, len(schema)*4))
|
maxLen := min(1024*1024, len(schema)*4)
|
||||||
buf := make([]byte, maxLen)
|
buf := make([]byte, maxLen)
|
||||||
|
|
||||||
// Call C function to convert schema to grammar
|
// Call C function to convert schema to grammar
|
||||||
@@ -602,7 +602,7 @@ type Grammar struct {
|
|||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
|
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
|
||||||
cGrammar := C.CString(grammar)
|
cGrammar := C.CString(grammar)
|
||||||
defer C.free(unsafe.Pointer(cGrammar))
|
defer C.free(unsafe.Pointer(cGrammar))
|
||||||
|
|
||||||
@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
|
|||||||
cEogTokens[i] = C.uint32_t(token)
|
cEogTokens[i] = C.uint32_t(token)
|
||||||
}
|
}
|
||||||
|
|
||||||
g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
|
g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
|
||||||
if g == nil {
|
if g == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,156 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jesse Gross <jesse@ollama.com>
|
|
||||||
Date: Fri, 18 Apr 2025 15:58:19 -0700
|
|
||||||
Subject: [PATCH] graph memory reporting on failure
|
|
||||||
|
|
||||||
---
|
|
||||||
ggml/include/ggml-alloc.h | 6 ++++++
|
|
||||||
ggml/include/ggml-backend.h | 6 ++++++
|
|
||||||
ggml/src/ggml-alloc.c | 38 +++++++++++++++++++++++++++++++++----
|
|
||||||
ggml/src/ggml-backend.cpp | 10 ++++++++++
|
|
||||||
4 files changed, 56 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
|
||||||
index 2cb150fd..781b1e10 100644
|
|
||||||
--- a/ggml/include/ggml-alloc.h
|
|
||||||
+++ b/ggml/include/ggml-alloc.h
|
|
||||||
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
||||||
|
|
||||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
|
||||||
|
|
||||||
+struct ggml_allocr_buffer_status {
|
|
||||||
+ size_t size;
|
|
||||||
+ bool allocated;
|
|
||||||
+};
|
|
||||||
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
|
||||||
+
|
|
||||||
// Utils
|
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
|
||||||
index 778927f6..74e46716 100644
|
|
||||||
--- a/ggml/include/ggml-backend.h
|
|
||||||
+++ b/ggml/include/ggml-backend.h
|
|
||||||
@@ -304,6 +304,12 @@ extern "C" {
|
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
||||||
|
|
||||||
+ struct ggml_backend_buffer_status {
|
|
||||||
+ size_t size;
|
|
||||||
+ bool allocated;
|
|
||||||
+ };
|
|
||||||
+ GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
||||||
+
|
|
||||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
|
||||||
index 5fd379f6..04812990 100644
|
|
||||||
--- a/ggml/src/ggml-alloc.c
|
|
||||||
+++ b/ggml/src/ggml-alloc.c
|
|
||||||
@@ -364,6 +364,7 @@ struct node_alloc {
|
|
||||||
struct ggml_gallocr {
|
|
||||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
|
||||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
|
||||||
+ size_t *buffer_sizes; // [n_buffers]
|
|
||||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
|
||||||
int n_buffers;
|
|
||||||
|
|
||||||
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
||||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
|
||||||
GGML_ASSERT(galloc->buffers != NULL);
|
|
||||||
|
|
||||||
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
|
||||||
+ GGML_ASSERT(galloc->buffer_sizes != NULL);
|
|
||||||
+
|
|
||||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
|
||||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
|
||||||
|
|
||||||
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
||||||
ggml_hash_set_free(&galloc->hash_set);
|
|
||||||
free(galloc->hash_values);
|
|
||||||
free(galloc->bufts);
|
|
||||||
+ free(galloc->buffer_sizes);
|
|
||||||
free(galloc->buffers);
|
|
||||||
free(galloc->buf_tallocs);
|
|
||||||
free(galloc->node_allocs);
|
|
||||||
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+ bool success = true;
|
|
||||||
+
|
|
||||||
// reallocate buffers if needed
|
|
||||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
||||||
// if the buffer type is used multiple times, we reuse the same buffer
|
|
||||||
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
||||||
|
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
|
||||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
|
||||||
- if (galloc->buffers[i] == NULL) {
|
|
||||||
+ if (galloc->buffers[i]) {
|
|
||||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
|
||||||
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
||||||
+ } else {
|
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
|
||||||
- return false;
|
|
||||||
+ galloc->buffer_sizes[i] = new_size;
|
|
||||||
+ success = false;
|
|
||||||
}
|
|
||||||
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
||||||
+ } else {
|
|
||||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
- return true;
|
|
||||||
+ return success;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
||||||
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|
||||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
|
||||||
}
|
|
||||||
|
|
||||||
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|
||||||
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
|
||||||
+
|
|
||||||
+ for (int i = 0; i < buffer_id; i++) {
|
|
||||||
+ if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
|
|
||||||
+ // This buffer is the same as a previous one due to the same buffer type being used multiple times
|
|
||||||
+ // (See above.) However, we need a different check because multiple buffers might be NULL in our
|
|
||||||
+ // case and we still want to know the attempted size.
|
|
||||||
+
|
|
||||||
+ struct ggml_allocr_buffer_status status = {0, true};
|
|
||||||
+ return status;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
|
|
||||||
+ return status;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
// utils
|
|
||||||
|
|
||||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
|
||||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
|
||||||
index 0ce73a99..be335e8c 100644
|
|
||||||
--- a/ggml/src/ggml-backend.cpp
|
|
||||||
+++ b/ggml/src/ggml-backend.cpp
|
|
||||||
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
|
||||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
||||||
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
||||||
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
||||||
+
|
|
||||||
+ struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
|
||||||
+ struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
|
|
||||||
+
|
|
||||||
+ return status;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
||||||
@@ -1,9 +1,12 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"cmp"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"maps"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -82,11 +85,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
var graphOffload uint64
|
var graphOffload uint64
|
||||||
|
|
||||||
// Projectors loaded into GPU0 only
|
// Projectors loaded into GPU0 only
|
||||||
var llamaEngineProjectorWeights uint64
|
var projectorWeights uint64
|
||||||
|
var projectorGraph uint64
|
||||||
// Projectors loaded with output layer
|
|
||||||
var ollamaEngineProjectorWeights uint64
|
|
||||||
var ollamaEngineProjectorGraph uint64
|
|
||||||
|
|
||||||
// Conditional output size on GPU 0
|
// Conditional output size on GPU 0
|
||||||
var memoryLayerOutput uint64
|
var memoryLayerOutput uint64
|
||||||
@@ -111,23 +111,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||||
|
|
||||||
for _, projector := range projectors {
|
for _, projector := range projectors {
|
||||||
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
|
weight := projectorMemoryRequirements(projector)
|
||||||
|
projectorWeights += weight
|
||||||
|
|
||||||
// multimodal models require at least 2048 context
|
// multimodal models require at least 2048 context
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
}
|
}
|
||||||
if llamaEngineProjectorWeights == 0 {
|
if projectorWeights == 0 && projectorGraph == 0 {
|
||||||
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
|
projectorWeights, projectorGraph = f.VisionGraphSize()
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
layers := f.Tensors().GroupLayers()
|
layers := f.Tensors().GroupLayers()
|
||||||
// add one layer worth of memory as a buffer
|
// add one layer (chosing the max layer) worth of memory as a buffer
|
||||||
if blk0, ok := layers["blk.0"]; ok {
|
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
|
||||||
layerSize = blk0.Size()
|
return cmp.Compare(a.Size(), b.Size())
|
||||||
} else {
|
}).Size()
|
||||||
slog.Warn("model missing blk.0 layer size")
|
|
||||||
}
|
|
||||||
|
|
||||||
var kvct string
|
var kvct string
|
||||||
if envconfig.FlashAttention() &&
|
if envconfig.FlashAttention() &&
|
||||||
@@ -165,7 +163,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
graphFullOffload = graphPartialOffload
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output layer handled at the end if we have space
|
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
memoryLayerOutput += layer.Size()
|
memoryLayerOutput += layer.Size()
|
||||||
}
|
}
|
||||||
@@ -175,7 +172,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
memoryLayerOutput += layer.Size()
|
memoryLayerOutput += layer.Size()
|
||||||
}
|
}
|
||||||
|
|
||||||
gpuZeroOverhead := llamaEngineProjectorWeights
|
// Output layer handled at the end if we have space
|
||||||
|
gpuZeroOverhead := projectorWeights + projectorGraph
|
||||||
|
|
||||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||||
var layerCount int
|
var layerCount int
|
||||||
@@ -218,8 +216,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
if len(gpusWithSpace) > 0 {
|
if len(gpusWithSpace) > 0 {
|
||||||
gpuZeroID = gpusWithSpace[0].i
|
gpuZeroID = gpusWithSpace[0].i
|
||||||
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
||||||
} else {
|
|
||||||
overflow += gpuZeroOverhead
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// For all the layers, find where they can fit on the GPU(s)
|
// For all the layers, find where they can fit on the GPU(s)
|
||||||
@@ -260,24 +256,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Determine if we need to consider output then find where it fits
|
// Determine if we need to consider output then find where it fits
|
||||||
memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
|
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
|
||||||
if memoryLastLayer > 0 {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
if opts.NumGPU < 0 || layerCount < opts.NumGPU {
|
g := gpusWithSpace[layerCount%j]
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
g := gpusWithSpace[layerCount%j]
|
if g.g.FreeMemory > overhead+used+memoryLayerOutput {
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
gpuAllocations[g.i] += memoryLayerOutput
|
||||||
if g.g.FreeMemory > overhead+used+memoryLastLayer {
|
layerCounts[g.i]++
|
||||||
gpuAllocations[g.i] += memoryLastLayer
|
layerCount++
|
||||||
layerCounts[g.i]++
|
break
|
||||||
layerCount++
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if layerCount < int(f.KV().BlockCount())+1 {
|
if layerCount < int(f.KV().BlockCount())+1 {
|
||||||
fullyLoaded = false
|
fullyLoaded = false
|
||||||
overflow += memoryLastLayer
|
overflow += memoryLayerOutput
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -335,8 +328,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
memoryLayerOutput: memoryLayerOutput,
|
memoryLayerOutput: memoryLayerOutput,
|
||||||
graphFullOffload: graphFullOffload,
|
graphFullOffload: graphFullOffload,
|
||||||
graphPartialOffload: graphPartialOffload,
|
graphPartialOffload: graphPartialOffload,
|
||||||
projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
|
projectorWeights: projectorWeights,
|
||||||
projectorGraph: ollamaEngineProjectorGraph,
|
projectorGraph: projectorGraph,
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpus[0].Library == "cpu" {
|
if gpus[0].Library == "cpu" {
|
||||||
@@ -422,7 +415,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
|
|||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
ggml, err := ggml.Decode(file, 1024)
|
ggml, _, err := ggml.Decode(file, 1024)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ggml, err := ggml.Decode(f, maxArraySize)
|
ggml, _, err := ggml.Decode(f, maxArraySize)
|
||||||
return ggml, err
|
return ggml, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
121
ml/backend.go
121
ml/backend.go
@@ -6,6 +6,7 @@ import (
|
|||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"os"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -14,11 +15,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Backend interface {
|
type Backend interface {
|
||||||
Load(ctx context.Context, progress func(float32)) error
|
|
||||||
|
|
||||||
// BackendMemory returns the memory allocations that were made for this model
|
|
||||||
BackendMemory() BackendMemory
|
|
||||||
|
|
||||||
Config() fs.Config
|
Config() fs.Config
|
||||||
Get(name string) Tensor
|
Get(name string) Tensor
|
||||||
NewContext() Context
|
NewContext() Context
|
||||||
@@ -56,6 +52,10 @@ type CacheConfig struct {
|
|||||||
|
|
||||||
// BackendParams controls how the backend loads and executes models
|
// BackendParams controls how the backend loads and executes models
|
||||||
type BackendParams struct {
|
type BackendParams struct {
|
||||||
|
// Progress is a callback function that allows reporting percentage completion
|
||||||
|
// of model loading
|
||||||
|
Progress func(float32)
|
||||||
|
|
||||||
// NumThreads sets the number of threads to use if running on the CPU
|
// NumThreads sets the number of threads to use if running on the CPU
|
||||||
NumThreads int
|
NumThreads int
|
||||||
|
|
||||||
@@ -72,87 +72,9 @@ type BackendParams struct {
|
|||||||
FlashAttention bool
|
FlashAttention bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// ErrNoMem is returned when panicing due to insufficient memory. It includes
|
var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
|
||||||
// the attempted memory allocation.
|
|
||||||
type ErrNoMem struct {
|
|
||||||
BackendMemory
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e ErrNoMem) Error() string {
|
func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
|
||||||
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
|
|
||||||
}
|
|
||||||
|
|
||||||
type AllocationStatus int
|
|
||||||
|
|
||||||
const (
|
|
||||||
// Unallocated memory - have not yet attempted to allocate
|
|
||||||
Unallocated AllocationStatus = iota
|
|
||||||
|
|
||||||
// Failed memory - tried to allocate the memory and did not succeed
|
|
||||||
Failed
|
|
||||||
|
|
||||||
// Allocated memory = tried and succeeded to allocate memory
|
|
||||||
Allocated
|
|
||||||
)
|
|
||||||
|
|
||||||
// Memory is the size of an allocation and whether it was successful.
|
|
||||||
type Memory struct {
|
|
||||||
Size uint64
|
|
||||||
Status AllocationStatus
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m Memory) String() string {
|
|
||||||
s := fmt.Sprint(m.Size)
|
|
||||||
|
|
||||||
switch m.Status {
|
|
||||||
case Unallocated:
|
|
||||||
s += "U"
|
|
||||||
case Failed:
|
|
||||||
s += "F"
|
|
||||||
case Allocated:
|
|
||||||
s += "A"
|
|
||||||
}
|
|
||||||
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
// DeviceMemory provides a breakdown of the memory needed
|
|
||||||
// per device, such as a CPU or GPU.
|
|
||||||
type DeviceMemory struct {
|
|
||||||
// Name is the name of the device as labeled by the backend. It
|
|
||||||
// may not be persistent across instances of the runner.
|
|
||||||
Name string
|
|
||||||
|
|
||||||
// Weights is the per-layer memory needed for the model weights.
|
|
||||||
Weights []Memory
|
|
||||||
|
|
||||||
// Cache is the per-layer memory needed for the KV cache.
|
|
||||||
Cache []Memory
|
|
||||||
|
|
||||||
// Graph is the size of the compute graph. It is not per-layer.
|
|
||||||
Graph Memory
|
|
||||||
}
|
|
||||||
|
|
||||||
// BackendMemory provides the amount of memory required to load the model
|
|
||||||
// per device based on the BackendParams. In some cases, not all required
|
|
||||||
// allocations will be known at this point. However, the size of the most recent
|
|
||||||
// allocation is guaranteed to be provided so that if it failed, the caller can
|
|
||||||
// accommodate that to make forward progress.
|
|
||||||
type BackendMemory struct {
|
|
||||||
// InputsWeights are always located on the CPU and cannot be moved
|
|
||||||
InputWeights Memory
|
|
||||||
|
|
||||||
// CPU model components are located in system memory. This does not
|
|
||||||
// include unified memory allocated through the GPU.
|
|
||||||
CPU DeviceMemory
|
|
||||||
|
|
||||||
// GPU model components are located on one or more GPUs.
|
|
||||||
GPUs []DeviceMemory
|
|
||||||
}
|
|
||||||
|
|
||||||
var backends = make(map[string]func(string, BackendParams) (Backend, error))
|
|
||||||
|
|
||||||
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
|
|
||||||
if _, ok := backends[name]; ok {
|
if _, ok := backends[name]; ok {
|
||||||
panic("backend: backend already registered")
|
panic("backend: backend already registered")
|
||||||
}
|
}
|
||||||
@@ -160,9 +82,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
|
|||||||
backends[name] = f
|
backends[name] = f
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewBackend(modelPath string, params BackendParams) (Backend, error) {
|
func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
|
||||||
if backend, ok := backends["ggml"]; ok {
|
if backend, ok := backends["ggml"]; ok {
|
||||||
return backend(modelPath, params)
|
return backend(ctx, f, params)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, fmt.Errorf("unsupported backend")
|
return nil, fmt.Errorf("unsupported backend")
|
||||||
@@ -171,8 +93,8 @@ func NewBackend(modelPath string, params BackendParams) (Backend, error) {
|
|||||||
type Context interface {
|
type Context interface {
|
||||||
Empty(dtype DType, shape ...int) Tensor
|
Empty(dtype DType, shape ...int) Tensor
|
||||||
Zeros(dtype DType, shape ...int) Tensor
|
Zeros(dtype DType, shape ...int) Tensor
|
||||||
FromFloatSlice(s []float32, shape ...int) Tensor
|
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
|
||||||
FromIntSlice(s []int32, shape ...int) Tensor
|
FromIntSlice(s []int32, shape ...int) (Tensor, error)
|
||||||
|
|
||||||
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
|
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
|
||||||
Arange(start, stop, step float32, dtype DType) Tensor
|
Arange(start, stop, step float32, dtype DType) Tensor
|
||||||
@@ -184,7 +106,7 @@ type Context interface {
|
|||||||
// graph, simply preallocates memory. Typically called with a
|
// graph, simply preallocates memory. Typically called with a
|
||||||
// worst case graph to ensure all resources are available for
|
// worst case graph to ensure all resources are available for
|
||||||
// for future inference.
|
// for future inference.
|
||||||
Reserve()
|
Reserve() error
|
||||||
|
|
||||||
MaxGraphNodes() int
|
MaxGraphNodes() int
|
||||||
Close()
|
Close()
|
||||||
@@ -197,6 +119,21 @@ type Context interface {
|
|||||||
Layer(int) Context
|
Layer(int) Context
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RopeOptions contains optional parameters for RoPE function
|
||||||
|
type RopeOptions struct {
|
||||||
|
OriginalContextLen uint32
|
||||||
|
}
|
||||||
|
|
||||||
|
// RopeOption defines a function that modifies RopeOpts
|
||||||
|
type RopeOption func(*RopeOptions)
|
||||||
|
|
||||||
|
// WithContextLen sets a custom context length
|
||||||
|
func WithContextLen(len uint32) RopeOption {
|
||||||
|
return func(opts *RopeOptions) {
|
||||||
|
opts.OriginalContextLen = len
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type Tensor interface {
|
type Tensor interface {
|
||||||
Dim(n int) int
|
Dim(n int) int
|
||||||
Stride(n int) int
|
Stride(n int) int
|
||||||
@@ -210,8 +147,6 @@ type Tensor interface {
|
|||||||
Neg(ctx Context) Tensor
|
Neg(ctx Context) Tensor
|
||||||
Add(ctx Context, t2 Tensor) Tensor
|
Add(ctx Context, t2 Tensor) Tensor
|
||||||
Mul(ctx Context, t2 Tensor) Tensor
|
Mul(ctx Context, t2 Tensor) Tensor
|
||||||
Div(ctx Context, t2 Tensor) Tensor
|
|
||||||
|
|
||||||
Mulmat(ctx Context, t2 Tensor) Tensor
|
Mulmat(ctx Context, t2 Tensor) Tensor
|
||||||
MulmatFullPrec(ctx Context, t2 Tensor) Tensor
|
MulmatFullPrec(ctx Context, t2 Tensor) Tensor
|
||||||
MulmatID(ctx Context, t2, ids Tensor) Tensor
|
MulmatID(ctx Context, t2, ids Tensor) Tensor
|
||||||
@@ -220,11 +155,11 @@ type Tensor interface {
|
|||||||
LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
|
LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
|
||||||
RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
|
RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
|
||||||
Scale(ctx Context, s float64) Tensor
|
Scale(ctx Context, s float64) Tensor
|
||||||
SumRows(ctx Context) Tensor
|
|
||||||
|
|
||||||
AvgPool2D(ctx Context, k, s int, p float32) Tensor
|
AvgPool2D(ctx Context, k, s int, p float32) Tensor
|
||||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||||
|
|
||||||
|
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
|
||||||
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||||
|
|
||||||
Sin(ctx Context) Tensor
|
Sin(ctx Context) Tensor
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import "C"
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
@@ -29,7 +30,6 @@ import (
|
|||||||
"github.com/ollama/ollama/logutil"
|
"github.com/ollama/ollama/logutil"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -44,15 +44,8 @@ func devices() []*C.struct_ggml_backend_device {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Backend struct {
|
type Backend struct {
|
||||||
// modelPath is the location of the model data
|
|
||||||
modelPath string
|
|
||||||
|
|
||||||
meta *fsggml.GGML
|
meta *fsggml.GGML
|
||||||
|
|
||||||
// tensorLoadTargets maps from the name of the tensor in the file
|
|
||||||
// to the name that is used by the model definition
|
|
||||||
tensorLoadTargets map[string][]string
|
|
||||||
|
|
||||||
sched *C.struct_ggml_backend_sched
|
sched *C.struct_ggml_backend_sched
|
||||||
schedBackends []*C.struct_ggml_backend
|
schedBackends []*C.struct_ggml_backend
|
||||||
schedBufts []*C.struct_ggml_backend_buffer_type
|
schedBufts []*C.struct_ggml_backend_buffer_type
|
||||||
@@ -65,26 +58,14 @@ type Backend struct {
|
|||||||
// layers is the backend used for repeating layers
|
// layers is the backend used for repeating layers
|
||||||
layers map[int]*C.struct_ggml_backend_buffer_type
|
layers map[int]*C.struct_ggml_backend_buffer_type
|
||||||
|
|
||||||
// requiredMemory is the cumulative memory allocations needed by the backend
|
|
||||||
requiredMemory *ml.BackendMemory
|
|
||||||
|
|
||||||
// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
|
|
||||||
btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
|
|
||||||
|
|
||||||
flashAttention bool
|
flashAttention bool
|
||||||
|
|
||||||
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
|
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
|
||||||
maxGraphNodes int
|
maxGraphNodes int
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
||||||
r, err := os.Open(modelPath)
|
meta, n, err := fsggml.Decode(r, -1)
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer r.Close()
|
|
||||||
|
|
||||||
meta, err := fsggml.Decode(r, -1)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -99,9 +80,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
"num_key_values", len(meta.KV()),
|
"num_key_values", len(meta.KV()),
|
||||||
)
|
)
|
||||||
|
|
||||||
var requiredMemory ml.BackendMemory
|
|
||||||
btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
|
|
||||||
|
|
||||||
type deviceBufferType struct {
|
type deviceBufferType struct {
|
||||||
d *C.struct_ggml_backend_device
|
d *C.struct_ggml_backend_device
|
||||||
bts []*C.struct_ggml_backend_buffer_type
|
bts []*C.struct_ggml_backend_buffer_type
|
||||||
@@ -122,8 +100,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
blocks := int(meta.KV().BlockCount())
|
|
||||||
|
|
||||||
// create list of buffer types for the cpu
|
// create list of buffer types for the cpu
|
||||||
cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
|
cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
|
||||||
for _, d := range append(accels, append(gpus, cpus...)...) {
|
for _, d := range append(accels, append(gpus, cpus...)...) {
|
||||||
@@ -131,27 +107,17 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
|
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||||
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||||
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
|
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
|
||||||
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
|
|
||||||
requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
|
|
||||||
requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
|
|
||||||
|
|
||||||
// create list of buffer types for each gpu
|
// create list of buffer types for each gpu
|
||||||
var gpuDeviceBufferTypes []deviceBufferType
|
var gpuDeviceBufferTypes []deviceBufferType
|
||||||
requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
|
for _, d := range gpus {
|
||||||
for i, d := range gpus {
|
|
||||||
bt := C.ggml_backend_dev_buffer_type(d)
|
bt := C.ggml_backend_dev_buffer_type(d)
|
||||||
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
|
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
|
||||||
d: d,
|
d: d,
|
||||||
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
|
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
|
||||||
})
|
})
|
||||||
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
|
|
||||||
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
|
|
||||||
requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
|
|
||||||
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
useDefaultSplit := true
|
useDefaultSplit := true
|
||||||
@@ -190,6 +156,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
// inputs always use cpu
|
// inputs always use cpu
|
||||||
input := cpuDeviceBufferType
|
input := cpuDeviceBufferType
|
||||||
|
|
||||||
|
blocks := int(meta.KV().BlockCount())
|
||||||
|
|
||||||
// define a range of gpu layers. anything outside of this range is assigned to the cpu
|
// define a range of gpu layers. anything outside of this range is assigned to the cpu
|
||||||
gpuRangeStart := max(0, blocks-params.NumGPULayers)
|
gpuRangeStart := max(0, blocks-params.NumGPULayers)
|
||||||
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
|
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
|
||||||
@@ -230,7 +198,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
|
|
||||||
// contexts are shared by tensors of the same buffer type
|
// contexts are shared by tensors of the same buffer type
|
||||||
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
|
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
|
||||||
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
|
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
|
||||||
for _, bt := range bts {
|
for _, bt := range bts {
|
||||||
if _, ok := ctxs[bt]; !ok {
|
if _, ok := ctxs[bt]; !ok {
|
||||||
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
|
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
|
||||||
@@ -256,16 +224,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
C.ggml_set_name(tt, cname)
|
C.ggml_set_name(tt, cname)
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
|
slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
|
||||||
|
|
||||||
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
|
||||||
if layer == -1 {
|
|
||||||
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
|
|
||||||
requiredMemory.InputWeights.Status = ml.Allocated
|
|
||||||
requiredMemory.InputWeights.Size += uint64(size)
|
|
||||||
} else {
|
|
||||||
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
|
|
||||||
}
|
|
||||||
|
|
||||||
//nolint:staticcheck // TODO: check if buffer type supports this tensor
|
//nolint:staticcheck // TODO: check if buffer type supports this tensor
|
||||||
return tt
|
return tt
|
||||||
}
|
}
|
||||||
@@ -287,22 +245,22 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
for _, t := range meta.Tensors().Items() {
|
for _, t := range meta.Tensors().Items() {
|
||||||
switch {
|
switch {
|
||||||
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
|
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
|
||||||
createTensor(tensor{source: t}, input.bts, -1)
|
createTensor(tensor{source: t}, input.bts)
|
||||||
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
|
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
|
||||||
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
|
createTensor(tensor{source: t, target: "output.weight"}, output.bts)
|
||||||
}
|
}
|
||||||
case contains(t.Name, "cls", "output", "output_norm"):
|
case contains(t.Name, "cls", "output", "output_norm"):
|
||||||
createTensor(tensor{source: t}, output.bts, blocks)
|
createTensor(tensor{source: t}, output.bts)
|
||||||
case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
|
case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
|
||||||
// TODO: assign vision tensors to the gpu if possible
|
// TODO: assign vision tensors to the gpu if possible
|
||||||
createTensor(tensor{source: t}, output.bts, blocks)
|
createTensor(tensor{source: t}, output.bts)
|
||||||
case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
|
case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
|
||||||
// these tensors should be repeated per layer
|
// these tensors should be repeated per layer
|
||||||
for i, layer := range layers {
|
for i, layer := range layers {
|
||||||
createTensor(tensor{
|
createTensor(tensor{
|
||||||
source: t,
|
source: t,
|
||||||
target: "blk." + strconv.Itoa(i) + "." + t.Name,
|
target: "blk." + strconv.Itoa(i) + "." + t.Name,
|
||||||
}, layer.bts, i)
|
}, layer.bts)
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
layerIndex := -1
|
layerIndex := -1
|
||||||
@@ -313,10 +271,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if layerIndex >= 0 {
|
if layerIndex >= 0 {
|
||||||
createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
|
createTensor(tensor{source: t}, layers[layerIndex].bts)
|
||||||
} else {
|
} else {
|
||||||
// load all other tensors on the cpu
|
// load all other tensors on the cpu
|
||||||
createTensor(tensor{source: t}, input.bts, -1)
|
createTensor(tensor{source: t}, input.bts)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -329,18 +287,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
|
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
|
||||||
for i := range btDeviceMemory[bt].Weights {
|
|
||||||
if btDeviceMemory[bt].Weights[i].Size != 0 {
|
|
||||||
if b != nil {
|
|
||||||
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
|
|
||||||
} else {
|
|
||||||
btDeviceMemory[bt].Weights[i].Status = ml.Failed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if b == nil {
|
if b == nil {
|
||||||
panic(ml.ErrNoMem{BackendMemory: requiredMemory})
|
return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
|
||||||
}
|
}
|
||||||
|
|
||||||
C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
|
C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
|
||||||
@@ -359,6 +307,73 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var doneBytes atomic.Uint64
|
||||||
|
totalBytes := uint64(n) - meta.Tensors().Offset
|
||||||
|
|
||||||
|
g, ctx := errgroup.WithContext(ctx)
|
||||||
|
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||||
|
for _, t := range meta.Tensors().Items() {
|
||||||
|
t := t
|
||||||
|
g.Go(func() error {
|
||||||
|
tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
|
||||||
|
for i := range tts {
|
||||||
|
target := targets[t.Name][i]
|
||||||
|
if target == "" {
|
||||||
|
target = t.Name
|
||||||
|
}
|
||||||
|
|
||||||
|
tt, ok := tensors[target]
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("unassigned tensor: %s", t.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
tts[i] = tt
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
|
||||||
|
// seeking around within an FD shared between all goroutines.
|
||||||
|
file, err := os.Open(r.Name())
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("file open error", "file", r.Name(), "error", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
||||||
|
bts := make([]byte, 128*format.KibiByte)
|
||||||
|
|
||||||
|
var s uint64
|
||||||
|
for s < t.Size() {
|
||||||
|
// Stop if either the parent context has been canceled or if any of the other tensors returned an error
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("file read error", "file", r.Name(), "error", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tts {
|
||||||
|
C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
s += uint64(n)
|
||||||
|
|
||||||
|
if params.Progress != nil {
|
||||||
|
done := doneBytes.Add(uint64(n))
|
||||||
|
params.Progress(float32(done) / float32(totalBytes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := g.Wait(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
// map devices to backend buffer types so new tensors can be assigned to the correct device
|
// map devices to backend buffer types so new tensors can be assigned to the correct device
|
||||||
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
|
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
|
||||||
|
|
||||||
@@ -382,11 +397,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
|
|
||||||
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
|
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
|
||||||
return &Backend{
|
return &Backend{
|
||||||
modelPath: modelPath,
|
flashAttention: params.FlashAttention,
|
||||||
flashAttention: params.FlashAttention,
|
meta: meta,
|
||||||
meta: meta,
|
tensors: tensors,
|
||||||
tensorLoadTargets: targets,
|
|
||||||
tensors: tensors,
|
|
||||||
sched: C.ggml_backend_sched_new(
|
sched: C.ggml_backend_sched_new(
|
||||||
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
|
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
|
||||||
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
|
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
|
||||||
@@ -405,9 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
}
|
}
|
||||||
return m
|
return m
|
||||||
}(),
|
}(),
|
||||||
requiredMemory: &requiredMemory,
|
maxGraphNodes: maxGraphNodes,
|
||||||
btDeviceMemory: btDeviceMemory,
|
|
||||||
maxGraphNodes: maxGraphNodes,
|
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -415,81 +426,6 @@ func init() {
|
|||||||
ml.RegisterBackend("ggml", New)
|
ml.RegisterBackend("ggml", New)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
|
||||||
var doneBytes atomic.Uint64
|
|
||||||
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
|
|
||||||
|
|
||||||
g, ctx := errgroup.WithContext(ctx)
|
|
||||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
|
||||||
for _, t := range b.meta.Tensors().Items() {
|
|
||||||
t := t
|
|
||||||
g.Go(func() error {
|
|
||||||
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
|
|
||||||
for i := range tts {
|
|
||||||
target := b.tensorLoadTargets[t.Name][i]
|
|
||||||
if target == "" {
|
|
||||||
target = t.Name
|
|
||||||
}
|
|
||||||
|
|
||||||
tt, ok := b.tensors[target]
|
|
||||||
if !ok {
|
|
||||||
return fmt.Errorf("unassigned tensor: %s", t.Name)
|
|
||||||
}
|
|
||||||
|
|
||||||
tts[i] = tt
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
|
|
||||||
// seeking around within an FD shared between all goroutines.
|
|
||||||
file, err := os.Open(b.modelPath)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("file open error", "file", b.modelPath, "error", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
|
||||||
bts := make([]byte, 128*format.KibiByte)
|
|
||||||
|
|
||||||
var s uint64
|
|
||||||
for s < t.Size() {
|
|
||||||
// Stop if either the parent context has been canceled or if any of the other tensors returned an error
|
|
||||||
if err := ctx.Err(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("file read error", "file", b.modelPath, "error", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tts {
|
|
||||||
C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
|
|
||||||
}
|
|
||||||
|
|
||||||
s += uint64(n)
|
|
||||||
|
|
||||||
if progress != nil {
|
|
||||||
done := doneBytes.Add(uint64(n))
|
|
||||||
progress(float32(done) / float32(totalBytes))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := g.Wait(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *Backend) BackendMemory() ml.BackendMemory {
|
|
||||||
return *b.requiredMemory
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *Backend) Config() fs.Config {
|
func (b *Backend) Config() fs.Config {
|
||||||
return b.meta.KV()
|
return b.meta.KV()
|
||||||
}
|
}
|
||||||
@@ -521,7 +457,6 @@ func (b *Backend) NewContextSize(n int) ml.Context {
|
|||||||
no_alloc: true,
|
no_alloc: true,
|
||||||
}),
|
}),
|
||||||
allocatedBuffers: &allocatedBuffers,
|
allocatedBuffers: &allocatedBuffers,
|
||||||
layer: -1,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -548,9 +483,6 @@ type Context struct {
|
|||||||
|
|
||||||
// maxGraphNodes is the maximum allowed number of graph nodes in this context
|
// maxGraphNodes is the maximum allowed number of graph nodes in this context
|
||||||
maxGraphNodes int
|
maxGraphNodes int
|
||||||
|
|
||||||
// layer is the graph layer that this context is allocating for - assumed to be cache
|
|
||||||
layer int
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) Input() ml.Context {
|
func (c *Context) Input() ml.Context {
|
||||||
@@ -561,7 +493,6 @@ func (c *Context) Input() ml.Context {
|
|||||||
buft: c.b.input,
|
buft: c.b.input,
|
||||||
allocatedBuffers: c.allocatedBuffers,
|
allocatedBuffers: c.allocatedBuffers,
|
||||||
maxGraphNodes: c.maxGraphNodes,
|
maxGraphNodes: c.maxGraphNodes,
|
||||||
layer: -1,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -576,7 +507,6 @@ func (c *Context) Layer(i int) ml.Context {
|
|||||||
buft: buft,
|
buft: buft,
|
||||||
allocatedBuffers: c.allocatedBuffers,
|
allocatedBuffers: c.allocatedBuffers,
|
||||||
maxGraphNodes: c.maxGraphNodes,
|
maxGraphNodes: c.maxGraphNodes,
|
||||||
layer: i,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -614,34 +544,22 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) Reserve() {
|
func (c *Context) Reserve() error {
|
||||||
reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
|
if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
|
||||||
|
C.ggml_backend_sched_reset(c.b.sched)
|
||||||
|
return errors.New("failed to reserve graph")
|
||||||
|
}
|
||||||
|
|
||||||
slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
|
slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
|
||||||
|
|
||||||
// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
|
|
||||||
for _, bt := range c.b.schedBufts {
|
|
||||||
c.b.btDeviceMemory[bt].Graph = ml.Memory{}
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := range c.b.schedBackends {
|
for i := range c.b.schedBackends {
|
||||||
bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
|
size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
|
||||||
|
|
||||||
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
|
|
||||||
graph.Size += uint64(bufferStatus.size)
|
|
||||||
if bufferStatus.allocated && graph.Status != ml.Failed {
|
|
||||||
graph.Status = ml.Allocated
|
|
||||||
} else {
|
|
||||||
graph.Status = ml.Failed
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
|
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
|
||||||
"size", format.HumanBytes2(uint64(bufferStatus.size)))
|
"size", format.HumanBytes2(uint64(size)))
|
||||||
}
|
}
|
||||||
|
|
||||||
if !reserved {
|
C.ggml_backend_sched_reset(c.b.sched)
|
||||||
panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
|
|
||||||
}
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) MaxGraphNodes() int {
|
func (c *Context) MaxGraphNodes() int {
|
||||||
@@ -661,7 +579,7 @@ func pad(length, pad C.size_t) C.size_t {
|
|||||||
return ((length + pad - 1) / pad) * pad
|
return ((length + pad - 1) / pad) * pad
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
|
||||||
if c.buft == nil {
|
if c.buft == nil {
|
||||||
panic("set Input or Layer before creating tensors")
|
panic("set Input or Layer before creating tensors")
|
||||||
}
|
}
|
||||||
@@ -684,7 +602,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
|||||||
|
|
||||||
if len(shape) < 1 || shape[0] == 0 {
|
if len(shape) < 1 || shape[0] == 0 {
|
||||||
var shape C.int64_t = 0
|
var shape C.int64_t = 0
|
||||||
return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
|
return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
|
||||||
} else if len(shape) > 4 {
|
} else if len(shape) > 4 {
|
||||||
panic("unsupported number of dimensions")
|
panic("unsupported number of dimensions")
|
||||||
}
|
}
|
||||||
@@ -697,43 +615,40 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
|||||||
|
|
||||||
t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
|
t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
|
||||||
size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
|
size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
|
||||||
|
|
||||||
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
|
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
|
||||||
if c.layer >= 0 {
|
|
||||||
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
|
|
||||||
|
|
||||||
cache.Size += uint64(size)
|
|
||||||
if b != nil {
|
|
||||||
cache.Status = ml.Allocated
|
|
||||||
} else {
|
|
||||||
cache.Status = ml.Failed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if b == nil {
|
if b == nil {
|
||||||
panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
|
return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
|
||||||
}
|
}
|
||||||
|
|
||||||
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
|
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
|
||||||
|
|
||||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||||
return &Tensor{b: c.b, t: t}
|
return &Tensor{b: c.b, t: t}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
|
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
|
||||||
return c.newTensor(dtype, shape)
|
t, err := c.newTensor(dtype, shape)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return t
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||||
t := c.newTensor(dtype, shape)
|
t, err := c.newTensor(dtype, shape)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
C.ggml_set_zero(t.(*Tensor).t)
|
C.ggml_set_zero(t.(*Tensor).t)
|
||||||
return t
|
return t
|
||||||
}
|
}
|
||||||
|
|
||||||
func checkShape[S ~[]E, E any](s S, shape ...int) {
|
func checkShape[S ~[]E, E any](s S, shape ...int) error {
|
||||||
n := len(s)
|
n := len(s)
|
||||||
|
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, v := range shape {
|
for _, v := range shape {
|
||||||
@@ -741,32 +656,44 @@ func checkShape[S ~[]E, E any](s S, shape ...int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if n != 1 {
|
if n != 1 {
|
||||||
panic(fmt.Errorf("invalid shape: %v", shape))
|
return fmt.Errorf("invalid shape: %v", shape)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
|
func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||||
checkShape(s, shape...)
|
if err := checkShape(s, shape...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
t := c.newTensor(ml.DTypeF32, shape)
|
t, err := c.newTensor(ml.DTypeF32, shape)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
if len(s) > 0 {
|
if len(s) > 0 {
|
||||||
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
||||||
}
|
}
|
||||||
|
|
||||||
return t
|
return t, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
|
func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||||
checkShape(s, shape...)
|
if err := checkShape(s, shape...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
t := c.newTensor(ml.DTypeI32, shape)
|
t, err := c.newTensor(ml.DTypeI32, shape)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
if len(s) > 0 {
|
if len(s) > 0 {
|
||||||
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
||||||
}
|
}
|
||||||
|
|
||||||
return t
|
return t, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||||
@@ -784,7 +711,12 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
|||||||
arange = append(arange, int32(i))
|
arange = append(arange, int32(i))
|
||||||
}
|
}
|
||||||
|
|
||||||
return c.Input().FromIntSlice(arange, len(arange))
|
t, err := c.Input().FromIntSlice(arange, len(arange))
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return t
|
||||||
default:
|
default:
|
||||||
panic("unsupported dtype for arange")
|
panic("unsupported dtype for arange")
|
||||||
}
|
}
|
||||||
@@ -935,13 +867,6 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
|
||||||
return &Tensor{
|
|
||||||
b: t.b,
|
|
||||||
t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||||
return &Tensor{
|
return &Tensor{
|
||||||
b: t.b,
|
b: t.b,
|
||||||
@@ -990,8 +915,6 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
|
|||||||
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
|
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||||
if len(shape) != 4 {
|
if len(shape) != 4 {
|
||||||
panic("expected 4 dimensions")
|
panic("expected 4 dimensions")
|
||||||
} else if shape[3] != 0 {
|
|
||||||
panic("cuda does not support 4d tensors")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Tensor{
|
return &Tensor{
|
||||||
@@ -1059,13 +982,6 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
|
|
||||||
return &Tensor{
|
|
||||||
b: t.b,
|
|
||||||
t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
|
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
|
||||||
return &Tensor{
|
return &Tensor{
|
||||||
b: t.b,
|
b: t.b,
|
||||||
@@ -1137,15 +1053,28 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
|
const (
|
||||||
|
ropeTypeNorm C.int = 0
|
||||||
|
ropeTypeNeox C.int = 2
|
||||||
|
ropeTypeMrope C.int = 8
|
||||||
|
ropeTypeVision C.int = 24
|
||||||
|
)
|
||||||
|
|
||||||
|
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
|
||||||
// Default options
|
// Default options
|
||||||
opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
|
opts := &ml.RopeOptions{
|
||||||
|
OriginalContextLen: 131072,
|
||||||
|
}
|
||||||
|
|
||||||
// Apply any provided options
|
// Apply any provided options
|
||||||
for _, option := range options {
|
for _, option := range options {
|
||||||
option(opts)
|
option(opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ropeFactors == nil {
|
||||||
|
ropeFactors = &Tensor{b: t.b}
|
||||||
|
}
|
||||||
|
|
||||||
dequant := t.t
|
dequant := t.t
|
||||||
if C.ggml_is_quantized(t.t._type) {
|
if C.ggml_is_quantized(t.t._type) {
|
||||||
dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
|
dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
|
||||||
@@ -1156,11 +1085,11 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
|
|||||||
t: C.ggml_rope_ext(
|
t: C.ggml_rope_ext(
|
||||||
ctx.(*Context).ctx,
|
ctx.(*Context).ctx,
|
||||||
dequant,
|
dequant,
|
||||||
positions.(*Tensor).t,
|
positionIDs.(*Tensor).t,
|
||||||
opts.Factors.(*Tensor).t,
|
ropeFactors.(*Tensor).t,
|
||||||
C.int(ropeDim),
|
C.int(ropeDim),
|
||||||
C.int(opts.Type),
|
C.int(ropeType),
|
||||||
C.int(opts.OriginalContextLength),
|
C.int(opts.OriginalContextLen),
|
||||||
C.float(ropeBase),
|
C.float(ropeBase),
|
||||||
C.float(ropeScale),
|
C.float(ropeScale),
|
||||||
C.float(0.0),
|
C.float(0.0),
|
||||||
|
|||||||
6
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
6
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
@@ -66,12 +66,6 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
|
|||||||
|
|
||||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status {
|
|
||||||
size_t size;
|
|
||||||
bool allocated;
|
|
||||||
};
|
|
||||||
GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
|
||||||
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
|
|||||||
6
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
6
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
@@ -304,12 +304,6 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
struct ggml_backend_buffer_status {
|
|
||||||
size_t size;
|
|
||||||
bool allocated;
|
|
||||||
};
|
|
||||||
GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
||||||
|
|||||||
38
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
38
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
@@ -364,7 +364,6 @@ struct node_alloc {
|
|||||||
struct ggml_gallocr {
|
struct ggml_gallocr {
|
||||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
ggml_backend_buffer_t * buffers; // [n_buffers]
|
||||||
size_t *buffer_sizes; // [n_buffers]
|
|
||||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||||
int n_buffers;
|
int n_buffers;
|
||||||
|
|
||||||
@@ -388,9 +387,6 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|||||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
||||||
GGML_ASSERT(galloc->buffers != NULL);
|
GGML_ASSERT(galloc->buffers != NULL);
|
||||||
|
|
||||||
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
|
||||||
GGML_ASSERT(galloc->buffer_sizes != NULL);
|
|
||||||
|
|
||||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||||
|
|
||||||
@@ -457,7 +453,6 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|||||||
ggml_hash_set_free(&galloc->hash_set);
|
ggml_hash_set_free(&galloc->hash_set);
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
free(galloc->bufts);
|
free(galloc->bufts);
|
||||||
free(galloc->buffer_sizes);
|
|
||||||
free(galloc->buffers);
|
free(galloc->buffers);
|
||||||
free(galloc->buf_tallocs);
|
free(galloc->buf_tallocs);
|
||||||
free(galloc->node_allocs);
|
free(galloc->node_allocs);
|
||||||
@@ -753,8 +748,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = true;
|
|
||||||
|
|
||||||
// reallocate buffers if needed
|
// reallocate buffers if needed
|
||||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
// if the buffer type is used multiple times, we reuse the same buffer
|
// if the buffer type is used multiple times, we reuse the same buffer
|
||||||
@@ -776,20 +769,15 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||||||
|
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||||
if (galloc->buffers[i]) {
|
if (galloc->buffers[i] == NULL) {
|
||||||
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
|
||||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
||||||
} else {
|
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
galloc->buffer_sizes[i] = new_size;
|
return false;
|
||||||
success = false;
|
|
||||||
}
|
}
|
||||||
} else {
|
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return success;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
@@ -946,24 +934,6 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|||||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|
||||||
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
|
||||||
|
|
||||||
for (int i = 0; i < buffer_id; i++) {
|
|
||||||
if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
|
|
||||||
// This buffer is the same as a previous one due to the same buffer type being used multiple times
|
|
||||||
// (See above.) However, we need a different check because multiple buffers might be NULL in our
|
|
||||||
// case and we still want to know the attempted size.
|
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status status = {0, true};
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||||
|
|||||||
10
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
10
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
@@ -1629,16 +1629,6 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
|||||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
|
||||||
struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
|
|
||||||
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ package cpu
|
|||||||
// #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
|
// #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
|
||||||
// #cgo CXXFLAGS: -std=c++17
|
// #cgo CXXFLAGS: -std=c++17
|
||||||
// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
|
// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
|
||||||
// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
|
// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
|
||||||
// #cgo linux CPPFLAGS: -D_GNU_SOURCE
|
// #cgo linux CPPFLAGS: -D_GNU_SOURCE
|
||||||
// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
||||||
// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
|
// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
|
||||||
|
|||||||
@@ -4,6 +4,6 @@ package metal
|
|||||||
|
|
||||||
//go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
|
//go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
|
||||||
|
|
||||||
// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
|
// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
|
||||||
// #cgo LDFLAGS: -framework Metal -framework MetalKit
|
// #cgo LDFLAGS: -framework Metal -framework MetalKit
|
||||||
import "C"
|
import "C"
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
// fast provides implementations of fast (fused) operations for increased performance.
|
|
||||||
package fast
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/ollama/ollama/ml"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
)
|
|
||||||
|
|
||||||
// fastRoPE is an interface for tensors that support fast rotary positional embedding.
|
|
||||||
type fastRoPE interface {
|
|
||||||
RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
|
|
||||||
}
|
|
||||||
|
|
||||||
// RoPE applies rotary positional embedding to tensor `t`.
|
|
||||||
func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
|
|
||||||
if t, ok := t.(fastRoPE); ok {
|
|
||||||
return t.RoPE(ctx, positions, dim, base, scale, options...)
|
|
||||||
}
|
|
||||||
|
|
||||||
panic("RoPE not implemented for this tensor type")
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
package rope
|
|
||||||
|
|
||||||
import "github.com/ollama/ollama/ml"
|
|
||||||
|
|
||||||
// Options contains optional parameters for RoPE function
|
|
||||||
type Options struct {
|
|
||||||
OriginalContextLength int
|
|
||||||
Type int
|
|
||||||
Factors ml.Tensor
|
|
||||||
}
|
|
||||||
|
|
||||||
// WithOriginalContextLength sets a custom context length
|
|
||||||
func WithOriginalContextLength(n int) func(*Options) {
|
|
||||||
return func(opts *Options) {
|
|
||||||
opts.OriginalContextLength = n
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// WithType sets RoPE type to NeoX
|
|
||||||
func WithTypeNeoX() func(*Options) {
|
|
||||||
return func(opts *Options) {
|
|
||||||
opts.Type = 2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// WithFactors sets custom rope factors
|
|
||||||
func WithFactors(factors ml.Tensor) func(*Options) {
|
|
||||||
return func(opts *Options) {
|
|
||||||
if factors != nil {
|
|
||||||
opts.Factors = factors
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -2,30 +2,16 @@ package input
|
|||||||
|
|
||||||
import "github.com/ollama/ollama/ml"
|
import "github.com/ollama/ollama/ml"
|
||||||
|
|
||||||
// Multimodal is a multimodal embedding or a component of one.
|
|
||||||
// For example, it could be a row of an image that can be processed
|
|
||||||
// independently.
|
|
||||||
type Multimodal struct {
|
|
||||||
// Tensor is the embedding data. Implementations may chose what to
|
|
||||||
// store here or it may be nil if not needed. However, any ml.Tensor
|
|
||||||
// objects must be stored here and not in Data.
|
|
||||||
Tensor ml.Tensor
|
|
||||||
|
|
||||||
// Data is implementation-specific opaque data, such as metadata on how
|
|
||||||
// to layout Tensor. It may be nil if not needed. It may also store larger
|
|
||||||
// objects such as complete images if they are to be processed later.
|
|
||||||
Data any
|
|
||||||
}
|
|
||||||
|
|
||||||
// Input represents one token in the input stream
|
// Input represents one token in the input stream
|
||||||
type Input struct {
|
type Input struct {
|
||||||
// Token is a single element of text.
|
// Token is a single element of text.
|
||||||
Token int32
|
Token int32
|
||||||
|
|
||||||
// Multimodal is represents a non-text element such as an
|
// Multimodal is opaque data representing a non-text
|
||||||
// image (or part of one if the image can be processed in pieces).
|
// element such as an image (or part of one if the image
|
||||||
// It may be used either together with Token or on its own.
|
// can be processed in pieces). It may be either together
|
||||||
Multimodal []Multimodal
|
// with Token or on its own.
|
||||||
|
Multimodal any
|
||||||
|
|
||||||
// MultimodalHash is a unique representation of the data
|
// MultimodalHash is a unique representation of the data
|
||||||
// stored in Multimodal, used for caching and comparing
|
// stored in Multimodal, used for caching and comparing
|
||||||
@@ -46,7 +32,7 @@ type Input struct {
|
|||||||
// Positions slice.
|
// Positions slice.
|
||||||
type MultimodalIndex struct {
|
type MultimodalIndex struct {
|
||||||
Index int
|
Index int
|
||||||
Multimodal []Multimodal
|
Multimodal any
|
||||||
}
|
}
|
||||||
|
|
||||||
// Batch contains the inputs for a model forward pass
|
// Batch contains the inputs for a model forward pass
|
||||||
|
|||||||
@@ -40,13 +40,12 @@ type MultimodalProcessor interface {
|
|||||||
// EncodeMultimodal processes a single input (such as an image) and
|
// EncodeMultimodal processes a single input (such as an image) and
|
||||||
// generates an output (typically an embedding) that can be used by the model.
|
// generates an output (typically an embedding) that can be used by the model.
|
||||||
//
|
//
|
||||||
// The return value is one or more tensors, each with optional model-specific
|
// The return value is most typically an ml.Tensor, however, different
|
||||||
// opaque metadata. Typically, the tensors might be views into an embedding
|
// type are possible, such as an object containing a tensor plus
|
||||||
// with each view representing a chunk of data that can be processed independently
|
// additional metadata, a slice of tensors or even just the original input.
|
||||||
// in different batches.
|
|
||||||
//
|
//
|
||||||
// The result may be cached by the runner.
|
// The result may be cached by the runner.
|
||||||
EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
|
EncodeMultimodal(ml.Context, []byte) (any, error)
|
||||||
|
|
||||||
// PostTokenize is called after tokenization to allow the model to edit the
|
// PostTokenize is called after tokenization to allow the model to edit the
|
||||||
// input stream to correctly arrange multimodal elements.
|
// input stream to correctly arrange multimodal elements.
|
||||||
@@ -98,8 +97,14 @@ func Register(name string, f func(fs.Config) (Model, error)) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// New initializes a new model instance with the provided configuration based on the metadata in the model file
|
// New initializes a new model instance with the provided configuration based on the metadata in the model file
|
||||||
func New(modelPath string, params ml.BackendParams) (Model, error) {
|
func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
|
||||||
b, err := ml.NewBackend(modelPath, params)
|
r, err := os.Open(modelPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
b, err := ml.NewBackend(ctx, r, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -128,7 +133,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
meta, err := fsggml.Decode(r, -1)
|
meta, _, err := fsggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -287,7 +292,11 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
|
|||||||
return nil, errors.New("batch size cannot be less than 1")
|
return nil, errors.New("batch size cannot be less than 1")
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
|
var err error
|
||||||
|
batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
cache := m.Config().Cache
|
cache := m.Config().Cache
|
||||||
if cache != nil {
|
if cache != nil {
|
||||||
|
|||||||
@@ -7,8 +7,6 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"github.com/ollama/ollama/model"
|
"github.com/ollama/ollama/model"
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
@@ -45,13 +43,10 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||||
EOS: append(
|
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
@@ -85,10 +80,11 @@ type SelfAttention struct {
|
|||||||
|
|
||||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||||
batchSize := hiddenState.Dim(1)
|
batchSize := hiddenState.Dim(1)
|
||||||
|
ropeType := uint32(2)
|
||||||
|
|
||||||
q := sa.Query.Forward(ctx, hiddenState)
|
q := sa.Query.Forward(ctx, hiddenState)
|
||||||
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
|
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
|
||||||
q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
if opts.largeModelScaling {
|
if opts.largeModelScaling {
|
||||||
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
|
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
|
||||||
@@ -98,7 +94,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
|||||||
|
|
||||||
k := sa.Key.Forward(ctx, hiddenState)
|
k := sa.Key.Forward(ctx, hiddenState)
|
||||||
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
|
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
|
||||||
k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
v := sa.Value.Forward(ctx, hiddenState)
|
v := sa.Value.Forward(ctx, hiddenState)
|
||||||
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
|
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
|
||||||
@@ -128,7 +124,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||||
return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
|
return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type MLP struct {
|
type MLP struct {
|
||||||
@@ -175,8 +171,15 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||||
hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
|
hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
|
||||||
|
|||||||
@@ -60,16 +60,12 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
EOS: int32(1),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
EOS: append(
|
EOT: int32(106),
|
||||||
[]int32{
|
AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
|
||||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
|
|
||||||
},
|
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
@@ -86,7 +82,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
return &m, nil
|
return &m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||||
if len(m.VisionModel.Layers) == 0 {
|
if len(m.VisionModel.Layers) == 0 {
|
||||||
return nil, model.ErrNoVisionModel
|
return nil, model.ErrNoVisionModel
|
||||||
}
|
}
|
||||||
@@ -101,30 +97,33 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
pixelValues := ctx.Input().FromFloatSlice(f32s,
|
pixelValues, err := ctx.Input().FromFloatSlice(f32s,
|
||||||
m.ImageProcessor.imageSize,
|
m.ImageProcessor.imageSize,
|
||||||
m.ImageProcessor.imageSize,
|
m.ImageProcessor.imageSize,
|
||||||
m.ImageProcessor.numChannels,
|
m.ImageProcessor.numChannels,
|
||||||
)
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
||||||
visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
|
visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
|
||||||
return []input.Multimodal{{Tensor: visionOutputs}}, nil
|
return visionOutputs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||||
var result []input.Input
|
var result []input.Input
|
||||||
|
|
||||||
for _, inp := range inputs {
|
for _, inp := range inputs {
|
||||||
if len(inp.Multimodal) == 0 {
|
if inp.Multimodal == nil {
|
||||||
result = append(result, inp)
|
result = append(result, inp)
|
||||||
} else {
|
} else {
|
||||||
inputMultimodal := inp.Multimodal[0].Tensor
|
inputMultimodal := inp.Multimodal.(ml.Tensor)
|
||||||
|
|
||||||
result = append(result,
|
result = append(result,
|
||||||
input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
|
input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
|
||||||
input.Input{Token: 255999}, // "<start_of_image>""
|
input.Input{Token: 255999}, // "<start_of_image>""
|
||||||
input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
|
input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
|
||||||
)
|
)
|
||||||
|
|
||||||
// add image token placeholders
|
// add image token placeholders
|
||||||
@@ -141,8 +140,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,8 +7,6 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -75,6 +73,7 @@ type TextSelfAttention struct {
|
|||||||
|
|
||||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
|
func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
|
||||||
batchSize := hiddenState.Dim(1)
|
batchSize := hiddenState.Dim(1)
|
||||||
|
ropeType := uint32(2)
|
||||||
|
|
||||||
ropeBase := opts.ropeLocalBase
|
ropeBase := opts.ropeLocalBase
|
||||||
if (layer+1)%gemmaGlobalCacheCount == 0 {
|
if (layer+1)%gemmaGlobalCacheCount == 0 {
|
||||||
@@ -84,7 +83,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
|
|||||||
q := sa.Query.Forward(ctx, hiddenState)
|
q := sa.Query.Forward(ctx, hiddenState)
|
||||||
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
|
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
|
||||||
q = sa.QueryNorm.Forward(ctx, q, opts.eps)
|
q = sa.QueryNorm.Forward(ctx, q, opts.eps)
|
||||||
q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
if opts.largeModelScaling {
|
if opts.largeModelScaling {
|
||||||
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
|
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
|
||||||
@@ -95,7 +94,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
|
|||||||
k := sa.Key.Forward(ctx, hiddenState)
|
k := sa.Key.Forward(ctx, hiddenState)
|
||||||
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
|
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
|
||||||
k = sa.KeyNorm.Forward(ctx, k, opts.eps)
|
k = sa.KeyNorm.Forward(ctx, k, opts.eps)
|
||||||
k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
v := sa.Value.Forward(ctx, hiddenState)
|
v := sa.Value.Forward(ctx, hiddenState)
|
||||||
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
|
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
|
||||||
@@ -113,7 +112,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
|
|||||||
ropeBase = m.TextConfig.ropeGlobalBase
|
ropeBase = m.TextConfig.ropeGlobalBase
|
||||||
}
|
}
|
||||||
|
|
||||||
return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
|
return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type TextMLP struct {
|
type TextMLP struct {
|
||||||
@@ -166,7 +165,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
// set image embeddings
|
// set image embeddings
|
||||||
var except []int
|
var except []int
|
||||||
for _, image := range batch.Multimodal {
|
for _, image := range batch.Multimodal {
|
||||||
visionOutputs := image.Multimodal[0].Tensor
|
visionOutputs := image.Multimodal.(ml.Tensor)
|
||||||
ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
|
ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
|
||||||
|
|
||||||
for i := range visionOutputs.Dim(1) {
|
for i := range visionOutputs.Dim(1) {
|
||||||
|
|||||||
@@ -1,23 +1,22 @@
|
|||||||
package llama
|
package llama
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"github.com/ollama/ollama/model"
|
"github.com/ollama/ollama/model"
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Options struct {
|
type Options struct {
|
||||||
hiddenSize, numHeads, numKVHeads int
|
hiddenSize, numHeads, numKVHeads int
|
||||||
headDim, ropeDim int
|
|
||||||
eps, ropeBase, ropeScale float32
|
eps, ropeBase, ropeScale float32
|
||||||
|
ropeDim uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
type Model struct {
|
type Model struct {
|
||||||
@@ -33,6 +32,10 @@ type Model struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
|
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
|
||||||
|
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
|
||||||
|
}
|
||||||
|
|
||||||
m := Model{
|
m := Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||||
@@ -40,13 +43,13 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
EOS: append(
|
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
@@ -54,11 +57,10 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
hiddenSize: int(c.Uint("embedding_length")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
numHeads: int(c.Uint("attention.head_count")),
|
numHeads: int(c.Uint("attention.head_count")),
|
||||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||||
headDim: int(c.Uint("attention.key_length")),
|
|
||||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
|
||||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||||
ropeBase: c.Float("rope.freq_base"),
|
ropeBase: c.Float("rope.freq_base"),
|
||||||
ropeScale: c.Float("rope.freq_scale", 1),
|
ropeScale: c.Float("rope.freq_scale", 1),
|
||||||
|
ropeDim: c.Uint("rope.dimension_count"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -75,31 +77,31 @@ type SelfAttention struct {
|
|||||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||||
batchSize := hiddenState.Dim(1)
|
batchSize := hiddenState.Dim(1)
|
||||||
headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
|
headDim := opts.hiddenSize / opts.numHeads
|
||||||
ropeDim := cmp.Or(opts.ropeDim, headDim)
|
ropeType := uint32(0)
|
||||||
|
|
||||||
query := sa.Query.Forward(ctx, hiddenState)
|
q := sa.Query.Forward(ctx, hiddenState)
|
||||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||||
|
q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
key := sa.Key.Forward(ctx, hiddenState)
|
k := sa.Key.Forward(ctx, hiddenState)
|
||||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
|
k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
value := sa.Value.Forward(ctx, hiddenState)
|
v := sa.Value.Forward(ctx, hiddenState)
|
||||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
|
|
||||||
query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
scaleFactor := 1.0 / math.Sqrt(float64(headDim))
|
||||||
key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
|
||||||
|
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||||
|
|
||||||
attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
|
return sa.Output.Forward(ctx, kqv)
|
||||||
attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
|
|
||||||
return sa.Output.Forward(ctx, attention)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||||
ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
|
return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
|
||||||
return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type MLP struct {
|
type MLP struct {
|
||||||
@@ -120,11 +122,11 @@ type Layer struct {
|
|||||||
MLP *MLP
|
MLP *MLP
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||||
residual := hiddenState
|
residual := hiddenState
|
||||||
|
|
||||||
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
|
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
||||||
|
|
||||||
// In the final layer (outputs != nil), optimize by pruning to just the token positions
|
// In the final layer (outputs != nil), optimize by pruning to just the token positions
|
||||||
// we need logits for.
|
// we need logits for.
|
||||||
@@ -142,19 +144,27 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tenso
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||||
|
|
||||||
for i, layer := range m.Layers {
|
for i, layer := range m.Layers {
|
||||||
m.Cache.SetLayer(i)
|
m.Cache.SetLayer(i)
|
||||||
|
|
||||||
var outputs ml.Tensor
|
var lastLayerOutputs ml.Tensor
|
||||||
if i == len(m.Layers)-1 {
|
if i == len(m.Layers)-1 {
|
||||||
outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
lastLayerOutputs = outputs
|
||||||
}
|
}
|
||||||
|
|
||||||
hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
|
hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
|
||||||
}
|
}
|
||||||
|
|
||||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"image"
|
"image"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
@@ -40,13 +41,13 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
EOS: append(
|
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
@@ -62,7 +63,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
return &m, nil
|
return &m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||||
if len(m.VisionModel.Layers) < 1 {
|
if len(m.VisionModel.Layers) < 1 {
|
||||||
return nil, model.ErrNoVisionModel
|
return nil, model.ErrNoVisionModel
|
||||||
}
|
}
|
||||||
@@ -77,7 +78,10 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
|
tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
|
ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
|
||||||
|
|
||||||
@@ -88,86 +92,81 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
|||||||
pixelValues := tilesLocal
|
pixelValues := tilesLocal
|
||||||
|
|
||||||
if len(pixelsGlobal) > 0 {
|
if len(pixelsGlobal) > 0 {
|
||||||
tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
|
tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
|
pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
|
||||||
}
|
}
|
||||||
|
|
||||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
||||||
visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
|
visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
|
||||||
projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
|
projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
|
||||||
|
return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
|
||||||
var multimodal []input.Multimodal
|
|
||||||
aspectRatio := image.Point{ratioW, ratioH}
|
|
||||||
|
|
||||||
var offset int
|
|
||||||
patchesPerChunk := projectedOutputs.Dim(1)
|
|
||||||
if aspectRatio.Y*aspectRatio.X > 1 {
|
|
||||||
patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
|
|
||||||
|
|
||||||
for range aspectRatio.Y {
|
|
||||||
for x := range aspectRatio.X {
|
|
||||||
view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
|
|
||||||
projectedOutputs.Dim(0), projectedOutputs.Stride(1),
|
|
||||||
patchesPerChunk)
|
|
||||||
var separator separator
|
|
||||||
if x < aspectRatio.X-1 {
|
|
||||||
separator.x = true // <|tile_x_separator|>
|
|
||||||
} else {
|
|
||||||
separator.y = true // <|tile_y_separator|>
|
|
||||||
}
|
|
||||||
multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
|
|
||||||
offset += patchesPerChunk
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
|
|
||||||
projectedOutputs.Dim(0), projectedOutputs.Stride(1),
|
|
||||||
patchesPerChunk)
|
|
||||||
multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
|
|
||||||
|
|
||||||
return multimodal, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type separator struct {
|
type chunks struct {
|
||||||
x bool
|
*Model
|
||||||
y bool
|
ml.Tensor
|
||||||
|
aspectRatio image.Point
|
||||||
|
|
||||||
|
dataOnce sync.Once
|
||||||
|
data []float32
|
||||||
|
}
|
||||||
|
|
||||||
|
type chunk struct {
|
||||||
|
*chunks
|
||||||
|
s, n int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *chunk) floats() []float32 {
|
||||||
|
r.dataOnce.Do(func() {
|
||||||
|
temp := r.Backend().NewContext()
|
||||||
|
defer temp.Close()
|
||||||
|
temp.Forward(r.Tensor).Compute(r.Tensor)
|
||||||
|
r.data = r.Floats()
|
||||||
|
})
|
||||||
|
|
||||||
|
return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||||
var result []input.Input
|
var result []input.Input
|
||||||
for _, inp := range inputs {
|
for _, inp := range inputs {
|
||||||
if len(inp.Multimodal) == 0 {
|
if inp.Multimodal == nil {
|
||||||
result = append(result, inp)
|
result = append(result, inp)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
t := inp.Multimodal.(*chunks)
|
||||||
var imageInputs []input.Input
|
var imageInputs []input.Input
|
||||||
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>
|
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>
|
||||||
|
|
||||||
for i, mm := range inp.Multimodal {
|
var offset int
|
||||||
patchesPerChunk := mm.Tensor.Dim(1)
|
patchesPerChunk := t.Dim(1)
|
||||||
|
if t.aspectRatio.Y*t.aspectRatio.X > 1 {
|
||||||
|
patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
|
||||||
|
|
||||||
if i < len(inp.Multimodal)-1 {
|
for range t.aspectRatio.Y {
|
||||||
separator := mm.Data.(*separator)
|
for x := range t.aspectRatio.X {
|
||||||
|
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
||||||
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
||||||
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
if x < t.aspectRatio.X-1 {
|
||||||
|
imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
|
||||||
if separator.x {
|
}
|
||||||
imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
|
offset += patchesPerChunk
|
||||||
}
|
}
|
||||||
if separator.y {
|
|
||||||
imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
|
imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
|
||||||
}
|
|
||||||
} else {
|
|
||||||
imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|>
|
|
||||||
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
|
||||||
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
|
||||||
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|>
|
||||||
|
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
||||||
|
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
||||||
|
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
|
||||||
|
|
||||||
result = append(result, imageInputs...)
|
result = append(result, imageInputs...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,8 +174,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,8 +8,6 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -33,8 +31,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
|
|||||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
|
|
||||||
if useRope {
|
if useRope {
|
||||||
query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
|
||||||
key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.useQKNorm {
|
if opts.useQKNorm {
|
||||||
@@ -82,7 +80,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
|
|||||||
|
|
||||||
nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
|
nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
|
||||||
for i := 1; i < opts.numExpertsUsed; i++ {
|
for i := 1; i < opts.numExpertsUsed; i++ {
|
||||||
nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
|
nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
|
||||||
}
|
}
|
||||||
|
|
||||||
return nextStates
|
return nextStates
|
||||||
@@ -212,7 +210,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
|
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
|
||||||
|
|
||||||
for _, mi := range batch.Multimodal {
|
for _, mi := range batch.Multimodal {
|
||||||
img := mi.Multimodal[0].Tensor
|
f32s := mi.Multimodal.(*chunk).floats()
|
||||||
|
img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,7 +226,11 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
|
scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
|
var err error
|
||||||
|
attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, layer := range m.Layers {
|
for i, layer := range m.Layers {
|
||||||
@@ -248,5 +255,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
|
return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -245,7 +245,10 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
|
ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||||
ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
|
ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"image"
|
"image"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
@@ -31,26 +32,31 @@ var _ model.MultimodalProcessor = (*Model)(nil)
|
|||||||
var _ model.TextProcessor = (*Model)(nil)
|
var _ model.TextProcessor = (*Model)(nil)
|
||||||
|
|
||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
|
textModel, err := NewTextModel(c)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
m := &Model{
|
m := &Model{
|
||||||
|
TextModel: textModel,
|
||||||
|
VisionModel: newVisionModel(c),
|
||||||
|
ImageProcessor: newImageProcessor(c),
|
||||||
|
MultiModalProjector: newMultiModalProjector(c),
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
EOS: append(
|
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
TextModel: newTextModel(c),
|
|
||||||
VisionModel: newVisionModel(c),
|
|
||||||
ImageProcessor: newImageProcessor(c),
|
|
||||||
MultiModalProjector: newMultiModalProjector(c),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||||
@@ -99,7 +105,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||||
if len(m.VisionModel.Layers) == 0 {
|
if len(m.VisionModel.Layers) == 0 {
|
||||||
return nil, model.ErrNoVisionModel
|
return nil, model.ErrNoVisionModel
|
||||||
}
|
}
|
||||||
@@ -114,20 +120,46 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
|
pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
||||||
features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
|
features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
|
||||||
|
|
||||||
// split into patches to be sent to the text transformer
|
// split into patches to be sent to the text transformer
|
||||||
rows := make([]input.Multimodal, size.Y)
|
parent := imageFeatures{tensor: features}
|
||||||
|
rows := make([]*imageRow, size.Y)
|
||||||
for i := range rows {
|
for i := range rows {
|
||||||
rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
|
rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rows, nil
|
return rows, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type imageFeatures struct {
|
||||||
|
tensor ml.Tensor
|
||||||
|
|
||||||
|
dataOnce sync.Once
|
||||||
|
data []float32
|
||||||
|
}
|
||||||
|
|
||||||
|
type imageRow struct {
|
||||||
|
parent *imageFeatures
|
||||||
|
s int
|
||||||
|
shape []int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *imageRow) data() []float32 {
|
||||||
|
n := 1
|
||||||
|
for _, s := range r.shape {
|
||||||
|
n *= s
|
||||||
|
}
|
||||||
|
|
||||||
|
return r.parent.data[r.s*n : (r.s+1)*n]
|
||||||
|
}
|
||||||
|
|
||||||
// PostTokenize arranges Mistral 3's inputs for the forward pass
|
// PostTokenize arranges Mistral 3's inputs for the forward pass
|
||||||
// In Mistral 3 and Pixtral, the input patches are arranged as follows:
|
// In Mistral 3 and Pixtral, the input patches are arranged as follows:
|
||||||
// [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
|
// [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
|
||||||
@@ -136,14 +168,15 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
|||||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||||
var result []input.Input
|
var result []input.Input
|
||||||
for _, inp := range inputs {
|
for _, inp := range inputs {
|
||||||
if len(inp.Multimodal) == 0 {
|
if inp.Multimodal == nil {
|
||||||
result = append(result, inp)
|
result = append(result, inp)
|
||||||
} else {
|
} else {
|
||||||
for i, row := range inp.Multimodal {
|
inputMultimodal := inp.Multimodal.([]*imageRow)
|
||||||
|
for i, row := range inputMultimodal {
|
||||||
// [IMG]
|
// [IMG]
|
||||||
result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
|
result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
|
||||||
result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
|
result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
|
||||||
if i == len(inp.Multimodal)-1 {
|
if i == len(inputMultimodal)-1 {
|
||||||
// [IMG_END]
|
// [IMG_END]
|
||||||
result = append(result, input.Input{Token: 13})
|
result = append(result, input.Input{Token: 13})
|
||||||
} else {
|
} else {
|
||||||
@@ -158,8 +191,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,24 +1,27 @@
|
|||||||
package mistral3
|
package mistral3
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
"github.com/ollama/ollama/model"
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
type TextOptions struct {
|
type TextOptions struct {
|
||||||
hiddenSize, numHeads, numKVHeads int
|
hiddenSize, numHeads, numKVHeads, headDim int
|
||||||
headDim, ropeDim int
|
eps, ropeBase, ropeScale float32
|
||||||
eps, ropeBase, ropeScale float32
|
ropeDim uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
type TextModel struct {
|
type TextModel struct {
|
||||||
|
model.Base
|
||||||
|
|
||||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||||
Layers []Layer `gguf:"blk"`
|
Layers []Layer `gguf:"blk"`
|
||||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||||
@@ -36,15 +39,19 @@ type SelfAttention struct {
|
|||||||
|
|
||||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
||||||
batchSize := hiddenState.Dim(1)
|
batchSize := hiddenState.Dim(1)
|
||||||
headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
|
ropeType := uint32(0)
|
||||||
|
headDim := opts.headDim
|
||||||
|
if headDim == 0 {
|
||||||
|
headDim = opts.hiddenSize / opts.numHeads
|
||||||
|
}
|
||||||
|
|
||||||
q := sa.Query.Forward(ctx, hiddenState)
|
q := sa.Query.Forward(ctx, hiddenState)
|
||||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||||
q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
k := sa.Key.Forward(ctx, hiddenState)
|
k := sa.Key.Forward(ctx, hiddenState)
|
||||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
v := sa.Value.Forward(ctx, hiddenState)
|
v := sa.Value.Forward(ctx, hiddenState)
|
||||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
@@ -55,7 +62,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
|
return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type MLP struct {
|
type MLP struct {
|
||||||
@@ -102,7 +109,20 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
|
|
||||||
// image embeddings
|
// image embeddings
|
||||||
for _, image := range batch.Multimodal {
|
for _, image := range batch.Multimodal {
|
||||||
imageFeature := image.Multimodal[0].Tensor
|
row := image.Multimodal.(*imageRow)
|
||||||
|
row.parent.dataOnce.Do(func() {
|
||||||
|
// use a new, throwaway context so the image tensor is not added to the graph
|
||||||
|
temp := m.Backend().NewContext()
|
||||||
|
temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
|
||||||
|
row.parent.data = row.parent.tensor.Floats()
|
||||||
|
temp.Close()
|
||||||
|
})
|
||||||
|
|
||||||
|
imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
|
ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,18 +141,24 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
return m.Output.Forward(ctx, hiddenState)
|
return m.Output.Forward(ctx, hiddenState)
|
||||||
}
|
}
|
||||||
|
|
||||||
func newTextModel(c fs.Config) *TextModel {
|
func NewTextModel(c fs.Config) (*TextModel, error) {
|
||||||
return &TextModel{
|
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
|
||||||
|
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
|
||||||
|
}
|
||||||
|
|
||||||
|
textModel := &TextModel{
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
TextOptions: &TextOptions{
|
TextOptions: &TextOptions{
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
numHeads: int(c.Uint("attention.head_count")),
|
numHeads: int(c.Uint("attention.head_count")),
|
||||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||||
headDim: int(c.Uint("attention.key_length")),
|
headDim: int(c.Uint("attention.key_length")),
|
||||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
|
||||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||||
ropeBase: c.Float("rope.freq_base"),
|
ropeBase: c.Float("rope.freq_base"),
|
||||||
ropeScale: c.Float("rope.freq_scale", 1),
|
ropeScale: c.Float("rope.freq_scale", 1),
|
||||||
|
ropeDim: c.Uint("rope.dimension_count"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return textModel, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -110,8 +110,15 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
|
h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
|
||||||
w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||||
w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||||
@@ -144,7 +151,10 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
|
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
|
positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
|
||||||
cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
|
cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
|
||||||
@@ -160,7 +170,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
|
|||||||
|
|
||||||
func newVisionModel(c fs.Config) *VisionModel {
|
func newVisionModel(c fs.Config) *VisionModel {
|
||||||
return &VisionModel{
|
return &VisionModel{
|
||||||
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
|
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
|
||||||
VisionModelOptions: &VisionModelOptions{
|
VisionModelOptions: &VisionModelOptions{
|
||||||
hiddenSize: int(c.Uint("vision.embedding_length", 1024)),
|
hiddenSize: int(c.Uint("vision.embedding_length", 1024)),
|
||||||
numHeads: int(c.Uint("vision.attention.head_count", 16)),
|
numHeads: int(c.Uint("vision.attention.head_count", 16)),
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ package mllama
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"image"
|
"image"
|
||||||
"slices"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
@@ -38,13 +37,13 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
EOS: append(
|
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
@@ -59,7 +58,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
return &m, nil
|
return &m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||||
if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
|
if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
|
||||||
return nil, model.ErrNoVisionModel
|
return nil, model.ErrNoVisionModel
|
||||||
}
|
}
|
||||||
@@ -74,20 +73,21 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if ratio.numTiles() < m.maxNumTiles {
|
pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
|
||||||
// Pad tiles to maxNumTiles
|
if err != nil {
|
||||||
f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
|
return nil, err
|
||||||
f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
|
pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
|
||||||
aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
|
|
||||||
|
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
|
positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
|
||||||
crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
|
crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
|
||||||
projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
|
return m.Projector.Forward(ctx, crossAttentionStates), nil
|
||||||
|
|
||||||
return []input.Multimodal{{Tensor: projectedOutputs}}, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||||
@@ -103,11 +103,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
var crossAttentionStates ml.Tensor
|
var crossAttentionStates ml.Tensor
|
||||||
if len(batch.Multimodal) > 0 {
|
if len(batch.Multimodal) > 0 {
|
||||||
crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
|
crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
|
||||||
}
|
}
|
||||||
|
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: attention mask, cross attention mask
|
// TODO: attention mask, cross attention mask
|
||||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
|
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
|
||||||
|
|||||||
@@ -8,8 +8,6 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type TextSelfAttention struct {
|
type TextSelfAttention struct {
|
||||||
@@ -23,14 +21,15 @@ type TextSelfAttention struct {
|
|||||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||||
batchSize := hiddenState.Dim(1)
|
batchSize := hiddenState.Dim(1)
|
||||||
headDim := opts.hiddenSize / opts.numHeads
|
headDim := opts.hiddenSize / opts.numHeads
|
||||||
|
ropeType := uint32(0)
|
||||||
|
|
||||||
query := sa.Query.Forward(ctx, hiddenState)
|
query := sa.Query.Forward(ctx, hiddenState)
|
||||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||||
query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
key := sa.Key.Forward(ctx, hiddenState)
|
key := sa.Key.Forward(ctx, hiddenState)
|
||||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
value := sa.Value.Forward(ctx, hiddenState)
|
value := sa.Value.Forward(ctx, hiddenState)
|
||||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
@@ -45,7 +44,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
|
|||||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||||
// This will only get called for layers in the cache, which are just the self attention layers
|
// This will only get called for layers in the cache, which are just the self attention layers
|
||||||
if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
|
if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
|
||||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
|
return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return key, nil
|
return key, nil
|
||||||
@@ -200,8 +199,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
|
|||||||
|
|
||||||
type TextModelOptions struct {
|
type TextModelOptions struct {
|
||||||
hiddenSize, numHeads, numKVHeads int
|
hiddenSize, numHeads, numKVHeads int
|
||||||
ropeDim int
|
|
||||||
eps, ropeBase, ropeScale float32
|
eps, ropeBase, ropeScale float32
|
||||||
|
ropeDim uint32
|
||||||
|
|
||||||
crossAttentionLayers []int32
|
crossAttentionLayers []int32
|
||||||
}
|
}
|
||||||
@@ -241,10 +240,10 @@ func newTextModel(c fs.Config) *TextModel {
|
|||||||
hiddenSize: int(c.Uint("embedding_length")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
numHeads: int(c.Uint("attention.head_count")),
|
numHeads: int(c.Uint("attention.head_count")),
|
||||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
|
||||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||||
ropeBase: c.Float("rope.freq_base"),
|
ropeBase: c.Float("rope.freq_base"),
|
||||||
ropeScale: c.Float("rope.freq_scale", 1),
|
ropeScale: c.Float("rope.freq_scale", 1),
|
||||||
|
ropeDim: c.Uint("rope.dimension_count"),
|
||||||
crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
|
crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ type VisionSelfAttention struct {
|
|||||||
Key *nn.Linear `gguf:"attn_k"`
|
Key *nn.Linear `gguf:"attn_k"`
|
||||||
Value *nn.Linear `gguf:"attn_v"`
|
Value *nn.Linear `gguf:"attn_v"`
|
||||||
Output *nn.Linear `gguf:"attn_output"`
|
Output *nn.Linear `gguf:"attn_output"`
|
||||||
|
|
||||||
|
Gate ml.Tensor `gguf:"attn_gate"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||||
@@ -23,16 +25,27 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
|
|||||||
|
|
||||||
query := sa.Query.Forward(ctx, hiddenState)
|
query := sa.Query.Forward(ctx, hiddenState)
|
||||||
query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
|
query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
|
||||||
|
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||||
|
|
||||||
key := sa.Key.Forward(ctx, hiddenState)
|
key := sa.Key.Forward(ctx, hiddenState)
|
||||||
key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
|
key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
|
||||||
|
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||||
|
|
||||||
value := sa.Value.Forward(ctx, hiddenState)
|
value := sa.Value.Forward(ctx, hiddenState)
|
||||||
value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
|
value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
|
||||||
|
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||||
|
|
||||||
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
|
scores := key.Mulmat(ctx, query)
|
||||||
|
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||||
|
scores = scores.Softmax(ctx)
|
||||||
|
|
||||||
|
attention := value.Mulmat(ctx, scores)
|
||||||
|
attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
|
||||||
|
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||||
return sa.Output.Forward(ctx, attention)
|
|
||||||
|
hiddenState = sa.Output.Forward(ctx, attention)
|
||||||
|
return hiddenState
|
||||||
}
|
}
|
||||||
|
|
||||||
type VisionMLP struct {
|
type VisionMLP struct {
|
||||||
@@ -63,18 +76,21 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
|
|||||||
// self attention
|
// self attention
|
||||||
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||||
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
||||||
|
|
||||||
if e.AttentionGate != nil {
|
if e.AttentionGate != nil {
|
||||||
hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
|
hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
|
||||||
}
|
}
|
||||||
hiddenState = hiddenState.Add(ctx, residual)
|
hiddenState = hiddenState.Add(ctx, residual)
|
||||||
residual = hiddenState
|
residual = hiddenState
|
||||||
|
|
||||||
|
// feed forward
|
||||||
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||||
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
||||||
|
hiddenState = hiddenState.Add(ctx, residual)
|
||||||
if e.MLPGate != nil {
|
if e.MLPGate != nil {
|
||||||
hiddenState = hiddenState.Mul(ctx, e.MLPGate)
|
hiddenState = hiddenState.Mul(ctx, e.MLPGate)
|
||||||
}
|
}
|
||||||
hiddenState = hiddenState.Add(ctx, residual)
|
|
||||||
return hiddenState
|
return hiddenState
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,5 @@ import (
|
|||||||
_ "github.com/ollama/ollama/model/models/llama4"
|
_ "github.com/ollama/ollama/model/models/llama4"
|
||||||
_ "github.com/ollama/ollama/model/models/mistral3"
|
_ "github.com/ollama/ollama/model/models/mistral3"
|
||||||
_ "github.com/ollama/ollama/model/models/mllama"
|
_ "github.com/ollama/ollama/model/models/mllama"
|
||||||
_ "github.com/ollama/ollama/model/models/qwen2"
|
|
||||||
_ "github.com/ollama/ollama/model/models/qwen25vl"
|
_ "github.com/ollama/ollama/model/models/qwen25vl"
|
||||||
_ "github.com/ollama/ollama/model/models/qwen3"
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,164 +0,0 @@
|
|||||||
package qwen2
|
|
||||||
|
|
||||||
import (
|
|
||||||
"cmp"
|
|
||||||
"math"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
|
||||||
"github.com/ollama/ollama/kvcache"
|
|
||||||
"github.com/ollama/ollama/ml"
|
|
||||||
"github.com/ollama/ollama/ml/nn"
|
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"github.com/ollama/ollama/model"
|
|
||||||
"github.com/ollama/ollama/model/input"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Options struct {
|
|
||||||
hiddenSize, numHeads, numKVHeads int
|
|
||||||
headDim, ropeDim int
|
|
||||||
eps, ropeBase, ropeScale float32
|
|
||||||
}
|
|
||||||
|
|
||||||
type Attention struct {
|
|
||||||
Query *nn.Linear `gguf:"attn_q"`
|
|
||||||
Key *nn.Linear `gguf:"attn_k"`
|
|
||||||
Value *nn.Linear `gguf:"attn_v"`
|
|
||||||
Output *nn.Linear `gguf:"attn_output"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
|
||||||
batchSize := hiddenStates.Dim(1)
|
|
||||||
headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
|
|
||||||
ropeDim := cmp.Or(opts.ropeDim, headDim)
|
|
||||||
|
|
||||||
query := attn.Query.Forward(ctx, hiddenStates)
|
|
||||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
|
||||||
|
|
||||||
key := attn.Key.Forward(ctx, hiddenStates)
|
|
||||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
|
||||||
|
|
||||||
value := attn.Value.Forward(ctx, hiddenStates)
|
|
||||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
|
||||||
|
|
||||||
query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
|
||||||
key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
|
||||||
|
|
||||||
attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
|
|
||||||
attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
|
|
||||||
|
|
||||||
return attn.Output.Forward(ctx, attention)
|
|
||||||
}
|
|
||||||
|
|
||||||
type MLP struct {
|
|
||||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
|
||||||
Up *nn.Linear `gguf:"ffn_up"`
|
|
||||||
Down *nn.Linear `gguf:"ffn_down"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
|
|
||||||
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
|
||||||
return mlp.Down.Forward(ctx, hiddenStates)
|
|
||||||
}
|
|
||||||
|
|
||||||
type DecoderLayer struct {
|
|
||||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
|
||||||
Attention *Attention
|
|
||||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
|
||||||
MLP *MLP
|
|
||||||
}
|
|
||||||
|
|
||||||
func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
|
||||||
residual := hiddenStates
|
|
||||||
|
|
||||||
hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
||||||
hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
|
|
||||||
if outputs != nil {
|
|
||||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
|
||||||
residual = residual.Rows(ctx, outputs)
|
|
||||||
}
|
|
||||||
|
|
||||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
||||||
residual = hiddenStates
|
|
||||||
|
|
||||||
hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
||||||
hiddenStates = d.MLP.Forward(ctx, hiddenStates)
|
|
||||||
return hiddenStates.Add(ctx, residual)
|
|
||||||
}
|
|
||||||
|
|
||||||
type Model struct {
|
|
||||||
model.Base
|
|
||||||
model.BytePairEncoding
|
|
||||||
|
|
||||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
|
||||||
Layers []DecoderLayer `gguf:"blk"`
|
|
||||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
|
||||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
|
||||||
|
|
||||||
Options
|
|
||||||
}
|
|
||||||
|
|
||||||
// Forward implements model.Model.
|
|
||||||
func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
|
||||||
|
|
||||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
|
||||||
|
|
||||||
for i, layer := range m.Layers {
|
|
||||||
m.Cache.SetLayer(i)
|
|
||||||
|
|
||||||
var outputs ml.Tensor
|
|
||||||
if i == len(m.Layers)-1 {
|
|
||||||
outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
|
||||||
}
|
|
||||||
|
|
||||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
|
|
||||||
}
|
|
||||||
|
|
||||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
|
|
||||||
hiddenStates = m.Output.Forward(ctx, hiddenStates)
|
|
||||||
return hiddenStates, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
|
||||||
ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
|
|
||||||
return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func New(c fs.Config) (model.Model, error) {
|
|
||||||
m := Model{
|
|
||||||
Layers: make([]DecoderLayer, c.Uint("block_count")),
|
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
EOS: append(
|
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
||||||
),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Options: Options{
|
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
|
||||||
numHeads: int(c.Uint("attention.head_count")),
|
|
||||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
|
||||||
headDim: int(c.Uint("attention.key_length")),
|
|
||||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
|
||||||
ropeBase: c.Float("rope.freq_base"),
|
|
||||||
ropeScale: c.Float("rope.freq_scale", 1),
|
|
||||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
|
||||||
return &m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
model.Register("qwen2", New)
|
|
||||||
}
|
|
||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"image"
|
"image"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
@@ -34,13 +35,12 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
||||||
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
EOS: append(
|
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
TextModel: NewTextModel(c),
|
TextModel: NewTextModel(c),
|
||||||
@@ -69,12 +69,15 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
|
|||||||
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
|
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
|
||||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||||
|
|
||||||
pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
|
pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
return pixelValues, grid, nil
|
return pixelValues, grid, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||||
if len(m.VisionModel.Layers) == 0 {
|
if len(m.VisionModel.Layers) == 0 {
|
||||||
return nil, model.ErrNoVisionModel
|
return nil, model.ErrNoVisionModel
|
||||||
}
|
}
|
||||||
@@ -85,7 +88,31 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
|||||||
}
|
}
|
||||||
|
|
||||||
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
||||||
return []input.Multimodal{{Tensor: visionOutputs}}, nil
|
return &chunks{Model: m, Tensor: visionOutputs}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type chunks struct {
|
||||||
|
*Model
|
||||||
|
ml.Tensor
|
||||||
|
|
||||||
|
dataOnce sync.Once
|
||||||
|
data []float32
|
||||||
|
}
|
||||||
|
|
||||||
|
type chunk struct {
|
||||||
|
*chunks
|
||||||
|
s, n int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *chunk) floats() []float32 {
|
||||||
|
r.dataOnce.Do(func() {
|
||||||
|
temp := r.Backend().NewContext()
|
||||||
|
defer temp.Close()
|
||||||
|
temp.Forward(r.Tensor).Compute(r.Tensor)
|
||||||
|
r.data = r.Floats()
|
||||||
|
})
|
||||||
|
|
||||||
|
return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
|
||||||
}
|
}
|
||||||
|
|
||||||
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
|
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
|
||||||
@@ -115,15 +142,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
result = append(result, input.Input{Token: pre[i]})
|
result = append(result, input.Input{Token: pre[i]})
|
||||||
}
|
}
|
||||||
|
|
||||||
patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
|
// This is an image token with multimodal data
|
||||||
|
chunksData := inp.Multimodal.(*chunks)
|
||||||
|
patchesPerChunk := chunksData.Dim(1)
|
||||||
|
|
||||||
// First add the vision start token
|
// First add the vision start token
|
||||||
result = append(result, input.Input{Token: visionStartToken})
|
result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})
|
||||||
|
|
||||||
// Add the image token with the multimodal tensor data at the first position
|
// Add the image token with the multimodal tensor data at the first position
|
||||||
|
// Create a chunk with proper s and n values
|
||||||
result = append(result, input.Input{
|
result = append(result, input.Input{
|
||||||
Token: imageToken,
|
Token: imageToken,
|
||||||
Multimodal: inp.Multimodal,
|
Multimodal: &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
|
||||||
MultimodalHash: inp.MultimodalHash,
|
MultimodalHash: inp.MultimodalHash,
|
||||||
SameBatch: patchesPerChunk,
|
SameBatch: patchesPerChunk,
|
||||||
})
|
})
|
||||||
@@ -139,8 +169,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
|
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,15 +7,13 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
type TextOptions struct {
|
type TextOptions struct {
|
||||||
hiddenSize, numHeads, numKVHeads int
|
ctxLen, hiddenSize, numHeads, numKVHeads int
|
||||||
ropeDim, originalContextLength int
|
eps, ropeBase, ropeScale float32
|
||||||
eps, ropeBase, ropeScale float32
|
ropeDim, defaultContextLen uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
type TextModel struct {
|
type TextModel struct {
|
||||||
@@ -31,14 +29,15 @@ func NewTextModel(c fs.Config) *TextModel {
|
|||||||
m := TextModel{
|
m := TextModel{
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
TextOptions: &TextOptions{
|
TextOptions: &TextOptions{
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
ctxLen: int(c.Uint("context_length")),
|
||||||
numHeads: int(c.Uint("attention.head_count")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
numHeads: int(c.Uint("attention.head_count")),
|
||||||
ropeDim: int(c.Uint("rope.dimension_count", 128)),
|
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||||
originalContextLength: int(c.Uint("context_length", 128000)),
|
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
ropeBase: c.Float("rope.freq_base"),
|
||||||
ropeBase: c.Float("rope.freq_base"),
|
ropeScale: c.Float("rope.freq_scale", 1),
|
||||||
ropeScale: c.Float("rope.freq_scale", 1),
|
ropeDim: c.Uint("rope.dimension_count", 128),
|
||||||
|
defaultContextLen: c.Uint("context_length", 128000),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,11 +59,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
|||||||
|
|
||||||
q := sa.Query.Forward(ctx, hiddenState)
|
q := sa.Query.Forward(ctx, hiddenState)
|
||||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||||
q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
|
q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
|
||||||
|
|
||||||
k := sa.Key.Forward(ctx, hiddenState)
|
k := sa.Key.Forward(ctx, hiddenState)
|
||||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
|
k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
|
||||||
|
|
||||||
v := sa.Value.Forward(ctx, hiddenState)
|
v := sa.Value.Forward(ctx, hiddenState)
|
||||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
@@ -78,7 +77,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
|||||||
|
|
||||||
// Shift applies rotary position embeddings to the key tensor for causal attention caching
|
// Shift applies rotary position embeddings to the key tensor for causal attention caching
|
||||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
|
return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// MLP implements the feed-forward network component with SwiGLU activation
|
// MLP implements the feed-forward network component with SwiGLU activation
|
||||||
@@ -130,7 +129,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
|||||||
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
|
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
|
||||||
|
|
||||||
for _, mi := range batch.Multimodal {
|
for _, mi := range batch.Multimodal {
|
||||||
img := mi.Multimodal[0].Tensor
|
f32s := mi.Multimodal.(*chunk).floats()
|
||||||
|
img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package qwen25vl
|
package qwen25vl
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
@@ -43,8 +44,10 @@ func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
|
mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
// Reshape to match [seqLength, seqLength, 1] for broadcasting
|
// Reshape to match [seqLength, seqLength, 1] for broadcasting
|
||||||
mask = mask.Reshape(ctx, seqLength, seqLength, 1)
|
mask = mask.Reshape(ctx, seqLength, seqLength, 1)
|
||||||
|
|
||||||
@@ -300,7 +303,10 @@ func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
t := ctx.Input().FromIntSlice(index, len(index))
|
t, err := ctx.Input().FromIntSlice(index, len(index))
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
return t, bounds
|
return t, bounds
|
||||||
}
|
}
|
||||||
@@ -320,7 +326,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
|
|||||||
freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
|
freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
|
freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
|
||||||
|
}
|
||||||
|
|
||||||
// Create position coordinates (y,x pairs) for the grid
|
// Create position coordinates (y,x pairs) for the grid
|
||||||
// In PyTorch: Equivalent to generating position ids with torch.arange()
|
// In PyTorch: Equivalent to generating position ids with torch.arange()
|
||||||
@@ -330,7 +339,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
|
|||||||
coords = append(coords, int32(y), int32(x))
|
coords = append(coords, int32(y), int32(x))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
|
pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("failed to create tensor from positions: %w", err))
|
||||||
|
}
|
||||||
|
|
||||||
// Reshape and permute positions to match spatial merging pattern
|
// Reshape and permute positions to match spatial merging pattern
|
||||||
pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
|
pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
|
||||||
|
|||||||
@@ -1,233 +0,0 @@
|
|||||||
package qwen3
|
|
||||||
|
|
||||||
import (
|
|
||||||
"cmp"
|
|
||||||
"math"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
|
||||||
"github.com/ollama/ollama/kvcache"
|
|
||||||
"github.com/ollama/ollama/ml"
|
|
||||||
"github.com/ollama/ollama/ml/nn"
|
|
||||||
"github.com/ollama/ollama/ml/nn/fast"
|
|
||||||
"github.com/ollama/ollama/ml/nn/rope"
|
|
||||||
"github.com/ollama/ollama/model"
|
|
||||||
"github.com/ollama/ollama/model/input"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Options struct {
|
|
||||||
hiddenSize, numHeads, numKVHeads int
|
|
||||||
eps float32
|
|
||||||
ropeBase, ropeScale float32
|
|
||||||
|
|
||||||
keyLength, valueLength int
|
|
||||||
|
|
||||||
numExperts, numExpertsUsed int
|
|
||||||
normTopKProb bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func (o Options) headDim() int {
|
|
||||||
return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
|
|
||||||
}
|
|
||||||
|
|
||||||
type Attention struct {
|
|
||||||
QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
|
|
||||||
Query *nn.Linear `gguf:"attn_q"`
|
|
||||||
KeyNorm *nn.RMSNorm `gguf:"attn_k_norm"`
|
|
||||||
Key *nn.Linear `gguf:"attn_k"`
|
|
||||||
Value *nn.Linear `gguf:"attn_v"`
|
|
||||||
Output *nn.Linear `gguf:"attn_output"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
|
||||||
batchSize := hiddenStates.Dim(1)
|
|
||||||
|
|
||||||
query := sa.Query.Forward(ctx, hiddenStates)
|
|
||||||
key := sa.Key.Forward(ctx, hiddenStates)
|
|
||||||
value := sa.Value.Forward(ctx, hiddenStates)
|
|
||||||
|
|
||||||
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
|
|
||||||
key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
|
||||||
value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
|
||||||
|
|
||||||
query = sa.QueryNorm.Forward(ctx, query, opts.eps)
|
|
||||||
key = sa.KeyNorm.Forward(ctx, key, opts.eps)
|
|
||||||
|
|
||||||
query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
|
||||||
key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
|
||||||
|
|
||||||
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
|
|
||||||
attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
|
|
||||||
return sa.Output.Forward(ctx, attention)
|
|
||||||
}
|
|
||||||
|
|
||||||
type MLP interface {
|
|
||||||
Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
|
|
||||||
}
|
|
||||||
|
|
||||||
type sparse struct {
|
|
||||||
Router *nn.Linear `gguf:"ffn_gate_inp"`
|
|
||||||
Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
|
|
||||||
Up ml.Tensor `gguf:"ffn_up_exps.weight"`
|
|
||||||
Down ml.Tensor `gguf:"ffn_down_exps.weight"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
|
|
||||||
hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
|
|
||||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
|
|
||||||
routerLogits := mlp.Router.Forward(ctx, hiddenStates)
|
|
||||||
|
|
||||||
routingWeights := routerLogits.Softmax(ctx)
|
|
||||||
selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
|
|
||||||
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
|
|
||||||
if opts.normTopKProb {
|
|
||||||
routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
|
|
||||||
routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
|
|
||||||
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
|
|
||||||
}
|
|
||||||
|
|
||||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
|
|
||||||
|
|
||||||
upStates := mlp.Up.MulmatID(ctx, hiddenStates, selectedExperts)
|
|
||||||
|
|
||||||
hiddenStates = mlp.Gate.MulmatID(ctx, hiddenStates, selectedExperts)
|
|
||||||
hiddenStates = hiddenStates.SILU(ctx)
|
|
||||||
hiddenStates = hiddenStates.Mul(ctx, upStates)
|
|
||||||
|
|
||||||
experts := mlp.Down.MulmatID(ctx, hiddenStates, selectedExperts)
|
|
||||||
experts = experts.Mul(ctx, routingWeights)
|
|
||||||
|
|
||||||
nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
|
|
||||||
for i := 1; i < opts.numExpertsUsed; i++ {
|
|
||||||
nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
|
|
||||||
}
|
|
||||||
|
|
||||||
return nextStates
|
|
||||||
}
|
|
||||||
|
|
||||||
type dense struct {
|
|
||||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
|
||||||
Up *nn.Linear `gguf:"ffn_up"`
|
|
||||||
Down *nn.Linear `gguf:"ffn_down"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
|
|
||||||
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
|
||||||
return mlp.Down.Forward(ctx, hiddenStates)
|
|
||||||
}
|
|
||||||
|
|
||||||
type Layer struct {
|
|
||||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
|
||||||
*Attention
|
|
||||||
|
|
||||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
|
||||||
MLP
|
|
||||||
}
|
|
||||||
|
|
||||||
func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
|
||||||
residual := hiddenStates
|
|
||||||
hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
||||||
hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
|
|
||||||
|
|
||||||
if outputs != nil {
|
|
||||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
|
||||||
residual = residual.Rows(ctx, outputs)
|
|
||||||
}
|
|
||||||
|
|
||||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
||||||
|
|
||||||
residual = hiddenStates
|
|
||||||
hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
||||||
hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
|
|
||||||
return hiddenStates.Add(ctx, residual)
|
|
||||||
}
|
|
||||||
|
|
||||||
type Model struct {
|
|
||||||
model.Base
|
|
||||||
model.BytePairEncoding
|
|
||||||
|
|
||||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
|
||||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
|
||||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
|
||||||
|
|
||||||
Layers []Layer `gguf:"blk"`
|
|
||||||
|
|
||||||
*Options
|
|
||||||
}
|
|
||||||
|
|
||||||
// Forward implements model.Model.
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
|
||||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
|
||||||
|
|
||||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
|
||||||
|
|
||||||
for i, layer := range m.Layers {
|
|
||||||
m.Cache.SetLayer(i)
|
|
||||||
|
|
||||||
var outputs ml.Tensor
|
|
||||||
if i == len(m.Layers)-1 {
|
|
||||||
outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
|
||||||
}
|
|
||||||
|
|
||||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
|
|
||||||
}
|
|
||||||
|
|
||||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
|
|
||||||
return m.Output.Forward(ctx, hiddenStates), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
|
||||||
return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ model.Model = (*Model)(nil)
|
|
||||||
|
|
||||||
func New(c fs.Config) (model.Model, error) {
|
|
||||||
layers := make([]Layer, c.Uint("block_count"))
|
|
||||||
for i := range layers {
|
|
||||||
if c.String("general.architecture") == "qwen3moe" {
|
|
||||||
layers[i].MLP = &sparse{}
|
|
||||||
} else {
|
|
||||||
layers[i].MLP = &dense{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m := Model{
|
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
|
||||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
|
||||||
&model.Vocabulary{
|
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
EOS: append(
|
|
||||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
|
||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
||||||
),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Layers: layers,
|
|
||||||
Options: &Options{
|
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
|
||||||
numHeads: int(c.Uint("attention.head_count")),
|
|
||||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
|
||||||
keyLength: int(c.Uint("attention.key_length")),
|
|
||||||
valueLength: int(c.Uint("attention.value_length")),
|
|
||||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
|
||||||
ropeBase: c.Float("rope.freq_base"),
|
|
||||||
ropeScale: c.Float("rope.freq_scale", 1),
|
|
||||||
numExperts: int(c.Uint("expert_count")),
|
|
||||||
numExpertsUsed: int(c.Uint("expert_used_count")),
|
|
||||||
normTopKProb: c.Bool("norm_top_k_prob", true),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
|
||||||
return &m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
model.Register("qwen3", New)
|
|
||||||
model.Register("qwen3moe", New)
|
|
||||||
}
|
|
||||||
@@ -5,13 +5,116 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"iter"
|
"iter"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/dlclark/regexp2"
|
"github.com/dlclark/regexp2"
|
||||||
heap "github.com/emirpasic/gods/v2/trees/binaryheap"
|
heap "github.com/emirpasic/gods/v2/trees/binaryheap"
|
||||||
"github.com/ollama/ollama/logutil"
|
"github.com/ollama/ollama/logutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type Special int32
|
||||||
|
|
||||||
|
const (
|
||||||
|
SpecialBOS Special = iota
|
||||||
|
SpecialEOS
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
TOKEN_TYPE_NORMAL = iota + 1
|
||||||
|
TOKEN_TYPE_UNKNOWN
|
||||||
|
TOKEN_TYPE_CONTROL
|
||||||
|
TOKEN_TYPE_USER_DEFINED
|
||||||
|
TOKEN_TYPE_UNUSED
|
||||||
|
TOKEN_TYPE_BYTE
|
||||||
|
)
|
||||||
|
|
||||||
|
type TextProcessor interface {
|
||||||
|
Encode(s string, addSpecial bool) ([]int32, error)
|
||||||
|
Decode([]int32) (string, error)
|
||||||
|
Is(int32, Special) bool
|
||||||
|
Vocabulary() *Vocabulary
|
||||||
|
}
|
||||||
|
|
||||||
|
type Vocabulary struct {
|
||||||
|
Values []string
|
||||||
|
Types []int32
|
||||||
|
Scores []float32
|
||||||
|
Merges []string
|
||||||
|
|
||||||
|
BOS, EOS, EOT int32
|
||||||
|
AddBOS, AddEOS, AddEOT bool
|
||||||
|
|
||||||
|
specialOnce sync.Once
|
||||||
|
special []string
|
||||||
|
|
||||||
|
valuesOnce sync.Once
|
||||||
|
values map[string]int32
|
||||||
|
|
||||||
|
mergeOnce sync.Once
|
||||||
|
merge map[string]int32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *Vocabulary) Is(id int32, special Special) bool {
|
||||||
|
switch special {
|
||||||
|
case SpecialBOS:
|
||||||
|
return id == v.BOS
|
||||||
|
case SpecialEOS:
|
||||||
|
return id == v.EOS || id == v.EOT
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *Vocabulary) Encode(s string) int32 {
|
||||||
|
v.valuesOnce.Do(func() {
|
||||||
|
v.values = make(map[string]int32, len(v.Values))
|
||||||
|
for i, value := range v.Values {
|
||||||
|
v.values[value] = int32(i)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if id, ok := v.values[s]; ok {
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *Vocabulary) Decode(id int32) string {
|
||||||
|
return v.Values[id]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *Vocabulary) SpecialVocabulary() []string {
|
||||||
|
v.specialOnce.Do(func() {
|
||||||
|
for i := range v.Values {
|
||||||
|
if slices.Contains([]int{105, 106}, i) {
|
||||||
|
v.special = append(v.special, v.Values[i])
|
||||||
|
} else if v.Types[i] == TOKEN_TYPE_CONTROL {
|
||||||
|
v.special = append(v.special, v.Values[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return v.special
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *Vocabulary) Merge(left, right string) int {
|
||||||
|
v.mergeOnce.Do(func() {
|
||||||
|
v.merge = make(map[string]int32, len(v.Merges))
|
||||||
|
for i, merge := range v.Merges {
|
||||||
|
v.merge[merge] = int32(i)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if id, ok := v.merge[left+" "+right]; ok {
|
||||||
|
return int(id)
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
type BytePairEncoding struct {
|
type BytePairEncoding struct {
|
||||||
pre *regexp2.Regexp
|
pre *regexp2.Regexp
|
||||||
vocab *Vocabulary
|
vocab *Vocabulary
|
||||||
@@ -201,12 +304,27 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
|
|
||||||
|
|
||||||
if addSpecial && len(ids) > 0 {
|
if addSpecial && len(ids) > 0 {
|
||||||
ids = bpe.vocab.addSpecials(ids)
|
if bpe.vocab.AddBOS {
|
||||||
|
if ids[0] == bpe.vocab.BOS {
|
||||||
|
slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
|
||||||
|
ids = append([]int32{bpe.vocab.BOS}, ids...)
|
||||||
|
}
|
||||||
|
|
||||||
|
if bpe.vocab.AddEOS {
|
||||||
|
if ids[len(ids)-1] == bpe.vocab.EOS {
|
||||||
|
slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
|
||||||
|
ids = append(ids, bpe.vocab.EOS)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
|
||||||
return ids, nil
|
return ids, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -234,6 +352,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
|
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
|
||||||
return sb.String(), nil
|
return sb.String(), nil
|
||||||
}
|
}
|
||||||
@@ -182,12 +182,27 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
|
|
||||||
|
|
||||||
if addSpecial && len(ids) > 0 {
|
if addSpecial && len(ids) > 0 {
|
||||||
ids = spm.vocab.addSpecials(ids)
|
if spm.vocab.AddBOS {
|
||||||
|
if ids[0] == spm.vocab.BOS {
|
||||||
|
slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
|
||||||
|
ids = append([]int32{spm.vocab.BOS}, ids...)
|
||||||
|
}
|
||||||
|
|
||||||
|
if spm.vocab.AddEOS {
|
||||||
|
if ids[len(ids)-1] == spm.vocab.EOS {
|
||||||
|
slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
|
||||||
|
ids = append(ids, spm.vocab.EOS)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
|
||||||
return ids, nil
|
return ids, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -246,6 +261,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
|
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
|
||||||
return sb.String(), nil
|
return sb.String(), nil
|
||||||
}
|
}
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
package model
|
|
||||||
|
|
||||||
const (
|
|
||||||
TOKEN_TYPE_NORMAL = iota + 1
|
|
||||||
TOKEN_TYPE_UNKNOWN
|
|
||||||
TOKEN_TYPE_CONTROL
|
|
||||||
TOKEN_TYPE_USER_DEFINED
|
|
||||||
TOKEN_TYPE_UNUSED
|
|
||||||
TOKEN_TYPE_BYTE
|
|
||||||
)
|
|
||||||
|
|
||||||
type TextProcessor interface {
|
|
||||||
Encode(s string, addSpecial bool) ([]int32, error)
|
|
||||||
Decode([]int32) (string, error)
|
|
||||||
Is(int32, Special) bool
|
|
||||||
Vocabulary() *Vocabulary
|
|
||||||
}
|
|
||||||
@@ -1,112 +0,0 @@
|
|||||||
package model
|
|
||||||
|
|
||||||
import (
|
|
||||||
"log/slog"
|
|
||||||
"slices"
|
|
||||||
"sync"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Special int32
|
|
||||||
|
|
||||||
const (
|
|
||||||
SpecialBOS Special = iota
|
|
||||||
SpecialEOS
|
|
||||||
)
|
|
||||||
|
|
||||||
type Vocabulary struct {
|
|
||||||
Values []string
|
|
||||||
Types []int32
|
|
||||||
Scores []float32
|
|
||||||
Merges []string
|
|
||||||
|
|
||||||
BOS, EOS []int32
|
|
||||||
AddBOS, AddEOS bool
|
|
||||||
|
|
||||||
specialOnce sync.Once
|
|
||||||
special []string
|
|
||||||
|
|
||||||
valuesOnce sync.Once
|
|
||||||
values map[string]int32
|
|
||||||
|
|
||||||
mergeOnce sync.Once
|
|
||||||
merge map[string]int32
|
|
||||||
}
|
|
||||||
|
|
||||||
func (v *Vocabulary) Is(id int32, special Special) bool {
|
|
||||||
switch special {
|
|
||||||
case SpecialBOS:
|
|
||||||
return slices.Contains(v.BOS, id)
|
|
||||||
case SpecialEOS:
|
|
||||||
return slices.Contains(v.EOS, id)
|
|
||||||
default:
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (v *Vocabulary) addSpecials(ids []int32) []int32 {
|
|
||||||
if v.AddBOS && len(v.BOS) > 0 {
|
|
||||||
if slices.Contains(v.BOS, ids[0]) {
|
|
||||||
slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("adding bos token to prompt", "id", v.BOS)
|
|
||||||
ids = append([]int32{v.BOS[0]}, ids...)
|
|
||||||
}
|
|
||||||
|
|
||||||
if v.AddEOS && len(v.EOS) > 0 {
|
|
||||||
if slices.Contains(v.BOS, ids[len(ids)-1]) {
|
|
||||||
slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("adding eos token to prompt", "id", v.EOS)
|
|
||||||
ids = append(ids, v.EOS[0])
|
|
||||||
}
|
|
||||||
|
|
||||||
return ids
|
|
||||||
}
|
|
||||||
|
|
||||||
func (v *Vocabulary) Encode(s string) int32 {
|
|
||||||
v.valuesOnce.Do(func() {
|
|
||||||
v.values = make(map[string]int32, len(v.Values))
|
|
||||||
for i, value := range v.Values {
|
|
||||||
v.values[value] = int32(i)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
if id, ok := v.values[s]; ok {
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
func (v *Vocabulary) Decode(id int32) string {
|
|
||||||
return v.Values[id]
|
|
||||||
}
|
|
||||||
|
|
||||||
func (v *Vocabulary) SpecialVocabulary() []string {
|
|
||||||
v.specialOnce.Do(func() {
|
|
||||||
for i := range v.Values {
|
|
||||||
if v.Types[i] == TOKEN_TYPE_CONTROL {
|
|
||||||
v.special = append(v.special, v.Values[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
return v.special
|
|
||||||
}
|
|
||||||
|
|
||||||
func (v *Vocabulary) Merge(left, right string) int {
|
|
||||||
v.mergeOnce.Do(func() {
|
|
||||||
v.merge = make(map[string]int32, len(v.Merges))
|
|
||||||
for i, merge := range v.Merges {
|
|
||||||
v.merge[merge] = int32(i)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
if id, ok := v.merge[left+" "+right]; ok {
|
|
||||||
return int(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
|||||||
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
|
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
|
||||||
"used", numPast, "remaining", len(prompt)-numPast)
|
"used", numPast, "remaining", len(prompt)-numPast)
|
||||||
|
|
||||||
slot.Inputs = prompt[:numPast]
|
|
||||||
prompt = prompt[numPast:]
|
prompt = prompt[numPast:]
|
||||||
|
slot.Inputs = slot.Inputs[:numPast]
|
||||||
|
|
||||||
return slot, prompt, nil
|
return slot, prompt, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
|
|||||||
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
|
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
|
||||||
"used", numPast, "remaining", int32(len(prompt))-numPast)
|
"used", numPast, "remaining", int32(len(prompt))-numPast)
|
||||||
|
|
||||||
slot.Inputs = prompt[:numPast]
|
|
||||||
prompt = prompt[numPast:]
|
prompt = prompt[numPast:]
|
||||||
|
slot.Inputs = slot.Inputs[:numPast]
|
||||||
|
|
||||||
return slot, prompt, nil
|
return slot, prompt, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package ollamarunner
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"image"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -11,6 +12,10 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestCountCommon(t *testing.T) {
|
func TestCountCommon(t *testing.T) {
|
||||||
|
imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
|
||||||
|
imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
|
||||||
|
imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
t1 []input.Input
|
t1 []input.Input
|
||||||
@@ -31,20 +36,20 @@ func TestCountCommon(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Image Prefix",
|
name: "Image Prefix",
|
||||||
t1: []input.Input{{MultimodalHash: 1}},
|
t1: []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
|
||||||
t2: []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
|
t2: []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
|
||||||
expected: 1,
|
expected: 1,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Mixed",
|
name: "Mixed",
|
||||||
t1: []input.Input{{Token: 1}, {MultimodalHash: 1}},
|
t1: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
|
||||||
t2: []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
|
t2: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
|
||||||
expected: 2,
|
expected: 2,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Mixed, Same Length",
|
name: "Mixed, Same Length",
|
||||||
t1: []input.Input{{Token: 1}, {MultimodalHash: 1}},
|
t1: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
|
||||||
t2: []input.Input{{Token: 1}, {MultimodalHash: 2}},
|
t2: []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
|
||||||
expected: 1,
|
expected: 1,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,113 +0,0 @@
|
|||||||
package ollamarunner
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/ml"
|
|
||||||
"github.com/ollama/ollama/model/input"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Tensors can't be used across multiple compute graphs. This is a problem
|
|
||||||
// if a single embedding is split across batches using views since all of
|
|
||||||
// the views will have the same source tensor. We also don't want to
|
|
||||||
// recompute the entire embedding for each batch.
|
|
||||||
//
|
|
||||||
// To avoid this, we compute all of the tensors for the embedding on the
|
|
||||||
// first use and then store the result in system memory. When we need
|
|
||||||
// additional tensors, we recreate them from the stored data.
|
|
||||||
|
|
||||||
// multimodalEntry represents the embeddings of a single object (such
|
|
||||||
// as an image).
|
|
||||||
type multimodalEntry struct {
|
|
||||||
// mm is the original set of tensors created by EncodeMultimodal
|
|
||||||
mm []input.Multimodal
|
|
||||||
|
|
||||||
// data is the computed result of mm. Nil if not yet computed
|
|
||||||
data [][]float32
|
|
||||||
}
|
|
||||||
|
|
||||||
// multimodalStore maps from an individual tensor (of which there
|
|
||||||
// may be many in a single multimodal object) to its parent embedding
|
|
||||||
type multimodalStore map[ml.Tensor]*multimodalEntry
|
|
||||||
|
|
||||||
func newMultimodalStore() multimodalStore {
|
|
||||||
return make(multimodalStore)
|
|
||||||
}
|
|
||||||
|
|
||||||
// addMultimodal stores an embedding for later use in a compute graph
|
|
||||||
func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
|
|
||||||
entry := &multimodalEntry{mm: embedding}
|
|
||||||
|
|
||||||
for _, e := range embedding {
|
|
||||||
if e.Tensor != nil {
|
|
||||||
m[e.Tensor] = entry
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// getMultimodal takes a source set of tensors (which may contain a whole or
|
|
||||||
// parts of one or more images) and returns the equivalent that can be used in
|
|
||||||
// the current context
|
|
||||||
func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
|
|
||||||
out := make([]input.Multimodal, len(in))
|
|
||||||
for i := range out {
|
|
||||||
if in[i].Tensor != nil {
|
|
||||||
var err error
|
|
||||||
out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
out[i].Data = in[i].Data
|
|
||||||
}
|
|
||||||
|
|
||||||
return out, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
|
|
||||||
entry := m[in]
|
|
||||||
|
|
||||||
if entry.data == nil {
|
|
||||||
computeCtx := backend.NewContext()
|
|
||||||
defer computeCtx.Close()
|
|
||||||
|
|
||||||
var tensors []ml.Tensor
|
|
||||||
for _, t := range entry.mm {
|
|
||||||
if t.Tensor != nil {
|
|
||||||
tensors = append(tensors, t.Tensor)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(tensors) == 0 {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
computeCtx.Forward(tensors...)
|
|
||||||
entry.data = make([][]float32, len(entry.mm))
|
|
||||||
|
|
||||||
if !reserve {
|
|
||||||
computeCtx.Compute(tensors...)
|
|
||||||
|
|
||||||
for i, t := range entry.mm {
|
|
||||||
if t.Tensor != nil {
|
|
||||||
entry.data[i] = t.Tensor.Floats()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
computeCtx.Reserve()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for i, t := range entry.mm {
|
|
||||||
if in == t.Tensor {
|
|
||||||
if !reserve {
|
|
||||||
return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
|
|
||||||
} else {
|
|
||||||
return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, errors.New("multimodal tensor not found")
|
|
||||||
}
|
|
||||||
@@ -1,14 +1,12 @@
|
|||||||
package ollamarunner
|
package ollamarunner
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"hash/maphash"
|
"hash/maphash"
|
||||||
"image"
|
|
||||||
"log"
|
"log"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net"
|
"net"
|
||||||
@@ -22,7 +20,6 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"golang.org/x/image/bmp"
|
|
||||||
"golang.org/x/sync/semaphore"
|
"golang.org/x/sync/semaphore"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
@@ -43,9 +40,6 @@ type Sequence struct {
|
|||||||
// multimodal embeddings
|
// multimodal embeddings
|
||||||
ctxs []ml.Context
|
ctxs []ml.Context
|
||||||
|
|
||||||
// mmStore holds multimodal embeddings to mange memory and enable splitting across batches
|
|
||||||
mmStore multimodalStore
|
|
||||||
|
|
||||||
// batch index
|
// batch index
|
||||||
iBatch int
|
iBatch int
|
||||||
|
|
||||||
@@ -107,7 +101,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
|||||||
|
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
|
||||||
inputs, ctxs, mmStore, err := s.inputs(prompt, images)
|
inputs, ctxs, err := s.inputs(prompt, images)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
||||||
} else if len(inputs) == 0 {
|
} else if len(inputs) == 0 {
|
||||||
@@ -162,7 +156,6 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
|||||||
|
|
||||||
return &Sequence{
|
return &Sequence{
|
||||||
ctxs: ctxs,
|
ctxs: ctxs,
|
||||||
mmStore: mmStore,
|
|
||||||
inputs: inputs,
|
inputs: inputs,
|
||||||
numPromptInputs: len(inputs),
|
numPromptInputs: len(inputs),
|
||||||
startProcessingTime: startTime,
|
startProcessingTime: startTime,
|
||||||
@@ -181,10 +174,9 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
|||||||
// inputs processes the prompt and images into a list of inputs
|
// inputs processes the prompt and images into a list of inputs
|
||||||
// by splitting the prompt on [img-<n>] tags, tokenizing text and
|
// by splitting the prompt on [img-<n>] tags, tokenizing text and
|
||||||
// decoding images
|
// decoding images
|
||||||
func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, multimodalStore, error) {
|
func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) {
|
||||||
var inputs []input.Input
|
var inputs []input.Input
|
||||||
var ctxs []ml.Context
|
var ctxs []ml.Context
|
||||||
var mmStore multimodalStore
|
|
||||||
|
|
||||||
var parts []string
|
var parts []string
|
||||||
var matches [][]string
|
var matches [][]string
|
||||||
@@ -195,7 +187,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
|
|||||||
re := regexp.MustCompile(`\[img-(\d+)\]`)
|
re := regexp.MustCompile(`\[img-(\d+)\]`)
|
||||||
parts = re.Split(prompt, -1)
|
parts = re.Split(prompt, -1)
|
||||||
matches = re.FindAllStringSubmatch(prompt, -1)
|
matches = re.FindAllStringSubmatch(prompt, -1)
|
||||||
mmStore = newMultimodalStore()
|
|
||||||
} else {
|
} else {
|
||||||
parts = []string{prompt}
|
parts = []string{prompt}
|
||||||
}
|
}
|
||||||
@@ -205,7 +196,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
|
|||||||
// text - tokenize
|
// text - tokenize
|
||||||
tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
|
tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, t := range tokens {
|
for _, t := range tokens {
|
||||||
@@ -225,7 +216,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
|
|||||||
}
|
}
|
||||||
|
|
||||||
if imageIndex < 0 {
|
if imageIndex < 0 {
|
||||||
return nil, nil, nil, fmt.Errorf("invalid image index: %d", n)
|
return nil, nil, fmt.Errorf("invalid image index: %d", n)
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx := s.model.Backend().NewContext()
|
ctx := s.model.Backend().NewContext()
|
||||||
@@ -233,15 +224,13 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
|
|||||||
ctxs = append(ctxs, ctx)
|
ctxs = append(ctxs, ctx)
|
||||||
imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
|
imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
s.multimodalHash.Reset()
|
s.multimodalHash.Reset()
|
||||||
_, _ = s.multimodalHash.Write(images[imageIndex].Data)
|
_, _ = s.multimodalHash.Write(images[imageIndex].Data)
|
||||||
imageHash := s.multimodalHash.Sum64()
|
imageHash := s.multimodalHash.Sum64()
|
||||||
|
|
||||||
mmStore.addMultimodal(imageEmbeddings)
|
|
||||||
|
|
||||||
inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
|
inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
|
||||||
postTokenize = true
|
postTokenize = true
|
||||||
}
|
}
|
||||||
@@ -251,11 +240,11 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
|
|||||||
var err error
|
var err error
|
||||||
inputs, err = multimodalProcessor.PostTokenize(inputs)
|
inputs, err = multimodalProcessor.PostTokenize(inputs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return inputs, ctxs, mmStore, nil
|
return inputs, ctxs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type Server struct {
|
type Server struct {
|
||||||
@@ -374,9 +363,6 @@ func (s *Server) processBatch() error {
|
|||||||
}
|
}
|
||||||
defer s.mu.Unlock()
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
ctx := s.model.Backend().NewContext()
|
|
||||||
defer ctx.Close()
|
|
||||||
|
|
||||||
var batchInputs []int32
|
var batchInputs []int32
|
||||||
var batch input.Batch
|
var batch input.Batch
|
||||||
|
|
||||||
@@ -447,11 +433,7 @@ func (s *Server) processBatch() error {
|
|||||||
|
|
||||||
batchInputs = append(batchInputs, inp.Token)
|
batchInputs = append(batchInputs, inp.Token)
|
||||||
if inp.Multimodal != nil {
|
if inp.Multimodal != nil {
|
||||||
mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, false)
|
batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: mm})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
|
batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
|
||||||
@@ -477,6 +459,9 @@ func (s *Server) processBatch() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx := s.model.Backend().NewContext()
|
||||||
|
defer ctx.Close()
|
||||||
|
|
||||||
modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
|
modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to decode batch: %w", err)
|
return fmt.Errorf("failed to decode batch: %w", err)
|
||||||
@@ -735,71 +720,12 @@ func (s *Server) reserveWorstCaseGraph() error {
|
|||||||
ctx := s.model.Backend().NewContext()
|
ctx := s.model.Backend().NewContext()
|
||||||
defer ctx.Close()
|
defer ctx.Close()
|
||||||
|
|
||||||
var err error
|
|
||||||
inputs := make([]input.Input, s.batchSize)
|
|
||||||
mmStore := newMultimodalStore()
|
|
||||||
|
|
||||||
// Multimodal strategy:
|
|
||||||
// - Encode a 2048x2048 image. This assumes that a single image of this
|
|
||||||
// size is sufficient to trigger the worst case. This is currently true
|
|
||||||
// because for existing models, only a single image fits in a batch.
|
|
||||||
// - Add the embedding to a full batch of tokens - this is necessary because
|
|
||||||
// the model may be looking for non-image data, such as <image> tags.
|
|
||||||
// - Run PostTokenize to execute any transformations between generated
|
|
||||||
// embeddings and what the forward pass expects.
|
|
||||||
// - The result may now be larger than a batch (images may not fit in a
|
|
||||||
// single batch), so trim based on what will fit and must be grouped together.
|
|
||||||
// - Fill out the rest of the space with text tokens.
|
|
||||||
if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok {
|
|
||||||
mmCtx := s.model.Backend().NewContext()
|
|
||||||
defer mmCtx.Close()
|
|
||||||
|
|
||||||
img := image.NewGray(image.Rect(0, 0, 2048, 2048))
|
|
||||||
var buf bytes.Buffer
|
|
||||||
bmp.Encode(&buf, img)
|
|
||||||
|
|
||||||
if inputs[0].Multimodal, err = multimodalProcessor.EncodeMultimodal(mmCtx, buf.Bytes()); err == nil {
|
|
||||||
mmStore.addMultimodal(inputs[0].Multimodal)
|
|
||||||
|
|
||||||
inputs, err = multimodalProcessor.PostTokenize(inputs)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for i, inp := range inputs {
|
|
||||||
minBatch := 1 + inp.SameBatch
|
|
||||||
if minBatch > s.batchSize {
|
|
||||||
inputs = inputs[i:min(i+minBatch, len(inputs))]
|
|
||||||
break
|
|
||||||
} else if i+minBatch > s.batchSize {
|
|
||||||
inputs = inputs[:i]
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(inputs) < s.batchSize {
|
|
||||||
newInputs := make([]input.Input, s.batchSize)
|
|
||||||
copy(newInputs, inputs)
|
|
||||||
inputs = newInputs
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var batch input.Batch
|
var batch input.Batch
|
||||||
|
|
||||||
batchInputs := make([]int32, len(inputs))
|
inputs := make([]int32, s.batchSize)
|
||||||
batch.Positions = make([]int32, len(inputs))
|
batch.Positions = make([]int32, len(inputs))
|
||||||
batch.Sequences = make([]int, len(inputs))
|
batch.Sequences = make([]int, len(inputs))
|
||||||
for i, inp := range inputs {
|
for i := range inputs {
|
||||||
batchInputs[i] = inp.Token
|
|
||||||
if inp.Multimodal != nil {
|
|
||||||
mm, err := mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, true)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: i, Multimodal: mm})
|
|
||||||
}
|
|
||||||
|
|
||||||
batch.Positions[i] = int32(i)
|
batch.Positions[i] = int32(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -808,7 +734,11 @@ func (s *Server) reserveWorstCaseGraph() error {
|
|||||||
batch.Outputs[i] = int32(i)
|
batch.Outputs[i] = int32(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.Inputs = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
|
var err error
|
||||||
|
batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
cache := s.model.Config().Cache
|
cache := s.model.Config().Cache
|
||||||
if cache != nil {
|
if cache != nil {
|
||||||
@@ -823,12 +753,16 @@ func (s *Server) reserveWorstCaseGraph() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.Forward(t).Reserve()
|
err = ctx.Forward(t).Reserve()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) initModel(
|
func (s *Server) loadModel(
|
||||||
|
ctx context.Context,
|
||||||
mpath string,
|
mpath string,
|
||||||
params ml.BackendParams,
|
params ml.BackendParams,
|
||||||
lpath multiLPath,
|
lpath multiLPath,
|
||||||
@@ -836,21 +770,21 @@ func (s *Server) initModel(
|
|||||||
kvCacheType string,
|
kvCacheType string,
|
||||||
kvSize int,
|
kvSize int,
|
||||||
multiUserCache bool,
|
multiUserCache bool,
|
||||||
) error {
|
) {
|
||||||
var err error
|
var err error
|
||||||
s.model, err = model.New(mpath, params)
|
s.model, err = model.New(ctx, mpath, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(jessegross): LoRA loading
|
// TODO(jessegross): LoRA loading
|
||||||
if lpath.String() != "" {
|
if lpath.String() != "" {
|
||||||
return errors.New("loras are not yet implemented")
|
panic("loras are not yet implemented")
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
|
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !s.cache.enabled && parallel > 1 {
|
if !s.cache.enabled && parallel > 1 {
|
||||||
@@ -862,30 +796,7 @@ func (s *Server) initModel(
|
|||||||
s.seqs = make([]*Sequence, s.parallel)
|
s.seqs = make([]*Sequence, s.parallel)
|
||||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||||
|
|
||||||
return s.reserveWorstCaseGraph()
|
err = s.reserveWorstCaseGraph()
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) load(
|
|
||||||
ctx context.Context,
|
|
||||||
mpath string,
|
|
||||||
params ml.BackendParams,
|
|
||||||
lpath multiLPath,
|
|
||||||
parallel int,
|
|
||||||
kvCacheType string,
|
|
||||||
kvSize int,
|
|
||||||
multiUserCache bool,
|
|
||||||
) {
|
|
||||||
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
|
|
||||||
|
|
||||||
err = s.model.Backend().Load(ctx,
|
|
||||||
func(progress float32) {
|
|
||||||
s.progress = progress
|
|
||||||
})
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
@@ -929,14 +840,9 @@ func Execute(args []string) error {
|
|||||||
status: llm.ServerStatusLoadingModel,
|
status: llm.ServerStatusLoadingModel,
|
||||||
}
|
}
|
||||||
|
|
||||||
server.cond = sync.NewCond(&server.mu)
|
|
||||||
server.ready.Add(1)
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
// TODO(jessegross): Parameters that need to be implemented:
|
// TODO(jessegross): Parameters that need to be implemented:
|
||||||
// no-mmap
|
// no-mmap
|
||||||
|
// mlock
|
||||||
|
|
||||||
var tensorSplitFloats []float32
|
var tensorSplitFloats []float32
|
||||||
if *tensorSplit != "" {
|
if *tensorSplit != "" {
|
||||||
@@ -949,6 +855,9 @@ func Execute(args []string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
params := ml.BackendParams{
|
params := ml.BackendParams{
|
||||||
|
Progress: func(progress float32) {
|
||||||
|
server.progress = progress
|
||||||
|
},
|
||||||
NumThreads: *threads,
|
NumThreads: *threads,
|
||||||
NumGPULayers: *numGPULayers,
|
NumGPULayers: *numGPULayers,
|
||||||
MainGPU: *mainGPU,
|
MainGPU: *mainGPU,
|
||||||
@@ -956,7 +865,14 @@ func Execute(args []string) error {
|
|||||||
FlashAttention: *flashAttention,
|
FlashAttention: *flashAttention,
|
||||||
}
|
}
|
||||||
|
|
||||||
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
|
server.ready.Add(1)
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
|
||||||
|
|
||||||
|
server.cond = sync.NewCond(&server.mu)
|
||||||
|
|
||||||
go server.run(ctx)
|
go server.run(ctx)
|
||||||
|
|
||||||
addr := "127.0.0.1:" + strconv.Itoa(*port)
|
addr := "127.0.0.1:" + strconv.Itoa(*port)
|
||||||
|
|||||||
218
runner/ollamarunner/runner_test.go
Normal file
218
runner/ollamarunner/runner_test.go
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
package ollamarunner
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
"github.com/ollama/ollama/model"
|
||||||
|
"github.com/ollama/ollama/model/input"
|
||||||
|
"github.com/ollama/ollama/sample"
|
||||||
|
"golang.org/x/sync/semaphore"
|
||||||
|
)
|
||||||
|
|
||||||
|
// testBackend implements ml.Backend with minimal functionality required for tests.
|
||||||
|
type testBackend struct{}
|
||||||
|
|
||||||
|
func (b *testBackend) Config() fs.Config { return testConfig{} }
|
||||||
|
func (b *testBackend) Get(string) ml.Tensor { return nil }
|
||||||
|
func (b *testBackend) NewContext() ml.Context { return &testContext{} }
|
||||||
|
func (b *testBackend) NewContextSize(int) ml.Context { return &testContext{} }
|
||||||
|
|
||||||
|
// testConfig is a stub implementation of fs.Config used by testBackend.
|
||||||
|
type testConfig struct{}
|
||||||
|
|
||||||
|
func (testConfig) Architecture() string { return "" }
|
||||||
|
func (testConfig) String(string, ...string) string { return "" }
|
||||||
|
func (testConfig) Uint(string, ...uint32) uint32 { return 0 }
|
||||||
|
func (testConfig) Float(string, ...float32) float32 { return 0 }
|
||||||
|
func (testConfig) Bool(string, ...bool) bool { return false }
|
||||||
|
func (testConfig) Strings(string, ...[]string) []string { return nil }
|
||||||
|
func (testConfig) Ints(string, ...[]int32) []int32 { return nil }
|
||||||
|
func (testConfig) Floats(string, ...[]float32) []float32 { return nil }
|
||||||
|
|
||||||
|
type testContext struct{}
|
||||||
|
|
||||||
|
func (c *testContext) Empty(dtype ml.DType, shape ...int) ml.Tensor {
|
||||||
|
sz := 1
|
||||||
|
for _, s := range shape {
|
||||||
|
sz *= s
|
||||||
|
}
|
||||||
|
return &testTensor{dtype: dtype, data: make([]float32, sz), shape: shape}
|
||||||
|
}
|
||||||
|
func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor { return c.Empty(dtype, shape...) }
|
||||||
|
func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||||
|
t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
|
||||||
|
copy(t.data, s)
|
||||||
|
return t, nil
|
||||||
|
}
|
||||||
|
func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||||
|
f := make([]float32, len(s))
|
||||||
|
for i, v := range s {
|
||||||
|
f[i] = float32(v)
|
||||||
|
}
|
||||||
|
out, _ := c.FromFloatSlice(f, shape...)
|
||||||
|
out.(*testTensor).dtype = ml.DTypeI32
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||||
|
return c.Empty(dtype, int((stop-start)/step))
|
||||||
|
}
|
||||||
|
func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
|
||||||
|
func (c *testContext) Compute(...ml.Tensor) {}
|
||||||
|
func (c *testContext) Reserve() error { return nil }
|
||||||
|
func (c *testContext) MaxGraphNodes() int { return 0 }
|
||||||
|
func (c *testContext) Close() {}
|
||||||
|
func (c *testContext) Input() ml.Context { return c }
|
||||||
|
func (c *testContext) Layer(int) ml.Context { return c }
|
||||||
|
|
||||||
|
type testTensor struct {
|
||||||
|
ml.Tensor
|
||||||
|
dtype ml.DType
|
||||||
|
data []float32
|
||||||
|
shape []int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *testTensor) Dim(n int) int { return t.shape[n] }
|
||||||
|
func (t *testTensor) Stride(n int) int { return 0 }
|
||||||
|
func (t *testTensor) Shape() []int { return t.shape }
|
||||||
|
func (t *testTensor) DType() ml.DType { return t.dtype }
|
||||||
|
func (t *testTensor) Bytes() []byte { return nil }
|
||||||
|
func (t *testTensor) Floats() []float32 {
|
||||||
|
out := make([]float32, len(t.data))
|
||||||
|
copy(out, t.data)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
func (t *testTensor) Neg(ctx ml.Context) ml.Tensor { return nil }
|
||||||
|
func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||||
|
out, _ := ctx.(*testContext).FromFloatSlice(nil, len(t.data))
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
func (t *testTensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return nil }
|
||||||
|
func (t *testTensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return nil }
|
||||||
|
func (t *testTensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return nil }
|
||||||
|
func (t *testTensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor { return nil }
|
||||||
|
func (t *testTensor) Softmax(ctx ml.Context) ml.Tensor { return nil }
|
||||||
|
func (t *testTensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, e float32) ml.Tensor {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
||||||
|
return ctx.(*testContext).Empty(t.dtype, shape...)
|
||||||
|
}
|
||||||
|
func (t *testTensor) Copy(ctx ml.Context, dest ml.Tensor) ml.Tensor {
|
||||||
|
copy(dest.(*testTensor).data, t.data)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// fakeModel implements model.Model and model.TextProcessor.
|
||||||
|
type fakeModel struct {
|
||||||
|
model.Base
|
||||||
|
decode map[int32]string
|
||||||
|
logits [][]float32
|
||||||
|
call int
|
||||||
|
backend ml.Backend
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
|
idx := f.call
|
||||||
|
if idx >= len(f.logits) {
|
||||||
|
idx = len(f.logits) - 1
|
||||||
|
}
|
||||||
|
f.call++
|
||||||
|
return ctx.FromFloatSlice(f.logits[idx], len(f.logits[idx]))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeModel) Backend() ml.Backend {
|
||||||
|
if f.backend == nil {
|
||||||
|
f.backend = &testBackend{}
|
||||||
|
}
|
||||||
|
return f.backend
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeModel) Encode(string, bool) ([]int32, error) { return nil, nil }
|
||||||
|
func (f *fakeModel) Decode(ids []int32) (string, error) {
|
||||||
|
var s string
|
||||||
|
for _, id := range ids {
|
||||||
|
s += f.decode[id]
|
||||||
|
}
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
func (f *fakeModel) Is(id int32, sp model.Special) bool { return false }
|
||||||
|
func (f *fakeModel) Vocabulary() *model.Vocabulary { return &model.Vocabulary{} }
|
||||||
|
|
||||||
|
var _ model.Model = (*fakeModel)(nil)
|
||||||
|
var _ model.TextProcessor = (*fakeModel)(nil)
|
||||||
|
|
||||||
|
func TestProcessBatchUnicode(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
decode map[int32]string
|
||||||
|
logits [][]float32
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "emoji",
|
||||||
|
decode: map[int32]string{0: "A", 1: "😀", 2: "👍", 3: "!"},
|
||||||
|
logits: [][]float32{{10, 0, 0, 0}, {0, 10, 0, 0}, {0, 0, 10, 0}, {0, 0, 0, 10}},
|
||||||
|
want: "A😀👍!",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ascii",
|
||||||
|
decode: map[int32]string{0: "H", 1: "e", 2: "y"},
|
||||||
|
logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
|
||||||
|
want: "Hey",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multibyte",
|
||||||
|
decode: map[int32]string{0: "世", 1: "界", 2: "😊"},
|
||||||
|
logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
|
||||||
|
want: "世界😊",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
m := &fakeModel{decode: tt.decode, logits: tt.logits}
|
||||||
|
|
||||||
|
s := &Server{model: m, batchSize: 1, parallel: 1}
|
||||||
|
s.cache = &InputCache{enabled: true, slots: []InputCacheSlot{{Id: 0}}, numCtx: 10}
|
||||||
|
s.seqs = make([]*Sequence, 1)
|
||||||
|
s.seqsSem = semaphore.NewWeighted(1)
|
||||||
|
if err := s.seqsSem.Acquire(context.Background(), 1); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
s.cond = sync.NewCond(&s.mu)
|
||||||
|
|
||||||
|
seq := &Sequence{
|
||||||
|
inputs: []input.Input{{Token: 0}},
|
||||||
|
cache: &s.cache.slots[0],
|
||||||
|
responses: make(chan string, 10),
|
||||||
|
quit: make(chan bool, 1),
|
||||||
|
numPredict: len(tt.logits),
|
||||||
|
sampler: sample.NewSampler(0, 0, 0, 0, 0, nil),
|
||||||
|
embedding: make(chan []float32, 1),
|
||||||
|
}
|
||||||
|
s.seqs[0] = seq
|
||||||
|
|
||||||
|
for {
|
||||||
|
if err := s.processBatch(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if s.seqs[0] == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var result string
|
||||||
|
for r := range seq.responses {
|
||||||
|
result += r
|
||||||
|
}
|
||||||
|
|
||||||
|
if result != tt.want {
|
||||||
|
t.Fatalf("got %q want %q", result, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa
|
|||||||
vocabIds[i] = uint32(i)
|
vocabIds[i] = uint32(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS)
|
grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)})
|
||||||
if grammar == nil {
|
if grammar == nil {
|
||||||
return nil, errors.New("sample: failed to initialize grammar")
|
return nil, errors.New("sample: failed to initialize grammar")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
|
|||||||
}
|
}
|
||||||
defer bin.Close()
|
defer bin.Close()
|
||||||
|
|
||||||
f, err := ggml.Decode(bin, -1)
|
f, _, err := ggml.Decode(bin, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -430,7 +430,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
|
|||||||
fnWrap := func(n uint64) {
|
fnWrap := func(n uint64) {
|
||||||
done := doneBytes.Add(n)
|
done := doneBytes.Add(n)
|
||||||
progress := float32(done) / float32(totalBytes)
|
progress := float32(done) / float32(totalBytes)
|
||||||
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0000000000000000000", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
|
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
|
||||||
}
|
}
|
||||||
ftype, err := ggml.ParseFileType(quantizeType)
|
ftype, err := ggml.ParseFileType(quantizeType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -467,7 +467,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
f, err := ggml.Decode(temp, 1024)
|
f, _, err := ggml.Decode(temp, 1024)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
|
slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -501,26 +501,47 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
|
|||||||
return nil, errOnlyGGUFSupported
|
return nil, errOnlyGGUFSupported
|
||||||
}
|
}
|
||||||
|
|
||||||
f, err := ggml.Decode(blob, -1)
|
stat, err := blob.Stat()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
mediatype := "application/vnd.ollama.image.model"
|
var offset int64
|
||||||
if f.KV().Kind() == "adapter" {
|
for offset < stat.Size() {
|
||||||
mediatype = "application/vnd.ollama.image.adapter"
|
f, n, err := ggml.Decode(blob, 1024)
|
||||||
} else if (f.KV().Uint("block_count") == 0 && f.KV().Uint("vision.block_count") > 0) || f.KV().Kind() == "projector" {
|
if errors.Is(err, io.EOF) {
|
||||||
// if a model has vision.block_count but not block_count, it is a standalone vision model
|
break
|
||||||
mediatype = "application/vnd.ollama.image.projector"
|
} else if err != nil {
|
||||||
}
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
layer, err := NewLayerFromLayer(digest, mediatype, blob.Name())
|
mediatype := "application/vnd.ollama.image.model"
|
||||||
if err != nil {
|
if f.KV().Kind() == "adapter" {
|
||||||
slog.Debug("could not create new layer from layer", "error", err)
|
mediatype = "application/vnd.ollama.image.adapter"
|
||||||
return nil, err
|
} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
|
||||||
}
|
mediatype = "application/vnd.ollama.image.projector"
|
||||||
|
}
|
||||||
|
|
||||||
layers = append(layers, &layerGGML{layer, f})
|
var layer Layer
|
||||||
|
if digest != "" && n == stat.Size() && offset == 0 {
|
||||||
|
layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("could not create new layer from layer", "error", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
|
||||||
|
if layer.Digest == "" {
|
||||||
|
layer, err = NewLayer(io.NewSectionReader(blob, offset, n), mediatype)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
layers = append(layers, &layerGGML{layer, f})
|
||||||
|
offset = n
|
||||||
|
}
|
||||||
|
|
||||||
return detectChatTemplate(layers)
|
return detectChatTemplate(layers)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ func (m *Model) Capabilities() []model.Capability {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
f, err := ggml.Decode(r, 1024)
|
f, _, err := ggml.Decode(r, 1024)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
|
if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
|
||||||
capabilities = append(capabilities, model.CapabilityEmbedding)
|
capabilities = append(capabilities, model.CapabilityEmbedding)
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
f, err := ggml.Decode(blob, -1)
|
f, _, err := ggml.Decode(blob, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -120,30 +120,14 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
|
|||||||
|
|
||||||
if newType.IsQuantized() {
|
if newType.IsQuantized() {
|
||||||
nx := shape[0]
|
nx := shape[0]
|
||||||
|
ny := uint64(1)
|
||||||
|
if len(shape) > 1 {
|
||||||
|
ny = shape[1]
|
||||||
|
}
|
||||||
qk_k := newType.BlockSize()
|
qk_k := newType.BlockSize()
|
||||||
|
|
||||||
// Check if first dimension is divisible by block size
|
|
||||||
if nx%qk_k != 0 {
|
if nx%qk_k != 0 {
|
||||||
// Store the original type for logging
|
slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
|
||||||
originalType := newType
|
newType = fsggml.TensorTypeF16
|
||||||
|
|
||||||
// Select appropriate fallback based on original type
|
|
||||||
switch newType {
|
|
||||||
case fsggml.TensorTypeQ4_K:
|
|
||||||
newType = fsggml.TensorTypeQ5_0
|
|
||||||
case fsggml.TensorTypeQ5_K:
|
|
||||||
newType = fsggml.TensorTypeQ5_1
|
|
||||||
case fsggml.TensorTypeQ6_K:
|
|
||||||
newType = fsggml.TensorTypeQ8_0
|
|
||||||
}
|
|
||||||
|
|
||||||
// Final check - if still incompatible, fall back to F16
|
|
||||||
if nx%newType.BlockSize() != 0 {
|
|
||||||
newType = fsggml.TensorTypeF16
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
|
|
||||||
nx, qk_k, originalType.String(), newType.String()))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return newType
|
return newType
|
||||||
|
|||||||
@@ -271,7 +271,7 @@ func TestQuantizeModel(t *testing.T) {
|
|||||||
t.Fatal(err.Error())
|
t.Fatal(err.Error())
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
defer fp.Close()
|
||||||
meta, err := fsggml.Decode(fp, -1)
|
meta, _, err := fsggml.Decode(fp, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err.Error())
|
t.Fatal(err.Error())
|
||||||
}
|
}
|
||||||
@@ -303,7 +303,7 @@ func TestQuantizeModel(t *testing.T) {
|
|||||||
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
|
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
|
||||||
}
|
}
|
||||||
defer fpNew.Close()
|
defer fpNew.Close()
|
||||||
newMeta, err := fsggml.Decode(fpNew, -1)
|
newMeta, _, err := fsggml.Decode(fpNew, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
|
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -387,17 +387,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
|||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
runner.refMu.Unlock()
|
runner.refMu.Unlock()
|
||||||
slog.Debug("duplicate expired event, ignoring", "runner", runner)
|
slog.Debug("duplicate expired event, ignoring", "runner", runner)
|
||||||
} else if runner.pid != runnerToUnload.pid {
|
|
||||||
// If the pids do not match, we likely had multiple load
|
|
||||||
// failures for the same model in quick succession due to
|
|
||||||
// request context canceled and are draining the queue of
|
|
||||||
// events. Ensure the orphaned runner is properly shut down, but
|
|
||||||
// do not delete the mismatched loaded runner, or wait for VRAM
|
|
||||||
// convergence.
|
|
||||||
slog.Debug("orphaned runner shutting down", "orphan", runner, "loaded", runnerToUnload)
|
|
||||||
runner.unload()
|
|
||||||
s.loadedMu.Unlock()
|
|
||||||
runner.refMu.Unlock()
|
|
||||||
} else {
|
} else {
|
||||||
slog.Debug("starting background wait for VRAM recovery", "runner", runner)
|
slog.Debug("starting background wait for VRAM recovery", "runner", runner)
|
||||||
finished := runner.waitForVRAMRecovery()
|
finished := runner.waitForVRAMRecovery()
|
||||||
|
|||||||
Reference in New Issue
Block a user