From efdd9b76da5a6b7bfb2fc3a2ee5034ce98808abe Mon Sep 17 00:00:00 2001 From: cvrunmin Date: Thu, 20 Nov 2025 16:43:47 +0800 Subject: [PATCH] gguf: add split gguf loading --- discover/runner.go | 1 + fs/ggml/ggml.go | 179 +++++++++++++++++++++++- fs/ggml/gguf.go | 8 +- go.mod | 2 +- llama/llama.go | 14 +- llm/server.go | 86 ++++++++---- ml/backend.go | 8 +- ml/backend/ggml/ggml.go | 50 +++++-- ml/backend/ggml/ggml_test.go | 2 +- model/model.go | 4 +- runner/llamarunner/runner.go | 21 +-- runner/ollamarunner/runner.go | 23 +-- server/images.go | 43 ++++-- server/routes.go | 8 +- server/routes_debug_test.go | 4 +- server/routes_generate_renderer_test.go | 4 +- server/routes_generate_test.go | 14 +- server/routes_harmony_streaming_test.go | 6 +- server/sched.go | 10 +- server/sched_test.go | 16 +-- 20 files changed, 386 insertions(+), 117 deletions(-) diff --git a/discover/runner.go b/discover/runner.go index 34a9364ec..71ceb6722 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -426,6 +426,7 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map cmd, port, err := llm.StartRunner( true, // ollama engine "", // no model + make([]string, 0), ollamaLibDirs, out, extraEnvs, diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 6ce9724f2..892b1bcfc 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -7,8 +7,10 @@ import ( "fmt" "io" "log/slog" + "maps" "math" "slices" + "sort" "strings" "github.com/ollama/ollama/format" @@ -26,6 +28,18 @@ type model interface { Tensors() Tensors } +type MetaGGML struct { + Shards []GGML + ShardPaths []string + Tensors ForeignTensors + kv KV +} + +type GGUFSplitInfo struct { + no uint32 + count uint32 +} + type KV map[string]any func (kv KV) Architecture() string { @@ -49,6 +63,17 @@ func (kv KV) FileType() FileType { return FileTypeUnknown } +func (kv KV) GGUFSplitInfo() *GGUFSplitInfo { + no := kv.Uint("split.no", 0xffffffff) + if no == 0xffffffff { + return nil + } + return &GGUFSplitInfo{ + no: no, + count: kv.Uint("split.count"), + } +} + func (kv KV) BlockCount() uint64 { return uint64(kv.Uint("block_count")) } @@ -268,7 +293,7 @@ type arrayValueTypes interface { } func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) { - if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") { + if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "split.") { key = kv.Architecture() + "." + key } @@ -285,6 +310,14 @@ type Tensors struct { Offset uint64 } +type ForeignTensor struct { + *Tensor + ModelPath string + TensorRegionOffset uint64 +} + +type ForeignTensors []ForeignTensor + func (s Tensors) Items(prefix ...string) []*Tensor { if len(prefix) == 0 { return s.items @@ -323,6 +356,41 @@ func (ts Tensors) GroupLayers() map[string]Layer { return layers } +func (s ForeignTensors) Items(prefix ...string) []*Tensor { + var items []*Tensor + for i := range s { + if len(prefix) == 0 || strings.HasPrefix(s[i].Name, prefix[0]) { + items = append(items, s[i].Tensor) + } + } + + return items +} + +func (ts ForeignTensors) GroupLayers() map[string]Layer { + layers := make(map[string]Layer) + for i := range ts { + t := ts[i].Tensor + parts := strings.Split(t.Name, ".") + if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 { + if len(parts) > index+2 { + // blk and mm should have a number after them, join it + parts = append( + []string{strings.Join(parts[:index+2], ".")}, + parts[index+2:]...) + } + } + + if _, ok := layers[parts[0]]; !ok { + layers[parts[0]] = make(Layer) + } + + layers[parts[0]][strings.Join(parts[1:], ".")] = t + } + + return layers +} + type Layer map[string]*Tensor func (l Layer) Size() (size uint64) { @@ -550,7 +618,89 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) { }, nil } +func BuildForeignTensors(shards []GGML, shardsPaths []string) (*ForeignTensors, error) { + if len(shards) != len(shardsPaths) { + return nil, fmt.Errorf("length of shards and shardsPaths do not match: %d vs %d", len(shards), len(shardsPaths)) + } + li := make(ForeignTensors, 0) + for i := range shards { + gs := shards[i] + tensors := gs.Tensors() + for k := range tensors.items { + tensor := tensors.items[k] + li = append(li, ForeignTensor{ + Tensor: tensor, + ModelPath: shardsPaths[i], + TensorRegionOffset: tensors.Offset, + }) + } + } + return &li, nil +} + +func MakeMetaGGML(ggmls []GGML, ggmlPaths []string) MetaGGML { + type wrapper struct { + ggml GGML + path string + weight int64 + } + var wrappers []wrapper + for i := range ggmls { + iSplitInfo := ggmls[i].KV().GGUFSplitInfo() + var weight int64 = 0 + if iSplitInfo == nil { + weight = -1 + } else { + weight = int64((*iSplitInfo).no) + } + wrappers = append(wrappers, wrapper{ + ggml: ggmls[i], + path: ggmlPaths[i], + weight: weight, + }) + } + sort.SliceStable(wrappers, func(i, j int) bool { + return wrappers[i].weight < wrappers[j].weight + }) + metaGgml := MetaGGML{} + for i := range wrappers { + if i == 0 { + kv := maps.Clone(wrappers[i].ggml.KV()) + // remove the keys contained in split gguf files. add more if needed. + delete(kv, "slice.no") + delete(kv, "slice.count") + delete(kv, "slice.tensors.count") + metaGgml.kv = kv + } + metaGgml.Shards = append(metaGgml.Shards, wrappers[i].ggml) + metaGgml.ShardPaths = append(metaGgml.ShardPaths, wrappers[i].path) + } + ft, _ := BuildForeignTensors(metaGgml.Shards, metaGgml.ShardPaths) + metaGgml.Tensors = *ft + return metaGgml +} + +func simpleWrapGGML(ggml GGML) MetaGGML { + // simply wrap single GGML, without creating foreign tensors + return MetaGGML{ + Shards: []GGML{ggml}, + ShardPaths: []string{""}, + kv: ggml.KV(), + } +} + +func WrapGGML(ggml GGML) MetaGGML { + metaggml := simpleWrapGGML(ggml) + ft, _ := BuildForeignTensors(metaggml.Shards, metaggml.ShardPaths) + metaggml.Tensors = *ft + return metaggml +} + func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) { + return WrapGGML(f).GraphSize(context, batch, numParallel, kvCacheType, useFlashAttention) +} + +func (f MetaGGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) { context *= uint64(numParallel) embedding := f.KV().EmbeddingLength() @@ -564,7 +714,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri embeddingHeadsK := f.KV().EmbeddingHeadCountK() embeddingHeadsV := f.KV().EmbeddingHeadCountV() - layers := f.Tensors().GroupLayers() + layers := f.Tensors.GroupLayers() bytesPerElement := kvCacheBytesPerElement(kvCacheType) @@ -662,7 +812,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri ) var ropeFreqsCount uint64 - if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok { + if ropeFreqs, ok := f.Tensors.GroupLayers()["rope_freqs"]; ok { if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { ropeFreqsCount = ropeFreqsWeights.Elements() } @@ -802,6 +952,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { + return simpleWrapGGML(f).SupportsKVCacheType(cacheType) +} +func (f MetaGGML) SupportsKVCacheType(cacheType string) bool { if cacheType == "" || cacheType == "f16" { return true } @@ -811,6 +964,10 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool { // SupportsFlashAttention checks if the model supports flash attention func (f GGML) SupportsFlashAttention() bool { + return simpleWrapGGML(f).SupportsFlashAttention() +} + +func (f MetaGGML) SupportsFlashAttention() bool { _, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())] if isEmbedding { return false @@ -828,6 +985,10 @@ func (f GGML) SupportsFlashAttention() bool { // FlashAttention checks if the model should enable flash attention func (f GGML) FlashAttention() bool { + return simpleWrapGGML(f).FlashAttention() +} + +func (f MetaGGML) FlashAttention() bool { return slices.Contains([]string{ "gemma3", "gptoss", "gpt-oss", @@ -849,3 +1010,15 @@ func kvCacheBytesPerElement(cacheType string) float64 { return 2 // f16 (default) } } + +func (f MetaGGML) KV() KV { + return f.kv +} + +func (f MetaGGML) TotalTensorBytes() uint64 { + totalBytes := uint64(0) + for i := range f.Shards { + totalBytes += uint64(f.Shards[i].Length) - f.Shards[i].Tensors().Offset + } + return totalBytes +} diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go index b694deadb..3081a2def 100644 --- a/fs/ggml/gguf.go +++ b/fs/ggml/gguf.go @@ -138,7 +138,7 @@ func (llm *gguf) numKV() uint64 { } } -func (llm *gguf) Decode(rs io.ReadSeeker) error { +func (llm *gguf) Decode(rs io.ReadSeeker, mainKV ...KV) error { // decode key-values for i := 0; uint64(i) < llm.numKV(); i++ { k, err := readGGUFString(llm, rs) @@ -235,7 +235,11 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { // patch KV with parameter count llm.kv["general.parameter_count"] = llm.parameters - alignment := llm.kv.Uint("general.alignment", 32) + alignment := llm.kv.Uint("general.alignment", 0xffffffff) + if alignment == 0xffffffff { + // try to get alignment from main shard instead. + alignment = append(mainKV, make(KV))[0].Uint("general.alignment", 32) + } offset, err := rs.Seek(0, io.SeekCurrent) if err != nil { diff --git a/go.mod b/go.mod index 6e7dc1d6d..400db2504 100644 --- a/go.mod +++ b/go.mod @@ -72,7 +72,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.5 github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect golang.org/x/arch v0.8.0 // indirect diff --git a/llama/llama.go b/llama/llama.go index 582d4128c..49acb8cb3 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -256,7 +256,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool { return true } -func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { +func LoadModelFromFile(modelPath string, extraModelPaths []string, params ModelParams) (*Model, error) { cparams := C.llama_model_default_params() cparams.n_gpu_layers = C.int(params.NumGpuLayers) cparams.main_gpu = C.int32_t(params.MainGpu) @@ -300,7 +300,17 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { cparams.progress_callback_user_data = unsafe.Pointer(&handle) } - m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)} + var splitPaths []*C.char + mp := C.CString(modelPath) + defer C.free(unsafe.Pointer(mp)) + splitPaths = append(splitPaths, mp) + for i := range extraModelPaths { + mp := C.CString(extraModelPaths[i]) + defer C.free(unsafe.Pointer(mp)) + splitPaths = append(splitPaths, mp) + } + + m := Model{c: C.llama_model_load_from_splits(&splitPaths[0], C.size_t(len(splitPaths)), cparams)} if m.c == nil { return nil, fmt.Errorf("unable to load model: %s", modelPath) } diff --git a/llm/server.go b/llm/server.go index 4eaa88df0..e3decfc72 100644 --- a/llm/server.go +++ b/llm/server.go @@ -84,12 +84,13 @@ type LlamaServer interface { // llmServer is an instance of a runner hosting a single model type llmServer struct { - port int - cmd *exec.Cmd - done chan error // Channel to signal when the process exits - status *StatusWriter - options api.Options - modelPath string + port int + cmd *exec.Cmd + done chan error // Channel to signal when the process exits + status *StatusWriter + options api.Options + modelPath string + extraModelPaths []string loadRequest LoadRequest // Parameters used to initialize the runner mem *ml.BackendMemory // Memory allocations for this model @@ -109,7 +110,7 @@ type llmServer struct { type llamaServer struct { llmServer - ggml *ggml.GGML + ggml *ggml.MetaGGML } type ollamaServer struct { @@ -123,7 +124,7 @@ type ollamaServer struct { // It collects array values for arrays with a size less than or equal to // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If // the maxArraySize is negative, all arrays are collected. -func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { +func LoadModel(model string, extraModels []string, maxArraySize int) (*ggml.MetaGGML, error) { if _, err := os.Stat(model); err != nil { return nil, err } @@ -134,12 +135,36 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { } defer f.Close() - ggml, err := ggml.Decode(f, maxArraySize) - return ggml, err + ggml1, err := ggml.Decode(f, maxArraySize) + if err != nil { + return nil, err + } + if ggml1.KV().GGUFSplitInfo() != nil { + loadedGgml := []ggml.GGML{*ggml1} + for i := range extraModels { + extraModel := extraModels[i] + f, err := os.Open(extraModel) + if err != nil { + return nil, err + } + defer f.Close() + + ggml1, err := ggml.Decode(f, maxArraySize) + if err != nil { + return nil, err + } + loadedGgml = append(loadedGgml, *ggml1) + } + metaggml := ggml.MakeMetaGGML(loadedGgml, append([]string{model}, extraModels...)) + return &metaggml, nil + } else { + metaggml := ggml.MakeMetaGGML([]ggml.GGML{*ggml1}, []string{model}) + return &metaggml, nil + } } // NewLlamaServer will run a server for the given GPUs -func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { +func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, extraModelPaths []string, f *ggml.MetaGGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var llamaModel *llama.Model var textProcessor model.TextProcessor var err error @@ -155,7 +180,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st } } if textProcessor == nil { - llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true}) + llamaModel, err = llama.LoadModelFromFile(modelPath, extraModelPaths, llama.ModelParams{VocabOnly: true}) if err != nil { return nil, err } @@ -225,24 +250,26 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st cmd, port, err := StartRunner( textProcessor != nil, modelPath, + extraModelPaths, gpuLibs, status, ml.GetVisibleDevicesEnv(gpus), ) s := llmServer{ - port: port, - cmd: cmd, - status: status, - options: opts, - modelPath: modelPath, - loadRequest: loadRequest, - llamaModel: llamaModel, - llamaModelLock: &sync.Mutex{}, - sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: f.KV().BlockCount() + 1, - loadStart: time.Now(), - done: make(chan error, 1), + port: port, + cmd: cmd, + status: status, + options: opts, + modelPath: modelPath, + extraModelPaths: extraModelPaths, + loadRequest: loadRequest, + llamaModel: llamaModel, + llamaModelLock: &sync.Mutex{}, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: f.KV().BlockCount() + 1, + loadStart: time.Now(), + done: make(chan error, 1), } if err != nil { @@ -279,7 +306,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st } } -func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) { +func StartRunner(ollamaEngine bool, modelPath string, extraModelPaths []string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) { var exe string exe, err = os.Executable() if err != nil { @@ -309,6 +336,9 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W if modelPath != "" { params = append(params, "--model", modelPath) } + for i := range extraModelPaths { + params = append(params, "--model", extraModelPaths[i]) + } params = append(params, "--port", strconv.Itoa(port)) var pathEnv string @@ -403,6 +433,10 @@ func (s *llmServer) ModelPath() string { return s.modelPath } +func (s *llmServer) ExtraModelPaths() []string { + return s.extraModelPaths +} + type LoadOperation int // The order of these constants are significant because we iterate over the operations. They @@ -478,7 +512,7 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention) // Use the size of one layer as a buffer - layers := s.ggml.Tensors().GroupLayers() + layers := s.ggml.Tensors.GroupLayers() if blk0, ok := layers["blk.0"]; ok { for i := range gpus { gpus[i].FreeMemory -= blk0.Size() + kv[0] diff --git a/ml/backend.go b/ml/backend.go index 4d930fe43..0c7f83320 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -77,9 +77,9 @@ type BackendParams struct { FlashAttention bool } -var backends = make(map[string]func(string, BackendParams) (Backend, error)) +var backends = make(map[string]func(string, []string, BackendParams) (Backend, error)) -func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) { +func RegisterBackend(name string, f func(string, []string, BackendParams) (Backend, error)) { if _, ok := backends[name]; ok { panic("backend: backend already registered") } @@ -87,9 +87,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error) backends[name] = f } -func NewBackend(modelPath string, params BackendParams) (Backend, error) { +func NewBackend(modelPath string, extraModelPaths []string, params BackendParams) (Backend, error) { if backend, ok := backends["ggml"]; ok { - return backend(modelPath, params) + return backend(modelPath, extraModelPaths, params) } return nil, fmt.Errorf("unsupported backend") diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 520d95cb0..33f683dfd 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -77,7 +77,7 @@ type Backend struct { // modelPath is the location of the model data modelPath string - meta *fsggml.GGML + meta *fsggml.MetaGGML // allocMemory means that memory should be allocated for tensors and not // just a dry run @@ -120,17 +120,38 @@ type Backend struct { var once sync.Once -func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { +func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (ml.Backend, error) { r, err := os.Open(modelPath) if err != nil { return nil, err } defer r.Close() - meta, err := fsggml.Decode(r, -1) + smallmeta, err := fsggml.Decode(r, -1) if err != nil { return nil, err } + var meta fsggml.MetaGGML + if smallmeta.KV().GGUFSplitInfo() != nil { + loadedGgml := []fsggml.GGML{*smallmeta} + for i := range extraModelPaths { + extraModel := extraModelPaths[i] + f, err := os.Open(extraModel) + if err != nil { + return nil, err + } + defer f.Close() + + smallmeta, err := fsggml.Decode(f, -1) + if err != nil { + return nil, err + } + loadedGgml = append(loadedGgml, *smallmeta) + } + meta = fsggml.MakeMetaGGML(loadedGgml, append([]string{modelPath}, extraModelPaths...)) + } else { + meta = fsggml.MakeMetaGGML([]fsggml.GGML{*smallmeta}, []string{modelPath}) + } once.Do(func() { slog.Info( @@ -139,7 +160,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { "file_type", meta.KV().FileType(), "name", meta.KV().String("general.name"), "description", meta.KV().String("general.description"), - "num_tensors", len(meta.Tensors().Items()), + "num_tensors", len(meta.Tensors.Items()), "num_key_values", len(meta.KV()), ) }) @@ -227,7 +248,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { // outputs are assigned iff allowed by splits and configured number of gpu layers output := assignLayer(blocks) - maxTensors := len(meta.Tensors().Items()) + maxTensors := len(meta.Tensors.Items()) maxTensors += 1 // each layer has at most 2 extra tensors for rope operations maxTensors += blocks * 2 @@ -303,11 +324,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { return false } - for _, t := range meta.Tensors().Items() { + for _, t := range meta.Tensors.Items() { switch { case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"): createTensor(tensor{source: t}, input.bts, -1) - if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" { + if _, ok := meta.Tensors.GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" { createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks) } case contains(t.Name, "cls", "output", "output_norm", @@ -378,7 +399,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } } - maxGraphNodes := max(1024, len(meta.Tensors().Items())*8) + maxGraphNodes := max(1024, len(meta.Tensors.Items())*8) sched := C.ggml_backend_sched_new_ext( (*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])), @@ -423,7 +444,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { modelPath: modelPath, allocMemory: params.AllocMemory, flashAttention: params.FlashAttention, - meta: meta, + meta: &meta, tensorLoadTargets: targets, tensors: tensors, sched: sched, @@ -494,11 +515,12 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error { slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1)) var doneBytes atomic.Uint64 - totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset + totalBytes := b.meta.TotalTensorBytes() g, ctx := errgroup.WithContext(ctx) g.SetLimit(runtime.GOMAXPROCS(0)) - for _, t := range b.meta.Tensors().Items() { + for i := range b.meta.Tensors { + t := b.meta.Tensors[i] g.Go(func() error { tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name]))) for i := range tts { @@ -517,13 +539,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error { // Create a new FD for each goroutine so that each FD is read sequentially, rather than // seeking around within an FD shared between all goroutines. - file, err := os.Open(b.modelPath) + file, err := os.Open(t.ModelPath) if err != nil { - slog.Warn("file open error", "file", b.modelPath, "error", err) + slog.Warn("file open error", "file", t.ModelPath, "error", err) return err } defer file.Close() - sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size())) + sr := io.NewSectionReader(file, int64(t.TensorRegionOffset+t.Offset), int64(t.Size())) if t.Kind == 4 && tts[0]._type == 39 { // source is mxfp4, target is ggml mxfp4 diff --git a/ml/backend/ggml/ggml_test.go b/ml/backend/ggml/ggml_test.go index efd3a455c..0d568dda2 100644 --- a/ml/backend/ggml/ggml_test.go +++ b/ml/backend/ggml/ggml_test.go @@ -24,7 +24,7 @@ func setup(tb testing.TB) ml.Context { tb.Fatal(err) } - b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true}) + b, err := ml.NewBackend(f.Name(), make([]string, 0), ml.BackendParams{AllocMemory: true}) if err != nil { tb.Fatal(err) } diff --git a/model/model.go b/model/model.go index 0af16da80..d45e03111 100644 --- a/model/model.go +++ b/model/model.go @@ -102,8 +102,8 @@ func Register(name string, f func(fs.Config) (Model, error)) { } // New initializes a new model instance with the provided configuration based on the metadata in the model file -func New(modelPath string, params ml.BackendParams) (Model, error) { - b, err := ml.NewBackend(modelPath, params) +func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (Model, error) { + b, err := ml.NewBackend(modelPath, extraModelPaths, params) if err != nil { return nil, err } diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go index a23ddd61a..5aaaf5eb0 100644 --- a/runner/llamarunner/runner.go +++ b/runner/llamarunner/runner.go @@ -4,7 +4,6 @@ import ( "context" "encoding/json" "errors" - "flag" "fmt" "log" "log/slog" @@ -19,6 +18,7 @@ import ( "time" "unicode/utf8" + "github.com/spf13/pflag" "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" @@ -256,6 +256,8 @@ type Server struct { // modelPath is the location of the model to be loaded modelPath string + extraModelPaths []string + // loadMu prevents more than one load attempt from occurring at a time loadMu sync.Mutex @@ -827,6 +829,7 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { func (s *Server) loadModel( params llama.ModelParams, mpath string, + empath []string, lpath []string, ppath string, kvSize int, @@ -836,7 +839,7 @@ func (s *Server) loadModel( multiUserCache bool, ) { var err error - s.model, err = llama.LoadModelFromFile(mpath, params) + s.model, err = llama.LoadModelFromFile(mpath, empath, params) if err != nil { panic(err) } @@ -929,7 +932,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { } s.status = llm.ServerStatusLoadingModel - go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache) + go s.loadModel(params, s.modelPath, s.extraModelPaths, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache) case llm.LoadOperationClose: // No-op for us @@ -947,13 +950,14 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { } func Execute(args []string) error { - fs := flag.NewFlagSet("runner", flag.ExitOnError) - mpath := fs.String("model", "", "Path to model binary file") + fs := pflag.NewFlagSet("runner", pflag.ExitOnError) + mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.") port := fs.Int("port", 8080, "Port to expose the server on") _ = fs.Bool("verbose", false, "verbose output (default: disabled)") fs.Usage = func() { - fmt.Fprintf(fs.Output(), "Runner usage\n") + // sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output + fmt.Fprintf(os.Stderr, "Runner usage\n") fs.PrintDefaults() } if err := fs.Parse(args); err != nil { @@ -965,8 +969,9 @@ func Execute(args []string) error { llama.BackendInit() server := &Server{ - modelPath: *mpath, - status: llm.ServerStatusLaunched, + modelPath: (*mpath)[0], + extraModelPaths: (*mpath)[1:], + status: llm.ServerStatusLaunched, } server.ready.Add(1) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 153390868..ad810ca5d 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -5,7 +5,6 @@ import ( "context" "encoding/json" "errors" - "flag" "fmt" "hash/maphash" "image" @@ -23,6 +22,7 @@ import ( "time" "unicode/utf8" + "github.com/spf13/pflag" "golang.org/x/image/bmp" "golang.org/x/sync/semaphore" @@ -331,6 +331,8 @@ type Server struct { // modelPath is the location of the model to be loaded modelPath string + extraModelPaths []string + // loadMu prevents more than one load attempt from occurring at a time loadMu sync.Mutex @@ -1168,6 +1170,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error { // based on the given parameters func (s *Server) allocModel( mpath string, + empath []string, params ml.BackendParams, loraPath []string, parallel int, @@ -1192,7 +1195,7 @@ func (s *Server) allocModel( }() var err error - s.model, err = model.New(mpath, params) + s.model, err = model.New(mpath, empath, params) if err != nil { return err } @@ -1295,7 +1298,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { s.batchSize = req.BatchSize - err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache) + err := s.allocModel(s.modelPath, s.extraModelPaths, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache) if err != nil { s.closeModel() @@ -1365,7 +1368,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) { return } - m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}}) + m, err = model.New(f.Name(), make([]string, 0), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}}) if err != nil { http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError) return @@ -1382,13 +1385,14 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) { } func Execute(args []string) error { - fs := flag.NewFlagSet("runner", flag.ExitOnError) - mpath := fs.String("model", "", "Path to model binary file") + fs := pflag.NewFlagSet("runner", pflag.ExitOnError) + mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.") port := fs.Int("port", 8080, "Port to expose the server on") _ = fs.Bool("verbose", false, "verbose output (default: disabled)") fs.Usage = func() { - fmt.Fprintf(fs.Output(), "Runner usage\n") + // sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output + fmt.Fprintf(os.Stderr, "Runner usage\n") fs.PrintDefaults() } if err := fs.Parse(args); err != nil { @@ -1401,8 +1405,9 @@ func Execute(args []string) error { defer cancel() server := &Server{ - modelPath: *mpath, - status: llm.ServerStatusLaunched, + modelPath: (*mpath)[0], + extraModelPaths: (*mpath)[1:], + status: llm.ServerStatusLaunched, } server.cond = sync.NewCond(&server.mu) diff --git a/server/images.go b/server/images.go index d3bd9ffaf..e4fbd8897 100644 --- a/server/images.go +++ b/server/images.go @@ -53,18 +53,19 @@ type registryOptions struct { } type Model struct { - Name string `json:"name"` - Config ConfigV2 - ShortName string - ModelPath string - ParentModel string - AdapterPaths []string - ProjectorPaths []string - System string - License []string - Digest string - Options map[string]any - Messages []api.Message + Name string `json:"name"` + Config ConfigV2 + ShortName string + ModelPath string + ExtraModelPaths []string + ParentModel string + AdapterPaths []string + ProjectorPaths []string + System string + License []string + Digest string + Options map[string]any + Messages []api.Message Template *template.Template } @@ -190,6 +191,13 @@ func (m *Model) String() string { Args: m.ModelPath, }) + for _, extraModels := range m.ExtraModelPaths { + modelfile.Commands = append(modelfile.Commands, parser.Command{ + Name: "model", + Args: extraModels, + }) + } + for _, adapter := range m.AdapterPaths { modelfile.Commands = append(modelfile.Commands, parser.Command{ Name: "adapter", @@ -348,6 +356,8 @@ func GetModel(name string) (*Model, error) { } } + readMainModelFlag := false + for _, layer := range manifest.Layers { filename, err := GetBlobsPath(layer.Digest) if err != nil { @@ -356,8 +366,13 @@ func GetModel(name string) (*Model, error) { switch layer.MediaType { case "application/vnd.ollama.image.model": - model.ModelPath = filename - model.ParentModel = layer.From + if !readMainModelFlag { + model.ModelPath = filename + model.ParentModel = layer.From + readMainModelFlag = true + } else { + model.ExtraModelPaths = append(model.ExtraModelPaths, filename) + } case "application/vnd.ollama.image.embed": // Deprecated in versions > 0.1.2 // TODO: remove this warning in a future version diff --git a/server/routes.go b/server/routes.go index 16df3f4fc..a2b2810ca 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1182,14 +1182,14 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { return resp, nil } -func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) { +func getModelData(digest string, verbose bool) (ggml.KV, ggml.ForeignTensors, error) { maxArraySize := 0 if verbose { maxArraySize = -1 } - data, err := llm.LoadModel(digest, maxArraySize) + data, err := llm.LoadModel(digest, make([]string, 0), maxArraySize) if err != nil { - return nil, ggml.Tensors{}, err + return nil, make(ggml.ForeignTensors, 0), err } kv := data.KV() @@ -1202,7 +1202,7 @@ func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) { } } - return kv, data.Tensors(), nil + return kv, data.Tensors, nil } func (s *Server) ListHandler(c *gin.Context) { diff --git a/server/routes_debug_test.go b/server/routes_debug_test.go index 6f9104c39..8002c752c 100644 --- a/server/routes_debug_test.go +++ b/server/routes_debug_test.go @@ -39,7 +39,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -232,7 +232,7 @@ func TestChatDebugRenderOnly(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ diff --git a/server/routes_generate_renderer_test.go b/server/routes_generate_renderer_test.go index e6473e087..06336d027 100644 --- a/server/routes_generate_renderer_test.go +++ b/server/routes_generate_renderer_test.go @@ -44,7 +44,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ llama: &mock, @@ -228,7 +228,7 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ llama: &mock, diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index a9931ea24..f5b448c46 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error return } -func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { - return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) { +func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, []string, *ggml.MetaGGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { + return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ []string, _ *ggml.MetaGGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) { return mock, nil } } @@ -159,7 +159,7 @@ func TestGenerateChat(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -786,7 +786,7 @@ func TestGenerate(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -1270,7 +1270,7 @@ func TestGenerateLogprobs(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{llama: mock} return false }, @@ -1450,7 +1450,7 @@ func TestChatLogprobs(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{llama: mock} return false }, @@ -1560,7 +1560,7 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{llama: mock} return false diff --git a/server/routes_harmony_streaming_test.go b/server/routes_harmony_streaming_test.go index 1fb41ff48..e660a3b36 100644 --- a/server/routes_harmony_streaming_test.go +++ b/server/routes_harmony_streaming_test.go @@ -265,7 +265,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 100 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } @@ -416,7 +416,7 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 100 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } @@ -598,7 +598,7 @@ func TestChatHarmonyParserStreaming(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } diff --git a/server/sched.go b/server/sched.go index c5bc6692d..11f702d75 100644 --- a/server/sched.go +++ b/server/sched.go @@ -49,8 +49,8 @@ type Scheduler struct { activeLoading llm.LlamaServer loaded map[string]*runnerRef - loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool - newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) + loadFn func(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool + newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo getSystemInfoFn func() ml.SystemInfo waitForRecovery time.Duration @@ -196,7 +196,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // Load model for fitting logutil.Trace("loading model metadata", "model", pending.model.ModelPath) - ggml, err := llm.LoadModel(pending.model.ModelPath, 1024) + ggml, err := llm.LoadModel(pending.model.ModelPath, pending.model.ExtraModelPaths, 1024) if err != nil { pending.errCh <- err break @@ -389,7 +389,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs // (if any). Returns whether the scheduler needs to evict a model to make this one fit. -func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool { +func (s *Scheduler) load(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool { numParallel := max(int(envconfig.NumParallel()), 1) // Embedding models should always be loaded with parallel=1 @@ -414,7 +414,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo if llama == nil { var err error - llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) + llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, req.model.ExtraModelPaths, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) if err != nil { // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to diff --git a/server/sched_test.go b/server/sched_test.go index 678be954f..69546242c 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -39,7 +39,7 @@ func TestSchedLoad(t *testing.T) { defer done() s := InitScheduler(ctx) s.waitForRecovery = 10 * time.Millisecond - var f *ggml.GGML // value not used in tests + var f *ggml.MetaGGML // value not used in tests req := &LlmRequest{ ctx: ctx, model: &Model{ModelPath: "foo"}, @@ -49,7 +49,7 @@ func TestSchedLoad(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Second}, } // Fail to load model first - s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return nil, errors.New("something failed to load model blah") } gpus := []ml.DeviceInfo{} @@ -64,7 +64,7 @@ func TestSchedLoad(t *testing.T) { require.Contains(t, err.Error(), "this model may be incompatible") server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} - s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { server.modelPath = model return server, nil } @@ -103,10 +103,10 @@ type reqBundle struct { ctxDone func() srv *mockLlm req *LlmRequest - f *ggml.GGML + f *ggml.MetaGGML } -func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { +func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { scenario.srv.modelPath = model return scenario.srv, nil } @@ -132,7 +132,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra }) model := &Model{Name: modelName, ModelPath: p} - f, err := llm.LoadModel(model.ModelPath, 0) + f, err := llm.LoadModel(model.ModelPath, make([]string, 0), 0) if err != nil { t.Fatal(err) } @@ -462,11 +462,11 @@ func TestSchedExpireRunner(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Minute}, } - var f *ggml.GGML + var f *ggml.MetaGGML gpus := []ml.DeviceInfo{} systemInfo := ml.SystemInfo{} server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} - s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { server.modelPath = model return server, nil }