diff --git a/discover/runner.go b/discover/runner.go index c963de6f8..e0b658237 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -441,6 +441,7 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map cmd, port, err := llm.StartRunner( true, // ollama engine "", // no model + make([]string, 0), ollamaLibDirs, out, extraEnvs, diff --git a/docs/api.md b/docs/api.md index 7c32c9597..7a1bd53f2 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1176,7 +1176,7 @@ Create a model from: - another model; - a safetensors directory; or -- a GGUF file. +- a GGUF file or directory. If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field. @@ -1270,6 +1270,7 @@ A stream of JSON objects is returned: #### Create a model from GGUF Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API. +For a model stored in multiple split GGUF files, includes all split GGUF files in the `files` parameter with the file names and SHA256 digests. It is recommended to provide files in split number order even though Ollama itself will sort them in order. ##### Request diff --git a/docs/import.mdx b/docs/import.mdx index b19596894..26a636564 100644 --- a/docs/import.mdx +++ b/docs/import.mdx @@ -88,6 +88,10 @@ To import a GGUF model, create a `Modelfile` containing: ```dockerfile FROM /path/to/file.gguf ``` +Or: +```dockerfile +FROM /path/to/gguf/split/directory +``` For a GGUF adapter, create the `Modelfile` with: diff --git a/docs/modelfile.mdx b/docs/modelfile.mdx index ce91bbf69..0a0ca47ec 100644 --- a/docs/modelfile.mdx +++ b/docs/modelfile.mdx @@ -12,7 +12,7 @@ A Modelfile is the blueprint to create and share customized models using Ollama. - [FROM (Required)](#from-required) - [Build from existing model](#build-from-existing-model) - [Build from a Safetensors model](#build-from-a-safetensors-model) - - [Build from a GGUF file](#build-from-a-gguf-file) + - [Build from a GGUF file](#build-from-a-gguf-model) - [PARAMETER](#parameter) - [Valid Parameters and Values](#valid-parameters-and-values) - [TEMPLATE](#template) @@ -130,7 +130,7 @@ Currently supported model architectures: - Gemma (including Gemma 1 and Gemma 2) - Phi3 -#### Build from a GGUF file +#### Build from a GGUF model ``` FROM ./ollama-model.gguf @@ -138,6 +138,14 @@ FROM ./ollama-model.gguf The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location. +For GGUF model split into multiple files: + +``` +FROM +``` + +The model directory should contain solely the split GGUF weights of one model. + ### PARAMETER The `PARAMETER` instruction defines a parameter that can be set when the model is run. diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 44a48511c..5db408aad 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "log/slog" + "maps" "math" "slices" "strings" @@ -27,6 +28,18 @@ type model interface { Tensors() Tensors } +type MetaGGML struct { + Shards []GGML + ShardPaths []string + Tensors ForeignTensors + kv KV +} + +type GGUFSplitInfo struct { + No uint16 + Count uint16 +} + type KV map[string]any func (kv KV) Architecture() string { @@ -50,6 +63,18 @@ func (kv KV) FileType() FileType { return FileTypeUnknown } +func (kv KV) GGUFSplitInfo() *GGUFSplitInfo { + no, found := keyValue(kv, "split.no", uint16(0)) + if !found { + return nil + } + count, _ := keyValue(kv, "split.count", uint16(0)) + return &GGUFSplitInfo{ + No: no, + Count: count, + } +} + func (kv KV) BlockCount() uint64 { return uint64(kv.Uint("block_count")) } @@ -271,7 +296,7 @@ type arrayValueTypes interface { } func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) { - if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") { + if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "split.") { key = kv.Architecture() + "." + key } @@ -288,6 +313,14 @@ type Tensors struct { Offset uint64 } +type ForeignTensor struct { + *Tensor + ModelPath string + TensorRegionOffset uint64 +} + +type ForeignTensors []ForeignTensor + func (s Tensors) Items(prefix ...string) []*Tensor { if len(prefix) == 0 { return s.items @@ -326,6 +359,41 @@ func (ts Tensors) GroupLayers() map[string]Layer { return layers } +func (s ForeignTensors) Items(prefix ...string) []*Tensor { + var items []*Tensor + for i := range s { + if len(prefix) == 0 || strings.HasPrefix(s[i].Name, prefix[0]) { + items = append(items, s[i].Tensor) + } + } + + return items +} + +func (ts ForeignTensors) GroupLayers() map[string]Layer { + layers := make(map[string]Layer) + for i := range ts { + t := ts[i].Tensor + parts := strings.Split(t.Name, ".") + if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 { + if len(parts) > index+2 { + // blk and mm should have a number after them, join it + parts = append( + []string{strings.Join(parts[:index+2], ".")}, + parts[index+2:]...) + } + } + + if _, ok := layers[parts[0]]; !ok { + layers[parts[0]] = make(Layer) + } + + layers[parts[0]][strings.Join(parts[1:], ".")] = t + } + + return layers +} + type Layer map[string]*Tensor func (l Layer) Size() (size uint64) { @@ -553,7 +621,93 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) { }, nil } +func BuildForeignTensors(shards []GGML, shardsPaths []string) (*ForeignTensors, error) { + if len(shards) != len(shardsPaths) { + return nil, fmt.Errorf("length of shards and shardsPaths do not match: %d vs %d", len(shards), len(shardsPaths)) + } + li := make(ForeignTensors, 0) + for i := range shards { + gs := shards[i] + tensors := gs.Tensors() + for k := range tensors.items { + tensor := tensors.items[k] + li = append(li, ForeignTensor{ + Tensor: tensor, + ModelPath: shardsPaths[i], + TensorRegionOffset: tensors.Offset, + }) + } + } + return &li, nil +} + +func MakeMetaGGML(ggmls []GGML, ggmlPaths []string) MetaGGML { + type wrapper struct { + ggml GGML + path string + weight int + } + var wrappers []wrapper + for i := range ggmls { + iSplitInfo := ggmls[i].KV().GGUFSplitInfo() + var weight int = 0 + if iSplitInfo == nil { + weight = -1 + } else { + weight = int((*iSplitInfo).No) + } + wrappers = append(wrappers, wrapper{ + ggml: ggmls[i], + path: ggmlPaths[i], + weight: weight, + }) + } + slices.SortStableFunc(wrappers, func(a, b wrapper) int { + return cmp.Compare(a.weight, b.weight) + }) + metaGgml := MetaGGML{} + var param_counts uint64 = 0 + for i := range wrappers { + param_counts += wrappers[i].ggml.KV().ParameterCount() + if i == 0 { + kv := maps.Clone(wrappers[i].ggml.KV()) + // remove the keys contained in split gguf files. add more if needed. + delete(kv, "slice.no") + delete(kv, "slice.count") + delete(kv, "slice.tensors.count") + delete(kv, "general.parameter_count") + metaGgml.kv = kv + } + metaGgml.Shards = append(metaGgml.Shards, wrappers[i].ggml) + metaGgml.ShardPaths = append(metaGgml.ShardPaths, wrappers[i].path) + } + metaGgml.kv["general.parameter_count"] = param_counts + ft, _ := BuildForeignTensors(metaGgml.Shards, metaGgml.ShardPaths) + metaGgml.Tensors = *ft + return metaGgml +} + +func simpleWrapGGML(ggml GGML) MetaGGML { + // simply wrap single GGML, without creating foreign tensors + return MetaGGML{ + Shards: []GGML{ggml}, + ShardPaths: []string{""}, + kv: ggml.KV(), + } +} + +func WrapGGML(ggml GGML) MetaGGML { + metaggml := simpleWrapGGML(ggml) + ft, _ := BuildForeignTensors(metaggml.Shards, metaggml.ShardPaths) + metaggml.Tensors = *ft + return metaggml +} + func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) { + return WrapGGML(f).GraphSize(context, batch, numParallel, kvCacheType, useFlashAttention) +} + +func (f MetaGGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) { context *= uint64(numParallel) embedding := f.KV().EmbeddingLength() @@ -567,7 +721,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri embeddingHeadsK := f.KV().EmbeddingHeadCountK() embeddingHeadsV := f.KV().EmbeddingHeadCountV() - layers := f.Tensors().GroupLayers() + layers := f.Tensors.GroupLayers() bytesPerElement := kvCacheBytesPerElement(kvCacheType) @@ -665,7 +819,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri ) var ropeFreqsCount uint64 - if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok { + if ropeFreqs, ok := f.Tensors.GroupLayers()["rope_freqs"]; ok { if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { ropeFreqsCount = ropeFreqsWeights.Elements() } @@ -805,6 +959,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { + return simpleWrapGGML(f).SupportsKVCacheType(cacheType) +} +func (f MetaGGML) SupportsKVCacheType(cacheType string) bool { if cacheType == "" || cacheType == "f16" { return true } @@ -822,6 +979,10 @@ func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool { // SupportsFlashAttention checks if the model supports flash attention func (f GGML) SupportsFlashAttention() bool { + return simpleWrapGGML(f).SupportsFlashAttention() +} + +func (f MetaGGML) SupportsFlashAttention() bool { _, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())] if isEmbedding { return false @@ -839,6 +1000,10 @@ func (f GGML) SupportsFlashAttention() bool { // FlashAttention checks if the model should enable flash attention func (f GGML) FlashAttention() bool { + return simpleWrapGGML(f).FlashAttention() +} + +func (f MetaGGML) FlashAttention() bool { return slices.Contains([]string{ "bert", "gemma3", @@ -863,3 +1028,15 @@ func kvCacheBytesPerElement(cacheType string) float64 { return 2 // f16 (default) } } + +func (f MetaGGML) KV() KV { + return f.kv +} + +func (f MetaGGML) TotalTensorBytes() uint64 { + totalBytes := uint64(0) + for i := range f.Shards { + totalBytes += uint64(f.Shards[i].Length) - f.Shards[i].Tensors().Offset + } + return totalBytes +} diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go index e093efea1..b88ec9597 100644 --- a/fs/ggml/gguf.go +++ b/fs/ggml/gguf.go @@ -582,7 +582,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error { if !strings.HasPrefix(k, arch+".") && !strings.HasPrefix(k, "general.") && !strings.HasPrefix(k, "adapter.") && - !strings.HasPrefix(k, "tokenizer.") { + !strings.HasPrefix(k, "tokenizer.") && + !strings.HasPrefix(k, "split.") { k = arch + "." + k } @@ -597,6 +598,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error { var err error switch v := v.(type) { + case uint16: + err = writeGGUF(ws, ggufTypeUint16, v) case int32: err = writeGGUF(ws, ggufTypeInt32, v) case int64: diff --git a/go.mod b/go.mod index b912a9a0a..0f7bca5f2 100644 --- a/go.mod +++ b/go.mod @@ -77,7 +77,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.5 github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect golang.org/x/arch v0.8.0 // indirect diff --git a/llama/llama.go b/llama/llama.go index 87844f2a2..c4c4e9e91 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -261,7 +261,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool { return true } -func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { +func LoadModelFromFile(modelPath string, extraModelPaths []string, params ModelParams) (*Model, error) { cparams := C.llama_model_default_params() cparams.n_gpu_layers = C.int(params.NumGpuLayers) cparams.main_gpu = C.int32_t(params.MainGpu) @@ -305,7 +305,17 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { cparams.progress_callback_user_data = unsafe.Pointer(&handle) } - m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)} + var splitPaths []*C.char + mp := C.CString(modelPath) + defer C.free(unsafe.Pointer(mp)) + splitPaths = append(splitPaths, mp) + for i := range extraModelPaths { + mp := C.CString(extraModelPaths[i]) + defer C.free(unsafe.Pointer(mp)) + splitPaths = append(splitPaths, mp) + } + + m := Model{c: C.llama_model_load_from_splits(&splitPaths[0], C.size_t(len(splitPaths)), cparams)} if m.c == nil { return nil, fmt.Errorf("unable to load model: %s", modelPath) } diff --git a/llm/server.go b/llm/server.go index c83bd5a40..ee74c3cf4 100644 --- a/llm/server.go +++ b/llm/server.go @@ -84,12 +84,13 @@ type LlamaServer interface { // llmServer is an instance of a runner hosting a single model type llmServer struct { - port int - cmd *exec.Cmd - done chan error // Channel to signal when the process exits - status *StatusWriter - options api.Options - modelPath string + port int + cmd *exec.Cmd + done chan error // Channel to signal when the process exits + status *StatusWriter + options api.Options + modelPath string + extraModelPaths []string loadRequest LoadRequest // Parameters used to initialize the runner mem *ml.BackendMemory // Memory allocations for this model @@ -109,7 +110,7 @@ type llmServer struct { type llamaServer struct { llmServer - ggml *ggml.GGML + ggml *ggml.MetaGGML } type ollamaServer struct { @@ -123,7 +124,7 @@ type ollamaServer struct { // It collects array values for arrays with a size less than or equal to // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If // the maxArraySize is negative, all arrays are collected. -func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { +func LoadModel(model string, extraModels []string, maxArraySize int, reliefSplitConstrain bool) (*ggml.MetaGGML, error) { if _, err := os.Stat(model); err != nil { return nil, err } @@ -134,12 +135,55 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { } defer f.Close() - ggml, err := ggml.Decode(f, maxArraySize) - return ggml, err + ggml1, err := ggml.Decode(f, maxArraySize) + if err != nil { + return nil, err + } + if ggml1.KV().GGUFSplitInfo() != nil { + if ggml1.KV().GGUFSplitInfo().No != 0 { + return nil, errors.New("not the first split of model") + } + loadedGgml := []ggml.GGML{*ggml1} + visitedSplitNo := []uint16{ggml1.KV().GGUFSplitInfo().No} + for i := range extraModels { + extraModel := extraModels[i] + f, err := os.Open(extraModel) + if err != nil { + return nil, err + } + defer f.Close() + + ggml1, err := ggml.Decode(f, maxArraySize) + if err != nil { + return nil, err + } + if ggml1.KV().GGUFSplitInfo() == nil { + return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf") + } + visitedSplitNo = append(visitedSplitNo, ggml1.KV().GGUFSplitInfo().No) + loadedGgml = append(loadedGgml, *ggml1) + } + if !reliefSplitConstrain { + if len(visitedSplitNo) != int(ggml1.KV().GGUFSplitInfo().Count) { + return nil, errors.New("mismatch split gguf count") + } + slices.Sort(visitedSplitNo) + for i := 0; i < len(visitedSplitNo)-1; i++ { + if visitedSplitNo[i] != visitedSplitNo[i+1]-1 { + return nil, errors.New("repeated or skipped split found") + } + } + } + metaggml := ggml.MakeMetaGGML(loadedGgml, append([]string{model}, extraModels...)) + return &metaggml, nil + } else { + metaggml := ggml.MakeMetaGGML([]ggml.GGML{*ggml1}, []string{model}) + return &metaggml, nil + } } // NewLlamaServer will run a server for the given GPUs -func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { +func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, extraModelPaths []string, f *ggml.MetaGGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var llamaModel *llama.Model var textProcessor model.TextProcessor var err error @@ -155,7 +199,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st } } if textProcessor == nil { - llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true}) + llamaModel, err = llama.LoadModelFromFile(modelPath, extraModelPaths, llama.ModelParams{VocabOnly: true}) if err != nil { return nil, err } @@ -262,24 +306,26 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st cmd, port, err := StartRunner( textProcessor != nil, modelPath, + extraModelPaths, gpuLibs, status, ml.GetVisibleDevicesEnv(gpus, false), ) s := llmServer{ - port: port, - cmd: cmd, - status: status, - options: opts, - modelPath: modelPath, - loadRequest: loadRequest, - llamaModel: llamaModel, - llamaModelLock: &sync.Mutex{}, - sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: f.KV().BlockCount() + 1, - loadStart: time.Now(), - done: make(chan error, 1), + port: port, + cmd: cmd, + status: status, + options: opts, + modelPath: modelPath, + extraModelPaths: extraModelPaths, + loadRequest: loadRequest, + llamaModel: llamaModel, + llamaModelLock: &sync.Mutex{}, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: f.KV().BlockCount() + 1, + loadStart: time.Now(), + done: make(chan error, 1), } if err != nil { @@ -316,7 +362,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st } } -func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) { +func StartRunner(ollamaEngine bool, modelPath string, extraModelPaths []string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) { var exe string exe, err = os.Executable() if err != nil { @@ -346,6 +392,9 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W if modelPath != "" { params = append(params, "--model", modelPath) } + for i := range extraModelPaths { + params = append(params, "--model", extraModelPaths[i]) + } params = append(params, "--port", strconv.Itoa(port)) var pathEnv string @@ -440,6 +489,10 @@ func (s *llmServer) ModelPath() string { return s.modelPath } +func (s *llmServer) ExtraModelPaths() []string { + return s.extraModelPaths +} + type LoadOperation int // The order of these constants are significant because we iterate over the operations. They @@ -522,7 +575,7 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention) // Use the size of one layer as a buffer - layers := s.ggml.Tensors().GroupLayers() + layers := s.ggml.Tensors.GroupLayers() if blk0, ok := layers["blk.0"]; ok { buffer := blk0.Size() + kv[0] for i := range gpus { diff --git a/ml/backend.go b/ml/backend.go index f287db6af..aadd003a8 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -73,9 +73,9 @@ type BackendParams struct { FlashAttention FlashAttentionType } -var backends = make(map[string]func(string, BackendParams) (Backend, error)) +var backends = make(map[string]func(string, []string, BackendParams) (Backend, error)) -func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) { +func RegisterBackend(name string, f func(string, []string, BackendParams) (Backend, error)) { if _, ok := backends[name]; ok { panic("backend: backend already registered") } @@ -83,9 +83,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error) backends[name] = f } -func NewBackend(modelPath string, params BackendParams) (Backend, error) { +func NewBackend(modelPath string, extraModelPaths []string, params BackendParams) (Backend, error) { if backend, ok := backends["ggml"]; ok { - return backend(modelPath, params) + return backend(modelPath, extraModelPaths, params) } return nil, fmt.Errorf("unsupported backend") diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index ebcc1d86f..4d61053d0 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -77,7 +77,7 @@ type Backend struct { // modelPath is the location of the model data modelPath string - meta *fsggml.GGML + meta *fsggml.MetaGGML // allocMemory means that memory should be allocated for tensors and not // just a dry run @@ -120,17 +120,55 @@ type Backend struct { var once sync.Once -func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { +func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (ml.Backend, error) { r, err := os.Open(modelPath) if err != nil { return nil, err } defer r.Close() - meta, err := fsggml.Decode(r, -1) + smallmeta, err := fsggml.Decode(r, -1) if err != nil { return nil, err } + var meta fsggml.MetaGGML + if smallmeta.KV().GGUFSplitInfo() != nil { + if smallmeta.KV().GGUFSplitInfo().No != 0 { + return nil, errors.New("not the first split of model") + } + loadedGgml := []fsggml.GGML{*smallmeta} + visitedSplitNo := []uint16{smallmeta.KV().GGUFSplitInfo().No} + for i := range extraModelPaths { + extraModel := extraModelPaths[i] + f, err := os.Open(extraModel) + if err != nil { + return nil, err + } + defer f.Close() + + smallmeta, err := fsggml.Decode(f, -1) + if err != nil { + return nil, err + } + if smallmeta.KV().GGUFSplitInfo() == nil { + return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf") + } + visitedSplitNo = append(visitedSplitNo, smallmeta.KV().GGUFSplitInfo().No) + loadedGgml = append(loadedGgml, *smallmeta) + } + if len(visitedSplitNo) != int(smallmeta.KV().GGUFSplitInfo().Count) { + return nil, errors.New("mismatch split gguf count") + } + slices.Sort(visitedSplitNo) + for i := 0; i < len(visitedSplitNo)-1; i++ { + if visitedSplitNo[i] != visitedSplitNo[i+1]-1 { + return nil, errors.New("repeated or skipped split found") + } + } + meta = fsggml.MakeMetaGGML(loadedGgml, append([]string{modelPath}, extraModelPaths...)) + } else { + meta = fsggml.MakeMetaGGML([]fsggml.GGML{*smallmeta}, []string{modelPath}) + } once.Do(func() { slog.Info( @@ -139,7 +177,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { "file_type", meta.KV().FileType(), "name", meta.KV().String("general.name"), "description", meta.KV().String("general.description"), - "num_tensors", len(meta.Tensors().Items()), + "num_tensors", len(meta.Tensors.Items()), "num_key_values", len(meta.KV()), ) }) @@ -227,7 +265,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { // outputs are assigned iff allowed by splits and configured number of gpu layers output := assignLayer(blocks) - maxTensors := len(meta.Tensors().Items()) + maxTensors := len(meta.Tensors.Items()) maxTensors += 1 // each layer has at most 2 extra tensors for rope operations maxTensors += blocks * 2 @@ -303,11 +341,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { return false } - for _, t := range meta.Tensors().Items() { + for _, t := range meta.Tensors.Items() { switch { case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"): createTensor(tensor{source: t}, input.bts, -1) - if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" { + if _, ok := meta.Tensors.GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" { createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks) } case contains(t.Name, "cls", "output", "output_norm", @@ -378,7 +416,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } } - maxGraphNodes := max(1024, len(meta.Tensors().Items())*8) + maxGraphNodes := max(1024, len(meta.Tensors.Items())*8) sched := C.ggml_backend_sched_new_ext( (*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])), @@ -423,7 +461,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { modelPath: modelPath, allocMemory: params.AllocMemory, flashAttention: params.FlashAttention, - meta: meta, + meta: &meta, tensorLoadTargets: targets, tensors: tensors, sched: sched, @@ -494,11 +532,12 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error { slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1)) var doneBytes atomic.Uint64 - totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset + totalBytes := b.meta.TotalTensorBytes() g, ctx := errgroup.WithContext(ctx) g.SetLimit(runtime.GOMAXPROCS(0)) - for _, t := range b.meta.Tensors().Items() { + for i := range b.meta.Tensors { + t := b.meta.Tensors[i] g.Go(func() error { tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name]))) for i := range tts { @@ -517,13 +556,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error { // Create a new FD for each goroutine so that each FD is read sequentially, rather than // seeking around within an FD shared between all goroutines. - file, err := os.Open(b.modelPath) + file, err := os.Open(t.ModelPath) if err != nil { - slog.Warn("file open error", "file", b.modelPath, "error", err) + slog.Warn("file open error", "file", t.ModelPath, "error", err) return err } defer file.Close() - sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size())) + sr := io.NewSectionReader(file, int64(t.TensorRegionOffset+t.Offset), int64(t.Size())) if t.Kind == 4 && tts[0]._type == 39 { // source is mxfp4, target is ggml mxfp4 diff --git a/ml/backend/ggml/ggml_test.go b/ml/backend/ggml/ggml_test.go index efd3a455c..0d568dda2 100644 --- a/ml/backend/ggml/ggml_test.go +++ b/ml/backend/ggml/ggml_test.go @@ -24,7 +24,7 @@ func setup(tb testing.TB) ml.Context { tb.Fatal(err) } - b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true}) + b, err := ml.NewBackend(f.Name(), make([]string, 0), ml.BackendParams{AllocMemory: true}) if err != nil { tb.Fatal(err) } diff --git a/model/model.go b/model/model.go index 0af16da80..d45e03111 100644 --- a/model/model.go +++ b/model/model.go @@ -102,8 +102,8 @@ func Register(name string, f func(fs.Config) (Model, error)) { } // New initializes a new model instance with the provided configuration based on the metadata in the model file -func New(modelPath string, params ml.BackendParams) (Model, error) { - b, err := ml.NewBackend(modelPath, params) +func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (Model, error) { + b, err := ml.NewBackend(modelPath, extraModelPaths, params) if err != nil { return nil, err } diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go index de9d718b3..b937d50c0 100644 --- a/runner/llamarunner/runner.go +++ b/runner/llamarunner/runner.go @@ -4,7 +4,6 @@ import ( "context" "encoding/json" "errors" - "flag" "fmt" "log" "log/slog" @@ -19,6 +18,7 @@ import ( "time" "unicode/utf8" + "github.com/spf13/pflag" "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" @@ -257,6 +257,8 @@ type Server struct { // modelPath is the location of the model to be loaded modelPath string + extraModelPaths []string + // loadMu prevents more than one load attempt from occurring at a time loadMu sync.Mutex @@ -829,6 +831,7 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { func (s *Server) loadModel( params llama.ModelParams, mpath string, + empath []string, lpath []string, ppath string, kvSize int, @@ -838,7 +841,7 @@ func (s *Server) loadModel( multiUserCache bool, ) { var err error - s.model, err = llama.LoadModelFromFile(mpath, params) + s.model, err = llama.LoadModelFromFile(mpath, empath, params) if err != nil { panic(err) } @@ -931,7 +934,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { } s.status = llm.ServerStatusLoadingModel - go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache) + go s.loadModel(params, s.modelPath, s.extraModelPaths, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache) case llm.LoadOperationClose: // No-op for us @@ -949,13 +952,14 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { } func Execute(args []string) error { - fs := flag.NewFlagSet("runner", flag.ExitOnError) - mpath := fs.String("model", "", "Path to model binary file") + fs := pflag.NewFlagSet("runner", pflag.ExitOnError) + mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.") port := fs.Int("port", 8080, "Port to expose the server on") _ = fs.Bool("verbose", false, "verbose output (default: disabled)") fs.Usage = func() { - fmt.Fprintf(fs.Output(), "Runner usage\n") + // sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output + fmt.Fprintf(os.Stderr, "Runner usage\n") fs.PrintDefaults() } if err := fs.Parse(args); err != nil { @@ -967,8 +971,9 @@ func Execute(args []string) error { llama.BackendInit() server := &Server{ - modelPath: *mpath, - status: llm.ServerStatusLaunched, + modelPath: (*mpath)[0], + extraModelPaths: (*mpath)[1:], + status: llm.ServerStatusLaunched, } server.ready.Add(1) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index a756cba23..a62dd4ca9 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -5,7 +5,6 @@ import ( "context" "encoding/json" "errors" - "flag" "fmt" "hash/maphash" "image" @@ -23,6 +22,7 @@ import ( "time" "unicode/utf8" + "github.com/spf13/pflag" "golang.org/x/image/bmp" "golang.org/x/sync/semaphore" @@ -331,6 +331,8 @@ type Server struct { // modelPath is the location of the model to be loaded modelPath string + extraModelPaths []string + // loadMu prevents more than one load attempt from occurring at a time loadMu sync.Mutex @@ -1169,6 +1171,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error { // based on the given parameters func (s *Server) allocModel( mpath string, + empath []string, params ml.BackendParams, loraPath []string, parallel int, @@ -1193,7 +1196,7 @@ func (s *Server) allocModel( }() var err error - s.model, err = model.New(mpath, params) + s.model, err = model.New(mpath, empath, params) if err != nil { return err } @@ -1302,7 +1305,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) { s.batchSize = req.BatchSize - err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache) + err := s.allocModel(s.modelPath, s.extraModelPaths, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache) if err != nil { s.closeModel() @@ -1372,7 +1375,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) { return } - m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}}) + m, err = model.New(f.Name(), make([]string, 0), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}}) if err != nil { http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError) return @@ -1389,13 +1392,14 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) { } func Execute(args []string) error { - fs := flag.NewFlagSet("runner", flag.ExitOnError) - mpath := fs.String("model", "", "Path to model binary file") + fs := pflag.NewFlagSet("runner", pflag.ExitOnError) + mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.") port := fs.Int("port", 8080, "Port to expose the server on") _ = fs.Bool("verbose", false, "verbose output (default: disabled)") fs.Usage = func() { - fmt.Fprintf(fs.Output(), "Runner usage\n") + // sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output + fmt.Fprintf(os.Stderr, "Runner usage\n") fs.PrintDefaults() } if err := fs.Parse(args); err != nil { @@ -1408,8 +1412,9 @@ func Execute(args []string) error { defer cancel() server := &Server{ - modelPath: *mpath, - status: llm.ServerStatusLaunched, + modelPath: (*mpath)[0], + extraModelPaths: (*mpath)[1:], + status: llm.ServerStatusLaunched, } server.cond = sync.NewCond(&server.mu) diff --git a/server/create.go b/server/create.go index 15e364e1e..6f3eaa05f 100644 --- a/server/create.go +++ b/server/create.go @@ -39,8 +39,97 @@ var ( errUnknownType = errors.New("unknown type") errNeitherFromOrFiles = errors.New("neither 'from' or 'files' was specified") errFilePath = errors.New("file path must be relative") + errIncompleteShardedGGUF = errors.New("missing some GGUF splits") + errExtraShardedGGUF = errors.New("extra GGUF splits found") ) +func broadcastKV(main *ggml.GGML, subs ...*ggml.GGML) { + // broadcast KV value towards other shards. Only for manifest purpose + ggmls := []ggml.GGML{*main} + for i := range subs { + ggmls = append(ggmls, *subs[i]) + } + metaggml := ggml.MakeMetaGGML(ggmls, make([]string, len(ggmls))) + mainKV := main.KV() + mainKV["general.parameter_count"] = metaggml.KV().ParameterCount() + for i := range subs { + subKV := subs[i].KV() + for k, v := range metaggml.KV() { + subKV[k] = v + } + } +} + +func baseLayerSortNCheckSan(baseLayers *[]*layerGGML) error { + slices.SortStableFunc(*baseLayers, func(a, b *layerGGML) int { + var aScore, bScore int + if a.GGML == nil { + // chat template and parameter can be added here. use very big number to move them at last + aScore = 0x7fffffff + } else { + aSplit := a.GGML.KV().GGUFSplitInfo() + if aSplit == nil { + aScore = -1 + } else { + aScore = int(aSplit.No) + } + } + if b.GGML == nil { + bScore = 0x7fffffff + } else { + bSplit := b.GGML.KV().GGUFSplitInfo() + if bSplit == nil { + bScore = -1 + } else { + bScore = int(bSplit.No) + } + } + return cmp.Compare(aScore, bScore) + }) + // sanity check for layers + { + ggmlPtrs := make([]*ggml.GGML, 0, len(*baseLayers)) + firstSplitCount := -1 + foundSplitNos := make([]uint16, 0) + for i, layer := range *baseLayers { + if i == 0 { + if layer.GGML == nil { + // First item should be GGUF after sorting + return errNoFilesProvided + } + } + if layer.GGML != nil && layer.GGML.KV().GGUFSplitInfo() != nil { + if firstSplitCount == -1 { + if layer.GGML.KV().GGUFSplitInfo().No != 0 { + return errIncompleteShardedGGUF + } + firstSplitCount = int(layer.GGML.KV().GGUFSplitInfo().Count) + foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No) + } else if firstSplitCount != int(layer.KV().GGUFSplitInfo().Count) { + return errExtraShardedGGUF + } else { + if foundSplitNos[len(foundSplitNos)-1] == layer.KV().GGUFSplitInfo().No { + return errExtraShardedGGUF + } else if foundSplitNos[len(foundSplitNos)-1] != layer.KV().GGUFSplitInfo().No-1 { + return errIncompleteShardedGGUF + } else { + foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No) + } + } + // only gguf splits should be included + ggmlPtrs = append(ggmlPtrs, layer.GGML) + } + } + if firstSplitCount != -1 && len(foundSplitNos) != firstSplitCount { + return errIncompleteShardedGGUF + } + if len(ggmlPtrs) > 1 { + broadcastKV(ggmlPtrs[0], ggmlPtrs[1:]...) + } + } + return nil +} + func (s *Server) CreateHandler(c *gin.Context) { config := &model.ConfigV2{ OS: "linux", @@ -161,6 +250,14 @@ func (s *Server) CreateHandler(c *gin.Context) { ch <- gin.H{"error": errNeitherFromOrFiles.Error(), "status": http.StatusBadRequest} return } + // Sort baseLayers here to ensure that split model will be correctly ordered + if !remote { + err := baseLayerSortNCheckSan(&baseLayers) + if err != nil { + ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest} + return + } + } var adapterLayers []*layerGGML if !remote && r.Adapters != nil { diff --git a/server/images.go b/server/images.go index 951f7ac6e..fa4a5f639 100644 --- a/server/images.go +++ b/server/images.go @@ -53,18 +53,19 @@ type registryOptions struct { } type Model struct { - Name string `json:"name"` - Config model.ConfigV2 - ShortName string - ModelPath string - ParentModel string - AdapterPaths []string - ProjectorPaths []string - System string - License []string - Digest string - Options map[string]any - Messages []api.Message + Name string `json:"name"` + Config model.ConfigV2 + ShortName string + ModelPath string + ExtraModelPaths []string + ParentModel string + AdapterPaths []string + ProjectorPaths []string + System string + License []string + Digest string + Options map[string]any + Messages []api.Message Template *template.Template } @@ -190,6 +191,13 @@ func (m *Model) String() string { Args: m.ModelPath, }) + for _, extraModels := range m.ExtraModelPaths { + modelfile.Commands = append(modelfile.Commands, parser.Command{ + Name: "model", + Args: extraModels, + }) + } + for _, adapter := range m.AdapterPaths { modelfile.Commands = append(modelfile.Commands, parser.Command{ Name: "adapter", @@ -319,6 +327,8 @@ func GetModel(name string) (*Model, error) { } } + readMainModelFlag := false + for _, layer := range manifest.Layers { filename, err := GetBlobsPath(layer.Digest) if err != nil { @@ -327,8 +337,13 @@ func GetModel(name string) (*Model, error) { switch layer.MediaType { case "application/vnd.ollama.image.model": - model.ModelPath = filename - model.ParentModel = layer.From + if !readMainModelFlag { + model.ModelPath = filename + model.ParentModel = layer.From + readMainModelFlag = true + } else { + model.ExtraModelPaths = append(model.ExtraModelPaths, filename) + } case "application/vnd.ollama.image.embed": // Deprecated in versions > 0.1.2 // TODO: remove this warning in a future version diff --git a/server/routes.go b/server/routes.go index 977a13ff2..68787eb4b 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1201,14 +1201,14 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { return resp, nil } -func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) { +func getModelData(digest string, verbose bool) (ggml.KV, ggml.ForeignTensors, error) { maxArraySize := 0 if verbose { maxArraySize = -1 } - data, err := llm.LoadModel(digest, maxArraySize) + data, err := llm.LoadModel(digest, make([]string, 0), maxArraySize, true) if err != nil { - return nil, ggml.Tensors{}, err + return nil, make(ggml.ForeignTensors, 0), err } kv := data.KV() @@ -1221,7 +1221,7 @@ func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) { } } - return kv, data.Tensors(), nil + return kv, data.Tensors, nil } func (s *Server) ListHandler(c *gin.Context) { diff --git a/server/routes_create_test.go b/server/routes_create_test.go index b1b1a2882..4dbc3d151 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -954,3 +954,236 @@ func TestDetectModelTypeFromFiles(t *testing.T) { } }) } + +func TestShardedGGUF(t *testing.T) { + gin.SetMode(gin.TestMode) + p := t.TempDir() + t.Setenv("OLLAMA_MODELS", p) + + _, fullDigest := createBinFile(t, ggml.KV{}, []*ggml.Tensor{}) + _, splitDigest1 := createBinFile(t, ggml.KV{ + "split.no": uint16(0), + "split.count": uint16(3), + }, []*ggml.Tensor{}) + _, splitDigest2 := createBinFile(t, ggml.KV{ + "split.no": uint16(1), + "split.count": uint16(3), + }, []*ggml.Tensor{}) + _, splitDigest3 := createBinFile(t, ggml.KV{ + "split.no": uint16(2), + "split.count": uint16(3), + }, []*ggml.Tensor{}) + _, splitDigest4 := createBinFile(t, ggml.KV{ + "split.no": uint16(0), + "split.count": uint16(4), + }, []*ggml.Tensor{}) + _, splitDigest5 := createBinFile(t, ggml.KV{ + "general.architecture": "test1", + "split.no": uint16(1), + "split.count": uint16(3), + }, []*ggml.Tensor{}) + + var s Server + + t.Run("single full gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-single-full", + Files: map[string]string{"test.gguf": fullDigest}, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + fmt.Println(w) + t.Fatalf("expected status code 200, actual %d", w.Code) + } + + manifest, err := ParseNamedManifest(model.ParseName("test-single-full")) + if err != nil { + t.Fatalf("parse manifest: %v", err) + } + for i, layer := range manifest.Layers { + if i != 0 { + t.Fatalf("expect 1 layer, actually found layer with index %d", i) + } else if layer.Digest != fullDigest { + t.Fatalf("expect digest %s, actual %s", fullDigest, layer.Digest) + } + } + }) + + t.Run("complete split gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-complete-split", + Files: map[string]string{ + "test-00001-of-00003.gguf": splitDigest1, + "test-00002-of-00003.gguf": splitDigest2, + "test-00003-of-00003.gguf": splitDigest3, + }, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + fmt.Println(w) + t.Fatalf("expected status code 200, actual %d", w.Code) + } + + correctOrder := []string{ + splitDigest1, splitDigest2, splitDigest3, + } + + manifest, err := ParseNamedManifest(model.ParseName("test-complete-split")) + if err != nil { + t.Fatalf("parse manifest: %v", err) + } + for i, layer := range manifest.Layers { + if i >= 3 { + t.Fatalf("expect 3 layers, actually found layer with index %d", i) + } else if layer.Digest != correctOrder[i] { + t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest) + } + } + }) + + t.Run("complete split misordered gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-complete-split-misorder", + Files: map[string]string{ + "test-00003-of-00003.gguf": splitDigest3, + "test-00001-of-00003.gguf": splitDigest1, + "test-00002-of-00003.gguf": splitDigest2, + }, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + fmt.Println(w) + t.Fatalf("expected status code 200, actual %d", w.Code) + } + + correctOrder := []string{ + splitDigest1, splitDigest2, splitDigest3, + } + + manifest, err := ParseNamedManifest(model.ParseName("test-complete-split-misorder")) + if err != nil { + t.Fatalf("parse manifest: %v", err) + } + for i, layer := range manifest.Layers { + if i >= 3 { + t.Fatalf("expect 3 layers, actually found layer with index %d", i) + } else if layer.Digest != correctOrder[i] { + t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest) + } + } + }) + + t.Run("mixed full and split gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-full-split-mixing", + Files: map[string]string{ + "test-00002-of-00003.gguf": splitDigest2, + "test-00003-of-00003.gguf": splitDigest3, + "test1.gguf": fullDigest, + "test-00001-of-00003.gguf": splitDigest1, + }, + Stream: &stream, + }) + + if w.Code != http.StatusOK { + fmt.Println(w) + t.Fatalf("expected status code 200, actual %d", w.Code) + } + + correctOrder := []string{ + fullDigest, splitDigest1, splitDigest2, splitDigest3, + } + + manifest, err := ParseNamedManifest(model.ParseName("test-full-split-mixing")) + if err != nil { + t.Fatalf("parse manifest: %v", err) + } + for i, layer := range manifest.Layers { + if i >= 4 { + t.Fatalf("expect 4 layers, actually found layer with index %d", i) + } else if layer.Digest != correctOrder[i] { + t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest) + } + } + }) + + t.Run("mixed wrong split gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-extra-split", + Files: map[string]string{ + "test-00002-of-00003.gguf": splitDigest2, + "test-00003-of-00003.gguf": splitDigest3, + "test-00001-of-00003.gguf": splitDigest1, + "test1-00001-of-00004.gguf": splitDigest4, + }, + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) + + t.Run("mixed same count wrong split gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-extra-split", + Files: map[string]string{ + "test-00002-of-00003.gguf": splitDigest2, + "test-00003-of-00003.gguf": splitDigest3, + "test-00001-of-00003.gguf": splitDigest1, + "test1-00002-of-00003.gguf": splitDigest5, + }, + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) + t.Run("missing head split gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-extra-split", + Files: map[string]string{ + "test-00002-of-00003.gguf": splitDigest2, + "test-00003-of-00003.gguf": splitDigest3, + }, + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) + t.Run("missing mid split gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-extra-split", + Files: map[string]string{ + "test-00001-of-00003.gguf": splitDigest1, + "test-00003-of-00003.gguf": splitDigest3, + }, + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) + t.Run("missing tail split gguf", func(t *testing.T) { + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Name: "test-extra-split", + Files: map[string]string{ + "test-00001-of-00003.gguf": splitDigest1, + "test-00002-of-00003.gguf": splitDigest2, + }, + Stream: &stream, + }) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected status code 400, actual %d", w.Code) + } + }) + +} diff --git a/server/routes_debug_test.go b/server/routes_debug_test.go index 6f9104c39..8002c752c 100644 --- a/server/routes_debug_test.go +++ b/server/routes_debug_test.go @@ -39,7 +39,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -232,7 +232,7 @@ func TestChatDebugRenderOnly(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ diff --git a/server/routes_generate_renderer_test.go b/server/routes_generate_renderer_test.go index e6473e087..06336d027 100644 --- a/server/routes_generate_renderer_test.go +++ b/server/routes_generate_renderer_test.go @@ -44,7 +44,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ llama: &mock, @@ -228,7 +228,7 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ llama: &mock, diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 111a9678a..169804bc8 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -71,8 +71,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error return } -func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { - return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) { +func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, []string, *ggml.MetaGGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { + return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ []string, _ *ggml.MetaGGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) { return mock, nil } } @@ -182,7 +182,7 @@ func TestGenerateChat(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -898,7 +898,7 @@ func TestGenerate(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -1382,7 +1382,7 @@ func TestGenerateLogprobs(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{llama: mock} return false }, @@ -1562,7 +1562,7 @@ func TestChatLogprobs(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{llama: mock} return false }, @@ -1672,7 +1672,7 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{llama: mock} return false diff --git a/server/routes_harmony_streaming_test.go b/server/routes_harmony_streaming_test.go index de130c8c8..5c034f8b6 100644 --- a/server/routes_harmony_streaming_test.go +++ b/server/routes_harmony_streaming_test.go @@ -265,7 +265,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 100 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } @@ -416,7 +416,7 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 100 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } @@ -598,7 +598,7 @@ func TestChatHarmonyParserStreaming(t *testing.T) { getGpuFn: getGpuFn, getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } diff --git a/server/sched.go b/server/sched.go index c5bc6692d..814cc7828 100644 --- a/server/sched.go +++ b/server/sched.go @@ -49,8 +49,8 @@ type Scheduler struct { activeLoading llm.LlamaServer loaded map[string]*runnerRef - loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool - newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) + loadFn func(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool + newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo getSystemInfoFn func() ml.SystemInfo waitForRecovery time.Duration @@ -196,7 +196,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // Load model for fitting logutil.Trace("loading model metadata", "model", pending.model.ModelPath) - ggml, err := llm.LoadModel(pending.model.ModelPath, 1024) + ggml, err := llm.LoadModel(pending.model.ModelPath, pending.model.ExtraModelPaths, 1024, false) if err != nil { pending.errCh <- err break @@ -389,7 +389,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs // (if any). Returns whether the scheduler needs to evict a model to make this one fit. -func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool { +func (s *Scheduler) load(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool { numParallel := max(int(envconfig.NumParallel()), 1) // Embedding models should always be loaded with parallel=1 @@ -414,7 +414,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo if llama == nil { var err error - llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) + llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, req.model.ExtraModelPaths, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) if err != nil { // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to diff --git a/server/sched_test.go b/server/sched_test.go index 480aafa4e..e79bcd335 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -39,7 +39,7 @@ func TestSchedLoad(t *testing.T) { defer done() s := InitScheduler(ctx) s.waitForRecovery = 10 * time.Millisecond - var f *ggml.GGML // value not used in tests + var f *ggml.MetaGGML // value not used in tests req := &LlmRequest{ ctx: ctx, model: &Model{ModelPath: "foo"}, @@ -49,7 +49,7 @@ func TestSchedLoad(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Second}, } // Fail to load model first - s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return nil, errors.New("something failed to load model blah") } gpus := []ml.DeviceInfo{} @@ -64,7 +64,7 @@ func TestSchedLoad(t *testing.T) { require.Contains(t, err.Error(), "this model may be incompatible") server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} - s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { server.modelPath = model return server, nil } @@ -103,10 +103,10 @@ type reqBundle struct { ctxDone func() srv *mockLlm req *LlmRequest - f *ggml.GGML + f *ggml.MetaGGML } -func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { +func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { scenario.srv.modelPath = model return scenario.srv, nil } @@ -132,7 +132,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra }) model := &Model{Name: modelName, ModelPath: p} - f, err := llm.LoadModel(model.ModelPath, 0) + f, err := llm.LoadModel(model.ModelPath, make([]string, 0), 0, true) if err != nil { t.Fatal(err) } @@ -462,11 +462,11 @@ func TestSchedExpireRunner(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Minute}, } - var f *ggml.GGML + var f *ggml.MetaGGML gpus := []ml.DeviceInfo{} systemInfo := ml.SystemInfo{} server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} - s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { server.modelPath = model return server, nil }