This commit is contained in:
cvrunmin 2026-01-06 08:25:48 +01:00 committed by GitHub
commit 9e70f0fcfd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 770 additions and 119 deletions

View File

@ -441,6 +441,7 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map
cmd, port, err := llm.StartRunner(
true, // ollama engine
"", // no model
make([]string, 0),
ollamaLibDirs,
out,
extraEnvs,

View File

@ -1176,7 +1176,7 @@ Create a model from:
- another model;
- a safetensors directory; or
- a GGUF file.
- a GGUF file or directory.
If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.
@ -1270,6 +1270,7 @@ A stream of JSON objects is returned:
#### Create a model from GGUF
Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
For a model stored in multiple split GGUF files, includes all split GGUF files in the `files` parameter with the file names and SHA256 digests. It is recommended to provide files in split number order even though Ollama itself will sort them in order.
##### Request

View File

@ -88,6 +88,10 @@ To import a GGUF model, create a `Modelfile` containing:
```dockerfile
FROM /path/to/file.gguf
```
Or:
```dockerfile
FROM /path/to/gguf/split/directory
```
For a GGUF adapter, create the `Modelfile` with:

View File

@ -12,7 +12,7 @@ A Modelfile is the blueprint to create and share customized models using Ollama.
- [FROM (Required)](#from-required)
- [Build from existing model](#build-from-existing-model)
- [Build from a Safetensors model](#build-from-a-safetensors-model)
- [Build from a GGUF file](#build-from-a-gguf-file)
- [Build from a GGUF file](#build-from-a-gguf-model)
- [PARAMETER](#parameter)
- [Valid Parameters and Values](#valid-parameters-and-values)
- [TEMPLATE](#template)
@ -130,7 +130,7 @@ Currently supported model architectures:
- Gemma (including Gemma 1 and Gemma 2)
- Phi3
#### Build from a GGUF file
#### Build from a GGUF model
```
FROM ./ollama-model.gguf
@ -138,6 +138,14 @@ FROM ./ollama-model.gguf
The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
For GGUF model split into multiple files:
```
FROM <model directory>
```
The model directory should contain solely the split GGUF weights of one model.
### PARAMETER
The `PARAMETER` instruction defines a parameter that can be set when the model is run.

View File

@ -7,6 +7,7 @@ import (
"fmt"
"io"
"log/slog"
"maps"
"math"
"slices"
"strings"
@ -27,6 +28,18 @@ type model interface {
Tensors() Tensors
}
type MetaGGML struct {
Shards []GGML
ShardPaths []string
Tensors ForeignTensors
kv KV
}
type GGUFSplitInfo struct {
No uint16
Count uint16
}
type KV map[string]any
func (kv KV) Architecture() string {
@ -50,6 +63,18 @@ func (kv KV) FileType() FileType {
return FileTypeUnknown
}
func (kv KV) GGUFSplitInfo() *GGUFSplitInfo {
no, found := keyValue(kv, "split.no", uint16(0))
if !found {
return nil
}
count, _ := keyValue(kv, "split.count", uint16(0))
return &GGUFSplitInfo{
No: no,
Count: count,
}
}
func (kv KV) BlockCount() uint64 {
return uint64(kv.Uint("block_count"))
}
@ -271,7 +296,7 @@ type arrayValueTypes interface {
}
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "split.") {
key = kv.Architecture() + "." + key
}
@ -288,6 +313,14 @@ type Tensors struct {
Offset uint64
}
type ForeignTensor struct {
*Tensor
ModelPath string
TensorRegionOffset uint64
}
type ForeignTensors []ForeignTensor
func (s Tensors) Items(prefix ...string) []*Tensor {
if len(prefix) == 0 {
return s.items
@ -326,6 +359,41 @@ func (ts Tensors) GroupLayers() map[string]Layer {
return layers
}
func (s ForeignTensors) Items(prefix ...string) []*Tensor {
var items []*Tensor
for i := range s {
if len(prefix) == 0 || strings.HasPrefix(s[i].Name, prefix[0]) {
items = append(items, s[i].Tensor)
}
}
return items
}
func (ts ForeignTensors) GroupLayers() map[string]Layer {
layers := make(map[string]Layer)
for i := range ts {
t := ts[i].Tensor
parts := strings.Split(t.Name, ".")
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
if len(parts) > index+2 {
// blk and mm should have a number after them, join it
parts = append(
[]string{strings.Join(parts[:index+2], ".")},
parts[index+2:]...)
}
}
if _, ok := layers[parts[0]]; !ok {
layers[parts[0]] = make(Layer)
}
layers[parts[0]][strings.Join(parts[1:], ".")] = t
}
return layers
}
type Layer map[string]*Tensor
func (l Layer) Size() (size uint64) {
@ -553,7 +621,93 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
}, nil
}
func BuildForeignTensors(shards []GGML, shardsPaths []string) (*ForeignTensors, error) {
if len(shards) != len(shardsPaths) {
return nil, fmt.Errorf("length of shards and shardsPaths do not match: %d vs %d", len(shards), len(shardsPaths))
}
li := make(ForeignTensors, 0)
for i := range shards {
gs := shards[i]
tensors := gs.Tensors()
for k := range tensors.items {
tensor := tensors.items[k]
li = append(li, ForeignTensor{
Tensor: tensor,
ModelPath: shardsPaths[i],
TensorRegionOffset: tensors.Offset,
})
}
}
return &li, nil
}
func MakeMetaGGML(ggmls []GGML, ggmlPaths []string) MetaGGML {
type wrapper struct {
ggml GGML
path string
weight int
}
var wrappers []wrapper
for i := range ggmls {
iSplitInfo := ggmls[i].KV().GGUFSplitInfo()
var weight int = 0
if iSplitInfo == nil {
weight = -1
} else {
weight = int((*iSplitInfo).No)
}
wrappers = append(wrappers, wrapper{
ggml: ggmls[i],
path: ggmlPaths[i],
weight: weight,
})
}
slices.SortStableFunc(wrappers, func(a, b wrapper) int {
return cmp.Compare(a.weight, b.weight)
})
metaGgml := MetaGGML{}
var param_counts uint64 = 0
for i := range wrappers {
param_counts += wrappers[i].ggml.KV().ParameterCount()
if i == 0 {
kv := maps.Clone(wrappers[i].ggml.KV())
// remove the keys contained in split gguf files. add more if needed.
delete(kv, "slice.no")
delete(kv, "slice.count")
delete(kv, "slice.tensors.count")
delete(kv, "general.parameter_count")
metaGgml.kv = kv
}
metaGgml.Shards = append(metaGgml.Shards, wrappers[i].ggml)
metaGgml.ShardPaths = append(metaGgml.ShardPaths, wrappers[i].path)
}
metaGgml.kv["general.parameter_count"] = param_counts
ft, _ := BuildForeignTensors(metaGgml.Shards, metaGgml.ShardPaths)
metaGgml.Tensors = *ft
return metaGgml
}
func simpleWrapGGML(ggml GGML) MetaGGML {
// simply wrap single GGML, without creating foreign tensors
return MetaGGML{
Shards: []GGML{ggml},
ShardPaths: []string{""},
kv: ggml.KV(),
}
}
func WrapGGML(ggml GGML) MetaGGML {
metaggml := simpleWrapGGML(ggml)
ft, _ := BuildForeignTensors(metaggml.Shards, metaggml.ShardPaths)
metaggml.Tensors = *ft
return metaggml
}
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
return WrapGGML(f).GraphSize(context, batch, numParallel, kvCacheType, useFlashAttention)
}
func (f MetaGGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
context *= uint64(numParallel)
embedding := f.KV().EmbeddingLength()
@ -567,7 +721,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
layers := f.Tensors().GroupLayers()
layers := f.Tensors.GroupLayers()
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
@ -665,7 +819,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
)
var ropeFreqsCount uint64
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
if ropeFreqs, ok := f.Tensors.GroupLayers()["rope_freqs"]; ok {
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
ropeFreqsCount = ropeFreqsWeights.Elements()
}
@ -805,6 +959,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
// SupportsKVCacheType checks if the requested cache type is supported
func (f GGML) SupportsKVCacheType(cacheType string) bool {
return simpleWrapGGML(f).SupportsKVCacheType(cacheType)
}
func (f MetaGGML) SupportsKVCacheType(cacheType string) bool {
if cacheType == "" || cacheType == "f16" {
return true
}
@ -822,6 +979,10 @@ func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {
// SupportsFlashAttention checks if the model supports flash attention
func (f GGML) SupportsFlashAttention() bool {
return simpleWrapGGML(f).SupportsFlashAttention()
}
func (f MetaGGML) SupportsFlashAttention() bool {
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
if isEmbedding {
return false
@ -839,6 +1000,10 @@ func (f GGML) SupportsFlashAttention() bool {
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
return simpleWrapGGML(f).FlashAttention()
}
func (f MetaGGML) FlashAttention() bool {
return slices.Contains([]string{
"bert",
"gemma3",
@ -863,3 +1028,15 @@ func kvCacheBytesPerElement(cacheType string) float64 {
return 2 // f16 (default)
}
}
func (f MetaGGML) KV() KV {
return f.kv
}
func (f MetaGGML) TotalTensorBytes() uint64 {
totalBytes := uint64(0)
for i := range f.Shards {
totalBytes += uint64(f.Shards[i].Length) - f.Shards[i].Tensors().Offset
}
return totalBytes
}

View File

@ -582,7 +582,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
if !strings.HasPrefix(k, arch+".") &&
!strings.HasPrefix(k, "general.") &&
!strings.HasPrefix(k, "adapter.") &&
!strings.HasPrefix(k, "tokenizer.") {
!strings.HasPrefix(k, "tokenizer.") &&
!strings.HasPrefix(k, "split.") {
k = arch + "." + k
}
@ -597,6 +598,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
var err error
switch v := v.(type) {
case uint16:
err = writeGGUF(ws, ggufTypeUint16, v)
case int32:
err = writeGGUF(ws, ggufTypeInt32, v)
case int64:

2
go.mod
View File

@ -77,7 +77,7 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/spf13/pflag v1.0.5
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
golang.org/x/arch v0.8.0 // indirect

View File

@ -261,7 +261,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
return true
}
func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
func LoadModelFromFile(modelPath string, extraModelPaths []string, params ModelParams) (*Model, error) {
cparams := C.llama_model_default_params()
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
cparams.main_gpu = C.int32_t(params.MainGpu)
@ -305,7 +305,17 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams.progress_callback_user_data = unsafe.Pointer(&handle)
}
m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)}
var splitPaths []*C.char
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
splitPaths = append(splitPaths, mp)
for i := range extraModelPaths {
mp := C.CString(extraModelPaths[i])
defer C.free(unsafe.Pointer(mp))
splitPaths = append(splitPaths, mp)
}
m := Model{c: C.llama_model_load_from_splits(&splitPaths[0], C.size_t(len(splitPaths)), cparams)}
if m.c == nil {
return nil, fmt.Errorf("unable to load model: %s", modelPath)
}

View File

@ -84,12 +84,13 @@ type LlamaServer interface {
// llmServer is an instance of a runner hosting a single model
type llmServer struct {
port int
cmd *exec.Cmd
done chan error // Channel to signal when the process exits
status *StatusWriter
options api.Options
modelPath string
port int
cmd *exec.Cmd
done chan error // Channel to signal when the process exits
status *StatusWriter
options api.Options
modelPath string
extraModelPaths []string
loadRequest LoadRequest // Parameters used to initialize the runner
mem *ml.BackendMemory // Memory allocations for this model
@ -109,7 +110,7 @@ type llmServer struct {
type llamaServer struct {
llmServer
ggml *ggml.GGML
ggml *ggml.MetaGGML
}
type ollamaServer struct {
@ -123,7 +124,7 @@ type ollamaServer struct {
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
func LoadModel(model string, extraModels []string, maxArraySize int, reliefSplitConstrain bool) (*ggml.MetaGGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@ -134,12 +135,55 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
}
defer f.Close()
ggml, err := ggml.Decode(f, maxArraySize)
return ggml, err
ggml1, err := ggml.Decode(f, maxArraySize)
if err != nil {
return nil, err
}
if ggml1.KV().GGUFSplitInfo() != nil {
if ggml1.KV().GGUFSplitInfo().No != 0 {
return nil, errors.New("not the first split of model")
}
loadedGgml := []ggml.GGML{*ggml1}
visitedSplitNo := []uint16{ggml1.KV().GGUFSplitInfo().No}
for i := range extraModels {
extraModel := extraModels[i]
f, err := os.Open(extraModel)
if err != nil {
return nil, err
}
defer f.Close()
ggml1, err := ggml.Decode(f, maxArraySize)
if err != nil {
return nil, err
}
if ggml1.KV().GGUFSplitInfo() == nil {
return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf")
}
visitedSplitNo = append(visitedSplitNo, ggml1.KV().GGUFSplitInfo().No)
loadedGgml = append(loadedGgml, *ggml1)
}
if !reliefSplitConstrain {
if len(visitedSplitNo) != int(ggml1.KV().GGUFSplitInfo().Count) {
return nil, errors.New("mismatch split gguf count")
}
slices.Sort(visitedSplitNo)
for i := 0; i < len(visitedSplitNo)-1; i++ {
if visitedSplitNo[i] != visitedSplitNo[i+1]-1 {
return nil, errors.New("repeated or skipped split found")
}
}
}
metaggml := ggml.MakeMetaGGML(loadedGgml, append([]string{model}, extraModels...))
return &metaggml, nil
} else {
metaggml := ggml.MakeMetaGGML([]ggml.GGML{*ggml1}, []string{model})
return &metaggml, nil
}
}
// NewLlamaServer will run a server for the given GPUs
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, extraModelPaths []string, f *ggml.MetaGGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var llamaModel *llama.Model
var textProcessor model.TextProcessor
var err error
@ -155,7 +199,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
}
}
if textProcessor == nil {
llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
llamaModel, err = llama.LoadModelFromFile(modelPath, extraModelPaths, llama.ModelParams{VocabOnly: true})
if err != nil {
return nil, err
}
@ -262,24 +306,26 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
cmd, port, err := StartRunner(
textProcessor != nil,
modelPath,
extraModelPaths,
gpuLibs,
status,
ml.GetVisibleDevicesEnv(gpus, false),
)
s := llmServer{
port: port,
cmd: cmd,
status: status,
options: opts,
modelPath: modelPath,
loadRequest: loadRequest,
llamaModel: llamaModel,
llamaModelLock: &sync.Mutex{},
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
loadStart: time.Now(),
done: make(chan error, 1),
port: port,
cmd: cmd,
status: status,
options: opts,
modelPath: modelPath,
extraModelPaths: extraModelPaths,
loadRequest: loadRequest,
llamaModel: llamaModel,
llamaModelLock: &sync.Mutex{},
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
loadStart: time.Now(),
done: make(chan error, 1),
}
if err != nil {
@ -316,7 +362,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
}
}
func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
func StartRunner(ollamaEngine bool, modelPath string, extraModelPaths []string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
var exe string
exe, err = os.Executable()
if err != nil {
@ -346,6 +392,9 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W
if modelPath != "" {
params = append(params, "--model", modelPath)
}
for i := range extraModelPaths {
params = append(params, "--model", extraModelPaths[i])
}
params = append(params, "--port", strconv.Itoa(port))
var pathEnv string
@ -440,6 +489,10 @@ func (s *llmServer) ModelPath() string {
return s.modelPath
}
func (s *llmServer) ExtraModelPaths() []string {
return s.extraModelPaths
}
type LoadOperation int
// The order of these constants are significant because we iterate over the operations. They
@ -522,7 +575,7 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
// Use the size of one layer as a buffer
layers := s.ggml.Tensors().GroupLayers()
layers := s.ggml.Tensors.GroupLayers()
if blk0, ok := layers["blk.0"]; ok {
buffer := blk0.Size() + kv[0]
for i := range gpus {

View File

@ -73,9 +73,9 @@ type BackendParams struct {
FlashAttention FlashAttentionType
}
var backends = make(map[string]func(string, BackendParams) (Backend, error))
var backends = make(map[string]func(string, []string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
func RegisterBackend(name string, f func(string, []string, BackendParams) (Backend, error)) {
if _, ok := backends[name]; ok {
panic("backend: backend already registered")
}
@ -83,9 +83,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
backends[name] = f
}
func NewBackend(modelPath string, params BackendParams) (Backend, error) {
func NewBackend(modelPath string, extraModelPaths []string, params BackendParams) (Backend, error) {
if backend, ok := backends["ggml"]; ok {
return backend(modelPath, params)
return backend(modelPath, extraModelPaths, params)
}
return nil, fmt.Errorf("unsupported backend")

View File

@ -77,7 +77,7 @@ type Backend struct {
// modelPath is the location of the model data
modelPath string
meta *fsggml.GGML
meta *fsggml.MetaGGML
// allocMemory means that memory should be allocated for tensors and not
// just a dry run
@ -120,17 +120,55 @@ type Backend struct {
var once sync.Once
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (ml.Backend, error) {
r, err := os.Open(modelPath)
if err != nil {
return nil, err
}
defer r.Close()
meta, err := fsggml.Decode(r, -1)
smallmeta, err := fsggml.Decode(r, -1)
if err != nil {
return nil, err
}
var meta fsggml.MetaGGML
if smallmeta.KV().GGUFSplitInfo() != nil {
if smallmeta.KV().GGUFSplitInfo().No != 0 {
return nil, errors.New("not the first split of model")
}
loadedGgml := []fsggml.GGML{*smallmeta}
visitedSplitNo := []uint16{smallmeta.KV().GGUFSplitInfo().No}
for i := range extraModelPaths {
extraModel := extraModelPaths[i]
f, err := os.Open(extraModel)
if err != nil {
return nil, err
}
defer f.Close()
smallmeta, err := fsggml.Decode(f, -1)
if err != nil {
return nil, err
}
if smallmeta.KV().GGUFSplitInfo() == nil {
return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf")
}
visitedSplitNo = append(visitedSplitNo, smallmeta.KV().GGUFSplitInfo().No)
loadedGgml = append(loadedGgml, *smallmeta)
}
if len(visitedSplitNo) != int(smallmeta.KV().GGUFSplitInfo().Count) {
return nil, errors.New("mismatch split gguf count")
}
slices.Sort(visitedSplitNo)
for i := 0; i < len(visitedSplitNo)-1; i++ {
if visitedSplitNo[i] != visitedSplitNo[i+1]-1 {
return nil, errors.New("repeated or skipped split found")
}
}
meta = fsggml.MakeMetaGGML(loadedGgml, append([]string{modelPath}, extraModelPaths...))
} else {
meta = fsggml.MakeMetaGGML([]fsggml.GGML{*smallmeta}, []string{modelPath})
}
once.Do(func() {
slog.Info(
@ -139,7 +177,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
"file_type", meta.KV().FileType(),
"name", meta.KV().String("general.name"),
"description", meta.KV().String("general.description"),
"num_tensors", len(meta.Tensors().Items()),
"num_tensors", len(meta.Tensors.Items()),
"num_key_values", len(meta.KV()),
)
})
@ -227,7 +265,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
// outputs are assigned iff allowed by splits and configured number of gpu layers
output := assignLayer(blocks)
maxTensors := len(meta.Tensors().Items())
maxTensors := len(meta.Tensors.Items())
maxTensors += 1
// each layer has at most 2 extra tensors for rope operations
maxTensors += blocks * 2
@ -303,11 +341,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
return false
}
for _, t := range meta.Tensors().Items() {
for _, t := range meta.Tensors.Items() {
switch {
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
createTensor(tensor{source: t}, input.bts, -1)
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
if _, ok := meta.Tensors.GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
}
case contains(t.Name, "cls", "output", "output_norm",
@ -378,7 +416,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
}
}
maxGraphNodes := max(1024, len(meta.Tensors().Items())*8)
maxGraphNodes := max(1024, len(meta.Tensors.Items())*8)
sched := C.ggml_backend_sched_new_ext(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
@ -423,7 +461,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
modelPath: modelPath,
allocMemory: params.AllocMemory,
flashAttention: params.FlashAttention,
meta: meta,
meta: &meta,
tensorLoadTargets: targets,
tensors: tensors,
sched: sched,
@ -494,11 +532,12 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
var doneBytes atomic.Uint64
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
totalBytes := b.meta.TotalTensorBytes()
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(runtime.GOMAXPROCS(0))
for _, t := range b.meta.Tensors().Items() {
for i := range b.meta.Tensors {
t := b.meta.Tensors[i]
g.Go(func() error {
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
for i := range tts {
@ -517,13 +556,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
// seeking around within an FD shared between all goroutines.
file, err := os.Open(b.modelPath)
file, err := os.Open(t.ModelPath)
if err != nil {
slog.Warn("file open error", "file", b.modelPath, "error", err)
slog.Warn("file open error", "file", t.ModelPath, "error", err)
return err
}
defer file.Close()
sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
sr := io.NewSectionReader(file, int64(t.TensorRegionOffset+t.Offset), int64(t.Size()))
if t.Kind == 4 && tts[0]._type == 39 {
// source is mxfp4, target is ggml mxfp4

View File

@ -24,7 +24,7 @@ func setup(tb testing.TB) ml.Context {
tb.Fatal(err)
}
b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
b, err := ml.NewBackend(f.Name(), make([]string, 0), ml.BackendParams{AllocMemory: true})
if err != nil {
tb.Fatal(err)
}

View File

@ -102,8 +102,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
}
// New initializes a new model instance with the provided configuration based on the metadata in the model file
func New(modelPath string, params ml.BackendParams) (Model, error) {
b, err := ml.NewBackend(modelPath, params)
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (Model, error) {
b, err := ml.NewBackend(modelPath, extraModelPaths, params)
if err != nil {
return nil, err
}

View File

@ -4,7 +4,6 @@ import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"log"
"log/slog"
@ -19,6 +18,7 @@ import (
"time"
"unicode/utf8"
"github.com/spf13/pflag"
"golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api"
@ -257,6 +257,8 @@ type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
extraModelPaths []string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
@ -829,6 +831,7 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
func (s *Server) loadModel(
params llama.ModelParams,
mpath string,
empath []string,
lpath []string,
ppath string,
kvSize int,
@ -838,7 +841,7 @@ func (s *Server) loadModel(
multiUserCache bool,
) {
var err error
s.model, err = llama.LoadModelFromFile(mpath, params)
s.model, err = llama.LoadModelFromFile(mpath, empath, params)
if err != nil {
panic(err)
}
@ -931,7 +934,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
}
s.status = llm.ServerStatusLoadingModel
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
go s.loadModel(params, s.modelPath, s.extraModelPaths, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
case llm.LoadOperationClose:
// No-op for us
@ -949,13 +952,14 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
fmt.Fprintf(os.Stderr, "Runner usage\n")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
@ -967,8 +971,9 @@ func Execute(args []string) error {
llama.BackendInit()
server := &Server{
modelPath: *mpath,
status: llm.ServerStatusLaunched,
modelPath: (*mpath)[0],
extraModelPaths: (*mpath)[1:],
status: llm.ServerStatusLaunched,
}
server.ready.Add(1)

View File

@ -5,7 +5,6 @@ import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"hash/maphash"
"image"
@ -23,6 +22,7 @@ import (
"time"
"unicode/utf8"
"github.com/spf13/pflag"
"golang.org/x/image/bmp"
"golang.org/x/sync/semaphore"
@ -331,6 +331,8 @@ type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
extraModelPaths []string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
@ -1169,6 +1171,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error {
// based on the given parameters
func (s *Server) allocModel(
mpath string,
empath []string,
params ml.BackendParams,
loraPath []string,
parallel int,
@ -1193,7 +1196,7 @@ func (s *Server) allocModel(
}()
var err error
s.model, err = model.New(mpath, params)
s.model, err = model.New(mpath, empath, params)
if err != nil {
return err
}
@ -1302,7 +1305,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.batchSize = req.BatchSize
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
err := s.allocModel(s.modelPath, s.extraModelPaths, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
if err != nil {
s.closeModel()
@ -1372,7 +1375,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
return
}
m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
m, err = model.New(f.Name(), make([]string, 0), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
if err != nil {
http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
return
@ -1389,13 +1392,14 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
fmt.Fprintf(os.Stderr, "Runner usage\n")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
@ -1408,8 +1412,9 @@ func Execute(args []string) error {
defer cancel()
server := &Server{
modelPath: *mpath,
status: llm.ServerStatusLaunched,
modelPath: (*mpath)[0],
extraModelPaths: (*mpath)[1:],
status: llm.ServerStatusLaunched,
}
server.cond = sync.NewCond(&server.mu)

View File

@ -39,8 +39,97 @@ var (
errUnknownType = errors.New("unknown type")
errNeitherFromOrFiles = errors.New("neither 'from' or 'files' was specified")
errFilePath = errors.New("file path must be relative")
errIncompleteShardedGGUF = errors.New("missing some GGUF splits")
errExtraShardedGGUF = errors.New("extra GGUF splits found")
)
func broadcastKV(main *ggml.GGML, subs ...*ggml.GGML) {
// broadcast KV value towards other shards. Only for manifest purpose
ggmls := []ggml.GGML{*main}
for i := range subs {
ggmls = append(ggmls, *subs[i])
}
metaggml := ggml.MakeMetaGGML(ggmls, make([]string, len(ggmls)))
mainKV := main.KV()
mainKV["general.parameter_count"] = metaggml.KV().ParameterCount()
for i := range subs {
subKV := subs[i].KV()
for k, v := range metaggml.KV() {
subKV[k] = v
}
}
}
func baseLayerSortNCheckSan(baseLayers *[]*layerGGML) error {
slices.SortStableFunc(*baseLayers, func(a, b *layerGGML) int {
var aScore, bScore int
if a.GGML == nil {
// chat template and parameter can be added here. use very big number to move them at last
aScore = 0x7fffffff
} else {
aSplit := a.GGML.KV().GGUFSplitInfo()
if aSplit == nil {
aScore = -1
} else {
aScore = int(aSplit.No)
}
}
if b.GGML == nil {
bScore = 0x7fffffff
} else {
bSplit := b.GGML.KV().GGUFSplitInfo()
if bSplit == nil {
bScore = -1
} else {
bScore = int(bSplit.No)
}
}
return cmp.Compare(aScore, bScore)
})
// sanity check for layers
{
ggmlPtrs := make([]*ggml.GGML, 0, len(*baseLayers))
firstSplitCount := -1
foundSplitNos := make([]uint16, 0)
for i, layer := range *baseLayers {
if i == 0 {
if layer.GGML == nil {
// First item should be GGUF after sorting
return errNoFilesProvided
}
}
if layer.GGML != nil && layer.GGML.KV().GGUFSplitInfo() != nil {
if firstSplitCount == -1 {
if layer.GGML.KV().GGUFSplitInfo().No != 0 {
return errIncompleteShardedGGUF
}
firstSplitCount = int(layer.GGML.KV().GGUFSplitInfo().Count)
foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No)
} else if firstSplitCount != int(layer.KV().GGUFSplitInfo().Count) {
return errExtraShardedGGUF
} else {
if foundSplitNos[len(foundSplitNos)-1] == layer.KV().GGUFSplitInfo().No {
return errExtraShardedGGUF
} else if foundSplitNos[len(foundSplitNos)-1] != layer.KV().GGUFSplitInfo().No-1 {
return errIncompleteShardedGGUF
} else {
foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No)
}
}
// only gguf splits should be included
ggmlPtrs = append(ggmlPtrs, layer.GGML)
}
}
if firstSplitCount != -1 && len(foundSplitNos) != firstSplitCount {
return errIncompleteShardedGGUF
}
if len(ggmlPtrs) > 1 {
broadcastKV(ggmlPtrs[0], ggmlPtrs[1:]...)
}
}
return nil
}
func (s *Server) CreateHandler(c *gin.Context) {
config := &model.ConfigV2{
OS: "linux",
@ -161,6 +250,14 @@ func (s *Server) CreateHandler(c *gin.Context) {
ch <- gin.H{"error": errNeitherFromOrFiles.Error(), "status": http.StatusBadRequest}
return
}
// Sort baseLayers here to ensure that split model will be correctly ordered
if !remote {
err := baseLayerSortNCheckSan(&baseLayers)
if err != nil {
ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
return
}
}
var adapterLayers []*layerGGML
if !remote && r.Adapters != nil {

View File

@ -53,18 +53,19 @@ type registryOptions struct {
}
type Model struct {
Name string `json:"name"`
Config model.ConfigV2
ShortName string
ModelPath string
ParentModel string
AdapterPaths []string
ProjectorPaths []string
System string
License []string
Digest string
Options map[string]any
Messages []api.Message
Name string `json:"name"`
Config model.ConfigV2
ShortName string
ModelPath string
ExtraModelPaths []string
ParentModel string
AdapterPaths []string
ProjectorPaths []string
System string
License []string
Digest string
Options map[string]any
Messages []api.Message
Template *template.Template
}
@ -190,6 +191,13 @@ func (m *Model) String() string {
Args: m.ModelPath,
})
for _, extraModels := range m.ExtraModelPaths {
modelfile.Commands = append(modelfile.Commands, parser.Command{
Name: "model",
Args: extraModels,
})
}
for _, adapter := range m.AdapterPaths {
modelfile.Commands = append(modelfile.Commands, parser.Command{
Name: "adapter",
@ -319,6 +327,8 @@ func GetModel(name string) (*Model, error) {
}
}
readMainModelFlag := false
for _, layer := range manifest.Layers {
filename, err := GetBlobsPath(layer.Digest)
if err != nil {
@ -327,8 +337,13 @@ func GetModel(name string) (*Model, error) {
switch layer.MediaType {
case "application/vnd.ollama.image.model":
model.ModelPath = filename
model.ParentModel = layer.From
if !readMainModelFlag {
model.ModelPath = filename
model.ParentModel = layer.From
readMainModelFlag = true
} else {
model.ExtraModelPaths = append(model.ExtraModelPaths, filename)
}
case "application/vnd.ollama.image.embed":
// Deprecated in versions > 0.1.2
// TODO: remove this warning in a future version

View File

@ -1201,14 +1201,14 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
return resp, nil
}
func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
func getModelData(digest string, verbose bool) (ggml.KV, ggml.ForeignTensors, error) {
maxArraySize := 0
if verbose {
maxArraySize = -1
}
data, err := llm.LoadModel(digest, maxArraySize)
data, err := llm.LoadModel(digest, make([]string, 0), maxArraySize, true)
if err != nil {
return nil, ggml.Tensors{}, err
return nil, make(ggml.ForeignTensors, 0), err
}
kv := data.KV()
@ -1221,7 +1221,7 @@ func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
}
}
return kv, data.Tensors(), nil
return kv, data.Tensors, nil
}
func (s *Server) ListHandler(c *gin.Context) {

View File

@ -954,3 +954,236 @@ func TestDetectModelTypeFromFiles(t *testing.T) {
}
})
}
func TestShardedGGUF(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
_, fullDigest := createBinFile(t, ggml.KV{}, []*ggml.Tensor{})
_, splitDigest1 := createBinFile(t, ggml.KV{
"split.no": uint16(0),
"split.count": uint16(3),
}, []*ggml.Tensor{})
_, splitDigest2 := createBinFile(t, ggml.KV{
"split.no": uint16(1),
"split.count": uint16(3),
}, []*ggml.Tensor{})
_, splitDigest3 := createBinFile(t, ggml.KV{
"split.no": uint16(2),
"split.count": uint16(3),
}, []*ggml.Tensor{})
_, splitDigest4 := createBinFile(t, ggml.KV{
"split.no": uint16(0),
"split.count": uint16(4),
}, []*ggml.Tensor{})
_, splitDigest5 := createBinFile(t, ggml.KV{
"general.architecture": "test1",
"split.no": uint16(1),
"split.count": uint16(3),
}, []*ggml.Tensor{})
var s Server
t.Run("single full gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-single-full",
Files: map[string]string{"test.gguf": fullDigest},
Stream: &stream,
})
if w.Code != http.StatusOK {
fmt.Println(w)
t.Fatalf("expected status code 200, actual %d", w.Code)
}
manifest, err := ParseNamedManifest(model.ParseName("test-single-full"))
if err != nil {
t.Fatalf("parse manifest: %v", err)
}
for i, layer := range manifest.Layers {
if i != 0 {
t.Fatalf("expect 1 layer, actually found layer with index %d", i)
} else if layer.Digest != fullDigest {
t.Fatalf("expect digest %s, actual %s", fullDigest, layer.Digest)
}
}
})
t.Run("complete split gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-complete-split",
Files: map[string]string{
"test-00001-of-00003.gguf": splitDigest1,
"test-00002-of-00003.gguf": splitDigest2,
"test-00003-of-00003.gguf": splitDigest3,
},
Stream: &stream,
})
if w.Code != http.StatusOK {
fmt.Println(w)
t.Fatalf("expected status code 200, actual %d", w.Code)
}
correctOrder := []string{
splitDigest1, splitDigest2, splitDigest3,
}
manifest, err := ParseNamedManifest(model.ParseName("test-complete-split"))
if err != nil {
t.Fatalf("parse manifest: %v", err)
}
for i, layer := range manifest.Layers {
if i >= 3 {
t.Fatalf("expect 3 layers, actually found layer with index %d", i)
} else if layer.Digest != correctOrder[i] {
t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
}
}
})
t.Run("complete split misordered gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-complete-split-misorder",
Files: map[string]string{
"test-00003-of-00003.gguf": splitDigest3,
"test-00001-of-00003.gguf": splitDigest1,
"test-00002-of-00003.gguf": splitDigest2,
},
Stream: &stream,
})
if w.Code != http.StatusOK {
fmt.Println(w)
t.Fatalf("expected status code 200, actual %d", w.Code)
}
correctOrder := []string{
splitDigest1, splitDigest2, splitDigest3,
}
manifest, err := ParseNamedManifest(model.ParseName("test-complete-split-misorder"))
if err != nil {
t.Fatalf("parse manifest: %v", err)
}
for i, layer := range manifest.Layers {
if i >= 3 {
t.Fatalf("expect 3 layers, actually found layer with index %d", i)
} else if layer.Digest != correctOrder[i] {
t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
}
}
})
t.Run("mixed full and split gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-full-split-mixing",
Files: map[string]string{
"test-00002-of-00003.gguf": splitDigest2,
"test-00003-of-00003.gguf": splitDigest3,
"test1.gguf": fullDigest,
"test-00001-of-00003.gguf": splitDigest1,
},
Stream: &stream,
})
if w.Code != http.StatusOK {
fmt.Println(w)
t.Fatalf("expected status code 200, actual %d", w.Code)
}
correctOrder := []string{
fullDigest, splitDigest1, splitDigest2, splitDigest3,
}
manifest, err := ParseNamedManifest(model.ParseName("test-full-split-mixing"))
if err != nil {
t.Fatalf("parse manifest: %v", err)
}
for i, layer := range manifest.Layers {
if i >= 4 {
t.Fatalf("expect 4 layers, actually found layer with index %d", i)
} else if layer.Digest != correctOrder[i] {
t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
}
}
})
t.Run("mixed wrong split gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-extra-split",
Files: map[string]string{
"test-00002-of-00003.gguf": splitDigest2,
"test-00003-of-00003.gguf": splitDigest3,
"test-00001-of-00003.gguf": splitDigest1,
"test1-00001-of-00004.gguf": splitDigest4,
},
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
t.Run("mixed same count wrong split gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-extra-split",
Files: map[string]string{
"test-00002-of-00003.gguf": splitDigest2,
"test-00003-of-00003.gguf": splitDigest3,
"test-00001-of-00003.gguf": splitDigest1,
"test1-00002-of-00003.gguf": splitDigest5,
},
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
t.Run("missing head split gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-extra-split",
Files: map[string]string{
"test-00002-of-00003.gguf": splitDigest2,
"test-00003-of-00003.gguf": splitDigest3,
},
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
t.Run("missing mid split gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-extra-split",
Files: map[string]string{
"test-00001-of-00003.gguf": splitDigest1,
"test-00003-of-00003.gguf": splitDigest3,
},
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
t.Run("missing tail split gguf", func(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test-extra-split",
Files: map[string]string{
"test-00001-of-00003.gguf": splitDigest1,
"test-00002-of-00003.gguf": splitDigest2,
},
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
}

View File

@ -39,7 +39,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -232,7 +232,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{

View File

@ -44,7 +44,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
@ -228,7 +228,7 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,

View File

@ -71,8 +71,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return
}
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, []string, *ggml.MetaGGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ []string, _ *ggml.MetaGGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
return mock, nil
}
}
@ -182,7 +182,7 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -898,7 +898,7 @@ func TestGenerate(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -1382,7 +1382,7 @@ func TestGenerateLogprobs(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{llama: mock}
return false
},
@ -1562,7 +1562,7 @@ func TestChatLogprobs(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{llama: mock}
return false
},
@ -1672,7 +1672,7 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{llama: mock}
return false

View File

@ -265,7 +265,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
@ -416,7 +416,7 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
@ -598,7 +598,7 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}

View File

@ -49,8 +49,8 @@ type Scheduler struct {
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
loadFn func(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
getSystemInfoFn func() ml.SystemInfo
waitForRecovery time.Duration
@ -196,7 +196,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// Load model for fitting
logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
ggml, err := llm.LoadModel(pending.model.ModelPath, pending.model.ExtraModelPaths, 1024, false)
if err != nil {
pending.errCh <- err
break
@ -389,7 +389,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
func (s *Scheduler) load(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
numParallel := max(int(envconfig.NumParallel()), 1)
// Embedding models should always be loaded with parallel=1
@ -414,7 +414,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
if llama == nil {
var err error
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, req.model.ExtraModelPaths, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to

View File

@ -39,7 +39,7 @@ func TestSchedLoad(t *testing.T) {
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
var f *ggml.GGML // value not used in tests
var f *ggml.MetaGGML // value not used in tests
req := &LlmRequest{
ctx: ctx,
model: &Model{ModelPath: "foo"},
@ -49,7 +49,7 @@ func TestSchedLoad(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Second},
}
// Fail to load model first
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return nil, errors.New("something failed to load model blah")
}
gpus := []ml.DeviceInfo{}
@ -64,7 +64,7 @@ func TestSchedLoad(t *testing.T) {
require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
@ -103,10 +103,10 @@ type reqBundle struct {
ctxDone func()
srv *mockLlm
req *LlmRequest
f *ggml.GGML
f *ggml.MetaGGML
}
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model
return scenario.srv, nil
}
@ -132,7 +132,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
})
model := &Model{Name: modelName, ModelPath: p}
f, err := llm.LoadModel(model.ModelPath, 0)
f, err := llm.LoadModel(model.ModelPath, make([]string, 0), 0, true)
if err != nil {
t.Fatal(err)
}
@ -462,11 +462,11 @@ func TestSchedExpireRunner(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
}
var f *ggml.GGML
var f *ggml.MetaGGML
gpus := []ml.DeviceInfo{}
systemInfo := ml.SystemInfo{}
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}