Merge 328675151b into 6c3faafed2
This commit is contained in:
commit
9e70f0fcfd
|
|
@ -441,6 +441,7 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map
|
|||
cmd, port, err := llm.StartRunner(
|
||||
true, // ollama engine
|
||||
"", // no model
|
||||
make([]string, 0),
|
||||
ollamaLibDirs,
|
||||
out,
|
||||
extraEnvs,
|
||||
|
|
|
|||
|
|
@ -1176,7 +1176,7 @@ Create a model from:
|
|||
|
||||
- another model;
|
||||
- a safetensors directory; or
|
||||
- a GGUF file.
|
||||
- a GGUF file or directory.
|
||||
|
||||
If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.
|
||||
|
||||
|
|
@ -1270,6 +1270,7 @@ A stream of JSON objects is returned:
|
|||
#### Create a model from GGUF
|
||||
|
||||
Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
|
||||
For a model stored in multiple split GGUF files, includes all split GGUF files in the `files` parameter with the file names and SHA256 digests. It is recommended to provide files in split number order even though Ollama itself will sort them in order.
|
||||
|
||||
##### Request
|
||||
|
||||
|
|
|
|||
|
|
@ -88,6 +88,10 @@ To import a GGUF model, create a `Modelfile` containing:
|
|||
```dockerfile
|
||||
FROM /path/to/file.gguf
|
||||
```
|
||||
Or:
|
||||
```dockerfile
|
||||
FROM /path/to/gguf/split/directory
|
||||
```
|
||||
|
||||
For a GGUF adapter, create the `Modelfile` with:
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ A Modelfile is the blueprint to create and share customized models using Ollama.
|
|||
- [FROM (Required)](#from-required)
|
||||
- [Build from existing model](#build-from-existing-model)
|
||||
- [Build from a Safetensors model](#build-from-a-safetensors-model)
|
||||
- [Build from a GGUF file](#build-from-a-gguf-file)
|
||||
- [Build from a GGUF file](#build-from-a-gguf-model)
|
||||
- [PARAMETER](#parameter)
|
||||
- [Valid Parameters and Values](#valid-parameters-and-values)
|
||||
- [TEMPLATE](#template)
|
||||
|
|
@ -130,7 +130,7 @@ Currently supported model architectures:
|
|||
- Gemma (including Gemma 1 and Gemma 2)
|
||||
- Phi3
|
||||
|
||||
#### Build from a GGUF file
|
||||
#### Build from a GGUF model
|
||||
|
||||
```
|
||||
FROM ./ollama-model.gguf
|
||||
|
|
@ -138,6 +138,14 @@ FROM ./ollama-model.gguf
|
|||
|
||||
The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
|
||||
|
||||
For GGUF model split into multiple files:
|
||||
|
||||
```
|
||||
FROM <model directory>
|
||||
```
|
||||
|
||||
The model directory should contain solely the split GGUF weights of one model.
|
||||
|
||||
### PARAMETER
|
||||
|
||||
The `PARAMETER` instruction defines a parameter that can be set when the model is run.
|
||||
|
|
|
|||
183
fs/ggml/ggml.go
183
fs/ggml/ggml.go
|
|
@ -7,6 +7,7 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"math"
|
||||
"slices"
|
||||
"strings"
|
||||
|
|
@ -27,6 +28,18 @@ type model interface {
|
|||
Tensors() Tensors
|
||||
}
|
||||
|
||||
type MetaGGML struct {
|
||||
Shards []GGML
|
||||
ShardPaths []string
|
||||
Tensors ForeignTensors
|
||||
kv KV
|
||||
}
|
||||
|
||||
type GGUFSplitInfo struct {
|
||||
No uint16
|
||||
Count uint16
|
||||
}
|
||||
|
||||
type KV map[string]any
|
||||
|
||||
func (kv KV) Architecture() string {
|
||||
|
|
@ -50,6 +63,18 @@ func (kv KV) FileType() FileType {
|
|||
return FileTypeUnknown
|
||||
}
|
||||
|
||||
func (kv KV) GGUFSplitInfo() *GGUFSplitInfo {
|
||||
no, found := keyValue(kv, "split.no", uint16(0))
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
count, _ := keyValue(kv, "split.count", uint16(0))
|
||||
return &GGUFSplitInfo{
|
||||
No: no,
|
||||
Count: count,
|
||||
}
|
||||
}
|
||||
|
||||
func (kv KV) BlockCount() uint64 {
|
||||
return uint64(kv.Uint("block_count"))
|
||||
}
|
||||
|
|
@ -271,7 +296,7 @@ type arrayValueTypes interface {
|
|||
}
|
||||
|
||||
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
|
||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "split.") {
|
||||
key = kv.Architecture() + "." + key
|
||||
}
|
||||
|
||||
|
|
@ -288,6 +313,14 @@ type Tensors struct {
|
|||
Offset uint64
|
||||
}
|
||||
|
||||
type ForeignTensor struct {
|
||||
*Tensor
|
||||
ModelPath string
|
||||
TensorRegionOffset uint64
|
||||
}
|
||||
|
||||
type ForeignTensors []ForeignTensor
|
||||
|
||||
func (s Tensors) Items(prefix ...string) []*Tensor {
|
||||
if len(prefix) == 0 {
|
||||
return s.items
|
||||
|
|
@ -326,6 +359,41 @@ func (ts Tensors) GroupLayers() map[string]Layer {
|
|||
return layers
|
||||
}
|
||||
|
||||
func (s ForeignTensors) Items(prefix ...string) []*Tensor {
|
||||
var items []*Tensor
|
||||
for i := range s {
|
||||
if len(prefix) == 0 || strings.HasPrefix(s[i].Name, prefix[0]) {
|
||||
items = append(items, s[i].Tensor)
|
||||
}
|
||||
}
|
||||
|
||||
return items
|
||||
}
|
||||
|
||||
func (ts ForeignTensors) GroupLayers() map[string]Layer {
|
||||
layers := make(map[string]Layer)
|
||||
for i := range ts {
|
||||
t := ts[i].Tensor
|
||||
parts := strings.Split(t.Name, ".")
|
||||
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
|
||||
if len(parts) > index+2 {
|
||||
// blk and mm should have a number after them, join it
|
||||
parts = append(
|
||||
[]string{strings.Join(parts[:index+2], ".")},
|
||||
parts[index+2:]...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := layers[parts[0]]; !ok {
|
||||
layers[parts[0]] = make(Layer)
|
||||
}
|
||||
|
||||
layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||
}
|
||||
|
||||
return layers
|
||||
}
|
||||
|
||||
type Layer map[string]*Tensor
|
||||
|
||||
func (l Layer) Size() (size uint64) {
|
||||
|
|
@ -553,7 +621,93 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
func BuildForeignTensors(shards []GGML, shardsPaths []string) (*ForeignTensors, error) {
|
||||
if len(shards) != len(shardsPaths) {
|
||||
return nil, fmt.Errorf("length of shards and shardsPaths do not match: %d vs %d", len(shards), len(shardsPaths))
|
||||
}
|
||||
li := make(ForeignTensors, 0)
|
||||
for i := range shards {
|
||||
gs := shards[i]
|
||||
tensors := gs.Tensors()
|
||||
for k := range tensors.items {
|
||||
tensor := tensors.items[k]
|
||||
li = append(li, ForeignTensor{
|
||||
Tensor: tensor,
|
||||
ModelPath: shardsPaths[i],
|
||||
TensorRegionOffset: tensors.Offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
return &li, nil
|
||||
}
|
||||
|
||||
func MakeMetaGGML(ggmls []GGML, ggmlPaths []string) MetaGGML {
|
||||
type wrapper struct {
|
||||
ggml GGML
|
||||
path string
|
||||
weight int
|
||||
}
|
||||
var wrappers []wrapper
|
||||
for i := range ggmls {
|
||||
iSplitInfo := ggmls[i].KV().GGUFSplitInfo()
|
||||
var weight int = 0
|
||||
if iSplitInfo == nil {
|
||||
weight = -1
|
||||
} else {
|
||||
weight = int((*iSplitInfo).No)
|
||||
}
|
||||
wrappers = append(wrappers, wrapper{
|
||||
ggml: ggmls[i],
|
||||
path: ggmlPaths[i],
|
||||
weight: weight,
|
||||
})
|
||||
}
|
||||
slices.SortStableFunc(wrappers, func(a, b wrapper) int {
|
||||
return cmp.Compare(a.weight, b.weight)
|
||||
})
|
||||
metaGgml := MetaGGML{}
|
||||
var param_counts uint64 = 0
|
||||
for i := range wrappers {
|
||||
param_counts += wrappers[i].ggml.KV().ParameterCount()
|
||||
if i == 0 {
|
||||
kv := maps.Clone(wrappers[i].ggml.KV())
|
||||
// remove the keys contained in split gguf files. add more if needed.
|
||||
delete(kv, "slice.no")
|
||||
delete(kv, "slice.count")
|
||||
delete(kv, "slice.tensors.count")
|
||||
delete(kv, "general.parameter_count")
|
||||
metaGgml.kv = kv
|
||||
}
|
||||
metaGgml.Shards = append(metaGgml.Shards, wrappers[i].ggml)
|
||||
metaGgml.ShardPaths = append(metaGgml.ShardPaths, wrappers[i].path)
|
||||
}
|
||||
metaGgml.kv["general.parameter_count"] = param_counts
|
||||
ft, _ := BuildForeignTensors(metaGgml.Shards, metaGgml.ShardPaths)
|
||||
metaGgml.Tensors = *ft
|
||||
return metaGgml
|
||||
}
|
||||
|
||||
func simpleWrapGGML(ggml GGML) MetaGGML {
|
||||
// simply wrap single GGML, without creating foreign tensors
|
||||
return MetaGGML{
|
||||
Shards: []GGML{ggml},
|
||||
ShardPaths: []string{""},
|
||||
kv: ggml.KV(),
|
||||
}
|
||||
}
|
||||
|
||||
func WrapGGML(ggml GGML) MetaGGML {
|
||||
metaggml := simpleWrapGGML(ggml)
|
||||
ft, _ := BuildForeignTensors(metaggml.Shards, metaggml.ShardPaths)
|
||||
metaggml.Tensors = *ft
|
||||
return metaggml
|
||||
}
|
||||
|
||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||
return WrapGGML(f).GraphSize(context, batch, numParallel, kvCacheType, useFlashAttention)
|
||||
}
|
||||
|
||||
func (f MetaGGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||
context *= uint64(numParallel)
|
||||
|
||||
embedding := f.KV().EmbeddingLength()
|
||||
|
|
@ -567,7 +721,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
||||
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
||||
|
||||
layers := f.Tensors().GroupLayers()
|
||||
layers := f.Tensors.GroupLayers()
|
||||
|
||||
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
||||
|
||||
|
|
@ -665,7 +819,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||
)
|
||||
|
||||
var ropeFreqsCount uint64
|
||||
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
||||
if ropeFreqs, ok := f.Tensors.GroupLayers()["rope_freqs"]; ok {
|
||||
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
||||
ropeFreqsCount = ropeFreqsWeights.Elements()
|
||||
}
|
||||
|
|
@ -805,6 +959,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||
|
||||
// SupportsKVCacheType checks if the requested cache type is supported
|
||||
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||
return simpleWrapGGML(f).SupportsKVCacheType(cacheType)
|
||||
}
|
||||
func (f MetaGGML) SupportsKVCacheType(cacheType string) bool {
|
||||
if cacheType == "" || cacheType == "f16" {
|
||||
return true
|
||||
}
|
||||
|
|
@ -822,6 +979,10 @@ func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {
|
|||
|
||||
// SupportsFlashAttention checks if the model supports flash attention
|
||||
func (f GGML) SupportsFlashAttention() bool {
|
||||
return simpleWrapGGML(f).SupportsFlashAttention()
|
||||
}
|
||||
|
||||
func (f MetaGGML) SupportsFlashAttention() bool {
|
||||
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
||||
if isEmbedding {
|
||||
return false
|
||||
|
|
@ -839,6 +1000,10 @@ func (f GGML) SupportsFlashAttention() bool {
|
|||
|
||||
// FlashAttention checks if the model should enable flash attention
|
||||
func (f GGML) FlashAttention() bool {
|
||||
return simpleWrapGGML(f).FlashAttention()
|
||||
}
|
||||
|
||||
func (f MetaGGML) FlashAttention() bool {
|
||||
return slices.Contains([]string{
|
||||
"bert",
|
||||
"gemma3",
|
||||
|
|
@ -863,3 +1028,15 @@ func kvCacheBytesPerElement(cacheType string) float64 {
|
|||
return 2 // f16 (default)
|
||||
}
|
||||
}
|
||||
|
||||
func (f MetaGGML) KV() KV {
|
||||
return f.kv
|
||||
}
|
||||
|
||||
func (f MetaGGML) TotalTensorBytes() uint64 {
|
||||
totalBytes := uint64(0)
|
||||
for i := range f.Shards {
|
||||
totalBytes += uint64(f.Shards[i].Length) - f.Shards[i].Tensors().Offset
|
||||
}
|
||||
return totalBytes
|
||||
}
|
||||
|
|
|
|||
|
|
@ -582,7 +582,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
|||
if !strings.HasPrefix(k, arch+".") &&
|
||||
!strings.HasPrefix(k, "general.") &&
|
||||
!strings.HasPrefix(k, "adapter.") &&
|
||||
!strings.HasPrefix(k, "tokenizer.") {
|
||||
!strings.HasPrefix(k, "tokenizer.") &&
|
||||
!strings.HasPrefix(k, "split.") {
|
||||
k = arch + "." + k
|
||||
}
|
||||
|
||||
|
|
@ -597,6 +598,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
|||
|
||||
var err error
|
||||
switch v := v.(type) {
|
||||
case uint16:
|
||||
err = writeGGUF(ws, ggufTypeUint16, v)
|
||||
case int32:
|
||||
err = writeGGUF(ws, ggufTypeInt32, v)
|
||||
case int64:
|
||||
|
|
|
|||
2
go.mod
2
go.mod
|
|
@ -77,7 +77,7 @@ require (
|
|||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||
github.com/ugorji/go/codec v1.2.12 // indirect
|
||||
golang.org/x/arch v0.8.0 // indirect
|
||||
|
|
|
|||
|
|
@ -261,7 +261,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
|
|||
return true
|
||||
}
|
||||
|
||||
func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
||||
func LoadModelFromFile(modelPath string, extraModelPaths []string, params ModelParams) (*Model, error) {
|
||||
cparams := C.llama_model_default_params()
|
||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
||||
cparams.main_gpu = C.int32_t(params.MainGpu)
|
||||
|
|
@ -305,7 +305,17 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
|||
cparams.progress_callback_user_data = unsafe.Pointer(&handle)
|
||||
}
|
||||
|
||||
m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)}
|
||||
var splitPaths []*C.char
|
||||
mp := C.CString(modelPath)
|
||||
defer C.free(unsafe.Pointer(mp))
|
||||
splitPaths = append(splitPaths, mp)
|
||||
for i := range extraModelPaths {
|
||||
mp := C.CString(extraModelPaths[i])
|
||||
defer C.free(unsafe.Pointer(mp))
|
||||
splitPaths = append(splitPaths, mp)
|
||||
}
|
||||
|
||||
m := Model{c: C.llama_model_load_from_splits(&splitPaths[0], C.size_t(len(splitPaths)), cparams)}
|
||||
if m.c == nil {
|
||||
return nil, fmt.Errorf("unable to load model: %s", modelPath)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ type llmServer struct {
|
|||
status *StatusWriter
|
||||
options api.Options
|
||||
modelPath string
|
||||
extraModelPaths []string
|
||||
|
||||
loadRequest LoadRequest // Parameters used to initialize the runner
|
||||
mem *ml.BackendMemory // Memory allocations for this model
|
||||
|
|
@ -109,7 +110,7 @@ type llmServer struct {
|
|||
type llamaServer struct {
|
||||
llmServer
|
||||
|
||||
ggml *ggml.GGML
|
||||
ggml *ggml.MetaGGML
|
||||
}
|
||||
|
||||
type ollamaServer struct {
|
||||
|
|
@ -123,7 +124,7 @@ type ollamaServer struct {
|
|||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||
// the maxArraySize is negative, all arrays are collected.
|
||||
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
||||
func LoadModel(model string, extraModels []string, maxArraySize int, reliefSplitConstrain bool) (*ggml.MetaGGML, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -134,12 +135,55 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
|||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, err := ggml.Decode(f, maxArraySize)
|
||||
return ggml, err
|
||||
ggml1, err := ggml.Decode(f, maxArraySize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if ggml1.KV().GGUFSplitInfo() != nil {
|
||||
if ggml1.KV().GGUFSplitInfo().No != 0 {
|
||||
return nil, errors.New("not the first split of model")
|
||||
}
|
||||
loadedGgml := []ggml.GGML{*ggml1}
|
||||
visitedSplitNo := []uint16{ggml1.KV().GGUFSplitInfo().No}
|
||||
for i := range extraModels {
|
||||
extraModel := extraModels[i]
|
||||
f, err := os.Open(extraModel)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml1, err := ggml.Decode(f, maxArraySize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if ggml1.KV().GGUFSplitInfo() == nil {
|
||||
return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf")
|
||||
}
|
||||
visitedSplitNo = append(visitedSplitNo, ggml1.KV().GGUFSplitInfo().No)
|
||||
loadedGgml = append(loadedGgml, *ggml1)
|
||||
}
|
||||
if !reliefSplitConstrain {
|
||||
if len(visitedSplitNo) != int(ggml1.KV().GGUFSplitInfo().Count) {
|
||||
return nil, errors.New("mismatch split gguf count")
|
||||
}
|
||||
slices.Sort(visitedSplitNo)
|
||||
for i := 0; i < len(visitedSplitNo)-1; i++ {
|
||||
if visitedSplitNo[i] != visitedSplitNo[i+1]-1 {
|
||||
return nil, errors.New("repeated or skipped split found")
|
||||
}
|
||||
}
|
||||
}
|
||||
metaggml := ggml.MakeMetaGGML(loadedGgml, append([]string{model}, extraModels...))
|
||||
return &metaggml, nil
|
||||
} else {
|
||||
metaggml := ggml.MakeMetaGGML([]ggml.GGML{*ggml1}, []string{model})
|
||||
return &metaggml, nil
|
||||
}
|
||||
}
|
||||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, extraModelPaths []string, f *ggml.MetaGGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
var llamaModel *llama.Model
|
||||
var textProcessor model.TextProcessor
|
||||
var err error
|
||||
|
|
@ -155,7 +199,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||
}
|
||||
}
|
||||
if textProcessor == nil {
|
||||
llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
|
||||
llamaModel, err = llama.LoadModelFromFile(modelPath, extraModelPaths, llama.ModelParams{VocabOnly: true})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -262,6 +306,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||
cmd, port, err := StartRunner(
|
||||
textProcessor != nil,
|
||||
modelPath,
|
||||
extraModelPaths,
|
||||
gpuLibs,
|
||||
status,
|
||||
ml.GetVisibleDevicesEnv(gpus, false),
|
||||
|
|
@ -273,6 +318,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||
status: status,
|
||||
options: opts,
|
||||
modelPath: modelPath,
|
||||
extraModelPaths: extraModelPaths,
|
||||
loadRequest: loadRequest,
|
||||
llamaModel: llamaModel,
|
||||
llamaModelLock: &sync.Mutex{},
|
||||
|
|
@ -316,7 +362,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||
}
|
||||
}
|
||||
|
||||
func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
|
||||
func StartRunner(ollamaEngine bool, modelPath string, extraModelPaths []string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
|
||||
var exe string
|
||||
exe, err = os.Executable()
|
||||
if err != nil {
|
||||
|
|
@ -346,6 +392,9 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W
|
|||
if modelPath != "" {
|
||||
params = append(params, "--model", modelPath)
|
||||
}
|
||||
for i := range extraModelPaths {
|
||||
params = append(params, "--model", extraModelPaths[i])
|
||||
}
|
||||
params = append(params, "--port", strconv.Itoa(port))
|
||||
|
||||
var pathEnv string
|
||||
|
|
@ -440,6 +489,10 @@ func (s *llmServer) ModelPath() string {
|
|||
return s.modelPath
|
||||
}
|
||||
|
||||
func (s *llmServer) ExtraModelPaths() []string {
|
||||
return s.extraModelPaths
|
||||
}
|
||||
|
||||
type LoadOperation int
|
||||
|
||||
// The order of these constants are significant because we iterate over the operations. They
|
||||
|
|
@ -522,7 +575,7 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
|
|||
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
|
||||
|
||||
// Use the size of one layer as a buffer
|
||||
layers := s.ggml.Tensors().GroupLayers()
|
||||
layers := s.ggml.Tensors.GroupLayers()
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
buffer := blk0.Size() + kv[0]
|
||||
for i := range gpus {
|
||||
|
|
|
|||
|
|
@ -73,9 +73,9 @@ type BackendParams struct {
|
|||
FlashAttention FlashAttentionType
|
||||
}
|
||||
|
||||
var backends = make(map[string]func(string, BackendParams) (Backend, error))
|
||||
var backends = make(map[string]func(string, []string, BackendParams) (Backend, error))
|
||||
|
||||
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
|
||||
func RegisterBackend(name string, f func(string, []string, BackendParams) (Backend, error)) {
|
||||
if _, ok := backends[name]; ok {
|
||||
panic("backend: backend already registered")
|
||||
}
|
||||
|
|
@ -83,9 +83,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
|
|||
backends[name] = f
|
||||
}
|
||||
|
||||
func NewBackend(modelPath string, params BackendParams) (Backend, error) {
|
||||
func NewBackend(modelPath string, extraModelPaths []string, params BackendParams) (Backend, error) {
|
||||
if backend, ok := backends["ggml"]; ok {
|
||||
return backend(modelPath, params)
|
||||
return backend(modelPath, extraModelPaths, params)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported backend")
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ type Backend struct {
|
|||
// modelPath is the location of the model data
|
||||
modelPath string
|
||||
|
||||
meta *fsggml.GGML
|
||||
meta *fsggml.MetaGGML
|
||||
|
||||
// allocMemory means that memory should be allocated for tensors and not
|
||||
// just a dry run
|
||||
|
|
@ -120,17 +120,55 @@ type Backend struct {
|
|||
|
||||
var once sync.Once
|
||||
|
||||
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (ml.Backend, error) {
|
||||
r, err := os.Open(modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
meta, err := fsggml.Decode(r, -1)
|
||||
smallmeta, err := fsggml.Decode(r, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var meta fsggml.MetaGGML
|
||||
if smallmeta.KV().GGUFSplitInfo() != nil {
|
||||
if smallmeta.KV().GGUFSplitInfo().No != 0 {
|
||||
return nil, errors.New("not the first split of model")
|
||||
}
|
||||
loadedGgml := []fsggml.GGML{*smallmeta}
|
||||
visitedSplitNo := []uint16{smallmeta.KV().GGUFSplitInfo().No}
|
||||
for i := range extraModelPaths {
|
||||
extraModel := extraModelPaths[i]
|
||||
f, err := os.Open(extraModel)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
smallmeta, err := fsggml.Decode(f, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if smallmeta.KV().GGUFSplitInfo() == nil {
|
||||
return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf")
|
||||
}
|
||||
visitedSplitNo = append(visitedSplitNo, smallmeta.KV().GGUFSplitInfo().No)
|
||||
loadedGgml = append(loadedGgml, *smallmeta)
|
||||
}
|
||||
if len(visitedSplitNo) != int(smallmeta.KV().GGUFSplitInfo().Count) {
|
||||
return nil, errors.New("mismatch split gguf count")
|
||||
}
|
||||
slices.Sort(visitedSplitNo)
|
||||
for i := 0; i < len(visitedSplitNo)-1; i++ {
|
||||
if visitedSplitNo[i] != visitedSplitNo[i+1]-1 {
|
||||
return nil, errors.New("repeated or skipped split found")
|
||||
}
|
||||
}
|
||||
meta = fsggml.MakeMetaGGML(loadedGgml, append([]string{modelPath}, extraModelPaths...))
|
||||
} else {
|
||||
meta = fsggml.MakeMetaGGML([]fsggml.GGML{*smallmeta}, []string{modelPath})
|
||||
}
|
||||
|
||||
once.Do(func() {
|
||||
slog.Info(
|
||||
|
|
@ -139,7 +177,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||
"file_type", meta.KV().FileType(),
|
||||
"name", meta.KV().String("general.name"),
|
||||
"description", meta.KV().String("general.description"),
|
||||
"num_tensors", len(meta.Tensors().Items()),
|
||||
"num_tensors", len(meta.Tensors.Items()),
|
||||
"num_key_values", len(meta.KV()),
|
||||
)
|
||||
})
|
||||
|
|
@ -227,7 +265,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||
// outputs are assigned iff allowed by splits and configured number of gpu layers
|
||||
output := assignLayer(blocks)
|
||||
|
||||
maxTensors := len(meta.Tensors().Items())
|
||||
maxTensors := len(meta.Tensors.Items())
|
||||
maxTensors += 1
|
||||
// each layer has at most 2 extra tensors for rope operations
|
||||
maxTensors += blocks * 2
|
||||
|
|
@ -303,11 +341,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||
return false
|
||||
}
|
||||
|
||||
for _, t := range meta.Tensors().Items() {
|
||||
for _, t := range meta.Tensors.Items() {
|
||||
switch {
|
||||
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
|
||||
createTensor(tensor{source: t}, input.bts, -1)
|
||||
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
|
||||
if _, ok := meta.Tensors.GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
|
||||
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
|
||||
}
|
||||
case contains(t.Name, "cls", "output", "output_norm",
|
||||
|
|
@ -378,7 +416,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||
}
|
||||
}
|
||||
|
||||
maxGraphNodes := max(1024, len(meta.Tensors().Items())*8)
|
||||
maxGraphNodes := max(1024, len(meta.Tensors.Items())*8)
|
||||
|
||||
sched := C.ggml_backend_sched_new_ext(
|
||||
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
|
||||
|
|
@ -423,7 +461,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
|||
modelPath: modelPath,
|
||||
allocMemory: params.AllocMemory,
|
||||
flashAttention: params.FlashAttention,
|
||||
meta: meta,
|
||||
meta: &meta,
|
||||
tensorLoadTargets: targets,
|
||||
tensors: tensors,
|
||||
sched: sched,
|
||||
|
|
@ -494,11 +532,12 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
|||
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
|
||||
|
||||
var doneBytes atomic.Uint64
|
||||
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
|
||||
totalBytes := b.meta.TotalTensorBytes()
|
||||
|
||||
g, ctx := errgroup.WithContext(ctx)
|
||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||
for _, t := range b.meta.Tensors().Items() {
|
||||
for i := range b.meta.Tensors {
|
||||
t := b.meta.Tensors[i]
|
||||
g.Go(func() error {
|
||||
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
|
||||
for i := range tts {
|
||||
|
|
@ -517,13 +556,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
|||
|
||||
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
|
||||
// seeking around within an FD shared between all goroutines.
|
||||
file, err := os.Open(b.modelPath)
|
||||
file, err := os.Open(t.ModelPath)
|
||||
if err != nil {
|
||||
slog.Warn("file open error", "file", b.modelPath, "error", err)
|
||||
slog.Warn("file open error", "file", t.ModelPath, "error", err)
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
||||
sr := io.NewSectionReader(file, int64(t.TensorRegionOffset+t.Offset), int64(t.Size()))
|
||||
|
||||
if t.Kind == 4 && tts[0]._type == 39 {
|
||||
// source is mxfp4, target is ggml mxfp4
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ func setup(tb testing.TB) ml.Context {
|
|||
tb.Fatal(err)
|
||||
}
|
||||
|
||||
b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
|
||||
b, err := ml.NewBackend(f.Name(), make([]string, 0), ml.BackendParams{AllocMemory: true})
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -102,8 +102,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
|
|||
}
|
||||
|
||||
// New initializes a new model instance with the provided configuration based on the metadata in the model file
|
||||
func New(modelPath string, params ml.BackendParams) (Model, error) {
|
||||
b, err := ml.NewBackend(modelPath, params)
|
||||
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (Model, error) {
|
||||
b, err := ml.NewBackend(modelPath, extraModelPaths, params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"log/slog"
|
||||
|
|
@ -19,6 +18,7 @@ import (
|
|||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
|
|
@ -257,6 +257,8 @@ type Server struct {
|
|||
// modelPath is the location of the model to be loaded
|
||||
modelPath string
|
||||
|
||||
extraModelPaths []string
|
||||
|
||||
// loadMu prevents more than one load attempt from occurring at a time
|
||||
loadMu sync.Mutex
|
||||
|
||||
|
|
@ -829,6 +831,7 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
|||
func (s *Server) loadModel(
|
||||
params llama.ModelParams,
|
||||
mpath string,
|
||||
empath []string,
|
||||
lpath []string,
|
||||
ppath string,
|
||||
kvSize int,
|
||||
|
|
@ -838,7 +841,7 @@ func (s *Server) loadModel(
|
|||
multiUserCache bool,
|
||||
) {
|
||||
var err error
|
||||
s.model, err = llama.LoadModelFromFile(mpath, params)
|
||||
s.model, err = llama.LoadModelFromFile(mpath, empath, params)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
|
@ -931,7 +934,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
|||
}
|
||||
|
||||
s.status = llm.ServerStatusLoadingModel
|
||||
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
|
||||
go s.loadModel(params, s.modelPath, s.extraModelPaths, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
|
||||
|
||||
case llm.LoadOperationClose:
|
||||
// No-op for us
|
||||
|
|
@ -949,13 +952,14 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
|||
}
|
||||
|
||||
func Execute(args []string) error {
|
||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
||||
mpath := fs.String("model", "", "Path to model binary file")
|
||||
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
|
||||
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
|
||||
port := fs.Int("port", 8080, "Port to expose the server on")
|
||||
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(fs.Output(), "Runner usage\n")
|
||||
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
|
||||
fmt.Fprintf(os.Stderr, "Runner usage\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
|
|
@ -967,7 +971,8 @@ func Execute(args []string) error {
|
|||
llama.BackendInit()
|
||||
|
||||
server := &Server{
|
||||
modelPath: *mpath,
|
||||
modelPath: (*mpath)[0],
|
||||
extraModelPaths: (*mpath)[1:],
|
||||
status: llm.ServerStatusLaunched,
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"hash/maphash"
|
||||
"image"
|
||||
|
|
@ -23,6 +22,7 @@ import (
|
|||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
"golang.org/x/image/bmp"
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
|
|
@ -331,6 +331,8 @@ type Server struct {
|
|||
// modelPath is the location of the model to be loaded
|
||||
modelPath string
|
||||
|
||||
extraModelPaths []string
|
||||
|
||||
// loadMu prevents more than one load attempt from occurring at a time
|
||||
loadMu sync.Mutex
|
||||
|
||||
|
|
@ -1169,6 +1171,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error {
|
|||
// based on the given parameters
|
||||
func (s *Server) allocModel(
|
||||
mpath string,
|
||||
empath []string,
|
||||
params ml.BackendParams,
|
||||
loraPath []string,
|
||||
parallel int,
|
||||
|
|
@ -1193,7 +1196,7 @@ func (s *Server) allocModel(
|
|||
}()
|
||||
|
||||
var err error
|
||||
s.model, err = model.New(mpath, params)
|
||||
s.model, err = model.New(mpath, empath, params)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
@ -1302,7 +1305,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
|||
|
||||
s.batchSize = req.BatchSize
|
||||
|
||||
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
|
||||
err := s.allocModel(s.modelPath, s.extraModelPaths, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
|
||||
if err != nil {
|
||||
s.closeModel()
|
||||
|
||||
|
|
@ -1372,7 +1375,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
|
|||
return
|
||||
}
|
||||
|
||||
m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
|
||||
m, err = model.New(f.Name(), make([]string, 0), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
|
||||
if err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
|
|
@ -1389,13 +1392,14 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
|
|||
}
|
||||
|
||||
func Execute(args []string) error {
|
||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
||||
mpath := fs.String("model", "", "Path to model binary file")
|
||||
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
|
||||
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
|
||||
port := fs.Int("port", 8080, "Port to expose the server on")
|
||||
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(fs.Output(), "Runner usage\n")
|
||||
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
|
||||
fmt.Fprintf(os.Stderr, "Runner usage\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
|
|
@ -1408,7 +1412,8 @@ func Execute(args []string) error {
|
|||
defer cancel()
|
||||
|
||||
server := &Server{
|
||||
modelPath: *mpath,
|
||||
modelPath: (*mpath)[0],
|
||||
extraModelPaths: (*mpath)[1:],
|
||||
status: llm.ServerStatusLaunched,
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -39,8 +39,97 @@ var (
|
|||
errUnknownType = errors.New("unknown type")
|
||||
errNeitherFromOrFiles = errors.New("neither 'from' or 'files' was specified")
|
||||
errFilePath = errors.New("file path must be relative")
|
||||
errIncompleteShardedGGUF = errors.New("missing some GGUF splits")
|
||||
errExtraShardedGGUF = errors.New("extra GGUF splits found")
|
||||
)
|
||||
|
||||
func broadcastKV(main *ggml.GGML, subs ...*ggml.GGML) {
|
||||
// broadcast KV value towards other shards. Only for manifest purpose
|
||||
ggmls := []ggml.GGML{*main}
|
||||
for i := range subs {
|
||||
ggmls = append(ggmls, *subs[i])
|
||||
}
|
||||
metaggml := ggml.MakeMetaGGML(ggmls, make([]string, len(ggmls)))
|
||||
mainKV := main.KV()
|
||||
mainKV["general.parameter_count"] = metaggml.KV().ParameterCount()
|
||||
for i := range subs {
|
||||
subKV := subs[i].KV()
|
||||
for k, v := range metaggml.KV() {
|
||||
subKV[k] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func baseLayerSortNCheckSan(baseLayers *[]*layerGGML) error {
|
||||
slices.SortStableFunc(*baseLayers, func(a, b *layerGGML) int {
|
||||
var aScore, bScore int
|
||||
if a.GGML == nil {
|
||||
// chat template and parameter can be added here. use very big number to move them at last
|
||||
aScore = 0x7fffffff
|
||||
} else {
|
||||
aSplit := a.GGML.KV().GGUFSplitInfo()
|
||||
if aSplit == nil {
|
||||
aScore = -1
|
||||
} else {
|
||||
aScore = int(aSplit.No)
|
||||
}
|
||||
}
|
||||
if b.GGML == nil {
|
||||
bScore = 0x7fffffff
|
||||
} else {
|
||||
bSplit := b.GGML.KV().GGUFSplitInfo()
|
||||
if bSplit == nil {
|
||||
bScore = -1
|
||||
} else {
|
||||
bScore = int(bSplit.No)
|
||||
}
|
||||
}
|
||||
return cmp.Compare(aScore, bScore)
|
||||
})
|
||||
// sanity check for layers
|
||||
{
|
||||
ggmlPtrs := make([]*ggml.GGML, 0, len(*baseLayers))
|
||||
firstSplitCount := -1
|
||||
foundSplitNos := make([]uint16, 0)
|
||||
for i, layer := range *baseLayers {
|
||||
if i == 0 {
|
||||
if layer.GGML == nil {
|
||||
// First item should be GGUF after sorting
|
||||
return errNoFilesProvided
|
||||
}
|
||||
}
|
||||
if layer.GGML != nil && layer.GGML.KV().GGUFSplitInfo() != nil {
|
||||
if firstSplitCount == -1 {
|
||||
if layer.GGML.KV().GGUFSplitInfo().No != 0 {
|
||||
return errIncompleteShardedGGUF
|
||||
}
|
||||
firstSplitCount = int(layer.GGML.KV().GGUFSplitInfo().Count)
|
||||
foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No)
|
||||
} else if firstSplitCount != int(layer.KV().GGUFSplitInfo().Count) {
|
||||
return errExtraShardedGGUF
|
||||
} else {
|
||||
if foundSplitNos[len(foundSplitNos)-1] == layer.KV().GGUFSplitInfo().No {
|
||||
return errExtraShardedGGUF
|
||||
} else if foundSplitNos[len(foundSplitNos)-1] != layer.KV().GGUFSplitInfo().No-1 {
|
||||
return errIncompleteShardedGGUF
|
||||
} else {
|
||||
foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No)
|
||||
}
|
||||
}
|
||||
// only gguf splits should be included
|
||||
ggmlPtrs = append(ggmlPtrs, layer.GGML)
|
||||
}
|
||||
}
|
||||
if firstSplitCount != -1 && len(foundSplitNos) != firstSplitCount {
|
||||
return errIncompleteShardedGGUF
|
||||
}
|
||||
if len(ggmlPtrs) > 1 {
|
||||
broadcastKV(ggmlPtrs[0], ggmlPtrs[1:]...)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) CreateHandler(c *gin.Context) {
|
||||
config := &model.ConfigV2{
|
||||
OS: "linux",
|
||||
|
|
@ -161,6 +250,14 @@ func (s *Server) CreateHandler(c *gin.Context) {
|
|||
ch <- gin.H{"error": errNeitherFromOrFiles.Error(), "status": http.StatusBadRequest}
|
||||
return
|
||||
}
|
||||
// Sort baseLayers here to ensure that split model will be correctly ordered
|
||||
if !remote {
|
||||
err := baseLayerSortNCheckSan(&baseLayers)
|
||||
if err != nil {
|
||||
ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var adapterLayers []*layerGGML
|
||||
if !remote && r.Adapters != nil {
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ type Model struct {
|
|||
Config model.ConfigV2
|
||||
ShortName string
|
||||
ModelPath string
|
||||
ExtraModelPaths []string
|
||||
ParentModel string
|
||||
AdapterPaths []string
|
||||
ProjectorPaths []string
|
||||
|
|
@ -190,6 +191,13 @@ func (m *Model) String() string {
|
|||
Args: m.ModelPath,
|
||||
})
|
||||
|
||||
for _, extraModels := range m.ExtraModelPaths {
|
||||
modelfile.Commands = append(modelfile.Commands, parser.Command{
|
||||
Name: "model",
|
||||
Args: extraModels,
|
||||
})
|
||||
}
|
||||
|
||||
for _, adapter := range m.AdapterPaths {
|
||||
modelfile.Commands = append(modelfile.Commands, parser.Command{
|
||||
Name: "adapter",
|
||||
|
|
@ -319,6 +327,8 @@ func GetModel(name string) (*Model, error) {
|
|||
}
|
||||
}
|
||||
|
||||
readMainModelFlag := false
|
||||
|
||||
for _, layer := range manifest.Layers {
|
||||
filename, err := GetBlobsPath(layer.Digest)
|
||||
if err != nil {
|
||||
|
|
@ -327,8 +337,13 @@ func GetModel(name string) (*Model, error) {
|
|||
|
||||
switch layer.MediaType {
|
||||
case "application/vnd.ollama.image.model":
|
||||
if !readMainModelFlag {
|
||||
model.ModelPath = filename
|
||||
model.ParentModel = layer.From
|
||||
readMainModelFlag = true
|
||||
} else {
|
||||
model.ExtraModelPaths = append(model.ExtraModelPaths, filename)
|
||||
}
|
||||
case "application/vnd.ollama.image.embed":
|
||||
// Deprecated in versions > 0.1.2
|
||||
// TODO: remove this warning in a future version
|
||||
|
|
|
|||
|
|
@ -1201,14 +1201,14 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
|||
return resp, nil
|
||||
}
|
||||
|
||||
func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
|
||||
func getModelData(digest string, verbose bool) (ggml.KV, ggml.ForeignTensors, error) {
|
||||
maxArraySize := 0
|
||||
if verbose {
|
||||
maxArraySize = -1
|
||||
}
|
||||
data, err := llm.LoadModel(digest, maxArraySize)
|
||||
data, err := llm.LoadModel(digest, make([]string, 0), maxArraySize, true)
|
||||
if err != nil {
|
||||
return nil, ggml.Tensors{}, err
|
||||
return nil, make(ggml.ForeignTensors, 0), err
|
||||
}
|
||||
|
||||
kv := data.KV()
|
||||
|
|
@ -1221,7 +1221,7 @@ func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
|
|||
}
|
||||
}
|
||||
|
||||
return kv, data.Tensors(), nil
|
||||
return kv, data.Tensors, nil
|
||||
}
|
||||
|
||||
func (s *Server) ListHandler(c *gin.Context) {
|
||||
|
|
|
|||
|
|
@ -954,3 +954,236 @@ func TestDetectModelTypeFromFiles(t *testing.T) {
|
|||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestShardedGGUF(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
|
||||
_, fullDigest := createBinFile(t, ggml.KV{}, []*ggml.Tensor{})
|
||||
_, splitDigest1 := createBinFile(t, ggml.KV{
|
||||
"split.no": uint16(0),
|
||||
"split.count": uint16(3),
|
||||
}, []*ggml.Tensor{})
|
||||
_, splitDigest2 := createBinFile(t, ggml.KV{
|
||||
"split.no": uint16(1),
|
||||
"split.count": uint16(3),
|
||||
}, []*ggml.Tensor{})
|
||||
_, splitDigest3 := createBinFile(t, ggml.KV{
|
||||
"split.no": uint16(2),
|
||||
"split.count": uint16(3),
|
||||
}, []*ggml.Tensor{})
|
||||
_, splitDigest4 := createBinFile(t, ggml.KV{
|
||||
"split.no": uint16(0),
|
||||
"split.count": uint16(4),
|
||||
}, []*ggml.Tensor{})
|
||||
_, splitDigest5 := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "test1",
|
||||
"split.no": uint16(1),
|
||||
"split.count": uint16(3),
|
||||
}, []*ggml.Tensor{})
|
||||
|
||||
var s Server
|
||||
|
||||
t.Run("single full gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-single-full",
|
||||
Files: map[string]string{"test.gguf": fullDigest},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
fmt.Println(w)
|
||||
t.Fatalf("expected status code 200, actual %d", w.Code)
|
||||
}
|
||||
|
||||
manifest, err := ParseNamedManifest(model.ParseName("test-single-full"))
|
||||
if err != nil {
|
||||
t.Fatalf("parse manifest: %v", err)
|
||||
}
|
||||
for i, layer := range manifest.Layers {
|
||||
if i != 0 {
|
||||
t.Fatalf("expect 1 layer, actually found layer with index %d", i)
|
||||
} else if layer.Digest != fullDigest {
|
||||
t.Fatalf("expect digest %s, actual %s", fullDigest, layer.Digest)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("complete split gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-complete-split",
|
||||
Files: map[string]string{
|
||||
"test-00001-of-00003.gguf": splitDigest1,
|
||||
"test-00002-of-00003.gguf": splitDigest2,
|
||||
"test-00003-of-00003.gguf": splitDigest3,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
fmt.Println(w)
|
||||
t.Fatalf("expected status code 200, actual %d", w.Code)
|
||||
}
|
||||
|
||||
correctOrder := []string{
|
||||
splitDigest1, splitDigest2, splitDigest3,
|
||||
}
|
||||
|
||||
manifest, err := ParseNamedManifest(model.ParseName("test-complete-split"))
|
||||
if err != nil {
|
||||
t.Fatalf("parse manifest: %v", err)
|
||||
}
|
||||
for i, layer := range manifest.Layers {
|
||||
if i >= 3 {
|
||||
t.Fatalf("expect 3 layers, actually found layer with index %d", i)
|
||||
} else if layer.Digest != correctOrder[i] {
|
||||
t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("complete split misordered gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-complete-split-misorder",
|
||||
Files: map[string]string{
|
||||
"test-00003-of-00003.gguf": splitDigest3,
|
||||
"test-00001-of-00003.gguf": splitDigest1,
|
||||
"test-00002-of-00003.gguf": splitDigest2,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
fmt.Println(w)
|
||||
t.Fatalf("expected status code 200, actual %d", w.Code)
|
||||
}
|
||||
|
||||
correctOrder := []string{
|
||||
splitDigest1, splitDigest2, splitDigest3,
|
||||
}
|
||||
|
||||
manifest, err := ParseNamedManifest(model.ParseName("test-complete-split-misorder"))
|
||||
if err != nil {
|
||||
t.Fatalf("parse manifest: %v", err)
|
||||
}
|
||||
for i, layer := range manifest.Layers {
|
||||
if i >= 3 {
|
||||
t.Fatalf("expect 3 layers, actually found layer with index %d", i)
|
||||
} else if layer.Digest != correctOrder[i] {
|
||||
t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("mixed full and split gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-full-split-mixing",
|
||||
Files: map[string]string{
|
||||
"test-00002-of-00003.gguf": splitDigest2,
|
||||
"test-00003-of-00003.gguf": splitDigest3,
|
||||
"test1.gguf": fullDigest,
|
||||
"test-00001-of-00003.gguf": splitDigest1,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
fmt.Println(w)
|
||||
t.Fatalf("expected status code 200, actual %d", w.Code)
|
||||
}
|
||||
|
||||
correctOrder := []string{
|
||||
fullDigest, splitDigest1, splitDigest2, splitDigest3,
|
||||
}
|
||||
|
||||
manifest, err := ParseNamedManifest(model.ParseName("test-full-split-mixing"))
|
||||
if err != nil {
|
||||
t.Fatalf("parse manifest: %v", err)
|
||||
}
|
||||
for i, layer := range manifest.Layers {
|
||||
if i >= 4 {
|
||||
t.Fatalf("expect 4 layers, actually found layer with index %d", i)
|
||||
} else if layer.Digest != correctOrder[i] {
|
||||
t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("mixed wrong split gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-extra-split",
|
||||
Files: map[string]string{
|
||||
"test-00002-of-00003.gguf": splitDigest2,
|
||||
"test-00003-of-00003.gguf": splitDigest3,
|
||||
"test-00001-of-00003.gguf": splitDigest1,
|
||||
"test1-00001-of-00004.gguf": splitDigest4,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected status code 400, actual %d", w.Code)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("mixed same count wrong split gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-extra-split",
|
||||
Files: map[string]string{
|
||||
"test-00002-of-00003.gguf": splitDigest2,
|
||||
"test-00003-of-00003.gguf": splitDigest3,
|
||||
"test-00001-of-00003.gguf": splitDigest1,
|
||||
"test1-00002-of-00003.gguf": splitDigest5,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected status code 400, actual %d", w.Code)
|
||||
}
|
||||
})
|
||||
t.Run("missing head split gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-extra-split",
|
||||
Files: map[string]string{
|
||||
"test-00002-of-00003.gguf": splitDigest2,
|
||||
"test-00003-of-00003.gguf": splitDigest3,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected status code 400, actual %d", w.Code)
|
||||
}
|
||||
})
|
||||
t.Run("missing mid split gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-extra-split",
|
||||
Files: map[string]string{
|
||||
"test-00001-of-00003.gguf": splitDigest1,
|
||||
"test-00003-of-00003.gguf": splitDigest3,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected status code 400, actual %d", w.Code)
|
||||
}
|
||||
})
|
||||
t.Run("missing tail split gguf", func(t *testing.T) {
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "test-extra-split",
|
||||
Files: map[string]string{
|
||||
"test-00001-of-00003.gguf": splitDigest1,
|
||||
"test-00002-of-00003.gguf": splitDigest2,
|
||||
},
|
||||
Stream: &stream,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected status code 400, actual %d", w.Code)
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
|
|
@ -232,7 +232,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
|
|
@ -228,7 +228,7 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
|
|
|
|||
|
|
@ -71,8 +71,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
|||
return
|
||||
}
|
||||
|
||||
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, []string, *ggml.MetaGGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ []string, _ *ggml.MetaGGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||
return mock, nil
|
||||
}
|
||||
}
|
||||
|
|
@ -182,7 +182,7 @@ func TestGenerateChat(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
|
|
@ -898,7 +898,7 @@ func TestGenerate(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
|
|
@ -1382,7 +1382,7 @@ func TestGenerateLogprobs(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{llama: mock}
|
||||
return false
|
||||
},
|
||||
|
|
@ -1562,7 +1562,7 @@ func TestChatLogprobs(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{llama: mock}
|
||||
return false
|
||||
},
|
||||
|
|
@ -1672,7 +1672,7 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{llama: mock}
|
||||
return false
|
||||
|
|
|
|||
|
|
@ -265,7 +265,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 100 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
|
|
@ -416,7 +416,7 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 100 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
|
|
@ -598,7 +598,7 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
|
|||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,8 +49,8 @@ type Scheduler struct {
|
|||
activeLoading llm.LlamaServer
|
||||
loaded map[string]*runnerRef
|
||||
|
||||
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
|
||||
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
loadFn func(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
|
||||
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
|
||||
getSystemInfoFn func() ml.SystemInfo
|
||||
waitForRecovery time.Duration
|
||||
|
|
@ -196,7 +196,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
|
||||
// Load model for fitting
|
||||
logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath, pending.model.ExtraModelPaths, 1024, false)
|
||||
if err != nil {
|
||||
pending.errCh <- err
|
||||
break
|
||||
|
|
@ -389,7 +389,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|||
|
||||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
|
||||
numParallel := max(int(envconfig.NumParallel()), 1)
|
||||
|
||||
// Embedding models should always be loaded with parallel=1
|
||||
|
|
@ -414,7 +414,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
|||
|
||||
if llama == nil {
|
||||
var err error
|
||||
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, req.model.ExtraModelPaths, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ func TestSchedLoad(t *testing.T) {
|
|||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
var f *ggml.GGML // value not used in tests
|
||||
var f *ggml.MetaGGML // value not used in tests
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
model: &Model{ModelPath: "foo"},
|
||||
|
|
@ -49,7 +49,7 @@ func TestSchedLoad(t *testing.T) {
|
|||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := []ml.DeviceInfo{}
|
||||
|
|
@ -64,7 +64,7 @@ func TestSchedLoad(t *testing.T) {
|
|||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
|
|
@ -103,10 +103,10 @@ type reqBundle struct {
|
|||
ctxDone func()
|
||||
srv *mockLlm
|
||||
req *LlmRequest
|
||||
f *ggml.GGML
|
||||
f *ggml.MetaGGML
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
scenario.srv.modelPath = model
|
||||
return scenario.srv, nil
|
||||
}
|
||||
|
|
@ -132,7 +132,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
|
|||
})
|
||||
|
||||
model := &Model{Name: modelName, ModelPath: p}
|
||||
f, err := llm.LoadModel(model.ModelPath, 0)
|
||||
f, err := llm.LoadModel(model.ModelPath, make([]string, 0), 0, true)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
@ -462,11 +462,11 @@ func TestSchedExpireRunner(t *testing.T) {
|
|||
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
||||
}
|
||||
|
||||
var f *ggml.GGML
|
||||
var f *ggml.MetaGGML
|
||||
gpus := []ml.DeviceInfo{}
|
||||
systemInfo := ml.SystemInfo{}
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue