gguf: add split gguf loading
This commit is contained in:
parent
d70e935526
commit
efdd9b76da
|
|
@ -426,6 +426,7 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map
|
||||||
cmd, port, err := llm.StartRunner(
|
cmd, port, err := llm.StartRunner(
|
||||||
true, // ollama engine
|
true, // ollama engine
|
||||||
"", // no model
|
"", // no model
|
||||||
|
make([]string, 0),
|
||||||
ollamaLibDirs,
|
ollamaLibDirs,
|
||||||
out,
|
out,
|
||||||
extraEnvs,
|
extraEnvs,
|
||||||
|
|
|
||||||
179
fs/ggml/ggml.go
179
fs/ggml/ggml.go
|
|
@ -7,8 +7,10 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"maps"
|
||||||
"math"
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
|
|
@ -26,6 +28,18 @@ type model interface {
|
||||||
Tensors() Tensors
|
Tensors() Tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type MetaGGML struct {
|
||||||
|
Shards []GGML
|
||||||
|
ShardPaths []string
|
||||||
|
Tensors ForeignTensors
|
||||||
|
kv KV
|
||||||
|
}
|
||||||
|
|
||||||
|
type GGUFSplitInfo struct {
|
||||||
|
no uint32
|
||||||
|
count uint32
|
||||||
|
}
|
||||||
|
|
||||||
type KV map[string]any
|
type KV map[string]any
|
||||||
|
|
||||||
func (kv KV) Architecture() string {
|
func (kv KV) Architecture() string {
|
||||||
|
|
@ -49,6 +63,17 @@ func (kv KV) FileType() FileType {
|
||||||
return FileTypeUnknown
|
return FileTypeUnknown
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (kv KV) GGUFSplitInfo() *GGUFSplitInfo {
|
||||||
|
no := kv.Uint("split.no", 0xffffffff)
|
||||||
|
if no == 0xffffffff {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &GGUFSplitInfo{
|
||||||
|
no: no,
|
||||||
|
count: kv.Uint("split.count"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (kv KV) BlockCount() uint64 {
|
func (kv KV) BlockCount() uint64 {
|
||||||
return uint64(kv.Uint("block_count"))
|
return uint64(kv.Uint("block_count"))
|
||||||
}
|
}
|
||||||
|
|
@ -268,7 +293,7 @@ type arrayValueTypes interface {
|
||||||
}
|
}
|
||||||
|
|
||||||
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
|
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
|
||||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "split.") {
|
||||||
key = kv.Architecture() + "." + key
|
key = kv.Architecture() + "." + key
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -285,6 +310,14 @@ type Tensors struct {
|
||||||
Offset uint64
|
Offset uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ForeignTensor struct {
|
||||||
|
*Tensor
|
||||||
|
ModelPath string
|
||||||
|
TensorRegionOffset uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
type ForeignTensors []ForeignTensor
|
||||||
|
|
||||||
func (s Tensors) Items(prefix ...string) []*Tensor {
|
func (s Tensors) Items(prefix ...string) []*Tensor {
|
||||||
if len(prefix) == 0 {
|
if len(prefix) == 0 {
|
||||||
return s.items
|
return s.items
|
||||||
|
|
@ -323,6 +356,41 @@ func (ts Tensors) GroupLayers() map[string]Layer {
|
||||||
return layers
|
return layers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s ForeignTensors) Items(prefix ...string) []*Tensor {
|
||||||
|
var items []*Tensor
|
||||||
|
for i := range s {
|
||||||
|
if len(prefix) == 0 || strings.HasPrefix(s[i].Name, prefix[0]) {
|
||||||
|
items = append(items, s[i].Tensor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return items
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ts ForeignTensors) GroupLayers() map[string]Layer {
|
||||||
|
layers := make(map[string]Layer)
|
||||||
|
for i := range ts {
|
||||||
|
t := ts[i].Tensor
|
||||||
|
parts := strings.Split(t.Name, ".")
|
||||||
|
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
|
||||||
|
if len(parts) > index+2 {
|
||||||
|
// blk and mm should have a number after them, join it
|
||||||
|
parts = append(
|
||||||
|
[]string{strings.Join(parts[:index+2], ".")},
|
||||||
|
parts[index+2:]...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := layers[parts[0]]; !ok {
|
||||||
|
layers[parts[0]] = make(Layer)
|
||||||
|
}
|
||||||
|
|
||||||
|
layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||||
|
}
|
||||||
|
|
||||||
|
return layers
|
||||||
|
}
|
||||||
|
|
||||||
type Layer map[string]*Tensor
|
type Layer map[string]*Tensor
|
||||||
|
|
||||||
func (l Layer) Size() (size uint64) {
|
func (l Layer) Size() (size uint64) {
|
||||||
|
|
@ -550,7 +618,89 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BuildForeignTensors(shards []GGML, shardsPaths []string) (*ForeignTensors, error) {
|
||||||
|
if len(shards) != len(shardsPaths) {
|
||||||
|
return nil, fmt.Errorf("length of shards and shardsPaths do not match: %d vs %d", len(shards), len(shardsPaths))
|
||||||
|
}
|
||||||
|
li := make(ForeignTensors, 0)
|
||||||
|
for i := range shards {
|
||||||
|
gs := shards[i]
|
||||||
|
tensors := gs.Tensors()
|
||||||
|
for k := range tensors.items {
|
||||||
|
tensor := tensors.items[k]
|
||||||
|
li = append(li, ForeignTensor{
|
||||||
|
Tensor: tensor,
|
||||||
|
ModelPath: shardsPaths[i],
|
||||||
|
TensorRegionOffset: tensors.Offset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &li, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func MakeMetaGGML(ggmls []GGML, ggmlPaths []string) MetaGGML {
|
||||||
|
type wrapper struct {
|
||||||
|
ggml GGML
|
||||||
|
path string
|
||||||
|
weight int64
|
||||||
|
}
|
||||||
|
var wrappers []wrapper
|
||||||
|
for i := range ggmls {
|
||||||
|
iSplitInfo := ggmls[i].KV().GGUFSplitInfo()
|
||||||
|
var weight int64 = 0
|
||||||
|
if iSplitInfo == nil {
|
||||||
|
weight = -1
|
||||||
|
} else {
|
||||||
|
weight = int64((*iSplitInfo).no)
|
||||||
|
}
|
||||||
|
wrappers = append(wrappers, wrapper{
|
||||||
|
ggml: ggmls[i],
|
||||||
|
path: ggmlPaths[i],
|
||||||
|
weight: weight,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.SliceStable(wrappers, func(i, j int) bool {
|
||||||
|
return wrappers[i].weight < wrappers[j].weight
|
||||||
|
})
|
||||||
|
metaGgml := MetaGGML{}
|
||||||
|
for i := range wrappers {
|
||||||
|
if i == 0 {
|
||||||
|
kv := maps.Clone(wrappers[i].ggml.KV())
|
||||||
|
// remove the keys contained in split gguf files. add more if needed.
|
||||||
|
delete(kv, "slice.no")
|
||||||
|
delete(kv, "slice.count")
|
||||||
|
delete(kv, "slice.tensors.count")
|
||||||
|
metaGgml.kv = kv
|
||||||
|
}
|
||||||
|
metaGgml.Shards = append(metaGgml.Shards, wrappers[i].ggml)
|
||||||
|
metaGgml.ShardPaths = append(metaGgml.ShardPaths, wrappers[i].path)
|
||||||
|
}
|
||||||
|
ft, _ := BuildForeignTensors(metaGgml.Shards, metaGgml.ShardPaths)
|
||||||
|
metaGgml.Tensors = *ft
|
||||||
|
return metaGgml
|
||||||
|
}
|
||||||
|
|
||||||
|
func simpleWrapGGML(ggml GGML) MetaGGML {
|
||||||
|
// simply wrap single GGML, without creating foreign tensors
|
||||||
|
return MetaGGML{
|
||||||
|
Shards: []GGML{ggml},
|
||||||
|
ShardPaths: []string{""},
|
||||||
|
kv: ggml.KV(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WrapGGML(ggml GGML) MetaGGML {
|
||||||
|
metaggml := simpleWrapGGML(ggml)
|
||||||
|
ft, _ := BuildForeignTensors(metaggml.Shards, metaggml.ShardPaths)
|
||||||
|
metaggml.Tensors = *ft
|
||||||
|
return metaggml
|
||||||
|
}
|
||||||
|
|
||||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
|
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||||
|
return WrapGGML(f).GraphSize(context, batch, numParallel, kvCacheType, useFlashAttention)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f MetaGGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||||
context *= uint64(numParallel)
|
context *= uint64(numParallel)
|
||||||
|
|
||||||
embedding := f.KV().EmbeddingLength()
|
embedding := f.KV().EmbeddingLength()
|
||||||
|
|
@ -564,7 +714,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
||||||
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
||||||
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
||||||
|
|
||||||
layers := f.Tensors().GroupLayers()
|
layers := f.Tensors.GroupLayers()
|
||||||
|
|
||||||
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
||||||
|
|
||||||
|
|
@ -662,7 +812,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
||||||
)
|
)
|
||||||
|
|
||||||
var ropeFreqsCount uint64
|
var ropeFreqsCount uint64
|
||||||
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
if ropeFreqs, ok := f.Tensors.GroupLayers()["rope_freqs"]; ok {
|
||||||
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
||||||
ropeFreqsCount = ropeFreqsWeights.Elements()
|
ropeFreqsCount = ropeFreqsWeights.Elements()
|
||||||
}
|
}
|
||||||
|
|
@ -802,6 +952,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
||||||
|
|
||||||
// SupportsKVCacheType checks if the requested cache type is supported
|
// SupportsKVCacheType checks if the requested cache type is supported
|
||||||
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
|
return simpleWrapGGML(f).SupportsKVCacheType(cacheType)
|
||||||
|
}
|
||||||
|
func (f MetaGGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
if cacheType == "" || cacheType == "f16" {
|
if cacheType == "" || cacheType == "f16" {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
@ -811,6 +964,10 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
|
|
||||||
// SupportsFlashAttention checks if the model supports flash attention
|
// SupportsFlashAttention checks if the model supports flash attention
|
||||||
func (f GGML) SupportsFlashAttention() bool {
|
func (f GGML) SupportsFlashAttention() bool {
|
||||||
|
return simpleWrapGGML(f).SupportsFlashAttention()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f MetaGGML) SupportsFlashAttention() bool {
|
||||||
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
||||||
if isEmbedding {
|
if isEmbedding {
|
||||||
return false
|
return false
|
||||||
|
|
@ -828,6 +985,10 @@ func (f GGML) SupportsFlashAttention() bool {
|
||||||
|
|
||||||
// FlashAttention checks if the model should enable flash attention
|
// FlashAttention checks if the model should enable flash attention
|
||||||
func (f GGML) FlashAttention() bool {
|
func (f GGML) FlashAttention() bool {
|
||||||
|
return simpleWrapGGML(f).FlashAttention()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f MetaGGML) FlashAttention() bool {
|
||||||
return slices.Contains([]string{
|
return slices.Contains([]string{
|
||||||
"gemma3",
|
"gemma3",
|
||||||
"gptoss", "gpt-oss",
|
"gptoss", "gpt-oss",
|
||||||
|
|
@ -849,3 +1010,15 @@ func kvCacheBytesPerElement(cacheType string) float64 {
|
||||||
return 2 // f16 (default)
|
return 2 // f16 (default)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f MetaGGML) KV() KV {
|
||||||
|
return f.kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f MetaGGML) TotalTensorBytes() uint64 {
|
||||||
|
totalBytes := uint64(0)
|
||||||
|
for i := range f.Shards {
|
||||||
|
totalBytes += uint64(f.Shards[i].Length) - f.Shards[i].Tensors().Offset
|
||||||
|
}
|
||||||
|
return totalBytes
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -138,7 +138,7 @@ func (llm *gguf) numKV() uint64 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
func (llm *gguf) Decode(rs io.ReadSeeker, mainKV ...KV) error {
|
||||||
// decode key-values
|
// decode key-values
|
||||||
for i := 0; uint64(i) < llm.numKV(); i++ {
|
for i := 0; uint64(i) < llm.numKV(); i++ {
|
||||||
k, err := readGGUFString(llm, rs)
|
k, err := readGGUFString(llm, rs)
|
||||||
|
|
@ -235,7 +235,11 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||||
// patch KV with parameter count
|
// patch KV with parameter count
|
||||||
llm.kv["general.parameter_count"] = llm.parameters
|
llm.kv["general.parameter_count"] = llm.parameters
|
||||||
|
|
||||||
alignment := llm.kv.Uint("general.alignment", 32)
|
alignment := llm.kv.Uint("general.alignment", 0xffffffff)
|
||||||
|
if alignment == 0xffffffff {
|
||||||
|
// try to get alignment from main shard instead.
|
||||||
|
alignment = append(mainKV, make(KV))[0].Uint("general.alignment", 32)
|
||||||
|
}
|
||||||
|
|
||||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
2
go.mod
2
go.mod
|
|
@ -72,7 +72,7 @@ require (
|
||||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||||
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
|
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
|
||||||
github.com/spf13/pflag v1.0.5 // indirect
|
github.com/spf13/pflag v1.0.5
|
||||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||||
github.com/ugorji/go/codec v1.2.12 // indirect
|
github.com/ugorji/go/codec v1.2.12 // indirect
|
||||||
golang.org/x/arch v0.8.0 // indirect
|
golang.org/x/arch v0.8.0 // indirect
|
||||||
|
|
|
||||||
|
|
@ -256,7 +256,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
func LoadModelFromFile(modelPath string, extraModelPaths []string, params ModelParams) (*Model, error) {
|
||||||
cparams := C.llama_model_default_params()
|
cparams := C.llama_model_default_params()
|
||||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
||||||
cparams.main_gpu = C.int32_t(params.MainGpu)
|
cparams.main_gpu = C.int32_t(params.MainGpu)
|
||||||
|
|
@ -300,7 +300,17 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
||||||
cparams.progress_callback_user_data = unsafe.Pointer(&handle)
|
cparams.progress_callback_user_data = unsafe.Pointer(&handle)
|
||||||
}
|
}
|
||||||
|
|
||||||
m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)}
|
var splitPaths []*C.char
|
||||||
|
mp := C.CString(modelPath)
|
||||||
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
splitPaths = append(splitPaths, mp)
|
||||||
|
for i := range extraModelPaths {
|
||||||
|
mp := C.CString(extraModelPaths[i])
|
||||||
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
splitPaths = append(splitPaths, mp)
|
||||||
|
}
|
||||||
|
|
||||||
|
m := Model{c: C.llama_model_load_from_splits(&splitPaths[0], C.size_t(len(splitPaths)), cparams)}
|
||||||
if m.c == nil {
|
if m.c == nil {
|
||||||
return nil, fmt.Errorf("unable to load model: %s", modelPath)
|
return nil, fmt.Errorf("unable to load model: %s", modelPath)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -84,12 +84,13 @@ type LlamaServer interface {
|
||||||
|
|
||||||
// llmServer is an instance of a runner hosting a single model
|
// llmServer is an instance of a runner hosting a single model
|
||||||
type llmServer struct {
|
type llmServer struct {
|
||||||
port int
|
port int
|
||||||
cmd *exec.Cmd
|
cmd *exec.Cmd
|
||||||
done chan error // Channel to signal when the process exits
|
done chan error // Channel to signal when the process exits
|
||||||
status *StatusWriter
|
status *StatusWriter
|
||||||
options api.Options
|
options api.Options
|
||||||
modelPath string
|
modelPath string
|
||||||
|
extraModelPaths []string
|
||||||
|
|
||||||
loadRequest LoadRequest // Parameters used to initialize the runner
|
loadRequest LoadRequest // Parameters used to initialize the runner
|
||||||
mem *ml.BackendMemory // Memory allocations for this model
|
mem *ml.BackendMemory // Memory allocations for this model
|
||||||
|
|
@ -109,7 +110,7 @@ type llmServer struct {
|
||||||
type llamaServer struct {
|
type llamaServer struct {
|
||||||
llmServer
|
llmServer
|
||||||
|
|
||||||
ggml *ggml.GGML
|
ggml *ggml.MetaGGML
|
||||||
}
|
}
|
||||||
|
|
||||||
type ollamaServer struct {
|
type ollamaServer struct {
|
||||||
|
|
@ -123,7 +124,7 @@ type ollamaServer struct {
|
||||||
// It collects array values for arrays with a size less than or equal to
|
// It collects array values for arrays with a size less than or equal to
|
||||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
// the maxArraySize is negative, all arrays are collected.
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
func LoadModel(model string, extraModels []string, maxArraySize int) (*ggml.MetaGGML, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -134,12 +135,36 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ggml, err := ggml.Decode(f, maxArraySize)
|
ggml1, err := ggml.Decode(f, maxArraySize)
|
||||||
return ggml, err
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if ggml1.KV().GGUFSplitInfo() != nil {
|
||||||
|
loadedGgml := []ggml.GGML{*ggml1}
|
||||||
|
for i := range extraModels {
|
||||||
|
extraModel := extraModels[i]
|
||||||
|
f, err := os.Open(extraModel)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
ggml1, err := ggml.Decode(f, maxArraySize)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
loadedGgml = append(loadedGgml, *ggml1)
|
||||||
|
}
|
||||||
|
metaggml := ggml.MakeMetaGGML(loadedGgml, append([]string{model}, extraModels...))
|
||||||
|
return &metaggml, nil
|
||||||
|
} else {
|
||||||
|
metaggml := ggml.MakeMetaGGML([]ggml.GGML{*ggml1}, []string{model})
|
||||||
|
return &metaggml, nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewLlamaServer will run a server for the given GPUs
|
// NewLlamaServer will run a server for the given GPUs
|
||||||
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, extraModelPaths []string, f *ggml.MetaGGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||||
var llamaModel *llama.Model
|
var llamaModel *llama.Model
|
||||||
var textProcessor model.TextProcessor
|
var textProcessor model.TextProcessor
|
||||||
var err error
|
var err error
|
||||||
|
|
@ -155,7 +180,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if textProcessor == nil {
|
if textProcessor == nil {
|
||||||
llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
|
llamaModel, err = llama.LoadModelFromFile(modelPath, extraModelPaths, llama.ModelParams{VocabOnly: true})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -225,24 +250,26 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
||||||
cmd, port, err := StartRunner(
|
cmd, port, err := StartRunner(
|
||||||
textProcessor != nil,
|
textProcessor != nil,
|
||||||
modelPath,
|
modelPath,
|
||||||
|
extraModelPaths,
|
||||||
gpuLibs,
|
gpuLibs,
|
||||||
status,
|
status,
|
||||||
ml.GetVisibleDevicesEnv(gpus),
|
ml.GetVisibleDevicesEnv(gpus),
|
||||||
)
|
)
|
||||||
|
|
||||||
s := llmServer{
|
s := llmServer{
|
||||||
port: port,
|
port: port,
|
||||||
cmd: cmd,
|
cmd: cmd,
|
||||||
status: status,
|
status: status,
|
||||||
options: opts,
|
options: opts,
|
||||||
modelPath: modelPath,
|
modelPath: modelPath,
|
||||||
loadRequest: loadRequest,
|
extraModelPaths: extraModelPaths,
|
||||||
llamaModel: llamaModel,
|
loadRequest: loadRequest,
|
||||||
llamaModelLock: &sync.Mutex{},
|
llamaModel: llamaModel,
|
||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
llamaModelLock: &sync.Mutex{},
|
||||||
totalLayers: f.KV().BlockCount() + 1,
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||||
loadStart: time.Now(),
|
totalLayers: f.KV().BlockCount() + 1,
|
||||||
done: make(chan error, 1),
|
loadStart: time.Now(),
|
||||||
|
done: make(chan error, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -279,7 +306,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
|
func StartRunner(ollamaEngine bool, modelPath string, extraModelPaths []string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
|
||||||
var exe string
|
var exe string
|
||||||
exe, err = os.Executable()
|
exe, err = os.Executable()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -309,6 +336,9 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W
|
||||||
if modelPath != "" {
|
if modelPath != "" {
|
||||||
params = append(params, "--model", modelPath)
|
params = append(params, "--model", modelPath)
|
||||||
}
|
}
|
||||||
|
for i := range extraModelPaths {
|
||||||
|
params = append(params, "--model", extraModelPaths[i])
|
||||||
|
}
|
||||||
params = append(params, "--port", strconv.Itoa(port))
|
params = append(params, "--port", strconv.Itoa(port))
|
||||||
|
|
||||||
var pathEnv string
|
var pathEnv string
|
||||||
|
|
@ -403,6 +433,10 @@ func (s *llmServer) ModelPath() string {
|
||||||
return s.modelPath
|
return s.modelPath
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *llmServer) ExtraModelPaths() []string {
|
||||||
|
return s.extraModelPaths
|
||||||
|
}
|
||||||
|
|
||||||
type LoadOperation int
|
type LoadOperation int
|
||||||
|
|
||||||
// The order of these constants are significant because we iterate over the operations. They
|
// The order of these constants are significant because we iterate over the operations. They
|
||||||
|
|
@ -478,7 +512,7 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
|
||||||
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
|
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
|
||||||
|
|
||||||
// Use the size of one layer as a buffer
|
// Use the size of one layer as a buffer
|
||||||
layers := s.ggml.Tensors().GroupLayers()
|
layers := s.ggml.Tensors.GroupLayers()
|
||||||
if blk0, ok := layers["blk.0"]; ok {
|
if blk0, ok := layers["blk.0"]; ok {
|
||||||
for i := range gpus {
|
for i := range gpus {
|
||||||
gpus[i].FreeMemory -= blk0.Size() + kv[0]
|
gpus[i].FreeMemory -= blk0.Size() + kv[0]
|
||||||
|
|
|
||||||
|
|
@ -77,9 +77,9 @@ type BackendParams struct {
|
||||||
FlashAttention bool
|
FlashAttention bool
|
||||||
}
|
}
|
||||||
|
|
||||||
var backends = make(map[string]func(string, BackendParams) (Backend, error))
|
var backends = make(map[string]func(string, []string, BackendParams) (Backend, error))
|
||||||
|
|
||||||
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
|
func RegisterBackend(name string, f func(string, []string, BackendParams) (Backend, error)) {
|
||||||
if _, ok := backends[name]; ok {
|
if _, ok := backends[name]; ok {
|
||||||
panic("backend: backend already registered")
|
panic("backend: backend already registered")
|
||||||
}
|
}
|
||||||
|
|
@ -87,9 +87,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
|
||||||
backends[name] = f
|
backends[name] = f
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewBackend(modelPath string, params BackendParams) (Backend, error) {
|
func NewBackend(modelPath string, extraModelPaths []string, params BackendParams) (Backend, error) {
|
||||||
if backend, ok := backends["ggml"]; ok {
|
if backend, ok := backends["ggml"]; ok {
|
||||||
return backend(modelPath, params)
|
return backend(modelPath, extraModelPaths, params)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, fmt.Errorf("unsupported backend")
|
return nil, fmt.Errorf("unsupported backend")
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,7 @@ type Backend struct {
|
||||||
// modelPath is the location of the model data
|
// modelPath is the location of the model data
|
||||||
modelPath string
|
modelPath string
|
||||||
|
|
||||||
meta *fsggml.GGML
|
meta *fsggml.MetaGGML
|
||||||
|
|
||||||
// allocMemory means that memory should be allocated for tensors and not
|
// allocMemory means that memory should be allocated for tensors and not
|
||||||
// just a dry run
|
// just a dry run
|
||||||
|
|
@ -120,17 +120,38 @@ type Backend struct {
|
||||||
|
|
||||||
var once sync.Once
|
var once sync.Once
|
||||||
|
|
||||||
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
r, err := os.Open(modelPath)
|
r, err := os.Open(modelPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
meta, err := fsggml.Decode(r, -1)
|
smallmeta, err := fsggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
var meta fsggml.MetaGGML
|
||||||
|
if smallmeta.KV().GGUFSplitInfo() != nil {
|
||||||
|
loadedGgml := []fsggml.GGML{*smallmeta}
|
||||||
|
for i := range extraModelPaths {
|
||||||
|
extraModel := extraModelPaths[i]
|
||||||
|
f, err := os.Open(extraModel)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
smallmeta, err := fsggml.Decode(f, -1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
loadedGgml = append(loadedGgml, *smallmeta)
|
||||||
|
}
|
||||||
|
meta = fsggml.MakeMetaGGML(loadedGgml, append([]string{modelPath}, extraModelPaths...))
|
||||||
|
} else {
|
||||||
|
meta = fsggml.MakeMetaGGML([]fsggml.GGML{*smallmeta}, []string{modelPath})
|
||||||
|
}
|
||||||
|
|
||||||
once.Do(func() {
|
once.Do(func() {
|
||||||
slog.Info(
|
slog.Info(
|
||||||
|
|
@ -139,7 +160,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
"file_type", meta.KV().FileType(),
|
"file_type", meta.KV().FileType(),
|
||||||
"name", meta.KV().String("general.name"),
|
"name", meta.KV().String("general.name"),
|
||||||
"description", meta.KV().String("general.description"),
|
"description", meta.KV().String("general.description"),
|
||||||
"num_tensors", len(meta.Tensors().Items()),
|
"num_tensors", len(meta.Tensors.Items()),
|
||||||
"num_key_values", len(meta.KV()),
|
"num_key_values", len(meta.KV()),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
@ -227,7 +248,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
// outputs are assigned iff allowed by splits and configured number of gpu layers
|
// outputs are assigned iff allowed by splits and configured number of gpu layers
|
||||||
output := assignLayer(blocks)
|
output := assignLayer(blocks)
|
||||||
|
|
||||||
maxTensors := len(meta.Tensors().Items())
|
maxTensors := len(meta.Tensors.Items())
|
||||||
maxTensors += 1
|
maxTensors += 1
|
||||||
// each layer has at most 2 extra tensors for rope operations
|
// each layer has at most 2 extra tensors for rope operations
|
||||||
maxTensors += blocks * 2
|
maxTensors += blocks * 2
|
||||||
|
|
@ -303,11 +324,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, t := range meta.Tensors().Items() {
|
for _, t := range meta.Tensors.Items() {
|
||||||
switch {
|
switch {
|
||||||
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
|
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
|
||||||
createTensor(tensor{source: t}, input.bts, -1)
|
createTensor(tensor{source: t}, input.bts, -1)
|
||||||
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
|
if _, ok := meta.Tensors.GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
|
||||||
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
|
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
|
||||||
}
|
}
|
||||||
case contains(t.Name, "cls", "output", "output_norm",
|
case contains(t.Name, "cls", "output", "output_norm",
|
||||||
|
|
@ -378,7 +399,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
maxGraphNodes := max(1024, len(meta.Tensors().Items())*8)
|
maxGraphNodes := max(1024, len(meta.Tensors.Items())*8)
|
||||||
|
|
||||||
sched := C.ggml_backend_sched_new_ext(
|
sched := C.ggml_backend_sched_new_ext(
|
||||||
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
|
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
|
||||||
|
|
@ -423,7 +444,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
modelPath: modelPath,
|
modelPath: modelPath,
|
||||||
allocMemory: params.AllocMemory,
|
allocMemory: params.AllocMemory,
|
||||||
flashAttention: params.FlashAttention,
|
flashAttention: params.FlashAttention,
|
||||||
meta: meta,
|
meta: &meta,
|
||||||
tensorLoadTargets: targets,
|
tensorLoadTargets: targets,
|
||||||
tensors: tensors,
|
tensors: tensors,
|
||||||
sched: sched,
|
sched: sched,
|
||||||
|
|
@ -494,11 +515,12 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
||||||
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
|
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
|
||||||
|
|
||||||
var doneBytes atomic.Uint64
|
var doneBytes atomic.Uint64
|
||||||
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
|
totalBytes := b.meta.TotalTensorBytes()
|
||||||
|
|
||||||
g, ctx := errgroup.WithContext(ctx)
|
g, ctx := errgroup.WithContext(ctx)
|
||||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||||
for _, t := range b.meta.Tensors().Items() {
|
for i := range b.meta.Tensors {
|
||||||
|
t := b.meta.Tensors[i]
|
||||||
g.Go(func() error {
|
g.Go(func() error {
|
||||||
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
|
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
|
||||||
for i := range tts {
|
for i := range tts {
|
||||||
|
|
@ -517,13 +539,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
||||||
|
|
||||||
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
|
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
|
||||||
// seeking around within an FD shared between all goroutines.
|
// seeking around within an FD shared between all goroutines.
|
||||||
file, err := os.Open(b.modelPath)
|
file, err := os.Open(t.ModelPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("file open error", "file", b.modelPath, "error", err)
|
slog.Warn("file open error", "file", t.ModelPath, "error", err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
sr := io.NewSectionReader(file, int64(t.TensorRegionOffset+t.Offset), int64(t.Size()))
|
||||||
|
|
||||||
if t.Kind == 4 && tts[0]._type == 39 {
|
if t.Kind == 4 && tts[0]._type == 39 {
|
||||||
// source is mxfp4, target is ggml mxfp4
|
// source is mxfp4, target is ggml mxfp4
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ func setup(tb testing.TB) ml.Context {
|
||||||
tb.Fatal(err)
|
tb.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
|
b, err := ml.NewBackend(f.Name(), make([]string, 0), ml.BackendParams{AllocMemory: true})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
tb.Fatal(err)
|
tb.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -102,8 +102,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// New initializes a new model instance with the provided configuration based on the metadata in the model file
|
// New initializes a new model instance with the provided configuration based on the metadata in the model file
|
||||||
func New(modelPath string, params ml.BackendParams) (Model, error) {
|
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (Model, error) {
|
||||||
b, err := ml.NewBackend(modelPath, params)
|
b, err := ml.NewBackend(modelPath, extraModelPaths, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"flag"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
|
@ -19,6 +18,7 @@ import (
|
||||||
"time"
|
"time"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"github.com/spf13/pflag"
|
||||||
"golang.org/x/sync/semaphore"
|
"golang.org/x/sync/semaphore"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
|
@ -256,6 +256,8 @@ type Server struct {
|
||||||
// modelPath is the location of the model to be loaded
|
// modelPath is the location of the model to be loaded
|
||||||
modelPath string
|
modelPath string
|
||||||
|
|
||||||
|
extraModelPaths []string
|
||||||
|
|
||||||
// loadMu prevents more than one load attempt from occurring at a time
|
// loadMu prevents more than one load attempt from occurring at a time
|
||||||
loadMu sync.Mutex
|
loadMu sync.Mutex
|
||||||
|
|
||||||
|
|
@ -827,6 +829,7 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||||
func (s *Server) loadModel(
|
func (s *Server) loadModel(
|
||||||
params llama.ModelParams,
|
params llama.ModelParams,
|
||||||
mpath string,
|
mpath string,
|
||||||
|
empath []string,
|
||||||
lpath []string,
|
lpath []string,
|
||||||
ppath string,
|
ppath string,
|
||||||
kvSize int,
|
kvSize int,
|
||||||
|
|
@ -836,7 +839,7 @@ func (s *Server) loadModel(
|
||||||
multiUserCache bool,
|
multiUserCache bool,
|
||||||
) {
|
) {
|
||||||
var err error
|
var err error
|
||||||
s.model, err = llama.LoadModelFromFile(mpath, params)
|
s.model, err = llama.LoadModelFromFile(mpath, empath, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
@ -929,7 +932,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
||||||
}
|
}
|
||||||
|
|
||||||
s.status = llm.ServerStatusLoadingModel
|
s.status = llm.ServerStatusLoadingModel
|
||||||
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
|
go s.loadModel(params, s.modelPath, s.extraModelPaths, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
|
||||||
|
|
||||||
case llm.LoadOperationClose:
|
case llm.LoadOperationClose:
|
||||||
// No-op for us
|
// No-op for us
|
||||||
|
|
@ -947,13 +950,14 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func Execute(args []string) error {
|
func Execute(args []string) error {
|
||||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
|
||||||
mpath := fs.String("model", "", "Path to model binary file")
|
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
|
||||||
port := fs.Int("port", 8080, "Port to expose the server on")
|
port := fs.Int("port", 8080, "Port to expose the server on")
|
||||||
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||||
|
|
||||||
fs.Usage = func() {
|
fs.Usage = func() {
|
||||||
fmt.Fprintf(fs.Output(), "Runner usage\n")
|
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
|
||||||
|
fmt.Fprintf(os.Stderr, "Runner usage\n")
|
||||||
fs.PrintDefaults()
|
fs.PrintDefaults()
|
||||||
}
|
}
|
||||||
if err := fs.Parse(args); err != nil {
|
if err := fs.Parse(args); err != nil {
|
||||||
|
|
@ -965,8 +969,9 @@ func Execute(args []string) error {
|
||||||
llama.BackendInit()
|
llama.BackendInit()
|
||||||
|
|
||||||
server := &Server{
|
server := &Server{
|
||||||
modelPath: *mpath,
|
modelPath: (*mpath)[0],
|
||||||
status: llm.ServerStatusLaunched,
|
extraModelPaths: (*mpath)[1:],
|
||||||
|
status: llm.ServerStatusLaunched,
|
||||||
}
|
}
|
||||||
|
|
||||||
server.ready.Add(1)
|
server.ready.Add(1)
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"flag"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"hash/maphash"
|
"hash/maphash"
|
||||||
"image"
|
"image"
|
||||||
|
|
@ -23,6 +22,7 @@ import (
|
||||||
"time"
|
"time"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"github.com/spf13/pflag"
|
||||||
"golang.org/x/image/bmp"
|
"golang.org/x/image/bmp"
|
||||||
"golang.org/x/sync/semaphore"
|
"golang.org/x/sync/semaphore"
|
||||||
|
|
||||||
|
|
@ -331,6 +331,8 @@ type Server struct {
|
||||||
// modelPath is the location of the model to be loaded
|
// modelPath is the location of the model to be loaded
|
||||||
modelPath string
|
modelPath string
|
||||||
|
|
||||||
|
extraModelPaths []string
|
||||||
|
|
||||||
// loadMu prevents more than one load attempt from occurring at a time
|
// loadMu prevents more than one load attempt from occurring at a time
|
||||||
loadMu sync.Mutex
|
loadMu sync.Mutex
|
||||||
|
|
||||||
|
|
@ -1168,6 +1170,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error {
|
||||||
// based on the given parameters
|
// based on the given parameters
|
||||||
func (s *Server) allocModel(
|
func (s *Server) allocModel(
|
||||||
mpath string,
|
mpath string,
|
||||||
|
empath []string,
|
||||||
params ml.BackendParams,
|
params ml.BackendParams,
|
||||||
loraPath []string,
|
loraPath []string,
|
||||||
parallel int,
|
parallel int,
|
||||||
|
|
@ -1192,7 +1195,7 @@ func (s *Server) allocModel(
|
||||||
}()
|
}()
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
s.model, err = model.New(mpath, params)
|
s.model, err = model.New(mpath, empath, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
@ -1295,7 +1298,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|
||||||
s.batchSize = req.BatchSize
|
s.batchSize = req.BatchSize
|
||||||
|
|
||||||
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
|
err := s.allocModel(s.modelPath, s.extraModelPaths, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.closeModel()
|
s.closeModel()
|
||||||
|
|
||||||
|
|
@ -1365,7 +1368,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
|
m, err = model.New(f.Name(), make([]string, 0), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
|
|
@ -1382,13 +1385,14 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func Execute(args []string) error {
|
func Execute(args []string) error {
|
||||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
|
||||||
mpath := fs.String("model", "", "Path to model binary file")
|
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
|
||||||
port := fs.Int("port", 8080, "Port to expose the server on")
|
port := fs.Int("port", 8080, "Port to expose the server on")
|
||||||
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||||
|
|
||||||
fs.Usage = func() {
|
fs.Usage = func() {
|
||||||
fmt.Fprintf(fs.Output(), "Runner usage\n")
|
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
|
||||||
|
fmt.Fprintf(os.Stderr, "Runner usage\n")
|
||||||
fs.PrintDefaults()
|
fs.PrintDefaults()
|
||||||
}
|
}
|
||||||
if err := fs.Parse(args); err != nil {
|
if err := fs.Parse(args); err != nil {
|
||||||
|
|
@ -1401,8 +1405,9 @@ func Execute(args []string) error {
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
server := &Server{
|
server := &Server{
|
||||||
modelPath: *mpath,
|
modelPath: (*mpath)[0],
|
||||||
status: llm.ServerStatusLaunched,
|
extraModelPaths: (*mpath)[1:],
|
||||||
|
status: llm.ServerStatusLaunched,
|
||||||
}
|
}
|
||||||
|
|
||||||
server.cond = sync.NewCond(&server.mu)
|
server.cond = sync.NewCond(&server.mu)
|
||||||
|
|
|
||||||
|
|
@ -53,18 +53,19 @@ type registryOptions struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type Model struct {
|
type Model struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Config ConfigV2
|
Config ConfigV2
|
||||||
ShortName string
|
ShortName string
|
||||||
ModelPath string
|
ModelPath string
|
||||||
ParentModel string
|
ExtraModelPaths []string
|
||||||
AdapterPaths []string
|
ParentModel string
|
||||||
ProjectorPaths []string
|
AdapterPaths []string
|
||||||
System string
|
ProjectorPaths []string
|
||||||
License []string
|
System string
|
||||||
Digest string
|
License []string
|
||||||
Options map[string]any
|
Digest string
|
||||||
Messages []api.Message
|
Options map[string]any
|
||||||
|
Messages []api.Message
|
||||||
|
|
||||||
Template *template.Template
|
Template *template.Template
|
||||||
}
|
}
|
||||||
|
|
@ -190,6 +191,13 @@ func (m *Model) String() string {
|
||||||
Args: m.ModelPath,
|
Args: m.ModelPath,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
for _, extraModels := range m.ExtraModelPaths {
|
||||||
|
modelfile.Commands = append(modelfile.Commands, parser.Command{
|
||||||
|
Name: "model",
|
||||||
|
Args: extraModels,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
for _, adapter := range m.AdapterPaths {
|
for _, adapter := range m.AdapterPaths {
|
||||||
modelfile.Commands = append(modelfile.Commands, parser.Command{
|
modelfile.Commands = append(modelfile.Commands, parser.Command{
|
||||||
Name: "adapter",
|
Name: "adapter",
|
||||||
|
|
@ -348,6 +356,8 @@ func GetModel(name string) (*Model, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
readMainModelFlag := false
|
||||||
|
|
||||||
for _, layer := range manifest.Layers {
|
for _, layer := range manifest.Layers {
|
||||||
filename, err := GetBlobsPath(layer.Digest)
|
filename, err := GetBlobsPath(layer.Digest)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -356,8 +366,13 @@ func GetModel(name string) (*Model, error) {
|
||||||
|
|
||||||
switch layer.MediaType {
|
switch layer.MediaType {
|
||||||
case "application/vnd.ollama.image.model":
|
case "application/vnd.ollama.image.model":
|
||||||
model.ModelPath = filename
|
if !readMainModelFlag {
|
||||||
model.ParentModel = layer.From
|
model.ModelPath = filename
|
||||||
|
model.ParentModel = layer.From
|
||||||
|
readMainModelFlag = true
|
||||||
|
} else {
|
||||||
|
model.ExtraModelPaths = append(model.ExtraModelPaths, filename)
|
||||||
|
}
|
||||||
case "application/vnd.ollama.image.embed":
|
case "application/vnd.ollama.image.embed":
|
||||||
// Deprecated in versions > 0.1.2
|
// Deprecated in versions > 0.1.2
|
||||||
// TODO: remove this warning in a future version
|
// TODO: remove this warning in a future version
|
||||||
|
|
|
||||||
|
|
@ -1182,14 +1182,14 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
|
func getModelData(digest string, verbose bool) (ggml.KV, ggml.ForeignTensors, error) {
|
||||||
maxArraySize := 0
|
maxArraySize := 0
|
||||||
if verbose {
|
if verbose {
|
||||||
maxArraySize = -1
|
maxArraySize = -1
|
||||||
}
|
}
|
||||||
data, err := llm.LoadModel(digest, maxArraySize)
|
data, err := llm.LoadModel(digest, make([]string, 0), maxArraySize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, ggml.Tensors{}, err
|
return nil, make(ggml.ForeignTensors, 0), err
|
||||||
}
|
}
|
||||||
|
|
||||||
kv := data.KV()
|
kv := data.KV()
|
||||||
|
|
@ -1202,7 +1202,7 @@ func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return kv, data.Tensors(), nil
|
return kv, data.Tensors, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) ListHandler(c *gin.Context) {
|
func (s *Server) ListHandler(c *gin.Context) {
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
@ -232,7 +232,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
|
|
@ -228,7 +228,7 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
|
|
|
||||||
|
|
@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, []string, *ggml.MetaGGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||||
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ []string, _ *ggml.MetaGGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||||
return mock, nil
|
return mock, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -159,7 +159,7 @@ func TestGenerateChat(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
@ -786,7 +786,7 @@ func TestGenerate(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
@ -1270,7 +1270,7 @@ func TestGenerateLogprobs(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{llama: mock}
|
req.successCh <- &runnerRef{llama: mock}
|
||||||
return false
|
return false
|
||||||
},
|
},
|
||||||
|
|
@ -1450,7 +1450,7 @@ func TestChatLogprobs(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{llama: mock}
|
req.successCh <- &runnerRef{llama: mock}
|
||||||
return false
|
return false
|
||||||
},
|
},
|
||||||
|
|
@ -1560,7 +1560,7 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{llama: mock}
|
req.successCh <- &runnerRef{llama: mock}
|
||||||
return false
|
return false
|
||||||
|
|
|
||||||
|
|
@ -265,7 +265,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 100 * time.Millisecond,
|
waitForRecovery: 100 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
}
|
}
|
||||||
|
|
@ -416,7 +416,7 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 100 * time.Millisecond,
|
waitForRecovery: 100 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
}
|
}
|
||||||
|
|
@ -598,7 +598,7 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getSystemInfoFn: getSystemInfoFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -49,8 +49,8 @@ type Scheduler struct {
|
||||||
activeLoading llm.LlamaServer
|
activeLoading llm.LlamaServer
|
||||||
loaded map[string]*runnerRef
|
loaded map[string]*runnerRef
|
||||||
|
|
||||||
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
|
loadFn func(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
|
||||||
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||||
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
|
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
|
||||||
getSystemInfoFn func() ml.SystemInfo
|
getSystemInfoFn func() ml.SystemInfo
|
||||||
waitForRecovery time.Duration
|
waitForRecovery time.Duration
|
||||||
|
|
@ -196,7 +196,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
|
logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
|
ggml, err := llm.LoadModel(pending.model.ModelPath, pending.model.ExtraModelPaths, 1024)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
pending.errCh <- err
|
pending.errCh <- err
|
||||||
break
|
break
|
||||||
|
|
@ -389,7 +389,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||||
|
|
||||||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
|
func (s *Scheduler) load(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
|
||||||
numParallel := max(int(envconfig.NumParallel()), 1)
|
numParallel := max(int(envconfig.NumParallel()), 1)
|
||||||
|
|
||||||
// Embedding models should always be loaded with parallel=1
|
// Embedding models should always be loaded with parallel=1
|
||||||
|
|
@ -414,7 +414,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
||||||
|
|
||||||
if llama == nil {
|
if llama == nil {
|
||||||
var err error
|
var err error
|
||||||
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, req.model.ExtraModelPaths, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// some older models are not compatible with newer versions of llama.cpp
|
// some older models are not compatible with newer versions of llama.cpp
|
||||||
// show a generalized compatibility error until there is a better way to
|
// show a generalized compatibility error until there is a better way to
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ func TestSchedLoad(t *testing.T) {
|
||||||
defer done()
|
defer done()
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.waitForRecovery = 10 * time.Millisecond
|
s.waitForRecovery = 10 * time.Millisecond
|
||||||
var f *ggml.GGML // value not used in tests
|
var f *ggml.MetaGGML // value not used in tests
|
||||||
req := &LlmRequest{
|
req := &LlmRequest{
|
||||||
ctx: ctx,
|
ctx: ctx,
|
||||||
model: &Model{ModelPath: "foo"},
|
model: &Model{ModelPath: "foo"},
|
||||||
|
|
@ -49,7 +49,7 @@ func TestSchedLoad(t *testing.T) {
|
||||||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||||
}
|
}
|
||||||
// Fail to load model first
|
// Fail to load model first
|
||||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return nil, errors.New("something failed to load model blah")
|
return nil, errors.New("something failed to load model blah")
|
||||||
}
|
}
|
||||||
gpus := []ml.DeviceInfo{}
|
gpus := []ml.DeviceInfo{}
|
||||||
|
|
@ -64,7 +64,7 @@ func TestSchedLoad(t *testing.T) {
|
||||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||||
|
|
||||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
server.modelPath = model
|
server.modelPath = model
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
|
|
@ -103,10 +103,10 @@ type reqBundle struct {
|
||||||
ctxDone func()
|
ctxDone func()
|
||||||
srv *mockLlm
|
srv *mockLlm
|
||||||
req *LlmRequest
|
req *LlmRequest
|
||||||
f *ggml.GGML
|
f *ggml.MetaGGML
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
scenario.srv.modelPath = model
|
scenario.srv.modelPath = model
|
||||||
return scenario.srv, nil
|
return scenario.srv, nil
|
||||||
}
|
}
|
||||||
|
|
@ -132,7 +132,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
|
||||||
})
|
})
|
||||||
|
|
||||||
model := &Model{Name: modelName, ModelPath: p}
|
model := &Model{Name: modelName, ModelPath: p}
|
||||||
f, err := llm.LoadModel(model.ModelPath, 0)
|
f, err := llm.LoadModel(model.ModelPath, make([]string, 0), 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
@ -462,11 +462,11 @@ func TestSchedExpireRunner(t *testing.T) {
|
||||||
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
||||||
}
|
}
|
||||||
|
|
||||||
var f *ggml.GGML
|
var f *ggml.MetaGGML
|
||||||
gpus := []ml.DeviceInfo{}
|
gpus := []ml.DeviceInfo{}
|
||||||
systemInfo := ml.SystemInfo{}
|
systemInfo := ml.SystemInfo{}
|
||||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
server.modelPath = model
|
server.modelPath = model
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue