gguf: add split gguf loading

This commit is contained in:
cvrunmin 2025-11-20 16:43:47 +08:00
parent d70e935526
commit efdd9b76da
20 changed files with 386 additions and 117 deletions

View File

@ -426,6 +426,7 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map
cmd, port, err := llm.StartRunner(
true, // ollama engine
"", // no model
make([]string, 0),
ollamaLibDirs,
out,
extraEnvs,

View File

@ -7,8 +7,10 @@ import (
"fmt"
"io"
"log/slog"
"maps"
"math"
"slices"
"sort"
"strings"
"github.com/ollama/ollama/format"
@ -26,6 +28,18 @@ type model interface {
Tensors() Tensors
}
type MetaGGML struct {
Shards []GGML
ShardPaths []string
Tensors ForeignTensors
kv KV
}
type GGUFSplitInfo struct {
no uint32
count uint32
}
type KV map[string]any
func (kv KV) Architecture() string {
@ -49,6 +63,17 @@ func (kv KV) FileType() FileType {
return FileTypeUnknown
}
func (kv KV) GGUFSplitInfo() *GGUFSplitInfo {
no := kv.Uint("split.no", 0xffffffff)
if no == 0xffffffff {
return nil
}
return &GGUFSplitInfo{
no: no,
count: kv.Uint("split.count"),
}
}
func (kv KV) BlockCount() uint64 {
return uint64(kv.Uint("block_count"))
}
@ -268,7 +293,7 @@ type arrayValueTypes interface {
}
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "split.") {
key = kv.Architecture() + "." + key
}
@ -285,6 +310,14 @@ type Tensors struct {
Offset uint64
}
type ForeignTensor struct {
*Tensor
ModelPath string
TensorRegionOffset uint64
}
type ForeignTensors []ForeignTensor
func (s Tensors) Items(prefix ...string) []*Tensor {
if len(prefix) == 0 {
return s.items
@ -323,6 +356,41 @@ func (ts Tensors) GroupLayers() map[string]Layer {
return layers
}
func (s ForeignTensors) Items(prefix ...string) []*Tensor {
var items []*Tensor
for i := range s {
if len(prefix) == 0 || strings.HasPrefix(s[i].Name, prefix[0]) {
items = append(items, s[i].Tensor)
}
}
return items
}
func (ts ForeignTensors) GroupLayers() map[string]Layer {
layers := make(map[string]Layer)
for i := range ts {
t := ts[i].Tensor
parts := strings.Split(t.Name, ".")
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
if len(parts) > index+2 {
// blk and mm should have a number after them, join it
parts = append(
[]string{strings.Join(parts[:index+2], ".")},
parts[index+2:]...)
}
}
if _, ok := layers[parts[0]]; !ok {
layers[parts[0]] = make(Layer)
}
layers[parts[0]][strings.Join(parts[1:], ".")] = t
}
return layers
}
type Layer map[string]*Tensor
func (l Layer) Size() (size uint64) {
@ -550,7 +618,89 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
}, nil
}
func BuildForeignTensors(shards []GGML, shardsPaths []string) (*ForeignTensors, error) {
if len(shards) != len(shardsPaths) {
return nil, fmt.Errorf("length of shards and shardsPaths do not match: %d vs %d", len(shards), len(shardsPaths))
}
li := make(ForeignTensors, 0)
for i := range shards {
gs := shards[i]
tensors := gs.Tensors()
for k := range tensors.items {
tensor := tensors.items[k]
li = append(li, ForeignTensor{
Tensor: tensor,
ModelPath: shardsPaths[i],
TensorRegionOffset: tensors.Offset,
})
}
}
return &li, nil
}
func MakeMetaGGML(ggmls []GGML, ggmlPaths []string) MetaGGML {
type wrapper struct {
ggml GGML
path string
weight int64
}
var wrappers []wrapper
for i := range ggmls {
iSplitInfo := ggmls[i].KV().GGUFSplitInfo()
var weight int64 = 0
if iSplitInfo == nil {
weight = -1
} else {
weight = int64((*iSplitInfo).no)
}
wrappers = append(wrappers, wrapper{
ggml: ggmls[i],
path: ggmlPaths[i],
weight: weight,
})
}
sort.SliceStable(wrappers, func(i, j int) bool {
return wrappers[i].weight < wrappers[j].weight
})
metaGgml := MetaGGML{}
for i := range wrappers {
if i == 0 {
kv := maps.Clone(wrappers[i].ggml.KV())
// remove the keys contained in split gguf files. add more if needed.
delete(kv, "slice.no")
delete(kv, "slice.count")
delete(kv, "slice.tensors.count")
metaGgml.kv = kv
}
metaGgml.Shards = append(metaGgml.Shards, wrappers[i].ggml)
metaGgml.ShardPaths = append(metaGgml.ShardPaths, wrappers[i].path)
}
ft, _ := BuildForeignTensors(metaGgml.Shards, metaGgml.ShardPaths)
metaGgml.Tensors = *ft
return metaGgml
}
func simpleWrapGGML(ggml GGML) MetaGGML {
// simply wrap single GGML, without creating foreign tensors
return MetaGGML{
Shards: []GGML{ggml},
ShardPaths: []string{""},
kv: ggml.KV(),
}
}
func WrapGGML(ggml GGML) MetaGGML {
metaggml := simpleWrapGGML(ggml)
ft, _ := BuildForeignTensors(metaggml.Shards, metaggml.ShardPaths)
metaggml.Tensors = *ft
return metaggml
}
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
return WrapGGML(f).GraphSize(context, batch, numParallel, kvCacheType, useFlashAttention)
}
func (f MetaGGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
context *= uint64(numParallel)
embedding := f.KV().EmbeddingLength()
@ -564,7 +714,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
layers := f.Tensors().GroupLayers()
layers := f.Tensors.GroupLayers()
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
@ -662,7 +812,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
)
var ropeFreqsCount uint64
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
if ropeFreqs, ok := f.Tensors.GroupLayers()["rope_freqs"]; ok {
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
ropeFreqsCount = ropeFreqsWeights.Elements()
}
@ -802,6 +952,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
// SupportsKVCacheType checks if the requested cache type is supported
func (f GGML) SupportsKVCacheType(cacheType string) bool {
return simpleWrapGGML(f).SupportsKVCacheType(cacheType)
}
func (f MetaGGML) SupportsKVCacheType(cacheType string) bool {
if cacheType == "" || cacheType == "f16" {
return true
}
@ -811,6 +964,10 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
// SupportsFlashAttention checks if the model supports flash attention
func (f GGML) SupportsFlashAttention() bool {
return simpleWrapGGML(f).SupportsFlashAttention()
}
func (f MetaGGML) SupportsFlashAttention() bool {
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
if isEmbedding {
return false
@ -828,6 +985,10 @@ func (f GGML) SupportsFlashAttention() bool {
// FlashAttention checks if the model should enable flash attention
func (f GGML) FlashAttention() bool {
return simpleWrapGGML(f).FlashAttention()
}
func (f MetaGGML) FlashAttention() bool {
return slices.Contains([]string{
"gemma3",
"gptoss", "gpt-oss",
@ -849,3 +1010,15 @@ func kvCacheBytesPerElement(cacheType string) float64 {
return 2 // f16 (default)
}
}
func (f MetaGGML) KV() KV {
return f.kv
}
func (f MetaGGML) TotalTensorBytes() uint64 {
totalBytes := uint64(0)
for i := range f.Shards {
totalBytes += uint64(f.Shards[i].Length) - f.Shards[i].Tensors().Offset
}
return totalBytes
}

View File

@ -138,7 +138,7 @@ func (llm *gguf) numKV() uint64 {
}
}
func (llm *gguf) Decode(rs io.ReadSeeker) error {
func (llm *gguf) Decode(rs io.ReadSeeker, mainKV ...KV) error {
// decode key-values
for i := 0; uint64(i) < llm.numKV(); i++ {
k, err := readGGUFString(llm, rs)
@ -235,7 +235,11 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
// patch KV with parameter count
llm.kv["general.parameter_count"] = llm.parameters
alignment := llm.kv.Uint("general.alignment", 32)
alignment := llm.kv.Uint("general.alignment", 0xffffffff)
if alignment == 0xffffffff {
// try to get alignment from main shard instead.
alignment = append(mainKV, make(KV))[0].Uint("general.alignment", 32)
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {

2
go.mod
View File

@ -72,7 +72,7 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/spf13/pflag v1.0.5
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
golang.org/x/arch v0.8.0 // indirect

View File

@ -256,7 +256,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
return true
}
func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
func LoadModelFromFile(modelPath string, extraModelPaths []string, params ModelParams) (*Model, error) {
cparams := C.llama_model_default_params()
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
cparams.main_gpu = C.int32_t(params.MainGpu)
@ -300,7 +300,17 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams.progress_callback_user_data = unsafe.Pointer(&handle)
}
m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)}
var splitPaths []*C.char
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
splitPaths = append(splitPaths, mp)
for i := range extraModelPaths {
mp := C.CString(extraModelPaths[i])
defer C.free(unsafe.Pointer(mp))
splitPaths = append(splitPaths, mp)
}
m := Model{c: C.llama_model_load_from_splits(&splitPaths[0], C.size_t(len(splitPaths)), cparams)}
if m.c == nil {
return nil, fmt.Errorf("unable to load model: %s", modelPath)
}

View File

@ -84,12 +84,13 @@ type LlamaServer interface {
// llmServer is an instance of a runner hosting a single model
type llmServer struct {
port int
cmd *exec.Cmd
done chan error // Channel to signal when the process exits
status *StatusWriter
options api.Options
modelPath string
port int
cmd *exec.Cmd
done chan error // Channel to signal when the process exits
status *StatusWriter
options api.Options
modelPath string
extraModelPaths []string
loadRequest LoadRequest // Parameters used to initialize the runner
mem *ml.BackendMemory // Memory allocations for this model
@ -109,7 +110,7 @@ type llmServer struct {
type llamaServer struct {
llmServer
ggml *ggml.GGML
ggml *ggml.MetaGGML
}
type ollamaServer struct {
@ -123,7 +124,7 @@ type ollamaServer struct {
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
func LoadModel(model string, extraModels []string, maxArraySize int) (*ggml.MetaGGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@ -134,12 +135,36 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
}
defer f.Close()
ggml, err := ggml.Decode(f, maxArraySize)
return ggml, err
ggml1, err := ggml.Decode(f, maxArraySize)
if err != nil {
return nil, err
}
if ggml1.KV().GGUFSplitInfo() != nil {
loadedGgml := []ggml.GGML{*ggml1}
for i := range extraModels {
extraModel := extraModels[i]
f, err := os.Open(extraModel)
if err != nil {
return nil, err
}
defer f.Close()
ggml1, err := ggml.Decode(f, maxArraySize)
if err != nil {
return nil, err
}
loadedGgml = append(loadedGgml, *ggml1)
}
metaggml := ggml.MakeMetaGGML(loadedGgml, append([]string{model}, extraModels...))
return &metaggml, nil
} else {
metaggml := ggml.MakeMetaGGML([]ggml.GGML{*ggml1}, []string{model})
return &metaggml, nil
}
}
// NewLlamaServer will run a server for the given GPUs
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, extraModelPaths []string, f *ggml.MetaGGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var llamaModel *llama.Model
var textProcessor model.TextProcessor
var err error
@ -155,7 +180,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
}
}
if textProcessor == nil {
llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
llamaModel, err = llama.LoadModelFromFile(modelPath, extraModelPaths, llama.ModelParams{VocabOnly: true})
if err != nil {
return nil, err
}
@ -225,24 +250,26 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
cmd, port, err := StartRunner(
textProcessor != nil,
modelPath,
extraModelPaths,
gpuLibs,
status,
ml.GetVisibleDevicesEnv(gpus),
)
s := llmServer{
port: port,
cmd: cmd,
status: status,
options: opts,
modelPath: modelPath,
loadRequest: loadRequest,
llamaModel: llamaModel,
llamaModelLock: &sync.Mutex{},
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
loadStart: time.Now(),
done: make(chan error, 1),
port: port,
cmd: cmd,
status: status,
options: opts,
modelPath: modelPath,
extraModelPaths: extraModelPaths,
loadRequest: loadRequest,
llamaModel: llamaModel,
llamaModelLock: &sync.Mutex{},
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
loadStart: time.Now(),
done: make(chan error, 1),
}
if err != nil {
@ -279,7 +306,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
}
}
func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
func StartRunner(ollamaEngine bool, modelPath string, extraModelPaths []string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
var exe string
exe, err = os.Executable()
if err != nil {
@ -309,6 +336,9 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W
if modelPath != "" {
params = append(params, "--model", modelPath)
}
for i := range extraModelPaths {
params = append(params, "--model", extraModelPaths[i])
}
params = append(params, "--port", strconv.Itoa(port))
var pathEnv string
@ -403,6 +433,10 @@ func (s *llmServer) ModelPath() string {
return s.modelPath
}
func (s *llmServer) ExtraModelPaths() []string {
return s.extraModelPaths
}
type LoadOperation int
// The order of these constants are significant because we iterate over the operations. They
@ -478,7 +512,7 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
// Use the size of one layer as a buffer
layers := s.ggml.Tensors().GroupLayers()
layers := s.ggml.Tensors.GroupLayers()
if blk0, ok := layers["blk.0"]; ok {
for i := range gpus {
gpus[i].FreeMemory -= blk0.Size() + kv[0]

View File

@ -77,9 +77,9 @@ type BackendParams struct {
FlashAttention bool
}
var backends = make(map[string]func(string, BackendParams) (Backend, error))
var backends = make(map[string]func(string, []string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
func RegisterBackend(name string, f func(string, []string, BackendParams) (Backend, error)) {
if _, ok := backends[name]; ok {
panic("backend: backend already registered")
}
@ -87,9 +87,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
backends[name] = f
}
func NewBackend(modelPath string, params BackendParams) (Backend, error) {
func NewBackend(modelPath string, extraModelPaths []string, params BackendParams) (Backend, error) {
if backend, ok := backends["ggml"]; ok {
return backend(modelPath, params)
return backend(modelPath, extraModelPaths, params)
}
return nil, fmt.Errorf("unsupported backend")

View File

@ -77,7 +77,7 @@ type Backend struct {
// modelPath is the location of the model data
modelPath string
meta *fsggml.GGML
meta *fsggml.MetaGGML
// allocMemory means that memory should be allocated for tensors and not
// just a dry run
@ -120,17 +120,38 @@ type Backend struct {
var once sync.Once
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (ml.Backend, error) {
r, err := os.Open(modelPath)
if err != nil {
return nil, err
}
defer r.Close()
meta, err := fsggml.Decode(r, -1)
smallmeta, err := fsggml.Decode(r, -1)
if err != nil {
return nil, err
}
var meta fsggml.MetaGGML
if smallmeta.KV().GGUFSplitInfo() != nil {
loadedGgml := []fsggml.GGML{*smallmeta}
for i := range extraModelPaths {
extraModel := extraModelPaths[i]
f, err := os.Open(extraModel)
if err != nil {
return nil, err
}
defer f.Close()
smallmeta, err := fsggml.Decode(f, -1)
if err != nil {
return nil, err
}
loadedGgml = append(loadedGgml, *smallmeta)
}
meta = fsggml.MakeMetaGGML(loadedGgml, append([]string{modelPath}, extraModelPaths...))
} else {
meta = fsggml.MakeMetaGGML([]fsggml.GGML{*smallmeta}, []string{modelPath})
}
once.Do(func() {
slog.Info(
@ -139,7 +160,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
"file_type", meta.KV().FileType(),
"name", meta.KV().String("general.name"),
"description", meta.KV().String("general.description"),
"num_tensors", len(meta.Tensors().Items()),
"num_tensors", len(meta.Tensors.Items()),
"num_key_values", len(meta.KV()),
)
})
@ -227,7 +248,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
// outputs are assigned iff allowed by splits and configured number of gpu layers
output := assignLayer(blocks)
maxTensors := len(meta.Tensors().Items())
maxTensors := len(meta.Tensors.Items())
maxTensors += 1
// each layer has at most 2 extra tensors for rope operations
maxTensors += blocks * 2
@ -303,11 +324,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
return false
}
for _, t := range meta.Tensors().Items() {
for _, t := range meta.Tensors.Items() {
switch {
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
createTensor(tensor{source: t}, input.bts, -1)
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
if _, ok := meta.Tensors.GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
}
case contains(t.Name, "cls", "output", "output_norm",
@ -378,7 +399,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
}
}
maxGraphNodes := max(1024, len(meta.Tensors().Items())*8)
maxGraphNodes := max(1024, len(meta.Tensors.Items())*8)
sched := C.ggml_backend_sched_new_ext(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
@ -423,7 +444,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
modelPath: modelPath,
allocMemory: params.AllocMemory,
flashAttention: params.FlashAttention,
meta: meta,
meta: &meta,
tensorLoadTargets: targets,
tensors: tensors,
sched: sched,
@ -494,11 +515,12 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
var doneBytes atomic.Uint64
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
totalBytes := b.meta.TotalTensorBytes()
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(runtime.GOMAXPROCS(0))
for _, t := range b.meta.Tensors().Items() {
for i := range b.meta.Tensors {
t := b.meta.Tensors[i]
g.Go(func() error {
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
for i := range tts {
@ -517,13 +539,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
// seeking around within an FD shared between all goroutines.
file, err := os.Open(b.modelPath)
file, err := os.Open(t.ModelPath)
if err != nil {
slog.Warn("file open error", "file", b.modelPath, "error", err)
slog.Warn("file open error", "file", t.ModelPath, "error", err)
return err
}
defer file.Close()
sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
sr := io.NewSectionReader(file, int64(t.TensorRegionOffset+t.Offset), int64(t.Size()))
if t.Kind == 4 && tts[0]._type == 39 {
// source is mxfp4, target is ggml mxfp4

View File

@ -24,7 +24,7 @@ func setup(tb testing.TB) ml.Context {
tb.Fatal(err)
}
b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
b, err := ml.NewBackend(f.Name(), make([]string, 0), ml.BackendParams{AllocMemory: true})
if err != nil {
tb.Fatal(err)
}

View File

@ -102,8 +102,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
}
// New initializes a new model instance with the provided configuration based on the metadata in the model file
func New(modelPath string, params ml.BackendParams) (Model, error) {
b, err := ml.NewBackend(modelPath, params)
func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (Model, error) {
b, err := ml.NewBackend(modelPath, extraModelPaths, params)
if err != nil {
return nil, err
}

View File

@ -4,7 +4,6 @@ import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"log"
"log/slog"
@ -19,6 +18,7 @@ import (
"time"
"unicode/utf8"
"github.com/spf13/pflag"
"golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api"
@ -256,6 +256,8 @@ type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
extraModelPaths []string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
@ -827,6 +829,7 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
func (s *Server) loadModel(
params llama.ModelParams,
mpath string,
empath []string,
lpath []string,
ppath string,
kvSize int,
@ -836,7 +839,7 @@ func (s *Server) loadModel(
multiUserCache bool,
) {
var err error
s.model, err = llama.LoadModelFromFile(mpath, params)
s.model, err = llama.LoadModelFromFile(mpath, empath, params)
if err != nil {
panic(err)
}
@ -929,7 +932,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
}
s.status = llm.ServerStatusLoadingModel
go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
go s.loadModel(params, s.modelPath, s.extraModelPaths, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
case llm.LoadOperationClose:
// No-op for us
@ -947,13 +950,14 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
fmt.Fprintf(os.Stderr, "Runner usage\n")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
@ -965,8 +969,9 @@ func Execute(args []string) error {
llama.BackendInit()
server := &Server{
modelPath: *mpath,
status: llm.ServerStatusLaunched,
modelPath: (*mpath)[0],
extraModelPaths: (*mpath)[1:],
status: llm.ServerStatusLaunched,
}
server.ready.Add(1)

View File

@ -5,7 +5,6 @@ import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"hash/maphash"
"image"
@ -23,6 +22,7 @@ import (
"time"
"unicode/utf8"
"github.com/spf13/pflag"
"golang.org/x/image/bmp"
"golang.org/x/sync/semaphore"
@ -331,6 +331,8 @@ type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
extraModelPaths []string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
@ -1168,6 +1170,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error {
// based on the given parameters
func (s *Server) allocModel(
mpath string,
empath []string,
params ml.BackendParams,
loraPath []string,
parallel int,
@ -1192,7 +1195,7 @@ func (s *Server) allocModel(
}()
var err error
s.model, err = model.New(mpath, params)
s.model, err = model.New(mpath, empath, params)
if err != nil {
return err
}
@ -1295,7 +1298,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.batchSize = req.BatchSize
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
err := s.allocModel(s.modelPath, s.extraModelPaths, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
if err != nil {
s.closeModel()
@ -1365,7 +1368,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
return
}
m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
m, err = model.New(f.Name(), make([]string, 0), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
if err != nil {
http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
return
@ -1382,13 +1385,14 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
fmt.Fprintf(os.Stderr, "Runner usage\n")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
@ -1401,8 +1405,9 @@ func Execute(args []string) error {
defer cancel()
server := &Server{
modelPath: *mpath,
status: llm.ServerStatusLaunched,
modelPath: (*mpath)[0],
extraModelPaths: (*mpath)[1:],
status: llm.ServerStatusLaunched,
}
server.cond = sync.NewCond(&server.mu)

View File

@ -53,18 +53,19 @@ type registryOptions struct {
}
type Model struct {
Name string `json:"name"`
Config ConfigV2
ShortName string
ModelPath string
ParentModel string
AdapterPaths []string
ProjectorPaths []string
System string
License []string
Digest string
Options map[string]any
Messages []api.Message
Name string `json:"name"`
Config ConfigV2
ShortName string
ModelPath string
ExtraModelPaths []string
ParentModel string
AdapterPaths []string
ProjectorPaths []string
System string
License []string
Digest string
Options map[string]any
Messages []api.Message
Template *template.Template
}
@ -190,6 +191,13 @@ func (m *Model) String() string {
Args: m.ModelPath,
})
for _, extraModels := range m.ExtraModelPaths {
modelfile.Commands = append(modelfile.Commands, parser.Command{
Name: "model",
Args: extraModels,
})
}
for _, adapter := range m.AdapterPaths {
modelfile.Commands = append(modelfile.Commands, parser.Command{
Name: "adapter",
@ -348,6 +356,8 @@ func GetModel(name string) (*Model, error) {
}
}
readMainModelFlag := false
for _, layer := range manifest.Layers {
filename, err := GetBlobsPath(layer.Digest)
if err != nil {
@ -356,8 +366,13 @@ func GetModel(name string) (*Model, error) {
switch layer.MediaType {
case "application/vnd.ollama.image.model":
model.ModelPath = filename
model.ParentModel = layer.From
if !readMainModelFlag {
model.ModelPath = filename
model.ParentModel = layer.From
readMainModelFlag = true
} else {
model.ExtraModelPaths = append(model.ExtraModelPaths, filename)
}
case "application/vnd.ollama.image.embed":
// Deprecated in versions > 0.1.2
// TODO: remove this warning in a future version

View File

@ -1182,14 +1182,14 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
return resp, nil
}
func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
func getModelData(digest string, verbose bool) (ggml.KV, ggml.ForeignTensors, error) {
maxArraySize := 0
if verbose {
maxArraySize = -1
}
data, err := llm.LoadModel(digest, maxArraySize)
data, err := llm.LoadModel(digest, make([]string, 0), maxArraySize)
if err != nil {
return nil, ggml.Tensors{}, err
return nil, make(ggml.ForeignTensors, 0), err
}
kv := data.KV()
@ -1202,7 +1202,7 @@ func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
}
}
return kv, data.Tensors(), nil
return kv, data.Tensors, nil
}
func (s *Server) ListHandler(c *gin.Context) {

View File

@ -39,7 +39,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -232,7 +232,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{

View File

@ -44,7 +44,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
@ -228,7 +228,7 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,

View File

@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return
}
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, []string, *ggml.MetaGGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ []string, _ *ggml.MetaGGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
return mock, nil
}
}
@ -159,7 +159,7 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -786,7 +786,7 @@ func TestGenerate(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -1270,7 +1270,7 @@ func TestGenerateLogprobs(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{llama: mock}
return false
},
@ -1450,7 +1450,7 @@ func TestChatLogprobs(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{llama: mock}
return false
},
@ -1560,7 +1560,7 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{llama: mock}
return false

View File

@ -265,7 +265,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
@ -416,7 +416,7 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
@ -598,7 +598,7 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
getGpuFn: getGpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}

View File

@ -49,8 +49,8 @@ type Scheduler struct {
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
loadFn func(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
getSystemInfoFn func() ml.SystemInfo
waitForRecovery time.Duration
@ -196,7 +196,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// Load model for fitting
logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
ggml, err := llm.LoadModel(pending.model.ModelPath, pending.model.ExtraModelPaths, 1024)
if err != nil {
pending.errCh <- err
break
@ -389,7 +389,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
func (s *Scheduler) load(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
numParallel := max(int(envconfig.NumParallel()), 1)
// Embedding models should always be loaded with parallel=1
@ -414,7 +414,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
if llama == nil {
var err error
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, req.model.ExtraModelPaths, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to

View File

@ -39,7 +39,7 @@ func TestSchedLoad(t *testing.T) {
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
var f *ggml.GGML // value not used in tests
var f *ggml.MetaGGML // value not used in tests
req := &LlmRequest{
ctx: ctx,
model: &Model{ModelPath: "foo"},
@ -49,7 +49,7 @@ func TestSchedLoad(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Second},
}
// Fail to load model first
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return nil, errors.New("something failed to load model blah")
}
gpus := []ml.DeviceInfo{}
@ -64,7 +64,7 @@ func TestSchedLoad(t *testing.T) {
require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
@ -103,10 +103,10 @@ type reqBundle struct {
ctxDone func()
srv *mockLlm
req *LlmRequest
f *ggml.GGML
f *ggml.MetaGGML
}
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model
return scenario.srv, nil
}
@ -132,7 +132,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
})
model := &Model{Name: modelName, ModelPath: p}
f, err := llm.LoadModel(model.ModelPath, 0)
f, err := llm.LoadModel(model.ModelPath, make([]string, 0), 0)
if err != nil {
t.Fatal(err)
}
@ -462,11 +462,11 @@ func TestSchedExpireRunner(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
}
var f *ggml.GGML
var f *ggml.MetaGGML
gpus := []ml.DeviceInfo{}
systemInfo := ml.SystemInfo{}
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}