Merge 328675151b into 6c3faafed2

2026-01-06 08:25:48 +01:00 · 2026-01-06 08:25:48 +01:00 · 9e70f0fcfd
parent 6c3faafed2 328675151b
commit 9e70f0fcfd
25 changed files with 770 additions and 119 deletions
--- a/discover/runner.go
+++ b/discover/runner.go
@ -441,6 +441,7 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map
 	cmd, port, err := llm.StartRunner(
 		true, // ollama engine
 		"",   // no model
+		make([]string, 0),
 		ollamaLibDirs,
 		out,
 		extraEnvs,
--- a/docs/api.md
+++ b/docs/api.md
@ -1176,7 +1176,7 @@ Create a model from:

 - another model;
 - a safetensors directory; or
- a GGUF file.
+- a GGUF file or directory.

 If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.

@ -1270,6 +1270,7 @@ A stream of JSON objects is returned:
 #### Create a model from GGUF

 Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
+For a model stored in multiple split GGUF files, includes all split GGUF files in the `files` parameter with the file names and SHA256 digests. It is recommended to provide files in split number order even though Ollama itself will sort them in order.

 ##### Request

--- a/docs/import.mdx
+++ b/docs/import.mdx
@ -88,6 +88,10 @@ To import a GGUF model, create a `Modelfile` containing:
 ```dockerfile
 FROM /path/to/file.gguf
 ```
+Or:
+```dockerfile
+FROM /path/to/gguf/split/directory
+```

 For a GGUF adapter, create the `Modelfile` with:

--- a/docs/modelfile.mdx
+++ b/docs/modelfile.mdx
@ -12,7 +12,7 @@ A Modelfile is the blueprint to create and share customized models using Ollama.
  - [FROM (Required)](#from-required)
    - [Build from existing model](#build-from-existing-model)
    - [Build from a Safetensors model](#build-from-a-safetensors-model)
-    - [Build from a GGUF file](#build-from-a-gguf-file)
+    - [Build from a GGUF file](#build-from-a-gguf-model)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
  - [TEMPLATE](#template)
@ -130,7 +130,7 @@ Currently supported model architectures:
 - Gemma (including Gemma 1 and Gemma 2)
 - Phi3

-#### Build from a GGUF file
+#### Build from a GGUF model

 ```
 FROM ./ollama-model.gguf
@ -138,6 +138,14 @@ FROM ./ollama-model.gguf

 The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.

+For GGUF model split into multiple files:
+
+```
+FROM <model directory>
+```
+
+The model directory should contain solely the split GGUF weights of one model.
+
 ### PARAMETER

 The `PARAMETER` instruction defines a parameter that can be set when the model is run.
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -7,6 +7,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"maps"
 	"math"
 	"slices"
 	"strings"
@ -27,6 +28,18 @@ type model interface {
 	Tensors() Tensors
 }

+type MetaGGML struct {
+	Shards     []GGML
+	ShardPaths []string
+	Tensors    ForeignTensors
+	kv         KV
+}
+
+type GGUFSplitInfo struct {
+	No    uint16
+	Count uint16
+}
+
 type KV map[string]any

 func (kv KV) Architecture() string {
@ -50,6 +63,18 @@ func (kv KV) FileType() FileType {
 	return FileTypeUnknown
 }

+func (kv KV) GGUFSplitInfo() *GGUFSplitInfo {
+	no, found := keyValue(kv, "split.no", uint16(0))
+	if !found {
+		return nil
+	}
+	count, _ := keyValue(kv, "split.count", uint16(0))
+	return &GGUFSplitInfo{
+		No:    no,
+		Count: count,
+	}
+}
+
 func (kv KV) BlockCount() uint64 {
 	return uint64(kv.Uint("block_count"))
 }
@ -271,7 +296,7 @@ type arrayValueTypes interface {
 }

 func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
-	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "split.") {
 		key = kv.Architecture() + "." + key
 	}

@ -288,6 +313,14 @@ type Tensors struct {
 	Offset uint64
 }

+type ForeignTensor struct {
+	*Tensor
+	ModelPath          string
+	TensorRegionOffset uint64
+}
+
+type ForeignTensors []ForeignTensor
+
 func (s Tensors) Items(prefix ...string) []*Tensor {
 	if len(prefix) == 0 {
 		return s.items
@ -326,6 +359,41 @@ func (ts Tensors) GroupLayers() map[string]Layer {
 	return layers
 }

+func (s ForeignTensors) Items(prefix ...string) []*Tensor {
+	var items []*Tensor
+	for i := range s {
+		if len(prefix) == 0 || strings.HasPrefix(s[i].Name, prefix[0]) {
+			items = append(items, s[i].Tensor)
+		}
+	}
+
+	return items
+}
+
+func (ts ForeignTensors) GroupLayers() map[string]Layer {
+	layers := make(map[string]Layer)
+	for i := range ts {
+		t := ts[i].Tensor
+		parts := strings.Split(t.Name, ".")
+		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
+			if len(parts) > index+2 {
+				// blk and mm should have a number after them, join it
+				parts = append(
+					[]string{strings.Join(parts[:index+2], ".")},
+					parts[index+2:]...)
+			}
+		}
+
+		if _, ok := layers[parts[0]]; !ok {
+			layers[parts[0]] = make(Layer)
+		}
+
+		layers[parts[0]][strings.Join(parts[1:], ".")] = t
+	}
+
+	return layers
+}
+
 type Layer map[string]*Tensor

 func (l Layer) Size() (size uint64) {
@ -553,7 +621,93 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	}, nil
 }

+func BuildForeignTensors(shards []GGML, shardsPaths []string) (*ForeignTensors, error) {
+	if len(shards) != len(shardsPaths) {
+		return nil, fmt.Errorf("length of shards and shardsPaths do not match: %d vs %d", len(shards), len(shardsPaths))
+	}
+	li := make(ForeignTensors, 0)
+	for i := range shards {
+		gs := shards[i]
+		tensors := gs.Tensors()
+		for k := range tensors.items {
+			tensor := tensors.items[k]
+			li = append(li, ForeignTensor{
+				Tensor:             tensor,
+				ModelPath:          shardsPaths[i],
+				TensorRegionOffset: tensors.Offset,
+			})
+		}
+	}
+	return &li, nil
+}
+
+func MakeMetaGGML(ggmls []GGML, ggmlPaths []string) MetaGGML {
+	type wrapper struct {
+		ggml   GGML
+		path   string
+		weight int
+	}
+	var wrappers []wrapper
+	for i := range ggmls {
+		iSplitInfo := ggmls[i].KV().GGUFSplitInfo()
+		var weight int = 0
+		if iSplitInfo == nil {
+			weight = -1
+		} else {
+			weight = int((*iSplitInfo).No)
+		}
+		wrappers = append(wrappers, wrapper{
+			ggml:   ggmls[i],
+			path:   ggmlPaths[i],
+			weight: weight,
+		})
+	}
+	slices.SortStableFunc(wrappers, func(a, b wrapper) int {
+		return cmp.Compare(a.weight, b.weight)
+	})
+	metaGgml := MetaGGML{}
+	var param_counts uint64 = 0
+	for i := range wrappers {
+		param_counts += wrappers[i].ggml.KV().ParameterCount()
+		if i == 0 {
+			kv := maps.Clone(wrappers[i].ggml.KV())
+			// remove the keys contained in split gguf files. add more if needed.
+			delete(kv, "slice.no")
+			delete(kv, "slice.count")
+			delete(kv, "slice.tensors.count")
+			delete(kv, "general.parameter_count")
+			metaGgml.kv = kv
+		}
+		metaGgml.Shards = append(metaGgml.Shards, wrappers[i].ggml)
+		metaGgml.ShardPaths = append(metaGgml.ShardPaths, wrappers[i].path)
+	}
+	metaGgml.kv["general.parameter_count"] = param_counts
+	ft, _ := BuildForeignTensors(metaGgml.Shards, metaGgml.ShardPaths)
+	metaGgml.Tensors = *ft
+	return metaGgml
+}
+
+func simpleWrapGGML(ggml GGML) MetaGGML {
+	// simply wrap single GGML, without creating foreign tensors
+	return MetaGGML{
+		Shards:     []GGML{ggml},
+		ShardPaths: []string{""},
+		kv:         ggml.KV(),
+	}
+}
+
+func WrapGGML(ggml GGML) MetaGGML {
+	metaggml := simpleWrapGGML(ggml)
+	ft, _ := BuildForeignTensors(metaggml.Shards, metaggml.ShardPaths)
+	metaggml.Tensors = *ft
+	return metaggml
+}
+
 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
+	return WrapGGML(f).GraphSize(context, batch, numParallel, kvCacheType, useFlashAttention)
+}
+
+func (f MetaGGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
 	context *= uint64(numParallel)

 	embedding := f.KV().EmbeddingLength()
@ -567,7 +721,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
 	embeddingHeadsV := f.KV().EmbeddingHeadCountV()

-	layers := f.Tensors().GroupLayers()
+	layers := f.Tensors.GroupLayers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)

@ -665,7 +819,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 		)

 		var ropeFreqsCount uint64
-		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
+		if ropeFreqs, ok := f.Tensors.GroupLayers()["rope_freqs"]; ok {
 			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
 				ropeFreqsCount = ropeFreqsWeights.Elements()
 			}
@ -805,6 +959,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri

 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
+	return simpleWrapGGML(f).SupportsKVCacheType(cacheType)
+}
+func (f MetaGGML) SupportsKVCacheType(cacheType string) bool {
 	if cacheType == "" || cacheType == "f16" {
 		return true
 	}
@ -822,6 +979,10 @@ func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {

 // SupportsFlashAttention checks if the model supports flash attention
 func (f GGML) SupportsFlashAttention() bool {
+	return simpleWrapGGML(f).SupportsFlashAttention()
+}
+
+func (f MetaGGML) SupportsFlashAttention() bool {
 	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
 	if isEmbedding {
 		return false
@ -839,6 +1000,10 @@ func (f GGML) SupportsFlashAttention() bool {

 // FlashAttention checks if the model should enable flash attention
 func (f GGML) FlashAttention() bool {
+	return simpleWrapGGML(f).FlashAttention()
+}
+
+func (f MetaGGML) FlashAttention() bool {
 	return slices.Contains([]string{
 		"bert",
 		"gemma3",
@ -863,3 +1028,15 @@ func kvCacheBytesPerElement(cacheType string) float64 {
 		return 2 // f16 (default)
 	}
 }
+
+func (f MetaGGML) KV() KV {
+	return f.kv
+}
+
+func (f MetaGGML) TotalTensorBytes() uint64 {
+	totalBytes := uint64(0)
+	for i := range f.Shards {
+		totalBytes += uint64(f.Shards[i].Length) - f.Shards[i].Tensors().Offset
+	}
+	return totalBytes
+}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@ -582,7 +582,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
 	if !strings.HasPrefix(k, arch+".") &&
 		!strings.HasPrefix(k, "general.") &&
 		!strings.HasPrefix(k, "adapter.") &&
-		!strings.HasPrefix(k, "tokenizer.") {
+		!strings.HasPrefix(k, "tokenizer.") &&
+		!strings.HasPrefix(k, "split.") {
 		k = arch + "." + k
 	}

@ -597,6 +598,8 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {

 	var err error
 	switch v := v.(type) {
+	case uint16:
+		err = writeGGUF(ws, ggufTypeUint16, v)
 	case int32:
 		err = writeGGUF(ws, ggufTypeInt32, v)
 	case int64:
--- a/go.mod
+++ b/go.mod
@ -77,7 +77,7 @@ require (
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
-	github.com/spf13/pflag v1.0.5 // indirect
+	github.com/spf13/pflag v1.0.5
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
--- a/llama/llama.go
+++ b/llama/llama.go
@ -261,7 +261,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
 	return true
 }

-func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
+func LoadModelFromFile(modelPath string, extraModelPaths []string, params ModelParams) (*Model, error) {
 	cparams := C.llama_model_default_params()
 	cparams.n_gpu_layers = C.int(params.NumGpuLayers)
 	cparams.main_gpu = C.int32_t(params.MainGpu)
@ -305,7 +305,17 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 		cparams.progress_callback_user_data = unsafe.Pointer(&handle)
 	}

-	m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)}
+	var splitPaths []*C.char
+	mp := C.CString(modelPath)
+	defer C.free(unsafe.Pointer(mp))
+	splitPaths = append(splitPaths, mp)
+	for i := range extraModelPaths {
+		mp := C.CString(extraModelPaths[i])
+		defer C.free(unsafe.Pointer(mp))
+		splitPaths = append(splitPaths, mp)
+	}
+
+	m := Model{c: C.llama_model_load_from_splits(&splitPaths[0], C.size_t(len(splitPaths)), cparams)}
 	if m.c == nil {
 		return nil, fmt.Errorf("unable to load model: %s", modelPath)
 	}
--- a/llm/server.go
+++ b/llm/server.go
@ -84,12 +84,13 @@ type LlamaServer interface {

 // llmServer is an instance of a runner hosting a single model
 type llmServer struct {
-	port      int
-	cmd       *exec.Cmd
-	done      chan error // Channel to signal when the process exits
-	status    *StatusWriter
-	options   api.Options
-	modelPath string
+	port            int
+	cmd             *exec.Cmd
+	done            chan error // Channel to signal when the process exits
+	status          *StatusWriter
+	options         api.Options
+	modelPath       string
+	extraModelPaths []string

 	loadRequest LoadRequest       // Parameters used to initialize the runner
 	mem         *ml.BackendMemory // Memory allocations for this model
@ -109,7 +110,7 @@ type llmServer struct {
 type llamaServer struct {
 	llmServer

-	ggml *ggml.GGML
+	ggml *ggml.MetaGGML
 }

 type ollamaServer struct {
@ -123,7 +124,7 @@ type ollamaServer struct {
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
-func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
+func LoadModel(model string, extraModels []string, maxArraySize int, reliefSplitConstrain bool) (*ggml.MetaGGML, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@ -134,12 +135,55 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()

-	ggml, err := ggml.Decode(f, maxArraySize)
-	return ggml, err
+	ggml1, err := ggml.Decode(f, maxArraySize)
+	if err != nil {
+		return nil, err
+	}
+	if ggml1.KV().GGUFSplitInfo() != nil {
+		if ggml1.KV().GGUFSplitInfo().No != 0 {
+			return nil, errors.New("not the first split of model")
+		}
+		loadedGgml := []ggml.GGML{*ggml1}
+		visitedSplitNo := []uint16{ggml1.KV().GGUFSplitInfo().No}
+		for i := range extraModels {
+			extraModel := extraModels[i]
+			f, err := os.Open(extraModel)
+			if err != nil {
+				return nil, err
+			}
+			defer f.Close()
+
+			ggml1, err := ggml.Decode(f, maxArraySize)
+			if err != nil {
+				return nil, err
+			}
+			if ggml1.KV().GGUFSplitInfo() == nil {
+				return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf")
+			}
+			visitedSplitNo = append(visitedSplitNo, ggml1.KV().GGUFSplitInfo().No)
+			loadedGgml = append(loadedGgml, *ggml1)
+		}
+		if !reliefSplitConstrain {
+			if len(visitedSplitNo) != int(ggml1.KV().GGUFSplitInfo().Count) {
+				return nil, errors.New("mismatch split gguf count")
+			}
+			slices.Sort(visitedSplitNo)
+			for i := 0; i < len(visitedSplitNo)-1; i++ {
+				if visitedSplitNo[i] != visitedSplitNo[i+1]-1 {
+					return nil, errors.New("repeated or skipped split found")
+				}
+			}
+		}
+		metaggml := ggml.MakeMetaGGML(loadedGgml, append([]string{model}, extraModels...))
+		return &metaggml, nil
+	} else {
+		metaggml := ggml.MakeMetaGGML([]ggml.GGML{*ggml1}, []string{model})
+		return &metaggml, nil
+	}
 }

 // NewLlamaServer will run a server for the given GPUs
-func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, extraModelPaths []string, f *ggml.MetaGGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var llamaModel *llama.Model
 	var textProcessor model.TextProcessor
 	var err error
@ -155,7 +199,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 		}
 	}
 	if textProcessor == nil {
-		llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
+		llamaModel, err = llama.LoadModelFromFile(modelPath, extraModelPaths, llama.ModelParams{VocabOnly: true})
 		if err != nil {
 			return nil, err
 		}
@ -262,24 +306,26 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 	cmd, port, err := StartRunner(
 		textProcessor != nil,
 		modelPath,
+		extraModelPaths,
 		gpuLibs,
 		status,
 		ml.GetVisibleDevicesEnv(gpus, false),
 	)

 	s := llmServer{
-		port:           port,
-		cmd:            cmd,
-		status:         status,
-		options:        opts,
-		modelPath:      modelPath,
-		loadRequest:    loadRequest,
-		llamaModel:     llamaModel,
-		llamaModelLock: &sync.Mutex{},
-		sem:            semaphore.NewWeighted(int64(numParallel)),
-		totalLayers:    f.KV().BlockCount() + 1,
-		loadStart:      time.Now(),
-		done:           make(chan error, 1),
+		port:            port,
+		cmd:             cmd,
+		status:          status,
+		options:         opts,
+		modelPath:       modelPath,
+		extraModelPaths: extraModelPaths,
+		loadRequest:     loadRequest,
+		llamaModel:      llamaModel,
+		llamaModelLock:  &sync.Mutex{},
+		sem:             semaphore.NewWeighted(int64(numParallel)),
+		totalLayers:     f.KV().BlockCount() + 1,
+		loadStart:       time.Now(),
+		done:            make(chan error, 1),
 	}

 	if err != nil {
@ -316,7 +362,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 	}
 }

-func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
+func StartRunner(ollamaEngine bool, modelPath string, extraModelPaths []string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
 	var exe string
 	exe, err = os.Executable()
 	if err != nil {
@ -346,6 +392,9 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W
 	if modelPath != "" {
 		params = append(params, "--model", modelPath)
 	}
+	for i := range extraModelPaths {
+		params = append(params, "--model", extraModelPaths[i])
+	}
 	params = append(params, "--port", strconv.Itoa(port))

 	var pathEnv string
@ -440,6 +489,10 @@ func (s *llmServer) ModelPath() string {
 	return s.modelPath
 }

+func (s *llmServer) ExtraModelPaths() []string {
+	return s.extraModelPaths
+}
+
 type LoadOperation int

 // The order of these constants are significant because we iterate over the operations. They
@ -522,7 +575,7 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
 		s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)

 	// Use the size of one layer as a buffer
-	layers := s.ggml.Tensors().GroupLayers()
+	layers := s.ggml.Tensors.GroupLayers()
 	if blk0, ok := layers["blk.0"]; ok {
 		buffer := blk0.Size() + kv[0]
 		for i := range gpus {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -73,9 +73,9 @@ type BackendParams struct {
 	FlashAttention FlashAttentionType
 }

-var backends = make(map[string]func(string, BackendParams) (Backend, error))
+var backends = make(map[string]func(string, []string, BackendParams) (Backend, error))

-func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
+func RegisterBackend(name string, f func(string, []string, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@ -83,9 +83,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
 	backends[name] = f
 }

-func NewBackend(modelPath string, params BackendParams) (Backend, error) {
+func NewBackend(modelPath string, extraModelPaths []string, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(modelPath, params)
+		return backend(modelPath, extraModelPaths, params)
 	}

 	return nil, fmt.Errorf("unsupported backend")
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -77,7 +77,7 @@ type Backend struct {
 	// modelPath is the location of the model data
 	modelPath string

-	meta *fsggml.GGML
+	meta *fsggml.MetaGGML

 	// allocMemory means that memory should be allocated for tensors and not
 	// just a dry run
@ -120,17 +120,55 @@ type Backend struct {

 var once sync.Once

-func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
+func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (ml.Backend, error) {
 	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
 	defer r.Close()

-	meta, err := fsggml.Decode(r, -1)
+	smallmeta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
+	var meta fsggml.MetaGGML
+	if smallmeta.KV().GGUFSplitInfo() != nil {
+		if smallmeta.KV().GGUFSplitInfo().No != 0 {
+			return nil, errors.New("not the first split of model")
+		}
+		loadedGgml := []fsggml.GGML{*smallmeta}
+		visitedSplitNo := []uint16{smallmeta.KV().GGUFSplitInfo().No}
+		for i := range extraModelPaths {
+			extraModel := extraModelPaths[i]
+			f, err := os.Open(extraModel)
+			if err != nil {
+				return nil, err
+			}
+			defer f.Close()
+
+			smallmeta, err := fsggml.Decode(f, -1)
+			if err != nil {
+				return nil, err
+			}
+			if smallmeta.KV().GGUFSplitInfo() == nil {
+				return nil, errors.New("non-split gguf in extra model paths while main model path is split gguf")
+			}
+			visitedSplitNo = append(visitedSplitNo, smallmeta.KV().GGUFSplitInfo().No)
+			loadedGgml = append(loadedGgml, *smallmeta)
+		}
+		if len(visitedSplitNo) != int(smallmeta.KV().GGUFSplitInfo().Count) {
+			return nil, errors.New("mismatch split gguf count")
+		}
+		slices.Sort(visitedSplitNo)
+		for i := 0; i < len(visitedSplitNo)-1; i++ {
+			if visitedSplitNo[i] != visitedSplitNo[i+1]-1 {
+				return nil, errors.New("repeated or skipped split found")
+			}
+		}
+		meta = fsggml.MakeMetaGGML(loadedGgml, append([]string{modelPath}, extraModelPaths...))
+	} else {
+		meta = fsggml.MakeMetaGGML([]fsggml.GGML{*smallmeta}, []string{modelPath})
+	}

 	once.Do(func() {
 		slog.Info(
@ -139,7 +177,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			"file_type", meta.KV().FileType(),
 			"name", meta.KV().String("general.name"),
 			"description", meta.KV().String("general.description"),
-			"num_tensors", len(meta.Tensors().Items()),
+			"num_tensors", len(meta.Tensors.Items()),
 			"num_key_values", len(meta.KV()),
 		)
 	})
@ -227,7 +265,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	// outputs are assigned iff allowed by splits and configured number of gpu layers
 	output := assignLayer(blocks)

-	maxTensors := len(meta.Tensors().Items())
+	maxTensors := len(meta.Tensors.Items())
 	maxTensors += 1
 	// each layer has at most 2 extra tensors for rope operations
 	maxTensors += blocks * 2
@ -303,11 +341,11 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		return false
 	}

-	for _, t := range meta.Tensors().Items() {
+	for _, t := range meta.Tensors.Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
 			createTensor(tensor{source: t}, input.bts, -1)
-			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
+			if _, ok := meta.Tensors.GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
 				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
 			}
 		case contains(t.Name, "cls", "output", "output_norm",
@ -378,7 +416,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}

-	maxGraphNodes := max(1024, len(meta.Tensors().Items())*8)
+	maxGraphNodes := max(1024, len(meta.Tensors.Items())*8)

 	sched := C.ggml_backend_sched_new_ext(
 		(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
@ -423,7 +461,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		modelPath:         modelPath,
 		allocMemory:       params.AllocMemory,
 		flashAttention:    params.FlashAttention,
-		meta:              meta,
+		meta:              &meta,
 		tensorLoadTargets: targets,
 		tensors:           tensors,
 		sched:             sched,
@ -494,11 +532,12 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))

 	var doneBytes atomic.Uint64
-	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
+	totalBytes := b.meta.TotalTensorBytes()

 	g, ctx := errgroup.WithContext(ctx)
 	g.SetLimit(runtime.GOMAXPROCS(0))
-	for _, t := range b.meta.Tensors().Items() {
+	for i := range b.meta.Tensors {
+		t := b.meta.Tensors[i]
 		g.Go(func() error {
 			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
 			for i := range tts {
@ -517,13 +556,13 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {

 			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
 			// seeking around within an FD shared between all goroutines.
-			file, err := os.Open(b.modelPath)
+			file, err := os.Open(t.ModelPath)
 			if err != nil {
-				slog.Warn("file open error", "file", b.modelPath, "error", err)
+				slog.Warn("file open error", "file", t.ModelPath, "error", err)
 				return err
 			}
 			defer file.Close()
-			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			sr := io.NewSectionReader(file, int64(t.TensorRegionOffset+t.Offset), int64(t.Size()))

 			if t.Kind == 4 && tts[0]._type == 39 {
 				// source is mxfp4, target is ggml mxfp4
--- a/ml/backend/ggml/ggml_test.go
+++ b/ml/backend/ggml/ggml_test.go
@ -24,7 +24,7 @@ func setup(tb testing.TB) ml.Context {
 		tb.Fatal(err)
 	}

-	b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
+	b, err := ml.NewBackend(f.Name(), make([]string, 0), ml.BackendParams{AllocMemory: true})
 	if err != nil {
 		tb.Fatal(err)
 	}
--- a/model/model.go
+++ b/model/model.go
@ -102,8 +102,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(modelPath string, params ml.BackendParams) (Model, error) {
-	b, err := ml.NewBackend(modelPath, params)
+func New(modelPath string, extraModelPaths []string, params ml.BackendParams) (Model, error) {
+	b, err := ml.NewBackend(modelPath, extraModelPaths, params)
 	if err != nil {
 		return nil, err
 	}
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@ -4,7 +4,6 @@ import (
 	"context"
 	"encoding/json"
 	"errors"
-	"flag"
 	"fmt"
 	"log"
 	"log/slog"
@ -19,6 +18,7 @@ import (
 	"time"
 	"unicode/utf8"

+	"github.com/spf13/pflag"
 	"golang.org/x/sync/semaphore"

 	"github.com/ollama/ollama/api"
@ -257,6 +257,8 @@ type Server struct {
 	// modelPath is the location of the model to be loaded
 	modelPath string

+	extraModelPaths []string
+
 	// loadMu prevents more than one load attempt from occurring at a time
 	loadMu sync.Mutex

@ -829,6 +831,7 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
 func (s *Server) loadModel(
 	params llama.ModelParams,
 	mpath string,
+	empath []string,
 	lpath []string,
 	ppath string,
 	kvSize int,
@ -838,7 +841,7 @@ func (s *Server) loadModel(
 	multiUserCache bool,
 ) {
 	var err error
-	s.model, err = llama.LoadModelFromFile(mpath, params)
+	s.model, err = llama.LoadModelFromFile(mpath, empath, params)
 	if err != nil {
 		panic(err)
 	}
@ -931,7 +934,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
 		}

 		s.status = llm.ServerStatusLoadingModel
-		go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
+		go s.loadModel(params, s.modelPath, s.extraModelPaths, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)

 	case llm.LoadOperationClose:
 		// No-op for us
@ -949,13 +952,14 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
 }

 func Execute(args []string) error {
-	fs := flag.NewFlagSet("runner", flag.ExitOnError)
-	mpath := fs.String("model", "", "Path to model binary file")
+	fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
+	mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
 	port := fs.Int("port", 8080, "Port to expose the server on")
 	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")

 	fs.Usage = func() {
-		fmt.Fprintf(fs.Output(), "Runner usage\n")
+		// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
+		fmt.Fprintf(os.Stderr, "Runner usage\n")
 		fs.PrintDefaults()
 	}
 	if err := fs.Parse(args); err != nil {
@ -967,8 +971,9 @@ func Execute(args []string) error {
 	llama.BackendInit()

 	server := &Server{
-		modelPath: *mpath,
-		status:    llm.ServerStatusLaunched,
+		modelPath:       (*mpath)[0],
+		extraModelPaths: (*mpath)[1:],
+		status:          llm.ServerStatusLaunched,
 	}

 	server.ready.Add(1)
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@ -5,7 +5,6 @@ import (
 	"context"
 	"encoding/json"
 	"errors"
-	"flag"
 	"fmt"
 	"hash/maphash"
 	"image"
@ -23,6 +22,7 @@ import (
 	"time"
 	"unicode/utf8"

+	"github.com/spf13/pflag"
 	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"

@ -331,6 +331,8 @@ type Server struct {
 	// modelPath is the location of the model to be loaded
 	modelPath string

+	extraModelPaths []string
+
 	// loadMu prevents more than one load attempt from occurring at a time
 	loadMu sync.Mutex

@ -1169,6 +1171,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error {
 // based on the given parameters
 func (s *Server) allocModel(
 	mpath string,
+	empath []string,
 	params ml.BackendParams,
 	loraPath []string,
 	parallel int,
@ -1193,7 +1196,7 @@ func (s *Server) allocModel(
 	}()

 	var err error
-	s.model, err = model.New(mpath, params)
+	s.model, err = model.New(mpath, empath, params)
 	if err != nil {
 		return err
 	}
@ -1302,7 +1305,7 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {

 		s.batchSize = req.BatchSize

-		err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
+		err := s.allocModel(s.modelPath, s.extraModelPaths, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
 		if err != nil {
 			s.closeModel()

@ -1372,7 +1375,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
 			return
 		}

-		m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
+		m, err = model.New(f.Name(), make([]string, 0), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
 		if err != nil {
 			http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
 			return
@ -1389,13 +1392,14 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
 }

 func Execute(args []string) error {
-	fs := flag.NewFlagSet("runner", flag.ExitOnError)
-	mpath := fs.String("model", "", "Path to model binary file")
+	fs := pflag.NewFlagSet("runner", pflag.ExitOnError)
+	mpath := fs.StringArray("model", []string{""}, "Path to model binary file. May repeatedly specified to provide other split of models binary.")
 	port := fs.Int("port", 8080, "Port to expose the server on")
 	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")

 	fs.Usage = func() {
-		fmt.Fprintf(fs.Output(), "Runner usage\n")
+		// sadly pflag does not expose out(). Fallback to os.Stderr which should perform identically as we don't set fs.output
+		fmt.Fprintf(os.Stderr, "Runner usage\n")
 		fs.PrintDefaults()
 	}
 	if err := fs.Parse(args); err != nil {
@ -1408,8 +1412,9 @@ func Execute(args []string) error {
 	defer cancel()

 	server := &Server{
-		modelPath: *mpath,
-		status:    llm.ServerStatusLaunched,
+		modelPath:       (*mpath)[0],
+		extraModelPaths: (*mpath)[1:],
+		status:          llm.ServerStatusLaunched,
 	}

 	server.cond = sync.NewCond(&server.mu)
--- a/server/create.go
+++ b/server/create.go
@ -39,8 +39,97 @@ var (
 	errUnknownType             = errors.New("unknown type")
 	errNeitherFromOrFiles      = errors.New("neither 'from' or 'files' was specified")
 	errFilePath                = errors.New("file path must be relative")
+	errIncompleteShardedGGUF   = errors.New("missing some GGUF splits")
+	errExtraShardedGGUF        = errors.New("extra GGUF splits found")
 )

+func broadcastKV(main *ggml.GGML, subs ...*ggml.GGML) {
+	// broadcast KV value towards other shards. Only for manifest purpose
+	ggmls := []ggml.GGML{*main}
+	for i := range subs {
+		ggmls = append(ggmls, *subs[i])
+	}
+	metaggml := ggml.MakeMetaGGML(ggmls, make([]string, len(ggmls)))
+	mainKV := main.KV()
+	mainKV["general.parameter_count"] = metaggml.KV().ParameterCount()
+	for i := range subs {
+		subKV := subs[i].KV()
+		for k, v := range metaggml.KV() {
+			subKV[k] = v
+		}
+	}
+}
+
+func baseLayerSortNCheckSan(baseLayers *[]*layerGGML) error {
+	slices.SortStableFunc(*baseLayers, func(a, b *layerGGML) int {
+		var aScore, bScore int
+		if a.GGML == nil {
+			// chat template and parameter can be added here. use very big number to move them at last
+			aScore = 0x7fffffff
+		} else {
+			aSplit := a.GGML.KV().GGUFSplitInfo()
+			if aSplit == nil {
+				aScore = -1
+			} else {
+				aScore = int(aSplit.No)
+			}
+		}
+		if b.GGML == nil {
+			bScore = 0x7fffffff
+		} else {
+			bSplit := b.GGML.KV().GGUFSplitInfo()
+			if bSplit == nil {
+				bScore = -1
+			} else {
+				bScore = int(bSplit.No)
+			}
+		}
+		return cmp.Compare(aScore, bScore)
+	})
+	// sanity check for layers
+	{
+		ggmlPtrs := make([]*ggml.GGML, 0, len(*baseLayers))
+		firstSplitCount := -1
+		foundSplitNos := make([]uint16, 0)
+		for i, layer := range *baseLayers {
+			if i == 0 {
+				if layer.GGML == nil {
+					// First item should be GGUF after sorting
+					return errNoFilesProvided
+				}
+			}
+			if layer.GGML != nil && layer.GGML.KV().GGUFSplitInfo() != nil {
+				if firstSplitCount == -1 {
+					if layer.GGML.KV().GGUFSplitInfo().No != 0 {
+						return errIncompleteShardedGGUF
+					}
+					firstSplitCount = int(layer.GGML.KV().GGUFSplitInfo().Count)
+					foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No)
+				} else if firstSplitCount != int(layer.KV().GGUFSplitInfo().Count) {
+					return errExtraShardedGGUF
+				} else {
+					if foundSplitNos[len(foundSplitNos)-1] == layer.KV().GGUFSplitInfo().No {
+						return errExtraShardedGGUF
+					} else if foundSplitNos[len(foundSplitNos)-1] != layer.KV().GGUFSplitInfo().No-1 {
+						return errIncompleteShardedGGUF
+					} else {
+						foundSplitNos = append(foundSplitNos, layer.KV().GGUFSplitInfo().No)
+					}
+				}
+				// only gguf splits should be included
+				ggmlPtrs = append(ggmlPtrs, layer.GGML)
+			}
+		}
+		if firstSplitCount != -1 && len(foundSplitNos) != firstSplitCount {
+			return errIncompleteShardedGGUF
+		}
+		if len(ggmlPtrs) > 1 {
+			broadcastKV(ggmlPtrs[0], ggmlPtrs[1:]...)
+		}
+	}
+	return nil
+}
+
 func (s *Server) CreateHandler(c *gin.Context) {
 	config := &model.ConfigV2{
 		OS:           "linux",
@ -161,6 +250,14 @@ func (s *Server) CreateHandler(c *gin.Context) {
 			ch <- gin.H{"error": errNeitherFromOrFiles.Error(), "status": http.StatusBadRequest}
 			return
 		}
+		// Sort baseLayers here to ensure that split model will be correctly ordered
+		if !remote {
+			err := baseLayerSortNCheckSan(&baseLayers)
+			if err != nil {
+				ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+				return
+			}
+		}

 		var adapterLayers []*layerGGML
 		if !remote && r.Adapters != nil {
--- a/server/images.go
+++ b/server/images.go
@ -53,18 +53,19 @@ type registryOptions struct {
 }

 type Model struct {
-	Name           string `json:"name"`
-	Config         model.ConfigV2
-	ShortName      string
-	ModelPath      string
-	ParentModel    string
-	AdapterPaths   []string
-	ProjectorPaths []string
-	System         string
-	License        []string
-	Digest         string
-	Options        map[string]any
-	Messages       []api.Message
+	Name            string `json:"name"`
+	Config          model.ConfigV2
+	ShortName       string
+	ModelPath       string
+	ExtraModelPaths []string
+	ParentModel     string
+	AdapterPaths    []string
+	ProjectorPaths  []string
+	System          string
+	License         []string
+	Digest          string
+	Options         map[string]any
+	Messages        []api.Message

 	Template *template.Template
 }
@ -190,6 +191,13 @@ func (m *Model) String() string {
 		Args: m.ModelPath,
 	})

+	for _, extraModels := range m.ExtraModelPaths {
+		modelfile.Commands = append(modelfile.Commands, parser.Command{
+			Name: "model",
+			Args: extraModels,
+		})
+	}
+
 	for _, adapter := range m.AdapterPaths {
 		modelfile.Commands = append(modelfile.Commands, parser.Command{
 			Name: "adapter",
@ -319,6 +327,8 @@ func GetModel(name string) (*Model, error) {
 		}
 	}

+	readMainModelFlag := false
+
 	for _, layer := range manifest.Layers {
 		filename, err := GetBlobsPath(layer.Digest)
 		if err != nil {
@ -327,8 +337,13 @@ func GetModel(name string) (*Model, error) {

 		switch layer.MediaType {
 		case "application/vnd.ollama.image.model":
-			model.ModelPath = filename
-			model.ParentModel = layer.From
+			if !readMainModelFlag {
+				model.ModelPath = filename
+				model.ParentModel = layer.From
+				readMainModelFlag = true
+			} else {
+				model.ExtraModelPaths = append(model.ExtraModelPaths, filename)
+			}
 		case "application/vnd.ollama.image.embed":
 			// Deprecated in versions  > 0.1.2
 			// TODO: remove this warning in a future version
--- a/server/routes.go
+++ b/server/routes.go
@ -1201,14 +1201,14 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	return resp, nil
 }

-func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
+func getModelData(digest string, verbose bool) (ggml.KV, ggml.ForeignTensors, error) {
 	maxArraySize := 0
 	if verbose {
 		maxArraySize = -1
 	}
-	data, err := llm.LoadModel(digest, maxArraySize)
+	data, err := llm.LoadModel(digest, make([]string, 0), maxArraySize, true)
 	if err != nil {
-		return nil, ggml.Tensors{}, err
+		return nil, make(ggml.ForeignTensors, 0), err
 	}

 	kv := data.KV()
@ -1221,7 +1221,7 @@ func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
 		}
 	}

-	return kv, data.Tensors(), nil
+	return kv, data.Tensors, nil
 }

 func (s *Server) ListHandler(c *gin.Context) {
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@ -954,3 +954,236 @@ func TestDetectModelTypeFromFiles(t *testing.T) {
 		}
 	})
 }
+
+func TestShardedGGUF(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	p := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", p)
+
+	_, fullDigest := createBinFile(t, ggml.KV{}, []*ggml.Tensor{})
+	_, splitDigest1 := createBinFile(t, ggml.KV{
+		"split.no":    uint16(0),
+		"split.count": uint16(3),
+	}, []*ggml.Tensor{})
+	_, splitDigest2 := createBinFile(t, ggml.KV{
+		"split.no":    uint16(1),
+		"split.count": uint16(3),
+	}, []*ggml.Tensor{})
+	_, splitDigest3 := createBinFile(t, ggml.KV{
+		"split.no":    uint16(2),
+		"split.count": uint16(3),
+	}, []*ggml.Tensor{})
+	_, splitDigest4 := createBinFile(t, ggml.KV{
+		"split.no":    uint16(0),
+		"split.count": uint16(4),
+	}, []*ggml.Tensor{})
+	_, splitDigest5 := createBinFile(t, ggml.KV{
+		"general.architecture": "test1",
+		"split.no":             uint16(1),
+		"split.count":          uint16(3),
+	}, []*ggml.Tensor{})
+
+	var s Server
+
+	t.Run("single full gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name:   "test-single-full",
+			Files:  map[string]string{"test.gguf": fullDigest},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			fmt.Println(w)
+			t.Fatalf("expected status code 200, actual %d", w.Code)
+		}
+
+		manifest, err := ParseNamedManifest(model.ParseName("test-single-full"))
+		if err != nil {
+			t.Fatalf("parse manifest: %v", err)
+		}
+		for i, layer := range manifest.Layers {
+			if i != 0 {
+				t.Fatalf("expect 1 layer, actually found layer with index %d", i)
+			} else if layer.Digest != fullDigest {
+				t.Fatalf("expect digest %s, actual %s", fullDigest, layer.Digest)
+			}
+		}
+	})
+
+	t.Run("complete split gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-complete-split",
+			Files: map[string]string{
+				"test-00001-of-00003.gguf": splitDigest1,
+				"test-00002-of-00003.gguf": splitDigest2,
+				"test-00003-of-00003.gguf": splitDigest3,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			fmt.Println(w)
+			t.Fatalf("expected status code 200, actual %d", w.Code)
+		}
+
+		correctOrder := []string{
+			splitDigest1, splitDigest2, splitDigest3,
+		}
+
+		manifest, err := ParseNamedManifest(model.ParseName("test-complete-split"))
+		if err != nil {
+			t.Fatalf("parse manifest: %v", err)
+		}
+		for i, layer := range manifest.Layers {
+			if i >= 3 {
+				t.Fatalf("expect 3 layers, actually found layer with index %d", i)
+			} else if layer.Digest != correctOrder[i] {
+				t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
+			}
+		}
+	})
+
+	t.Run("complete split misordered gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-complete-split-misorder",
+			Files: map[string]string{
+				"test-00003-of-00003.gguf": splitDigest3,
+				"test-00001-of-00003.gguf": splitDigest1,
+				"test-00002-of-00003.gguf": splitDigest2,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			fmt.Println(w)
+			t.Fatalf("expected status code 200, actual %d", w.Code)
+		}
+
+		correctOrder := []string{
+			splitDigest1, splitDigest2, splitDigest3,
+		}
+
+		manifest, err := ParseNamedManifest(model.ParseName("test-complete-split-misorder"))
+		if err != nil {
+			t.Fatalf("parse manifest: %v", err)
+		}
+		for i, layer := range manifest.Layers {
+			if i >= 3 {
+				t.Fatalf("expect 3 layers, actually found layer with index %d", i)
+			} else if layer.Digest != correctOrder[i] {
+				t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
+			}
+		}
+	})
+
+	t.Run("mixed full and split gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-full-split-mixing",
+			Files: map[string]string{
+				"test-00002-of-00003.gguf": splitDigest2,
+				"test-00003-of-00003.gguf": splitDigest3,
+				"test1.gguf":               fullDigest,
+				"test-00001-of-00003.gguf": splitDigest1,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			fmt.Println(w)
+			t.Fatalf("expected status code 200, actual %d", w.Code)
+		}
+
+		correctOrder := []string{
+			fullDigest, splitDigest1, splitDigest2, splitDigest3,
+		}
+
+		manifest, err := ParseNamedManifest(model.ParseName("test-full-split-mixing"))
+		if err != nil {
+			t.Fatalf("parse manifest: %v", err)
+		}
+		for i, layer := range manifest.Layers {
+			if i >= 4 {
+				t.Fatalf("expect 4 layers, actually found layer with index %d", i)
+			} else if layer.Digest != correctOrder[i] {
+				t.Fatalf("expect digest %s, actual %s", correctOrder[i], layer.Digest)
+			}
+		}
+	})
+
+	t.Run("mixed wrong split gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-extra-split",
+			Files: map[string]string{
+				"test-00002-of-00003.gguf":  splitDigest2,
+				"test-00003-of-00003.gguf":  splitDigest3,
+				"test-00001-of-00003.gguf":  splitDigest1,
+				"test1-00001-of-00004.gguf": splitDigest4,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusBadRequest {
+			t.Fatalf("expected status code 400, actual %d", w.Code)
+		}
+	})
+
+	t.Run("mixed same count wrong split gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-extra-split",
+			Files: map[string]string{
+				"test-00002-of-00003.gguf":  splitDigest2,
+				"test-00003-of-00003.gguf":  splitDigest3,
+				"test-00001-of-00003.gguf":  splitDigest1,
+				"test1-00002-of-00003.gguf": splitDigest5,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusBadRequest {
+			t.Fatalf("expected status code 400, actual %d", w.Code)
+		}
+	})
+	t.Run("missing head split gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-extra-split",
+			Files: map[string]string{
+				"test-00002-of-00003.gguf": splitDigest2,
+				"test-00003-of-00003.gguf": splitDigest3,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusBadRequest {
+			t.Fatalf("expected status code 400, actual %d", w.Code)
+		}
+	})
+	t.Run("missing mid split gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-extra-split",
+			Files: map[string]string{
+				"test-00001-of-00003.gguf": splitDigest1,
+				"test-00003-of-00003.gguf": splitDigest3,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusBadRequest {
+			t.Fatalf("expected status code 400, actual %d", w.Code)
+		}
+	})
+	t.Run("missing tail split gguf", func(t *testing.T) {
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Name: "test-extra-split",
+			Files: map[string]string{
+				"test-00001-of-00003.gguf": splitDigest1,
+				"test-00002-of-00003.gguf": splitDigest2,
+			},
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusBadRequest {
+			t.Fatalf("expected status code 400, actual %d", w.Code)
+		}
+	})
+
+}
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@ -39,7 +39,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
 			getGpuFn:        getGpuFn,
 			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@ -232,7 +232,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
 			getGpuFn:        getGpuFn,
 			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
--- a/server/routes_generate_renderer_test.go
+++ b/server/routes_generate_renderer_test.go
@ -44,7 +44,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
 			getGpuFn:        getGpuFn,
 			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 					llama: &mock,
@ -228,7 +228,7 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
 			getGpuFn:        getGpuFn,
 			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 					llama: &mock,
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -71,8 +71,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 }

-func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
+func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, []string, *ggml.MetaGGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
+	return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ []string, _ *ggml.MetaGGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
 		return mock, nil
 	}
 }
@ -182,7 +182,7 @@ func TestGenerateChat(t *testing.T) {
 			getGpuFn:        getGpuFn,
 			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@ -898,7 +898,7 @@ func TestGenerate(t *testing.T) {
 			getGpuFn:        getGpuFn,
 			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@ -1382,7 +1382,7 @@ func TestGenerateLogprobs(t *testing.T) {
 				getGpuFn:        getGpuFn,
 				getSystemInfoFn: getSystemInfoFn,
 				waitForRecovery: 250 * time.Millisecond,
-				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+				loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 					req.successCh <- &runnerRef{llama: mock}
 					return false
 				},
@ -1562,7 +1562,7 @@ func TestChatLogprobs(t *testing.T) {
 				getGpuFn:        getGpuFn,
 				getSystemInfoFn: getSystemInfoFn,
 				waitForRecovery: 250 * time.Millisecond,
-				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+				loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 					req.successCh <- &runnerRef{llama: mock}
 					return false
 				},
@ -1672,7 +1672,7 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
 				getGpuFn:        getGpuFn,
 				getSystemInfoFn: getSystemInfoFn,
 				waitForRecovery: 250 * time.Millisecond,
-				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+				loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 					time.Sleep(time.Millisecond)
 					req.successCh <- &runnerRef{llama: mock}
 					return false
--- a/server/routes_harmony_streaming_test.go
+++ b/server/routes_harmony_streaming_test.go
@ -265,7 +265,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
 					getGpuFn:        getGpuFn,
 					getSystemInfoFn: getSystemInfoFn,
 					waitForRecovery: 100 * time.Millisecond,
-					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+					loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 						req.successCh <- &runnerRef{
 							llama: &mock,
 						}
@ -416,7 +416,7 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
 			getGpuFn:        getGpuFn,
 			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 100 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				req.successCh <- &runnerRef{
 					llama: &mock,
 				}
@ -598,7 +598,7 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
 					getGpuFn:        getGpuFn,
 					getSystemInfoFn: getSystemInfoFn,
 					waitForRecovery: 250 * time.Millisecond,
-					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
+					loadFn: func(req *LlmRequest, _ *ggml.MetaGGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 						req.successCh <- &runnerRef{
 							llama: &mock,
 						}
--- a/server/sched.go
+++ b/server/sched.go
@ -49,8 +49,8 @@ type Scheduler struct {
 	activeLoading llm.LlamaServer
 	loaded        map[string]*runnerRef

-	loadFn          func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
-	newServerFn     func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
+	loadFn          func(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
+	newServerFn     func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
 	getGpuFn        func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
 	getSystemInfoFn func() ml.SystemInfo
 	waitForRecovery time.Duration
@ -196,7 +196,7 @@ func (s *Scheduler) processPending(ctx context.Context) {

 					// Load model for fitting
 					logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
-					ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
+					ggml, err := llm.LoadModel(pending.model.ModelPath, pending.model.ExtraModelPaths, 1024, false)
 					if err != nil {
 						pending.errCh <- err
 						break
@ -389,7 +389,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm

 // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
 // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
-func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
+func (s *Scheduler) load(req *LlmRequest, f *ggml.MetaGGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
 	numParallel := max(int(envconfig.NumParallel()), 1)

 	// Embedding models should always be loaded with parallel=1
@ -414,7 +414,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo

 	if llama == nil {
 		var err error
-		llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
+		llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, req.model.ExtraModelPaths, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -39,7 +39,7 @@ func TestSchedLoad(t *testing.T) {
 	defer done()
 	s := InitScheduler(ctx)
 	s.waitForRecovery = 10 * time.Millisecond
-	var f *ggml.GGML // value not used in tests
+	var f *ggml.MetaGGML // value not used in tests
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
@ -49,7 +49,7 @@ func TestSchedLoad(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	// Fail to load model first
-	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return nil, errors.New("something failed to load model blah")
 	}
 	gpus := []ml.DeviceInfo{}
@ -64,7 +64,7 @@ func TestSchedLoad(t *testing.T) {
 	require.Contains(t, err.Error(), "this model may be incompatible")

 	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
-	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		server.modelPath = model
 		return server, nil
 	}
@ -103,10 +103,10 @@ type reqBundle struct {
 	ctxDone func()
 	srv     *mockLlm
 	req     *LlmRequest
-	f       *ggml.GGML
+	f       *ggml.MetaGGML
 }

-func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	scenario.srv.modelPath = model
 	return scenario.srv, nil
 }
@ -132,7 +132,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
 	})

 	model := &Model{Name: modelName, ModelPath: p}
-	f, err := llm.LoadModel(model.ModelPath, 0)
+	f, err := llm.LoadModel(model.ModelPath, make([]string, 0), 0, true)
 	if err != nil {
 		t.Fatal(err)
 	}
@ -462,11 +462,11 @@ func TestSchedExpireRunner(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
 	}

-	var f *ggml.GGML
+	var f *ggml.MetaGGML
 	gpus := []ml.DeviceInfo{}
 	systemInfo := ml.SystemInfo{}
 	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
-	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, extraModelPaths []string, f *ggml.MetaGGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		server.modelPath = model
 		return server, nil
 	}