Merge remote-tracking branch 'upstream/main' into VulkanV3Update

2025-10-04 14:53:59 +02:00
parent 5647ac91b2 19e6796eac
commit ac6ba7d44b
397 changed files with 34413 additions and 21947 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -195,7 +195,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		slog.Warn("model missing blk.0 layer size")
 	}

-	useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
+	useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
 		(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
 		f.SupportsFlashAttention()

@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	// on metal there's no partial offload overhead
-	if gpus[0].Library == "metal" {
+	if gpus[0].Library == "Metal" {
 		graphPartialOffload = graphFullOffload
 	} else if len(gpus) > 1 {
 		// multigpu should always use the partial graph size
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -12,6 +12,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
 )

 func TestEstimateGPULayers(t *testing.T) {
@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
 	// Simple CPU scenario
 	gpus := []discover.GpuInfo{
 		{
-			Library: "cpu",
+			DeviceID: ml.DeviceID{
+				Library: "cpu",
+			},
 		},
 	}
 	projectors := []string{}
@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
 	gpuMinimumMemory := uint64(2048)
 	gpus = []discover.GpuInfo{
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {

 type LlamaServer interface {
 	ModelPath() string
-	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
+	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@@ -76,8 +76,9 @@ type LlamaServer interface {
 	Close() error
 	VRAMSize() uint64 // Total VRAM across all GPUs
 	TotalSize() uint64
-	VRAMByGPU(gpuID string) uint64
+	VRAMByGPU(id ml.DeviceID) uint64
 	Pid() int
+	GetPort() int
 	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
 	HasExited() bool
 }
@@ -195,14 +196,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		loadRequest.ProjectorPath = projectors[0]
 	}

+	fa := envconfig.FlashAttention(f.FlashAttention())
+
 	// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
 	// that can handle it.
-	fa := envconfig.FlashAttention()
-	if f.FlashAttention() {
-		slog.Info("model wants flash attention")
-		fa = true
-	}
-
 	if fa && !gpus.FlashAttentionSupported() {
 		slog.Warn("flash attention enabled but not supported by gpu")
 		fa = false
@@ -303,22 +300,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		pathEnv = "LD_LIBRARY_PATH"
 	}

-	s := llmServer{
-		port:           port,
-		cmd:            exec.Command(exe, params...),
-		status:         NewStatusWriter(os.Stderr),
-		options:        opts,
-		modelPath:      modelPath,
-		loadRequest:    loadRequest,
-		llamaModel:     llamaModel,
-		llamaModelLock: &sync.Mutex{},
-		textProcessor:  textProcessor,
-		numParallel:    numParallel,
-		sem:            semaphore.NewWeighted(int64(numParallel)),
-		totalLayers:    f.KV().BlockCount() + 1,
-		loadStart:      time.Now(),
-		done:           make(chan error, 1),
-	}
+		// Note: we always put our dependency paths first
+		// since these are the exact version we compiled/linked against
+		libraryPaths := []string{discover.LibOllamaPath}
+		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
+			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
+		}
+
+		ggmlPaths := []string{discover.LibOllamaPath}
+		for _, c := range compatible {
+			if libpath, ok := availableLibs[c]; ok {
+				slog.Debug("adding gpu library", "path", libpath)
+				libraryPaths = append([]string{libpath}, libraryPaths...)
+				ggmlPaths = append(ggmlPaths, libpath)
+			}
+		}
+
+		for _, gpu := range gpus {
+			if gpu.DependencyPath != nil {
+				slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
+				libraryPaths = append(gpu.DependencyPath, libraryPaths...)
+				ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
+			}
+		}
+
+		// finally, add the root library path
+		libraryPaths = append(libraryPaths, discover.LibOllamaPath)
+
+		s := llmServer{
+			port:           port,
+			cmd:            exec.Command(exe, params...),
+			status:         NewStatusWriter(os.Stderr),
+			options:        opts,
+			modelPath:      modelPath,
+			loadRequest:    loadRequest,
+			llamaModel:     llamaModel,
+			llamaModelLock: &sync.Mutex{},
+			textProcessor:  textProcessor,
+			numParallel:    numParallel,
+			sem:            semaphore.NewWeighted(int64(numParallel)),
+			totalLayers:    f.KV().BlockCount() + 1,
+			loadStart:      time.Now(),
+			done:           make(chan error, 1),
+		}

 	s.cmd.Env = os.Environ()
 	s.cmd.Stdout = os.Stdout
@@ -327,9 +351,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 	s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator)))

-	// Always filter down the set of GPUs in case there are any unsupported devices that might crash
-	envWorkarounds := gpus.GetVisibleDevicesEnv()
-	pathEnvVal := strings.Join(gpuLibs, string(filepath.ListSeparator))
+		// Always filter down the set of GPUs in case there are any unsupported devices that might crash
+		envWorkarounds := gpus.GetVisibleDevicesEnv()
+		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))

 	// Update or add the path variable with our adjusted version
 	pathNeeded := true
@@ -452,7 +476,7 @@ type LoadResponse struct {

 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")

-func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory := systemInfo.System.TotalMemory
 	systemFreeMemory := systemInfo.System.FreeMemory
@@ -465,7 +489,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
 			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-			return ErrLoadRequiredFull
+			return nil, ErrLoadRequiredFull
 		}
 	}

@@ -474,11 +498,11 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
-		case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
+		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			s.options.NumGPU = 0
-		case gpus[0].Library != "metal" && s.estimate.Layers == 0:
+		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			gpus = discover.GpuInfoList{discover.GetCPUInfo()}
 		case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
@@ -493,7 +517,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
 		if systemMemoryRequired > available {
 			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
-			return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
 		}
 	}

@@ -508,7 +532,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 		// mmap has issues with partial offloading on metal
 		for _, g := range gpus {
-			if g.Library == "metal" &&
+			if g.Library == "Metal" &&
 				uint64(s.options.NumGPU) > 0 &&
 				uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
 				s.options.UseMMap = new(bool)
@@ -530,12 +554,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 	}

 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}

 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	// On the Ollama engine, we can print out a summary of the memory allocations.
@@ -546,16 +570,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 	if !resp.Success {
 		slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
-		return errors.New("failed to allocate memory for model")
+		return nil, errors.New("failed to allocate memory for model")
 	}

 	// The llama engine does its memory allocations together with model loading, so we
 	// need to wait until it is done to ensure that we have accurate memory data before
 	// loading the next model
 	if s.textProcessor == nil {
-		return s.WaitUntilRunning(ctx)
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
 	} else {
-		return nil
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
 	}
 }

@@ -568,7 +592,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu

 	gpuLayers := make(ml.GPULayersList, len(gpus))
 	for i := range gpuLayers {
-		gpuLayers[i].ID = gpus[i].ID
+		gpuLayers[i].DeviceID = gpus[i].DeviceID
 	}

 	var sum float32
@@ -616,7 +640,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
 //
 // This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
 // allowing for faster iteration, but may return less information.
-func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+//
+// Returns the list of GPU IDs that were used in the final allocation on success
+func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	var success bool
 	defer func() {
 		if !success {
@@ -641,7 +667,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
 			if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
 				available = 0
 			}
-			slog.Info("gpu memory", "id", gpu.ID,
+			slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
 				"available", format.HumanBytes2(available),
 				"free", format.HumanBytes2(gpu.FreeMemory),
 				"minimum", format.HumanBytes2(gpu.MinimumMemory),
@@ -654,11 +680,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ

 	gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}

 nextOperation:
@@ -668,7 +694,7 @@ nextOperation:
 			s.loadRequest.GPULayers = gpuLayers
 			resp, err := s.initModel(ctx, s.loadRequest, operation)
 			if err != nil {
-				return err
+				return nil, err
 			}

 			resp.Memory.Log(slog.LevelDebug)
@@ -680,7 +706,7 @@ nextOperation:
 			for {
 				newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 				if err != nil {
-					return err
+					return nil, err
 				}

 				slog.Debug("new layout created", "layers", newGPULayers)
@@ -714,7 +740,7 @@ nextOperation:
 						newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 						s.options.NumGPU = -1
 						if err != nil {
-							return err
+							return nil, err
 						}

 						slog.Debug("new layout created", "layers", newGPULayers)
@@ -722,7 +748,7 @@ nextOperation:
 						s.loadRequest.GPULayers = newGPULayers
 						resp, err = s.initModel(ctx, s.loadRequest, operation)
 						if err != nil {
-							return err
+							return nil, err
 						}

 						resp.Memory.Log(slog.LevelDebug)
@@ -731,7 +757,7 @@ nextOperation:
 						if resp.Success {
 							verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
 							if err != nil {
-								return err
+								return nil, err
 							}

 							slog.Debug("verifying layout", "layers", verifyGPULayers)
@@ -756,7 +782,7 @@ nextOperation:
 				}

 				if s.options.NumGPU >= 0 {
-					return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
+					return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
 				}

 				// Memory allocation failed even though we created a layout that we thought should
@@ -766,7 +792,7 @@ nextOperation:
 				// space.
 				if backoff > 1 {
 					slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
-					return errors.New("memory layout cannot be allocated")
+					return nil, errors.New("memory layout cannot be allocated")
 				} else if backoff == 0 {
 					backoff = 0.01
 				} else {
@@ -781,7 +807,7 @@ nextOperation:
 	s.loadRequest.GPULayers = gpuLayers
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	success = resp.Success
@@ -789,10 +815,27 @@ nextOperation:

 	if !success {
 		slog.Warn("failed to commit memory for model", "memory", resp.Memory)
-		return errors.New("failed to commit memory for model")
+		return nil, errors.New("failed to commit memory for model")
 	}

-	return nil
+	return uniqueDeviceIDs(gpuLayers), nil
+}
+
+func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
+	devices := []ml.DeviceID{}
+	for _, layer := range gpuLayers {
+		new := true
+		for _, ID := range devices {
+			if layer.DeviceID == ID {
+				new = false
+				break
+			}
+		}
+		if new {
+			devices = append(devices, layer.DeviceID)
+		}
+	}
+	return devices
 }

 // createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
@@ -811,19 +854,19 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d

 	if memory == nil {
 		memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
-			Weights: make([]ml.Memory, s.totalLayers),
-			Cache:   make([]ml.Memory, s.totalLayers),
+			Weights: make([]uint64, s.totalLayers),
+			Cache:   make([]uint64, s.totalLayers),
 		}}
 	}

 	layers := make([]uint64, len(memory.CPU.Weights))
 	for i := range layers {
 		for j := range memory.GPUs {
-			layers[i] += memory.GPUs[j].Weights[i].Size
-			layers[i] += memory.GPUs[j].Cache[i].Size
+			layers[i] += memory.GPUs[j].Weights[i]
+			layers[i] += memory.GPUs[j].Cache[i]
 		}
-		layers[i] += memory.CPU.Weights[i].Size
-		layers[i] += memory.CPU.Cache[i].Size
+		layers[i] += memory.CPU.Weights[i]
+		layers[i] += memory.CPU.Cache[i]
 		logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
 	}

@@ -837,23 +880,23 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 		for i := range gl {
 			found := false
 			for j := range memory.GPUs {
-				if gl[i].ID == memory.GPUs[j].ID {
-					if memory.GPUs[j].Graph.Size != 0 {
+				if gl[i].DeviceID == memory.GPUs[j].DeviceID {
+					if memory.GPUs[j].Graph != 0 {
 						lastUsedGPU = i
 					}

-					reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph.Size
+					reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
 					if gl[i].FreeMemory > reserved {
 						gl[i].FreeMemory -= reserved
 					} else {
 						gl[i].FreeMemory = 0
 					}

-					slog.Debug("available gpu", "id", gl[i].ID,
+					slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
 						"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
 						"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
 						"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
-						"graph", format.HumanBytes2(memory.GPUs[j].Graph.Size))
+						"graph", format.HumanBytes2(memory.GPUs[j].Graph))

 					found = true
 					break
@@ -872,12 +915,12 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 	}

 	// These sizes will only increase as we go through additional iterations and get additional information.
-	cpuSize := memory.InputWeights.Size + memory.CPU.Graph.Size
+	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
 	for _, gl := range gpuLayers {
 		for _, gpu := range memory.GPUs {
-			if gl.ID == gpu.ID {
-				vramSize += gpu.Graph.Size
+			if gl.DeviceID == gpu.DeviceID {
+				vramSize += gpu.Graph
 				break
 			}
 		}
@@ -997,7 +1040,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
 // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
 func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
 	device := len(gpus) - 1
-	gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
+	gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
 	freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
 	for i := len(layers) - 1; i >= 0; i-- {
 		if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
@@ -1015,7 +1058,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
 			if device < 0 {
 				return gpuLayers
 			}
-			gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
+			gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
 			freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
 		}
 	}
@@ -1270,30 +1313,6 @@ func (s *llmServer) Pid() int {
 	return -1
 }

-func (s *llmServer) GetPort() int {
-	return s.port
-}
-
-func (s *llmServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
-	// llama engine does not currently support VRAM query, short circuit
-	if s.textProcessor == nil {
-		slog.Debug("llamarunner free vram reporting not supported")
-		return nil
-	}
-	devices, err := discover.GetDevicesFromRunner(ctx, s)
-	if err != nil {
-		slog.Debug("failure refreshing GPU information", "error", err)
-	}
-	return devices
-}
-
-func (s *llmServer) HasExited() bool {
-	if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
-		return true
-	}
-	return false
-}
-
 var grammarJSON = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
@@ -1368,7 +1387,7 @@ type CompletionResponse struct {

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
-	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
+	logutil.Trace("completion request", "prompt", req.Prompt)

 	if len(req.Format) > 0 {
 		switch string(req.Format) {
@@ -1534,7 +1553,7 @@ type EmbeddingResponse struct {
 }

 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
-	slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
+	logutil.Trace("embedding request", "input", input)

 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
@@ -1686,9 +1705,9 @@ func (s *llamaServer) TotalSize() uint64 {
 	return s.estimate.TotalSize
 }

-func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	for i, gpu := range s.gpus {
-		if gpu.ID == gpuID {
+		if gpu.DeviceID == id {
 			if i < len(s.estimate.GPUSizes) {
 				return s.estimate.GPUSizes[i]
 			}
@@ -1697,6 +1716,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
 	return 0
 }

+func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	slog.Debug("llamarunner free vram reporting not supported")
+	return nil
+}
+
 func (s *ollamaServer) VRAMSize() uint64 {
 	if s.mem == nil {
 		return 0
@@ -1705,21 +1729,21 @@ func (s *ollamaServer) VRAMSize() uint64 {
 	var mem uint64

 	for _, g := range s.mem.GPUs {
-		mem += g.Allocated()
+		mem += g.Size()
 	}

 	// Some elements are always on CPU. However, if we have allocated all layers
 	// on the GPU then include the CPU components as well, to represent complete offloading.
 	noCPULayers := true
 	for i := range s.mem.CPU.Weights {
-		if s.mem.CPU.Weights[i].Size != 0 || s.mem.CPU.Cache[i].Size != 0 {
+		if s.mem.CPU.Weights[i] != 0 || s.mem.CPU.Cache[i] != 0 {
 			noCPULayers = false
 			break
 		}
 	}
 	if noCPULayers {
-		mem += s.mem.InputWeights.Size
-		mem += s.mem.CPU.Graph.Size
+		mem += s.mem.InputWeights
+		mem += s.mem.CPU.Graph
 	}

 	return mem
@@ -1730,25 +1754,37 @@ func (s *ollamaServer) TotalSize() uint64 {
 		return 0
 	}

-	mem := s.mem.InputWeights.Size
-	mem += s.mem.CPU.Allocated()
+	mem := s.mem.InputWeights
+	mem += s.mem.CPU.Size()
 	for _, g := range s.mem.GPUs {
-		mem += g.Allocated()
+		mem += g.Size()
 	}

 	return mem
 }

-func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	if s.mem == nil {
 		return 0
 	}

 	for _, g := range s.mem.GPUs {
-		if g.ID == gpuID {
-			return g.Allocated()
+		if g.DeviceID == id {
+			return g.Size()
 		}
 	}

 	return 0
 }
+
+func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	devices, err := discover.GetDevicesFromRunner(ctx, s)
+	if err != nil {
+		if s.cmd != nil && s.cmd.ProcessState == nil {
+			// Still running but hit an error, log
+			slog.Debug("failure refreshing GPU information", "error", err)
+		}
+		// else no longer running so suppress logging as a failure is expected
+	}
+	return devices
+}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -16,8 +16,8 @@ import (

 func TestLLMServerFitGPU(t *testing.T) {
 	type gpu struct {
-		library string
-		free    int
+		id   ml.DeviceID
+		free int
 	}

 	tests := []struct {
@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
 		},
 		{
 			name:     "Full single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Partial single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Single GPU with numGPU 1",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Single GPU with numGPU 0",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   0,
 			expected: ml.GPULayersList{},
 		},
 		{
 			name:     "Single GPU with numGPU 999",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
 		},
 		{
 			name:     "Multi GPU fits on one",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Multi GPU split",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Multi GPU partial",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 1",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 2",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   2,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 999",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
 		},
 		{
 			name:     "Multi GPU different libraries",
-			gpus:     []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
 		},
 		{
 			name:        "requireFull",
-			gpus:        []gpu{{free: 256 * format.MebiByte}},
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:      -1,
 			requireFull: true,
@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {

 			gpus := make(discover.GpuInfoList, len(tt.gpus))
 			for i := range tt.gpus {
-				gpus[i].ID = fmt.Sprintf("gpu%d", i)
-				gpus[i].Library = tt.gpus[i].library
+				gpus[i].DeviceID = tt.gpus[i].id
 				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
 			}

@@ -155,18 +154,18 @@ func TestLLMServerFitGPU(t *testing.T) {
 			}

 			s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
-				Weights: make([]ml.Memory, s.totalLayers),
-				Cache:   make([]ml.Memory, s.totalLayers),
+				Weights: make([]uint64, s.totalLayers),
+				Cache:   make([]uint64, s.totalLayers),
 			}, GPUs: make([]ml.DeviceMemory, len(gpus))}

 			for i := range tt.layers {
-				s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
+				s.mem.CPU.Weights[i] = uint64(tt.layers[i])
 			}

 			for i := range s.mem.GPUs {
-				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
-				s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
-				s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
+				s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
+				s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
+				s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
 			}

 			gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)