Merge remote-tracking branch 'upstream/main' into VulkanV3Update

This commit is contained in:
Inforithmics
2025-10-04 14:53:59 +02:00
397 changed files with 34413 additions and 21947 deletions

View File

@@ -195,7 +195,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
slog.Warn("model missing blk.0 layer size")
}
useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
f.SupportsFlashAttention()
@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
}
// on metal there's no partial offload overhead
if gpus[0].Library == "metal" {
if gpus[0].Library == "Metal" {
graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 {
// multigpu should always use the partial graph size

View File

@@ -12,6 +12,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
)
func TestEstimateGPULayers(t *testing.T) {
@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
// Simple CPU scenario
gpus := []discover.GpuInfo{
{
Library: "cpu",
DeviceID: ml.DeviceID{
Library: "cpu",
},
},
}
projectors := []string{}
@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
gpuMinimumMemory := uint64(2048)
gpus = []discover.GpuInfo{
{
Library: "cuda",
DeviceID: ml.DeviceID{
Library: "cuda",
},
MinimumMemory: gpuMinimumMemory,
},
{
Library: "cuda",
DeviceID: ml.DeviceID{
Library: "cuda",
},
MinimumMemory: gpuMinimumMemory,
},
}

View File

@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {
type LlamaServer interface {
ModelPath() string
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@@ -76,8 +76,9 @@ type LlamaServer interface {
Close() error
VRAMSize() uint64 // Total VRAM across all GPUs
TotalSize() uint64
VRAMByGPU(gpuID string) uint64
VRAMByGPU(id ml.DeviceID) uint64
Pid() int
GetPort() int
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
HasExited() bool
}
@@ -195,14 +196,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
loadRequest.ProjectorPath = projectors[0]
}
fa := envconfig.FlashAttention(f.FlashAttention())
// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
// that can handle it.
fa := envconfig.FlashAttention()
if f.FlashAttention() {
slog.Info("model wants flash attention")
fa = true
}
if fa && !gpus.FlashAttentionSupported() {
slog.Warn("flash attention enabled but not supported by gpu")
fa = false
@@ -303,22 +300,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
pathEnv = "LD_LIBRARY_PATH"
}
s := llmServer{
port: port,
cmd: exec.Command(exe, params...),
status: NewStatusWriter(os.Stderr),
options: opts,
modelPath: modelPath,
loadRequest: loadRequest,
llamaModel: llamaModel,
llamaModelLock: &sync.Mutex{},
textProcessor: textProcessor,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
loadStart: time.Now(),
done: make(chan error, 1),
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
libraryPaths := []string{discover.LibOllamaPath}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
ggmlPaths := []string{discover.LibOllamaPath}
for _, c := range compatible {
if libpath, ok := availableLibs[c]; ok {
slog.Debug("adding gpu library", "path", libpath)
libraryPaths = append([]string{libpath}, libraryPaths...)
ggmlPaths = append(ggmlPaths, libpath)
}
}
for _, gpu := range gpus {
if gpu.DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
libraryPaths = append(gpu.DependencyPath, libraryPaths...)
ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
}
}
// finally, add the root library path
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
s := llmServer{
port: port,
cmd: exec.Command(exe, params...),
status: NewStatusWriter(os.Stderr),
options: opts,
modelPath: modelPath,
loadRequest: loadRequest,
llamaModel: llamaModel,
llamaModelLock: &sync.Mutex{},
textProcessor: textProcessor,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
loadStart: time.Now(),
done: make(chan error, 1),
}
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
@@ -327,9 +351,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator)))
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
envWorkarounds := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(gpuLibs, string(filepath.ListSeparator))
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
envWorkarounds := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path variable with our adjusted version
pathNeeded := true
@@ -452,7 +476,7 @@ type LoadResponse struct {
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
systemInfo := discover.GetSystemInfo()
systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory := systemInfo.System.FreeMemory
@@ -465,7 +489,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
} else {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
return ErrLoadRequiredFull
return nil, ErrLoadRequiredFull
}
}
@@ -474,11 +498,11 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if len(gpus) > 1 || gpus[0].Library != "cpu" {
switch {
case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
s.options.NumGPU = 0
case gpus[0].Library != "metal" && s.estimate.Layers == 0:
case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit
gpus = discover.GpuInfoList{discover.GetCPUInfo()}
case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
@@ -493,7 +517,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
}
}
@@ -508,7 +532,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
// mmap has issues with partial offloading on metal
for _, g := range gpus {
if g.Library == "metal" &&
if g.Library == "Metal" &&
uint64(s.options.NumGPU) > 0 &&
uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
s.options.UseMMap = new(bool)
@@ -530,12 +554,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
}
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
return err
return nil, err
}
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
if err != nil {
return err
return nil, err
}
// On the Ollama engine, we can print out a summary of the memory allocations.
@@ -546,16 +570,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if !resp.Success {
slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
return errors.New("failed to allocate memory for model")
return nil, errors.New("failed to allocate memory for model")
}
// The llama engine does its memory allocations together with model loading, so we
// need to wait until it is done to ensure that we have accurate memory data before
// loading the next model
if s.textProcessor == nil {
return s.WaitUntilRunning(ctx)
return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
} else {
return nil
return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
}
}
@@ -568,7 +592,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
gpuLayers := make(ml.GPULayersList, len(gpus))
for i := range gpuLayers {
gpuLayers[i].ID = gpus[i].ID
gpuLayers[i].DeviceID = gpus[i].DeviceID
}
var sum float32
@@ -616,7 +640,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
//
// This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
// allowing for faster iteration, but may return less information.
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
//
// Returns the list of GPU IDs that were used in the final allocation on success
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
var success bool
defer func() {
if !success {
@@ -641,7 +667,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
available = 0
}
slog.Info("gpu memory", "id", gpu.ID,
slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
"available", format.HumanBytes2(available),
"free", format.HumanBytes2(gpu.FreeMemory),
"minimum", format.HumanBytes2(gpu.MinimumMemory),
@@ -654,11 +680,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
if err != nil {
return err
return nil, err
}
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
return err
return nil, err
}
nextOperation:
@@ -668,7 +694,7 @@ nextOperation:
s.loadRequest.GPULayers = gpuLayers
resp, err := s.initModel(ctx, s.loadRequest, operation)
if err != nil {
return err
return nil, err
}
resp.Memory.Log(slog.LevelDebug)
@@ -680,7 +706,7 @@ nextOperation:
for {
newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
if err != nil {
return err
return nil, err
}
slog.Debug("new layout created", "layers", newGPULayers)
@@ -714,7 +740,7 @@ nextOperation:
newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
s.options.NumGPU = -1
if err != nil {
return err
return nil, err
}
slog.Debug("new layout created", "layers", newGPULayers)
@@ -722,7 +748,7 @@ nextOperation:
s.loadRequest.GPULayers = newGPULayers
resp, err = s.initModel(ctx, s.loadRequest, operation)
if err != nil {
return err
return nil, err
}
resp.Memory.Log(slog.LevelDebug)
@@ -731,7 +757,7 @@ nextOperation:
if resp.Success {
verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
if err != nil {
return err
return nil, err
}
slog.Debug("verifying layout", "layers", verifyGPULayers)
@@ -756,7 +782,7 @@ nextOperation:
}
if s.options.NumGPU >= 0 {
return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
}
// Memory allocation failed even though we created a layout that we thought should
@@ -766,7 +792,7 @@ nextOperation:
// space.
if backoff > 1 {
slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
return errors.New("memory layout cannot be allocated")
return nil, errors.New("memory layout cannot be allocated")
} else if backoff == 0 {
backoff = 0.01
} else {
@@ -781,7 +807,7 @@ nextOperation:
s.loadRequest.GPULayers = gpuLayers
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
if err != nil {
return err
return nil, err
}
success = resp.Success
@@ -789,10 +815,27 @@ nextOperation:
if !success {
slog.Warn("failed to commit memory for model", "memory", resp.Memory)
return errors.New("failed to commit memory for model")
return nil, errors.New("failed to commit memory for model")
}
return nil
return uniqueDeviceIDs(gpuLayers), nil
}
func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
devices := []ml.DeviceID{}
for _, layer := range gpuLayers {
new := true
for _, ID := range devices {
if layer.DeviceID == ID {
new = false
break
}
}
if new {
devices = append(devices, layer.DeviceID)
}
}
return devices
}
// createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
@@ -811,19 +854,19 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
if memory == nil {
memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
Weights: make([]ml.Memory, s.totalLayers),
Cache: make([]ml.Memory, s.totalLayers),
Weights: make([]uint64, s.totalLayers),
Cache: make([]uint64, s.totalLayers),
}}
}
layers := make([]uint64, len(memory.CPU.Weights))
for i := range layers {
for j := range memory.GPUs {
layers[i] += memory.GPUs[j].Weights[i].Size
layers[i] += memory.GPUs[j].Cache[i].Size
layers[i] += memory.GPUs[j].Weights[i]
layers[i] += memory.GPUs[j].Cache[i]
}
layers[i] += memory.CPU.Weights[i].Size
layers[i] += memory.CPU.Cache[i].Size
layers[i] += memory.CPU.Weights[i]
layers[i] += memory.CPU.Cache[i]
logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
}
@@ -837,23 +880,23 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
for i := range gl {
found := false
for j := range memory.GPUs {
if gl[i].ID == memory.GPUs[j].ID {
if memory.GPUs[j].Graph.Size != 0 {
if gl[i].DeviceID == memory.GPUs[j].DeviceID {
if memory.GPUs[j].Graph != 0 {
lastUsedGPU = i
}
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph.Size
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
if gl[i].FreeMemory > reserved {
gl[i].FreeMemory -= reserved
} else {
gl[i].FreeMemory = 0
}
slog.Debug("available gpu", "id", gl[i].ID,
slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
"graph", format.HumanBytes2(memory.GPUs[j].Graph.Size))
"graph", format.HumanBytes2(memory.GPUs[j].Graph))
found = true
break
@@ -872,12 +915,12 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
}
// These sizes will only increase as we go through additional iterations and get additional information.
cpuSize := memory.InputWeights.Size + memory.CPU.Graph.Size
cpuSize := memory.InputWeights + memory.CPU.Graph
var vramSize uint64
for _, gl := range gpuLayers {
for _, gpu := range memory.GPUs {
if gl.ID == gpu.ID {
vramSize += gpu.Graph.Size
if gl.DeviceID == gpu.DeviceID {
vramSize += gpu.Graph
break
}
}
@@ -997,7 +1040,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
device := len(gpus) - 1
gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
for i := len(layers) - 1; i >= 0; i-- {
if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
@@ -1015,7 +1058,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
if device < 0 {
return gpuLayers
}
gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
}
}
@@ -1270,30 +1313,6 @@ func (s *llmServer) Pid() int {
return -1
}
func (s *llmServer) GetPort() int {
return s.port
}
func (s *llmServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
// llama engine does not currently support VRAM query, short circuit
if s.textProcessor == nil {
slog.Debug("llamarunner free vram reporting not supported")
return nil
}
devices, err := discover.GetDevicesFromRunner(ctx, s)
if err != nil {
slog.Debug("failure refreshing GPU information", "error", err)
}
return devices
}
func (s *llmServer) HasExited() bool {
if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
return true
}
return false
}
var grammarJSON = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
@@ -1368,7 +1387,7 @@ type CompletionResponse struct {
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
logutil.Trace("completion request", "prompt", req.Prompt)
if len(req.Format) > 0 {
switch string(req.Format) {
@@ -1534,7 +1553,7 @@ type EmbeddingResponse struct {
}
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
logutil.Trace("embedding request", "input", input)
if err := s.sem.Acquire(ctx, 1); err != nil {
if errors.Is(err, context.Canceled) {
@@ -1686,9 +1705,9 @@ func (s *llamaServer) TotalSize() uint64 {
return s.estimate.TotalSize
}
func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
for i, gpu := range s.gpus {
if gpu.ID == gpuID {
if gpu.DeviceID == id {
if i < len(s.estimate.GPUSizes) {
return s.estimate.GPUSizes[i]
}
@@ -1697,6 +1716,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
return 0
}
func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
slog.Debug("llamarunner free vram reporting not supported")
return nil
}
func (s *ollamaServer) VRAMSize() uint64 {
if s.mem == nil {
return 0
@@ -1705,21 +1729,21 @@ func (s *ollamaServer) VRAMSize() uint64 {
var mem uint64
for _, g := range s.mem.GPUs {
mem += g.Allocated()
mem += g.Size()
}
// Some elements are always on CPU. However, if we have allocated all layers
// on the GPU then include the CPU components as well, to represent complete offloading.
noCPULayers := true
for i := range s.mem.CPU.Weights {
if s.mem.CPU.Weights[i].Size != 0 || s.mem.CPU.Cache[i].Size != 0 {
if s.mem.CPU.Weights[i] != 0 || s.mem.CPU.Cache[i] != 0 {
noCPULayers = false
break
}
}
if noCPULayers {
mem += s.mem.InputWeights.Size
mem += s.mem.CPU.Graph.Size
mem += s.mem.InputWeights
mem += s.mem.CPU.Graph
}
return mem
@@ -1730,25 +1754,37 @@ func (s *ollamaServer) TotalSize() uint64 {
return 0
}
mem := s.mem.InputWeights.Size
mem += s.mem.CPU.Allocated()
mem := s.mem.InputWeights
mem += s.mem.CPU.Size()
for _, g := range s.mem.GPUs {
mem += g.Allocated()
mem += g.Size()
}
return mem
}
func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
if s.mem == nil {
return 0
}
for _, g := range s.mem.GPUs {
if g.ID == gpuID {
return g.Allocated()
if g.DeviceID == id {
return g.Size()
}
}
return 0
}
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
devices, err := discover.GetDevicesFromRunner(ctx, s)
if err != nil {
if s.cmd != nil && s.cmd.ProcessState == nil {
// Still running but hit an error, log
slog.Debug("failure refreshing GPU information", "error", err)
}
// else no longer running so suppress logging as a failure is expected
}
return devices
}

View File

@@ -16,8 +16,8 @@ import (
func TestLLMServerFitGPU(t *testing.T) {
type gpu struct {
library string
free int
id ml.DeviceID
free int
}
tests := []struct {
@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
},
{
name: "Full single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
},
{
name: "Partial single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
},
{
name: "Single GPU with numGPU 1",
gpus: []gpu{{free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
},
{
name: "Single GPU with numGPU 0",
gpus: []gpu{{free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0,
expected: ml.GPULayersList{},
},
{
name: "Single GPU with numGPU 999",
gpus: []gpu{{free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
},
{
name: "Multi GPU fits on one",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
},
{
name: "Multi GPU split",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
},
{
name: "Multi GPU partial",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 1",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 2",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 999",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
},
{
name: "Multi GPU different libraries",
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
},
{
name: "requireFull",
gpus: []gpu{{free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
requireFull: true,
@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {
gpus := make(discover.GpuInfoList, len(tt.gpus))
for i := range tt.gpus {
gpus[i].ID = fmt.Sprintf("gpu%d", i)
gpus[i].Library = tt.gpus[i].library
gpus[i].DeviceID = tt.gpus[i].id
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
}
@@ -155,18 +154,18 @@ func TestLLMServerFitGPU(t *testing.T) {
}
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
Weights: make([]ml.Memory, s.totalLayers),
Cache: make([]ml.Memory, s.totalLayers),
Weights: make([]uint64, s.totalLayers),
Cache: make([]uint64, s.totalLayers),
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
for i := range tt.layers {
s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
s.mem.CPU.Weights[i] = uint64(tt.layers[i])
}
for i := range s.mem.GPUs {
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
}
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)