Merge remote-tracking branch 'upstream/main' into VulkanV3Update
This commit is contained in:
@@ -195,7 +195,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
|
||||
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
|
||||
(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
|
||||
f.SupportsFlashAttention()
|
||||
|
||||
@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
}
|
||||
|
||||
// on metal there's no partial offload overhead
|
||||
if gpus[0].Library == "metal" {
|
||||
if gpus[0].Library == "Metal" {
|
||||
graphPartialOffload = graphFullOffload
|
||||
} else if len(gpus) > 1 {
|
||||
// multigpu should always use the partial graph size
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
func TestEstimateGPULayers(t *testing.T) {
|
||||
@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
// Simple CPU scenario
|
||||
gpus := []discover.GpuInfo{
|
||||
{
|
||||
Library: "cpu",
|
||||
DeviceID: ml.DeviceID{
|
||||
Library: "cpu",
|
||||
},
|
||||
},
|
||||
}
|
||||
projectors := []string{}
|
||||
@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
gpuMinimumMemory := uint64(2048)
|
||||
gpus = []discover.GpuInfo{
|
||||
{
|
||||
Library: "cuda",
|
||||
DeviceID: ml.DeviceID{
|
||||
Library: "cuda",
|
||||
},
|
||||
MinimumMemory: gpuMinimumMemory,
|
||||
},
|
||||
{
|
||||
Library: "cuda",
|
||||
DeviceID: ml.DeviceID{
|
||||
Library: "cuda",
|
||||
},
|
||||
MinimumMemory: gpuMinimumMemory,
|
||||
},
|
||||
}
|
||||
|
||||
250
llm/server.go
250
llm/server.go
@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {
|
||||
|
||||
type LlamaServer interface {
|
||||
ModelPath() string
|
||||
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
|
||||
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
@@ -76,8 +76,9 @@ type LlamaServer interface {
|
||||
Close() error
|
||||
VRAMSize() uint64 // Total VRAM across all GPUs
|
||||
TotalSize() uint64
|
||||
VRAMByGPU(gpuID string) uint64
|
||||
VRAMByGPU(id ml.DeviceID) uint64
|
||||
Pid() int
|
||||
GetPort() int
|
||||
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
|
||||
HasExited() bool
|
||||
}
|
||||
@@ -195,14 +196,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
loadRequest.ProjectorPath = projectors[0]
|
||||
}
|
||||
|
||||
fa := envconfig.FlashAttention(f.FlashAttention())
|
||||
|
||||
// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
|
||||
// that can handle it.
|
||||
fa := envconfig.FlashAttention()
|
||||
if f.FlashAttention() {
|
||||
slog.Info("model wants flash attention")
|
||||
fa = true
|
||||
}
|
||||
|
||||
if fa && !gpus.FlashAttentionSupported() {
|
||||
slog.Warn("flash attention enabled but not supported by gpu")
|
||||
fa = false
|
||||
@@ -303,22 +300,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
pathEnv = "LD_LIBRARY_PATH"
|
||||
}
|
||||
|
||||
s := llmServer{
|
||||
port: port,
|
||||
cmd: exec.Command(exe, params...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
modelPath: modelPath,
|
||||
loadRequest: loadRequest,
|
||||
llamaModel: llamaModel,
|
||||
llamaModelLock: &sync.Mutex{},
|
||||
textProcessor: textProcessor,
|
||||
numParallel: numParallel,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: f.KV().BlockCount() + 1,
|
||||
loadStart: time.Now(),
|
||||
done: make(chan error, 1),
|
||||
}
|
||||
// Note: we always put our dependency paths first
|
||||
// since these are the exact version we compiled/linked against
|
||||
libraryPaths := []string{discover.LibOllamaPath}
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||
}
|
||||
|
||||
ggmlPaths := []string{discover.LibOllamaPath}
|
||||
for _, c := range compatible {
|
||||
if libpath, ok := availableLibs[c]; ok {
|
||||
slog.Debug("adding gpu library", "path", libpath)
|
||||
libraryPaths = append([]string{libpath}, libraryPaths...)
|
||||
ggmlPaths = append(ggmlPaths, libpath)
|
||||
}
|
||||
}
|
||||
|
||||
for _, gpu := range gpus {
|
||||
if gpu.DependencyPath != nil {
|
||||
slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
|
||||
libraryPaths = append(gpu.DependencyPath, libraryPaths...)
|
||||
ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
|
||||
}
|
||||
}
|
||||
|
||||
// finally, add the root library path
|
||||
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
|
||||
|
||||
s := llmServer{
|
||||
port: port,
|
||||
cmd: exec.Command(exe, params...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
modelPath: modelPath,
|
||||
loadRequest: loadRequest,
|
||||
llamaModel: llamaModel,
|
||||
llamaModelLock: &sync.Mutex{},
|
||||
textProcessor: textProcessor,
|
||||
numParallel: numParallel,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: f.KV().BlockCount() + 1,
|
||||
loadStart: time.Now(),
|
||||
done: make(chan error, 1),
|
||||
}
|
||||
|
||||
s.cmd.Env = os.Environ()
|
||||
s.cmd.Stdout = os.Stdout
|
||||
@@ -327,9 +351,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
|
||||
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator)))
|
||||
|
||||
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
|
||||
envWorkarounds := gpus.GetVisibleDevicesEnv()
|
||||
pathEnvVal := strings.Join(gpuLibs, string(filepath.ListSeparator))
|
||||
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
|
||||
envWorkarounds := gpus.GetVisibleDevicesEnv()
|
||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||
|
||||
// Update or add the path variable with our adjusted version
|
||||
pathNeeded := true
|
||||
@@ -452,7 +476,7 @@ type LoadResponse struct {
|
||||
|
||||
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
|
||||
|
||||
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
|
||||
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
||||
systemInfo := discover.GetSystemInfo()
|
||||
systemTotalMemory := systemInfo.System.TotalMemory
|
||||
systemFreeMemory := systemInfo.System.FreeMemory
|
||||
@@ -465,7 +489,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
|
||||
} else {
|
||||
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
|
||||
return ErrLoadRequiredFull
|
||||
return nil, ErrLoadRequiredFull
|
||||
}
|
||||
}
|
||||
|
||||
@@ -474,11 +498,11 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
|
||||
if len(gpus) > 1 || gpus[0].Library != "cpu" {
|
||||
switch {
|
||||
case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
|
||||
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
|
||||
// disable partial offloading when model is greater than total system memory as this
|
||||
// can lead to locking up the system
|
||||
s.options.NumGPU = 0
|
||||
case gpus[0].Library != "metal" && s.estimate.Layers == 0:
|
||||
case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
|
||||
// Don't bother loading into the GPU if no layers can fit
|
||||
gpus = discover.GpuInfoList{discover.GetCPUInfo()}
|
||||
case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||
@@ -493,7 +517,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
|
||||
if systemMemoryRequired > available {
|
||||
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
|
||||
return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
|
||||
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -508,7 +532,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
|
||||
// mmap has issues with partial offloading on metal
|
||||
for _, g := range gpus {
|
||||
if g.Library == "metal" &&
|
||||
if g.Library == "Metal" &&
|
||||
uint64(s.options.NumGPU) > 0 &&
|
||||
uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
|
||||
s.options.UseMMap = new(bool)
|
||||
@@ -530,12 +554,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
}
|
||||
|
||||
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// On the Ollama engine, we can print out a summary of the memory allocations.
|
||||
@@ -546,16 +570,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
|
||||
if !resp.Success {
|
||||
slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
|
||||
return errors.New("failed to allocate memory for model")
|
||||
return nil, errors.New("failed to allocate memory for model")
|
||||
}
|
||||
|
||||
// The llama engine does its memory allocations together with model loading, so we
|
||||
// need to wait until it is done to ensure that we have accurate memory data before
|
||||
// loading the next model
|
||||
if s.textProcessor == nil {
|
||||
return s.WaitUntilRunning(ctx)
|
||||
return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
|
||||
} else {
|
||||
return nil
|
||||
return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -568,7 +592,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
|
||||
|
||||
gpuLayers := make(ml.GPULayersList, len(gpus))
|
||||
for i := range gpuLayers {
|
||||
gpuLayers[i].ID = gpus[i].ID
|
||||
gpuLayers[i].DeviceID = gpus[i].DeviceID
|
||||
}
|
||||
|
||||
var sum float32
|
||||
@@ -616,7 +640,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
|
||||
//
|
||||
// This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
|
||||
// allowing for faster iteration, but may return less information.
|
||||
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
|
||||
//
|
||||
// Returns the list of GPU IDs that were used in the final allocation on success
|
||||
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
||||
var success bool
|
||||
defer func() {
|
||||
if !success {
|
||||
@@ -641,7 +667,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
|
||||
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
|
||||
available = 0
|
||||
}
|
||||
slog.Info("gpu memory", "id", gpu.ID,
|
||||
slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
|
||||
"available", format.HumanBytes2(available),
|
||||
"free", format.HumanBytes2(gpu.FreeMemory),
|
||||
"minimum", format.HumanBytes2(gpu.MinimumMemory),
|
||||
@@ -654,11 +680,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
|
||||
|
||||
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nextOperation:
|
||||
@@ -668,7 +694,7 @@ nextOperation:
|
||||
s.loadRequest.GPULayers = gpuLayers
|
||||
resp, err := s.initModel(ctx, s.loadRequest, operation)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Memory.Log(slog.LevelDebug)
|
||||
@@ -680,7 +706,7 @@ nextOperation:
|
||||
for {
|
||||
newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("new layout created", "layers", newGPULayers)
|
||||
@@ -714,7 +740,7 @@ nextOperation:
|
||||
newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
|
||||
s.options.NumGPU = -1
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("new layout created", "layers", newGPULayers)
|
||||
@@ -722,7 +748,7 @@ nextOperation:
|
||||
s.loadRequest.GPULayers = newGPULayers
|
||||
resp, err = s.initModel(ctx, s.loadRequest, operation)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Memory.Log(slog.LevelDebug)
|
||||
@@ -731,7 +757,7 @@ nextOperation:
|
||||
if resp.Success {
|
||||
verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("verifying layout", "layers", verifyGPULayers)
|
||||
@@ -756,7 +782,7 @@ nextOperation:
|
||||
}
|
||||
|
||||
if s.options.NumGPU >= 0 {
|
||||
return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
|
||||
return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
|
||||
}
|
||||
|
||||
// Memory allocation failed even though we created a layout that we thought should
|
||||
@@ -766,7 +792,7 @@ nextOperation:
|
||||
// space.
|
||||
if backoff > 1 {
|
||||
slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
|
||||
return errors.New("memory layout cannot be allocated")
|
||||
return nil, errors.New("memory layout cannot be allocated")
|
||||
} else if backoff == 0 {
|
||||
backoff = 0.01
|
||||
} else {
|
||||
@@ -781,7 +807,7 @@ nextOperation:
|
||||
s.loadRequest.GPULayers = gpuLayers
|
||||
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
success = resp.Success
|
||||
@@ -789,10 +815,27 @@ nextOperation:
|
||||
|
||||
if !success {
|
||||
slog.Warn("failed to commit memory for model", "memory", resp.Memory)
|
||||
return errors.New("failed to commit memory for model")
|
||||
return nil, errors.New("failed to commit memory for model")
|
||||
}
|
||||
|
||||
return nil
|
||||
return uniqueDeviceIDs(gpuLayers), nil
|
||||
}
|
||||
|
||||
func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
|
||||
devices := []ml.DeviceID{}
|
||||
for _, layer := range gpuLayers {
|
||||
new := true
|
||||
for _, ID := range devices {
|
||||
if layer.DeviceID == ID {
|
||||
new = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if new {
|
||||
devices = append(devices, layer.DeviceID)
|
||||
}
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
|
||||
@@ -811,19 +854,19 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||
|
||||
if memory == nil {
|
||||
memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
||||
Weights: make([]ml.Memory, s.totalLayers),
|
||||
Cache: make([]ml.Memory, s.totalLayers),
|
||||
Weights: make([]uint64, s.totalLayers),
|
||||
Cache: make([]uint64, s.totalLayers),
|
||||
}}
|
||||
}
|
||||
|
||||
layers := make([]uint64, len(memory.CPU.Weights))
|
||||
for i := range layers {
|
||||
for j := range memory.GPUs {
|
||||
layers[i] += memory.GPUs[j].Weights[i].Size
|
||||
layers[i] += memory.GPUs[j].Cache[i].Size
|
||||
layers[i] += memory.GPUs[j].Weights[i]
|
||||
layers[i] += memory.GPUs[j].Cache[i]
|
||||
}
|
||||
layers[i] += memory.CPU.Weights[i].Size
|
||||
layers[i] += memory.CPU.Cache[i].Size
|
||||
layers[i] += memory.CPU.Weights[i]
|
||||
layers[i] += memory.CPU.Cache[i]
|
||||
logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
|
||||
}
|
||||
|
||||
@@ -837,23 +880,23 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||
for i := range gl {
|
||||
found := false
|
||||
for j := range memory.GPUs {
|
||||
if gl[i].ID == memory.GPUs[j].ID {
|
||||
if memory.GPUs[j].Graph.Size != 0 {
|
||||
if gl[i].DeviceID == memory.GPUs[j].DeviceID {
|
||||
if memory.GPUs[j].Graph != 0 {
|
||||
lastUsedGPU = i
|
||||
}
|
||||
|
||||
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph.Size
|
||||
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
|
||||
if gl[i].FreeMemory > reserved {
|
||||
gl[i].FreeMemory -= reserved
|
||||
} else {
|
||||
gl[i].FreeMemory = 0
|
||||
}
|
||||
|
||||
slog.Debug("available gpu", "id", gl[i].ID,
|
||||
slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
|
||||
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
|
||||
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
|
||||
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
||||
"graph", format.HumanBytes2(memory.GPUs[j].Graph.Size))
|
||||
"graph", format.HumanBytes2(memory.GPUs[j].Graph))
|
||||
|
||||
found = true
|
||||
break
|
||||
@@ -872,12 +915,12 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||
}
|
||||
|
||||
// These sizes will only increase as we go through additional iterations and get additional information.
|
||||
cpuSize := memory.InputWeights.Size + memory.CPU.Graph.Size
|
||||
cpuSize := memory.InputWeights + memory.CPU.Graph
|
||||
var vramSize uint64
|
||||
for _, gl := range gpuLayers {
|
||||
for _, gpu := range memory.GPUs {
|
||||
if gl.ID == gpu.ID {
|
||||
vramSize += gpu.Graph.Size
|
||||
if gl.DeviceID == gpu.DeviceID {
|
||||
vramSize += gpu.Graph
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -997,7 +1040,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
|
||||
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
|
||||
func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
|
||||
device := len(gpus) - 1
|
||||
gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
|
||||
gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
|
||||
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
|
||||
for i := len(layers) - 1; i >= 0; i-- {
|
||||
if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
|
||||
@@ -1015,7 +1058,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
|
||||
if device < 0 {
|
||||
return gpuLayers
|
||||
}
|
||||
gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
|
||||
gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
|
||||
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
|
||||
}
|
||||
}
|
||||
@@ -1270,30 +1313,6 @@ func (s *llmServer) Pid() int {
|
||||
return -1
|
||||
}
|
||||
|
||||
func (s *llmServer) GetPort() int {
|
||||
return s.port
|
||||
}
|
||||
|
||||
func (s *llmServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
|
||||
// llama engine does not currently support VRAM query, short circuit
|
||||
if s.textProcessor == nil {
|
||||
slog.Debug("llamarunner free vram reporting not supported")
|
||||
return nil
|
||||
}
|
||||
devices, err := discover.GetDevicesFromRunner(ctx, s)
|
||||
if err != nil {
|
||||
slog.Debug("failure refreshing GPU information", "error", err)
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
func (s *llmServer) HasExited() bool {
|
||||
if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var grammarJSON = `
|
||||
root ::= object
|
||||
value ::= object | array | string | number | ("true" | "false" | "null") ws
|
||||
@@ -1368,7 +1387,7 @@ type CompletionResponse struct {
|
||||
|
||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
|
||||
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
|
||||
logutil.Trace("completion request", "prompt", req.Prompt)
|
||||
|
||||
if len(req.Format) > 0 {
|
||||
switch string(req.Format) {
|
||||
@@ -1534,7 +1553,7 @@ type EmbeddingResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
|
||||
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
|
||||
logutil.Trace("embedding request", "input", input)
|
||||
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
@@ -1686,9 +1705,9 @@ func (s *llamaServer) TotalSize() uint64 {
|
||||
return s.estimate.TotalSize
|
||||
}
|
||||
|
||||
func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
|
||||
func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
|
||||
for i, gpu := range s.gpus {
|
||||
if gpu.ID == gpuID {
|
||||
if gpu.DeviceID == id {
|
||||
if i < len(s.estimate.GPUSizes) {
|
||||
return s.estimate.GPUSizes[i]
|
||||
}
|
||||
@@ -1697,6 +1716,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
|
||||
slog.Debug("llamarunner free vram reporting not supported")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *ollamaServer) VRAMSize() uint64 {
|
||||
if s.mem == nil {
|
||||
return 0
|
||||
@@ -1705,21 +1729,21 @@ func (s *ollamaServer) VRAMSize() uint64 {
|
||||
var mem uint64
|
||||
|
||||
for _, g := range s.mem.GPUs {
|
||||
mem += g.Allocated()
|
||||
mem += g.Size()
|
||||
}
|
||||
|
||||
// Some elements are always on CPU. However, if we have allocated all layers
|
||||
// on the GPU then include the CPU components as well, to represent complete offloading.
|
||||
noCPULayers := true
|
||||
for i := range s.mem.CPU.Weights {
|
||||
if s.mem.CPU.Weights[i].Size != 0 || s.mem.CPU.Cache[i].Size != 0 {
|
||||
if s.mem.CPU.Weights[i] != 0 || s.mem.CPU.Cache[i] != 0 {
|
||||
noCPULayers = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if noCPULayers {
|
||||
mem += s.mem.InputWeights.Size
|
||||
mem += s.mem.CPU.Graph.Size
|
||||
mem += s.mem.InputWeights
|
||||
mem += s.mem.CPU.Graph
|
||||
}
|
||||
|
||||
return mem
|
||||
@@ -1730,25 +1754,37 @@ func (s *ollamaServer) TotalSize() uint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
mem := s.mem.InputWeights.Size
|
||||
mem += s.mem.CPU.Allocated()
|
||||
mem := s.mem.InputWeights
|
||||
mem += s.mem.CPU.Size()
|
||||
for _, g := range s.mem.GPUs {
|
||||
mem += g.Allocated()
|
||||
mem += g.Size()
|
||||
}
|
||||
|
||||
return mem
|
||||
}
|
||||
|
||||
func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
|
||||
func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
|
||||
if s.mem == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
for _, g := range s.mem.GPUs {
|
||||
if g.ID == gpuID {
|
||||
return g.Allocated()
|
||||
if g.DeviceID == id {
|
||||
return g.Size()
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
|
||||
devices, err := discover.GetDevicesFromRunner(ctx, s)
|
||||
if err != nil {
|
||||
if s.cmd != nil && s.cmd.ProcessState == nil {
|
||||
// Still running but hit an error, log
|
||||
slog.Debug("failure refreshing GPU information", "error", err)
|
||||
}
|
||||
// else no longer running so suppress logging as a failure is expected
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@ import (
|
||||
|
||||
func TestLLMServerFitGPU(t *testing.T) {
|
||||
type gpu struct {
|
||||
library string
|
||||
free int
|
||||
id ml.DeviceID
|
||||
free int
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "Full single GPU",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Partial single GPU",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Single GPU with numGPU 1",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 1,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Single GPU with numGPU 0",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 0,
|
||||
expected: ml.GPULayersList{},
|
||||
},
|
||||
{
|
||||
name: "Single GPU with numGPU 999",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||
numGPU: 999,
|
||||
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU fits on one",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU split",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU partial",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU numGPU 1",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU numGPU 2",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 2,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU numGPU 999",
|
||||
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: 999,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
|
||||
},
|
||||
{
|
||||
name: "Multi GPU different libraries",
|
||||
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
|
||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
|
||||
},
|
||||
{
|
||||
name: "requireFull",
|
||||
gpus: []gpu{{free: 256 * format.MebiByte}},
|
||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
requireFull: true,
|
||||
@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||
|
||||
gpus := make(discover.GpuInfoList, len(tt.gpus))
|
||||
for i := range tt.gpus {
|
||||
gpus[i].ID = fmt.Sprintf("gpu%d", i)
|
||||
gpus[i].Library = tt.gpus[i].library
|
||||
gpus[i].DeviceID = tt.gpus[i].id
|
||||
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
|
||||
}
|
||||
|
||||
@@ -155,18 +154,18 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||
}
|
||||
|
||||
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
||||
Weights: make([]ml.Memory, s.totalLayers),
|
||||
Cache: make([]ml.Memory, s.totalLayers),
|
||||
Weights: make([]uint64, s.totalLayers),
|
||||
Cache: make([]uint64, s.totalLayers),
|
||||
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
|
||||
|
||||
for i := range tt.layers {
|
||||
s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
|
||||
s.mem.CPU.Weights[i] = uint64(tt.layers[i])
|
||||
}
|
||||
|
||||
for i := range s.mem.GPUs {
|
||||
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
|
||||
s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
|
||||
s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
|
||||
s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
|
||||
s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
|
||||
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
|
||||
}
|
||||
|
||||
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
|
||||
|
||||
Reference in New Issue
Block a user