From 17b7186cd759337fa98b626e82de150f3789b040 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 6 May 2024 17:47:52 -0700 Subject: [PATCH 01/18] Enable concurrency by default This adjusts our default settings to enable multiple models and parallel requests to a single model. Users can still override these by the same env var settings as before. Parallel has a direct impact on num_ctx, which in turn can have a significant impact on small VRAM GPUs so this change also refines the algorithm so that when parallel is not explicitly set by the user, we try to find a reasonable default that fits the model on their GPU(s). As before, multiple models will only load concurrently if they fully fit in VRAM. --- envconfig/config.go | 16 ++++---- llm/server.go | 13 ++---- server/sched.go | 98 +++++++++++++++++++++++++++++++++----------- server/sched_test.go | 80 +++++++++++++++++++++++------------- 4 files changed, 135 insertions(+), 72 deletions(-) diff --git a/envconfig/config.go b/envconfig/config.go index e86f72e6a..cb456448c 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar { "OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, - "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models (default 1)"}, + "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default 4)"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, - "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"}, + "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, @@ -129,8 +129,8 @@ func clean(key string) string { func init() { // default values - NumParallel = 1 - MaxRunners = 1 + NumParallel = 0 + MaxRunners = 4 MaxQueuedRequests = 512 LoadConfig() @@ -205,8 +205,8 @@ func LoadConfig() { if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" { val, err := strconv.Atoi(onp) - if err != nil || val <= 0 { - slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err) + if err != nil { + slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err) } else { NumParallel = val } @@ -251,7 +251,7 @@ func LoadConfig() { if maxRunners != "" { m, err := strconv.Atoi(maxRunners) if err != nil { - slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err) + slog.Error("invalid setting, ignoring", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err) } else { MaxRunners = m } @@ -260,7 +260,7 @@ func LoadConfig() { if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" { p, err := strconv.Atoi(onp) if err != nil || p <= 0 { - slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err) + slog.Error("invalid setting, ignoring", "OLLAMA_MAX_QUEUE", onp, "error", err) } else { MaxQueuedRequests = p } diff --git a/llm/server.go b/llm/server.go index da83416ee..3cb5ac1f0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -77,7 +77,7 @@ func LoadModel(model string) (*GGML, error) { // NewLlamaServer will run a server for the given GPUs // The gpu list must be a single family. -func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) { +func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var err error var cpuRunner string var estimate MemoryEstimate @@ -213,8 +213,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // Windows CUDA should not use mmap for best performance // Linux with a model larger than free space, mmap leads to thrashing + // For CPU loads we want the memory to be allocated, not FS cache if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) || (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) || + (gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) || opts.UseMMap == api.TriStateFalse { params = append(params, "--no-mmap") } @@ -227,15 +229,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--numa") } - numParallel := envconfig.NumParallel - - // TODO (jmorganca): multimodal models don't support parallel yet - // see https://github.com/ollama/ollama/issues/4165 - if len(projectors) > 0 { - numParallel = 1 - slog.Warn("multimodal models don't support parallel requests yet") - } - params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) if estimate.TensorSplit != "" { diff --git a/server/sched.go b/server/sched.go index 424395544..31ef560f5 100644 --- a/server/sched.go +++ b/server/sched.go @@ -23,6 +23,7 @@ type LlmRequest struct { ctx context.Context //nolint:containedctx model *Model opts api.Options + origNumCTX int // Track the initial ctx request sessionDuration time.Duration successCh chan *runnerRef errCh chan error @@ -38,8 +39,8 @@ type Scheduler struct { loaded map[string]*runnerRef loadedMu sync.Mutex - loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) - newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) + loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) + newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) getGpuFn func() gpu.GpuInfoList getCpuFn func() gpu.GpuInfoList reschedDelay time.Duration @@ -65,13 +66,10 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { - // allocate a large enough kv cache for all parallel requests if opts.NumCtx < 4 { opts.NumCtx = 4 } - opts.NumCtx *= envconfig.NumParallel - req := &LlmRequest{ ctx: c, model: model, @@ -102,6 +100,7 @@ func (s *Scheduler) Run(ctx context.Context) { } func (s *Scheduler) processPending(ctx context.Context) { + maxRunnerFactor := 1 // number of GPUs or 1 for { select { case <-ctx.Done(): @@ -110,11 +109,25 @@ func (s *Scheduler) processPending(ctx context.Context) { case pending := <-s.pendingReqCh: // Block other requests until we get this pending request running pending.schedAttempts++ + if pending.origNumCTX == 0 { + pending.origNumCTX = pending.opts.NumCtx + } if pending.ctx.Err() != nil { slog.Debug("pending request cancelled or timed out, skipping scheduling") continue } + numParallel := envconfig.NumParallel + // TODO (jmorganca): multimodal models don't support parallel yet + // see https://github.com/ollama/ollama/issues/4165 + if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 { + numParallel = 1 + slog.Warn("multimodal models don't support parallel requests yet") + } + // Keep NumCtx and numParallel in sync + if numParallel > 1 { + pending.opts.NumCtx = pending.origNumCTX * numParallel + } for { var runnerToExpire *runnerRef @@ -130,7 +143,7 @@ func (s *Scheduler) processPending(ctx context.Context) { pending.useLoadedRunner(runner, s.finishedReqCh) break } - } else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners { + } else if envconfig.MaxRunners > 0 && loadedCount >= (maxRunnerFactor*envconfig.MaxRunners) { slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) runnerToExpire = s.findRunnerToUnload() } else { @@ -142,6 +155,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } else { gpus = s.getGpuFn() } + maxRunnerFactor = max(len(gpus), 1) // Load model for fitting ggml, err := llm.LoadModel(pending.model.ModelPath) @@ -152,26 +166,32 @@ func (s *Scheduler) processPending(ctx context.Context) { // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { + // simplifying assumption of defaultParallel when in CPU mode + if numParallel <= 0 { + numParallel = defaultParallel + pending.opts.NumCtx = pending.origNumCTX * numParallel + } + if loadedCount == 0 { slog.Debug("cpu mode with first model, loading") - s.loadFn(pending, ggml, gpus) + s.loadFn(pending, ggml, gpus, numParallel) break } runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus) if runnerToExpire == nil { slog.Debug("cpu mode with available system memory or first model, loading") - s.loadFn(pending, ggml, gpus) + s.loadFn(pending, ggml, gpus, numParallel) break } // else we need to expire a runner } else if loadedCount == 0 { // No models loaded. Load the model but prefer the best fit. slog.Debug("loading first model", "model", pending.model.ModelPath) - g := pickBestFitGPUs(pending, ggml, gpus) + g := pickBestFitGPUs(pending, ggml, gpus, &numParallel) if g != nil { gpus = g } - s.loadFn(pending, ggml, gpus) + s.loadFn(pending, ggml, gpus, numParallel) break } @@ -186,10 +206,10 @@ func (s *Scheduler) processPending(ctx context.Context) { // Update free memory from currently loaded models s.updateFreeSpace(availGpus) - fitGpus := pickBestFitGPUs(pending, ggml, availGpus) + fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel) if fitGpus != nil { slog.Debug("new model fits with existing models, loading") - s.loadFn(pending, ggml, fitGpus) + s.loadFn(pending, ggml, fitGpus, numParallel) break } @@ -350,8 +370,11 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm }() } -func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) { - llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts) +func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { + if numParallel < 1 { + numParallel = 1 + } + llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) if err != nil { // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to @@ -375,6 +398,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) loading: true, refCount: 1, } + runner.numParallel = numParallel runner.refMu.Lock() s.loadedMu.Lock() @@ -483,8 +507,9 @@ type runnerRef struct { expireTimer *time.Timer expiresAt time.Time - model *Model - modelPath string + model *Model + modelPath string + numParallel int *api.Options } @@ -525,6 +550,9 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool optsNew.NumGPU = -1 } + // Normalize the NumCtx for parallelism + optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel + ctx, cancel := context.WithTimeout(ctx, timeout) defer cancel() if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed? @@ -611,22 +639,38 @@ func (a ByDuration) Less(i, j int) bool { // pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits // If the model can not be fit fully within the available GPU(s) nil is returned -func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList { +// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust +// opts.NumCtx accordingly +func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { var estimatedVRAM uint64 + + var numParallelToTry []int + if *numParallel <= 0 { + // If no specific parallel setting was provided, try larger then smaller, always end with 1 + numParallelToTry = append(numParallelToTry, 4, 1) + } else { + numParallelToTry = []int{*numParallel} + } + for _, gl := range gpus.ByLibrary() { var ok bool sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...) // TODO - potentially sort by performance capability, existing models loaded, etc. + // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them // Note: at present, this will favor more VRAM over faster GPU speed in mixed setups sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl))) // First attempt to fit the model into a single GPU - if !envconfig.SchedSpread { - for _, g := range sgl { - if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { - slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) - return []gpu.GpuInfo{g} + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCTX * p + if !envconfig.SchedSpread { + for _, g := range sgl { + if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) + *numParallel = p + return []gpu.GpuInfo{g} + } } } } @@ -636,9 +680,13 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu. // - try subsets of GPUs instead of just falling back to 1 or all in a family // Now try all the GPUs - if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { - slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM)) - return sgl + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCTX * p + if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) + *numParallel = p + return sgl + } } } return nil diff --git a/server/sched_test.go b/server/sched_test.go index 953288347..5e5913a7c 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -47,11 +47,11 @@ func TestLoad(t *testing.T) { sessionDuration: 2, } // Fail to load model first - s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) { + s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return nil, fmt.Errorf("something failed to load model blah") } gpus := gpu.GpuInfoList{} - s.load(req, ggml, gpus) + s.load(req, ggml, gpus, 0) require.Empty(t, req.successCh) require.Len(t, req.errCh, 1) s.loadedMu.Lock() @@ -61,10 +61,10 @@ func TestLoad(t *testing.T) { require.Contains(t, err.Error(), "this model may be incompatible") server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} - s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) { + s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return server, nil } - s.load(req, ggml, gpus) + s.load(req, ggml, gpus, 0) select { case err := <-req.errCh: require.NoError(t, err) @@ -78,12 +78,12 @@ func TestLoad(t *testing.T) { req.model.ModelPath = "dummy_model_path" server.waitResp = fmt.Errorf("wait failure") - s.load(req, ggml, gpus) + s.load(req, ggml, gpus, 0) select { case err := <-req.errCh: require.Contains(t, err.Error(), "wait failure") case resp := <-req.successCh: - t.Errorf("unexpected success %v", resp) + t.Fatalf("unexpected success %v", resp) } s.loadedMu.Lock() runner := s.loaded["dummy_model_path"] @@ -102,7 +102,7 @@ type bundle struct { ggml *llm.GGML } -func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) { +func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return scenario.srv, nil } @@ -200,7 +200,7 @@ func TestRequests(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, scenario1a.req.errCh) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } // Same runner as first request due to not needing a reload @@ -213,7 +213,7 @@ func TestRequests(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, scenario1b.req.errCh) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } // Trigger a reload @@ -231,7 +231,7 @@ func TestRequests(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, scenario2a.req.errCh) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } envconfig.MaxRunners = 1 @@ -247,7 +247,7 @@ func TestRequests(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, scenario3a.req.errCh) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } s.loadedMu.Lock() require.Len(t, s.loaded, 1) @@ -263,7 +263,7 @@ func TestRequests(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, scenario3b.req.errCh) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } s.loadedMu.Lock() require.Len(t, s.loaded, 2) @@ -279,7 +279,7 @@ func TestRequests(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, scenario3c.req.errCh) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } s.loadedMu.Lock() require.Len(t, s.loaded, 3) @@ -306,7 +306,7 @@ func TestRequests(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, scenario3d.req.errCh) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } s.loadedMu.Lock() require.Len(t, s.loaded, 2) @@ -349,7 +349,7 @@ func TestGetRunner(t *testing.T) { require.Empty(t, s.pendingReqCh) require.Empty(t, errCh1a) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } scenario1a.ctxDone() s.loadedMu.Lock() @@ -400,7 +400,7 @@ func TestPrematureExpired(t *testing.T) { slog.Info("sending premature expired event now") s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } time.Sleep(scenario1a.req.sessionDuration) scenario1a.ctxDone() @@ -427,7 +427,7 @@ func TestUseLoadedRunner(t *testing.T) { } finished := make(chan *LlmRequest) llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} - r1 := &runnerRef{llama: llm1, sessionDuration: 1} + r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1} req.useLoadedRunner(r1, finished) require.Equal(t, uint(1), r1.refCount) require.Equal(t, time.Duration(2), r1.sessionDuration) @@ -435,7 +435,7 @@ func TestUseLoadedRunner(t *testing.T) { case success := <-req.successCh: require.Equal(t, r1, success) case <-ctx.Done(): - t.Errorf("timeout") + t.Fatal("timeout") } done() fin := <-finished @@ -461,8 +461,8 @@ func TestUpdateFreeSpace(t *testing.T) { gpus[1].FreeMemory = 1900 llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}} llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}} - r1 := &runnerRef{llama: llm1, gpus: gpus} - r2 := &runnerRef{llama: llm2, gpus: gpus} + r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1} + r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1} s := InitScheduler(ctx) s.loadedMu.Lock() @@ -513,8 +513,8 @@ func TestFindRunnerToUnload(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() - r1 := &runnerRef{refCount: 1, sessionDuration: 1} - r2 := &runnerRef{sessionDuration: 2} + r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1} + r2 := &runnerRef{sessionDuration: 2, numParallel: 1} s := InitScheduler(ctx) s.loadedMu.Lock() @@ -536,9 +536,13 @@ func TestNeedsReload(t *testing.T) { llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} do := api.DefaultOptions() runner := &runnerRef{ - model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}}, - Options: &do, - llama: llm, + model: &Model{ + AdapterPaths: []string{"adapter1"}, + ProjectorPaths: []string{"projector1"}, + }, + Options: &do, + llama: llm, + numParallel: 1, } req := &LlmRequest{ model: &Model{ @@ -581,8 +585,8 @@ func TestUnloadAllRunners(t *testing.T) { s := InitScheduler(ctx) s.unloadAllRunners() - r1 := &runnerRef{llama: llm1} - r2 := &runnerRef{llama: llm2} + r1 := &runnerRef{llama: llm1, numParallel: 1} + r2 := &runnerRef{llama: llm2, numParallel: 1} s.loadedMu.Lock() s.loaded["a"] = r1 @@ -596,14 +600,32 @@ func TestUnloadAllRunners(t *testing.T) { func TestUnload(t *testing.T) { llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} - r1 := &runnerRef{llama: llm1} - r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}} + r1 := &runnerRef{llama: llm1, numParallel: 1} + r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1} r1.unload() require.True(t, llm1.closeCalled) r2.unload() require.Nil(t, r2.model) } +func TestAlreadyCanceled(t *testing.T) { + ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer done() + dctx, done2 := context.WithCancel(ctx) + done2() + scenario1a := newScenario(t, dctx, "ollama-model-1", 10) + scenario1a.req.sessionDuration = 0 + s := InitScheduler(ctx) + slog.Info("scenario1a") + s.pendingReqCh <- scenario1a.req + require.Len(t, s.pendingReqCh, 1) + s.Run(ctx) + time.Sleep(5 * time.Millisecond) + require.Empty(t, s.pendingReqCh) + require.Empty(t, scenario1a.req.errCh) + require.Empty(t, scenario1a.req.successCh) +} + type mockLlm struct { pingResp error waitResp error From 9929751cc8b415e7b83d5151742dad734e8b5efc Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 19 Jun 2024 13:35:38 -0700 Subject: [PATCH 02/18] Disable concurrency for AMD + Windows Until ROCm v6.2 ships, we wont be able to get accurate free memory reporting on windows, which makes automatic concurrency too risky. Users can still opt-in but will need to pay attention to model sizes otherwise they may thrash/page VRAM or cause OOM crashes. All other platforms and GPUs have accurate VRAM reporting wired up now, so we can turn on concurrency by default. --- envconfig/config.go | 8 ++++---- gpu/amd_windows.go | 5 +++-- gpu/types.go | 5 +++++ server/sched.go | 36 ++++++++++++++++++++++++++++++++---- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/envconfig/config.go b/envconfig/config.go index cb456448c..0f0f7f058 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar { "OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, - "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default 4)"}, + "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default auto)"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, - "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, + "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default auto)"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, @@ -129,8 +129,8 @@ func clean(key string) string { func init() { // default values - NumParallel = 0 - MaxRunners = 4 + NumParallel = 0 // Autoselect + MaxRunners = 0 // Autoselect MaxQueuedRequests = 512 LoadConfig() diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 21585277a..8b6fabebb 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -115,8 +115,6 @@ func AMDGetGPUInfo() []RocmGPUInfo { continue } - // TODO revisit this once ROCm v6 is available on windows. - // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) gpuInfo := RocmGPUInfo{ @@ -126,6 +124,9 @@ func AMDGetGPUInfo() []RocmGPUInfo { TotalMemory: totalMemory, FreeMemory: freeMemory, }, + // Free memory reporting on Windows is not reliable until we bump to ROCm v6.2 + UnreliableFreeMemory: true, + ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices DependencyPath: libDir, MinimumMemory: rocmMinimumMemory, diff --git a/gpu/types.go b/gpu/types.go index 9920db5ff..2eaa9bae9 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -29,6 +29,11 @@ type GpuInfo struct { // Extra environment variables specific to the GPU as list of [key,value] EnvWorkarounds [][2]string `json:"envs,omitempty"` + // Set to true if we can NOT reliably discover FreeMemory. A value of true indicates + // the FreeMemory is best effort, and may over or under report actual memory usage + // False indicates FreeMemory can generally be trusted on this GPU + UnreliableFreeMemory bool + // GPU information ID string `json:"gpu_id"` // string to use for selection of this specific GPU Name string `json:"name"` // user friendly name if available diff --git a/server/sched.go b/server/sched.go index 31ef560f5..de8c9d281 100644 --- a/server/sched.go +++ b/server/sched.go @@ -46,6 +46,16 @@ type Scheduler struct { reschedDelay time.Duration } +// Default automatic value for number of models we allow per GPU +// Model will still need to fit in VRAM, but loading many small models +// on a large GPU can cause stalling +var defaultModelsPerGPU = 3 + +// Default automatic value for parallel setting +// Model will still need to fit in VRAM. If this setting wont fit +// we'll back off down to 1 to try to get it to fit +var defaultParallel = 4 + var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded") func InitScheduler(ctx context.Context) *Scheduler { @@ -100,7 +110,6 @@ func (s *Scheduler) Run(ctx context.Context) { } func (s *Scheduler) processPending(ctx context.Context) { - maxRunnerFactor := 1 // number of GPUs or 1 for { select { case <-ctx.Done(): @@ -143,7 +152,7 @@ func (s *Scheduler) processPending(ctx context.Context) { pending.useLoadedRunner(runner, s.finishedReqCh) break } - } else if envconfig.MaxRunners > 0 && loadedCount >= (maxRunnerFactor*envconfig.MaxRunners) { + } else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners { slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) runnerToExpire = s.findRunnerToUnload() } else { @@ -155,7 +164,26 @@ func (s *Scheduler) processPending(ctx context.Context) { } else { gpus = s.getGpuFn() } - maxRunnerFactor = max(len(gpus), 1) + + if envconfig.MaxRunners <= 0 { + // No user specified MaxRunners, so figure out what automatic setting to use + // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs + // if any GPU has unreliable free memory reporting, 1x the number of GPUs + allReliable := true + for _, gpu := range gpus { + if gpu.UnreliableFreeMemory { + allReliable = false + break + } + } + if allReliable { + envconfig.MaxRunners = defaultModelsPerGPU * len(gpus) + slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus)) + } else { + slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency") + envconfig.MaxRunners = len(gpus) + } + } // Load model for fitting ggml, err := llm.LoadModel(pending.model.ModelPath) @@ -647,7 +675,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP var numParallelToTry []int if *numParallel <= 0 { // If no specific parallel setting was provided, try larger then smaller, always end with 1 - numParallelToTry = append(numParallelToTry, 4, 1) + numParallelToTry = append(numParallelToTry, defaultParallel, 1) } else { numParallelToTry = []int{*numParallel} } From 642cee13426c994f90d5a95376025fe9a223891a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 21 Jun 2024 15:59:41 -0700 Subject: [PATCH 03/18] Sort the ps output Provide consistent ordering for the ps command - longest duration listed first --- server/routes.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/routes.go b/server/routes.go index 3d112e9f1..a7f72edc2 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1224,6 +1224,11 @@ func (s *Server) ProcessHandler(c *gin.Context) { models = append(models, mr) } + slices.SortStableFunc(models, func(i, j api.ProcessModelResponse) int { + // longest duration remaining listed first + return cmp.Compare(j.ExpiresAt.Unix(), i.ExpiresAt.Unix()) + }) + c.JSON(http.StatusOK, api.ProcessResponse{Models: models}) } From 4e986a823ca47eb16f563d15a6fe4cc393a00715 Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Thu, 27 Jun 2024 10:59:15 -0700 Subject: [PATCH 04/18] unquote, trimp space --- parser/parser.go | 9 ++++++++- parser/parser_test.go | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/parser/parser.go b/parser/parser.go index 686a1e695..fa60ebc0f 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -125,6 +125,7 @@ func ParseFile(r io.Reader) (*File, error) { // pass case stateValue: s, ok := unquote(b.String()) + if !ok || isSpace(r) { if _, err := b.WriteRune(r); err != nil { return nil, err @@ -158,7 +159,13 @@ func ParseFile(r io.Reader) (*File, error) { case stateComment, stateNil: // pass; nothing to flush case stateValue: - s, ok := unquote(b.String()) + var s string + var ok bool + if cmd.Name == "model" { + s, ok = unquote(strings.TrimSpace(b.String())) + } else { + s, ok = unquote(b.String()) + } if !ok { return nil, io.ErrUnexpectedEOF } diff --git a/parser/parser_test.go b/parser/parser_test.go index 7123e53bf..35556515d 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -48,6 +48,26 @@ func TestParseFileFrom(t *testing.T) { expected []Command err error }{ + { + "FROM \"FOO BAR \"", + []Command{{Name: "model", Args: "FOO BAR "}}, + nil, + }, + { + "FROM \"FOO BAR\"\nPARAMETER param1 value1", + []Command{{Name: "model", Args: "FOO BAR"}, {Name: "param1", Args: "value1"}}, + nil, + }, + { + "FROM FOOO BAR ", + []Command{{Name: "model", Args: "FOOO BAR"}}, + nil, + }, + { + "FROM /what/is/the path ", + []Command{{Name: "model", Args: "/what/is/the path"}}, + nil, + }, { "FROM foo", []Command{{Name: "model", Args: "foo"}}, @@ -86,6 +106,11 @@ func TestParseFileFrom(t *testing.T) { []Command{{Name: "param1", Args: "value1"}, {Name: "model", Args: "foo"}}, nil, }, + { + "PARAMETER what the \nFROM lemons make lemonade ", + []Command{{Name: "what", Args: "the "}, {Name: "model", Args: "lemons make lemonade"}}, + nil, + }, } for _, c := range cases { From 9bd00041fa1c82881299f34a5950f9edc2a7e66c Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Thu, 27 Jun 2024 11:18:38 -0700 Subject: [PATCH 05/18] trim all params --- parser/parser.go | 11 ++--------- parser/parser_test.go | 4 ++-- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/parser/parser.go b/parser/parser.go index fa60ebc0f..7f566da4e 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -124,8 +124,7 @@ func ParseFile(r io.Reader) (*File, error) { case stateComment, stateNil: // pass case stateValue: - s, ok := unquote(b.String()) - + s, ok := unquote(strings.TrimSpace(b.String())) if !ok || isSpace(r) { if _, err := b.WriteRune(r); err != nil { return nil, err @@ -159,13 +158,7 @@ func ParseFile(r io.Reader) (*File, error) { case stateComment, stateNil: // pass; nothing to flush case stateValue: - var s string - var ok bool - if cmd.Name == "model" { - s, ok = unquote(strings.TrimSpace(b.String())) - } else { - s, ok = unquote(b.String()) - } + s, ok := unquote(strings.TrimSpace(b.String())) if !ok { return nil, io.ErrUnexpectedEOF } diff --git a/parser/parser_test.go b/parser/parser_test.go index 35556515d..3dc592239 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -108,7 +108,7 @@ func TestParseFileFrom(t *testing.T) { }, { "PARAMETER what the \nFROM lemons make lemonade ", - []Command{{Name: "what", Args: "the "}, {Name: "model", Args: "lemons make lemonade"}}, + []Command{{Name: "what", Args: "the"}, {Name: "model", Args: "lemons make lemonade"}}, nil, }, } @@ -424,7 +424,7 @@ func TestParseFileParameters(t *testing.T) { "mirostat_eta 1.0": {"mirostat_eta", "1.0"}, "penalize_newline true": {"penalize_newline", "true"}, "stop ### User:": {"stop", "### User:"}, - "stop ### User: ": {"stop", "### User: "}, + "stop ### User: ": {"stop", "### User:"}, "stop \"### User:\"": {"stop", "### User:"}, "stop \"### User: \"": {"stop", "### User: "}, "stop \"\"\"### User:\"\"\"": {"stop", "### User:"}, From 6d4219083c56ec4b031f0fda67e9ef2c09ad9888 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Fri, 28 Jun 2024 09:58:14 -0700 Subject: [PATCH 06/18] Update docs (#5312) --- docs/openai.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/openai.md b/docs/openai.md index 59e7d6405..81b967eb7 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \ #### Notes -- `finish_reason` will always be `stop` - `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached ## Models From b910fa90101038d09ca9cbbea16701831fafaffb Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Fri, 28 Jun 2024 11:30:16 -0700 Subject: [PATCH 07/18] Ollama Show: Check for Projector Type (#5307) * Check exists projtype * Maintain Ordering --- cmd/cmd.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 909e8e4b2..debb39218 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -672,11 +672,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error { projectorData := [][]string{ {"arch", "clip"}, {"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}, - {"projector type", resp.ProjectorInfo["clip.projector_type"].(string)}, - {"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))}, - {"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))}, } + if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok { + projectorData = append(projectorData, []string{"projector type", projectorType.(string)}) + } + + projectorData = append(projectorData, + []string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))}, + []string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))}, + ) + mainTableData = append(mainTableData, []string{"Projector"}, []string{renderSubTable(projectorData, false)}, From 5f034f5b63cab3a5eb61104118727b088cceea21 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Fri, 28 Jun 2024 13:15:52 -0700 Subject: [PATCH 08/18] Include Show Info in Interactive (#5342) --- cmd/cmd.go | 24 +++++++++++------------- cmd/interactive.go | 10 +--------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index debb39218..c898c7db6 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -624,13 +624,13 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified") } - if flagsSet == 1 { - req := api.ShowRequest{Name: args[0]} - resp, err := client.Show(cmd.Context(), &req) - if err != nil { - return err - } + req := api.ShowRequest{Name: args[0]} + resp, err := client.Show(cmd.Context(), &req) + if err != nil { + return err + } + if flagsSet == 1 { switch showType { case "license": fmt.Println(resp.License) @@ -647,12 +647,12 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return nil } - req := api.ShowRequest{Name: args[0]} - resp, err := client.Show(cmd.Context(), &req) - if err != nil { - return err - } + showInfo(resp) + return nil +} + +func showInfo(resp *api.ShowResponse) { arch := resp.ModelInfo["general.architecture"].(string) modelData := [][]string{ @@ -711,8 +711,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error { } table.Render() - - return nil } func renderSubTable(data [][]string, file bool) string { diff --git a/cmd/interactive.go b/cmd/interactive.go index 0a2f429b6..9214f2db5 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -404,15 +404,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { switch args[1] { case "info": - fmt.Println("Model details:") - if len(resp.Details.Families) > 0 { - fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", ")) - } else if resp.Details.Family != "" { - fmt.Printf("Family %s\n", resp.Details.Family) - } - fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize) - fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel) - fmt.Println("") + showInfo(resp) case "license": if resp.License == "" { fmt.Println("No license was specified for this model.") From aae56abb7cc96b8495a1c761a08b92cfd136d9d2 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 28 Jun 2024 13:15:57 -0700 Subject: [PATCH 09/18] Document concurrent behavior and settings --- docs/faq.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/faq.md b/docs/faq.md index b50a3138c..841f1d13d 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -257,3 +257,17 @@ If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` AP ## How do I manage the maximum number of requests the Ollama server can queue? If too many requests are sent to the server, it will respond with a 503 error indicating the server is overloaded. You can adjust how many requests may be queue by setting `OLLAMA_MAX_QUEUE`. + +## How does Ollama handle concurrent requests? + +Ollama supports two levels of concurrent processing. If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time. For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing. + +If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded. As prior models become idle, one or more will be unloaded to make room for the new model. Queued requests will be processed in order. When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads. + +Parallel request processing for a given model results in increasing the context size by the number of parallel requests. For example, a 2K context with 4 parallel requests will result in an 8K context and additional memory allocation. + +The following server settings may be used to adjust how Ollama handles concurrent requests: + +- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 * the number of GPUs or 3 for CPU inference. +- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory. +- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512 From 717f7229eb4f9220d4070aae617923950643d327 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 28 Jun 2024 19:39:31 -0700 Subject: [PATCH 10/18] Do not shift context for sliding window models (#5368) * Do not shift context for sliding window models * truncate prompt > 2/3 tokens * only target gemma2 --- llm/ext_server/server.cpp | 46 +++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 492126a4f..3bc012521 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1650,26 +1650,41 @@ struct llama_server_context } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); + char buf[256]; + llama_model_meta_val_str(model, "general.architecture", buf, 256); + bool gemma2 = strcmp(buf, "gemma2") == 0; + + int32_t truncate_at = slot.n_ctx; + + // truncate at 2/3 of the context length for gemma2 models + // as they do not support context shifts (from the sliding window implementation). + // this way, prompts that almost fit the context length can still generate a full + // response without a sudden stop from hitting the context limit + if (gemma2) { + truncate_at = 2 * slot.n_ctx / 3; + } + // if input prompt is too big, truncate it, if group attention self-extend is disabled - if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) + if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at) { const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + const int n_shift = n_left / 2; + const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift; std::vector new_tokens( prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); new_tokens.insert( new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, + prompt_tokens.begin() + slot.params.n_keep + n_erase, prompt_tokens.end()); - LOG_VERBOSE("input truncated", { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, + LOG_INFO("input truncated", { + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"n_shift", n_shift}, + {"n_erase", n_erase}, }); slot.truncated = true; prompt_tokens = new_tokens; @@ -1678,6 +1693,19 @@ struct llama_server_context GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } + // Models with sliding window attention do not work with context shifts, so + // limit their prediction to the context length + if (gemma2) { + int32_t limit = slot.n_ctx - slot.n_prompt_tokens; + slot.n_predict = limit; + slot.params.n_predict = limit; + LOG_INFO("model does not support sliding window, limiting generation", { + {"n_ctx", slot.n_ctx}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"n_predict", slot.n_predict} + }); + } + if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); From c1218199cfe82eda35f5e4a8031eee28f01ebf75 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 29 Jun 2024 16:22:49 -0700 Subject: [PATCH 11/18] Update api.md --- docs/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api.md b/docs/api.md index 107b5211f..c577bb1a5 100644 --- a/docs/api.md +++ b/docs/api.md @@ -26,7 +26,7 @@ All durations are returned in nanoseconds. ### Streaming responses -Certain endpoints stream responses as JSON objects and can optional return non-streamed responses. +Certain endpoints stream responses as JSON objects. Streaming can be disabled by providing `{"stream": false}` for these endpoints. ## Generate a completion From 27402cb7a28555a3efcaa5af054b1ce2d18e5442 Mon Sep 17 00:00:00 2001 From: Eduard Date: Mon, 1 Jul 2024 03:48:51 +0200 Subject: [PATCH 12/18] Update gpu.md (#5382) Runs fine on a NVIDIA GeForce GTX 1050 Ti --- docs/gpu.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gpu.md b/docs/gpu.md index 55c41c9de..80f276c3b 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -18,7 +18,7 @@ Check your compute compatibility to see if your card is supported: | | Quadro | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000` | | 7.0 | NVIDIA | `TITAN V` `V100` `Quadro GV100` | | 6.1 | NVIDIA TITAN | `TITAN Xp` `TITAN X` | -| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050` | +| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050 Ti` `GTX 1050` | | | Quadro | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` | | | Tesla | `P40` `P4` | | 6.0 | NVIDIA | `Tesla P100` `Quadro GP100` | From 1963c00201958da7165a40f9d2f22b28e11be718 Mon Sep 17 00:00:00 2001 From: RAPID ARCHITECT <126218667+rapidarchitect@users.noreply.github.com> Date: Sun, 30 Jun 2024 21:00:57 -0500 Subject: [PATCH 13/18] Update README.md (#5214) * Update README.md Added Mesop example to web & desktop * Update README.md --------- Co-authored-by: Jeffrey Morgan --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 72ed8fa5e..62f5cd65c 100644 --- a/README.md +++ b/README.md @@ -292,6 +292,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama) - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS) - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama) +- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama) ### Terminal From 26e4e66faff20a94bb8fee9ec2bc3e17a07fb19e Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Mon, 1 Jul 2024 09:43:49 -0700 Subject: [PATCH 14/18] updated parsefile test --- parser/parser_test.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/parser/parser_test.go b/parser/parser_test.go index 3dc592239..171bd4206 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -22,7 +22,13 @@ ADAPTER adapter1 LICENSE MIT PARAMETER param1 value1 PARAMETER param2 value2 -TEMPLATE template1 +TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|> + +{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|> + +{{ .Response }}<|eot_id|>""" ` reader := strings.NewReader(input) @@ -36,7 +42,7 @@ TEMPLATE template1 {Name: "license", Args: "MIT"}, {Name: "param1", Args: "value1"}, {Name: "param2", Args: "value2"}, - {Name: "template", Args: "template1"}, + {Name: "template", Args: "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>"}, } assert.Equal(t, expectedCommands, modelfile.Commands) From cff3f44f4a4097de864d70d9a95f31c62e8ecdfa Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 1 Jul 2024 09:43:59 -0700 Subject: [PATCH 15/18] Fix case for NumCtx --- server/sched.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/server/sched.go b/server/sched.go index 87da1db47..71b535ae2 100644 --- a/server/sched.go +++ b/server/sched.go @@ -23,7 +23,7 @@ type LlmRequest struct { ctx context.Context //nolint:containedctx model *Model opts api.Options - origNumCTX int // Track the initial ctx request + origNumCtx int // Track the initial ctx request sessionDuration time.Duration successCh chan *runnerRef errCh chan error @@ -118,8 +118,8 @@ func (s *Scheduler) processPending(ctx context.Context) { case pending := <-s.pendingReqCh: // Block other requests until we get this pending request running pending.schedAttempts++ - if pending.origNumCTX == 0 { - pending.origNumCTX = pending.opts.NumCtx + if pending.origNumCtx == 0 { + pending.origNumCtx = pending.opts.NumCtx } if pending.ctx.Err() != nil { @@ -135,7 +135,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } // Keep NumCtx and numParallel in sync if numParallel > 1 { - pending.opts.NumCtx = pending.origNumCTX * numParallel + pending.opts.NumCtx = pending.origNumCtx * numParallel } for { @@ -197,7 +197,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // simplifying assumption of defaultParallel when in CPU mode if numParallel <= 0 { numParallel = defaultParallel - pending.opts.NumCtx = pending.origNumCTX * numParallel + pending.opts.NumCtx = pending.origNumCtx * numParallel } if loadedCount == 0 { @@ -691,7 +691,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP // First attempt to fit the model into a single GPU for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCTX * p + req.opts.NumCtx = req.origNumCtx * p if !envconfig.SchedSpread { for _, g := range sgl { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { @@ -709,7 +709,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP // Now try all the GPUs for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCTX * p + req.opts.NumCtx = req.origNumCtx * p if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) *numParallel = p From 173b5504381a77b042f3957226a23c0569406aca Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 1 Jul 2024 09:48:05 -0700 Subject: [PATCH 16/18] Remove default auto from help message This may confuse users thinking "auto" is an acceptable string - it must be numeric --- envconfig/config.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/envconfig/config.go b/envconfig/config.go index 0f0f7f058..c02c4878e 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar { "OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, - "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default auto)"}, + "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, - "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default auto)"}, + "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, From 7e571f95f0306f90e4f754e34df96ebc36f93626 Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Mon, 1 Jul 2024 11:07:48 -0700 Subject: [PATCH 17/18] trimspace test case --- parser/parser_test.go | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/parser/parser_test.go b/parser/parser_test.go index 171bd4206..2b5c4c888 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -48,6 +48,39 @@ TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|> assert.Equal(t, expectedCommands, modelfile.Commands) } +func TestParseFileTrimSpace(t *testing.T) { + input := ` +FROM " model 1" +ADAPTER adapter3 +LICENSE "MIT " +PARAMETER param1 value1 +PARAMETER param2 value2 +TEMPLATE """ {{ if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|> + +{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|> + +{{ .Response }}<|eot_id|> """ +` + + reader := strings.NewReader(input) + + modelfile, err := ParseFile(reader) + require.NoError(t, err) + + expectedCommands := []Command{ + {Name: "model", Args: " model 1"}, + {Name: "adapter", Args: "adapter3"}, + {Name: "license", Args: "MIT "}, + {Name: "param1", Args: "value1"}, + {Name: "param2", Args: "value2"}, + {Name: "template", Args: " {{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|> "}, + } + + assert.Equal(t, expectedCommands, modelfile.Commands) +} + func TestParseFileFrom(t *testing.T) { var cases = []struct { input string From 33a65e3ba3ad5666d6ba8430efbccfa6d642d1de Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Mon, 1 Jul 2024 16:04:13 -0700 Subject: [PATCH 18/18] error --- llm/server.go | 3 +++ llm/status.go | 1 + 2 files changed, 4 insertions(+) diff --git a/llm/server.go b/llm/server.go index 61346069e..8b63cfbd5 100644 --- a/llm/server.go +++ b/llm/server.go @@ -560,6 +560,9 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { if s.status != nil && s.status.LastErrMsg != "" { msg = s.status.LastErrMsg } + if strings.Contains(msg, "unknown model") { + return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade") + } return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) default: } diff --git a/llm/status.go b/llm/status.go index 8a49bd55a..0f56b7f99 100644 --- a/llm/status.go +++ b/llm/status.go @@ -25,6 +25,7 @@ var errorPrefixes = []string{ "CUDA error", "cudaMalloc failed", "\"ERR\"", + "architecture", } func (w *StatusWriter) Write(b []byte) (int, error) {