diff --git a/cmd/cmd.go b/cmd/cmd.go index 35074ad2b..b63dd5948 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1876,6 +1876,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_CONTEXT_LENGTH"], envVars["OLLAMA_KEEP_ALIVE"], envVars["OLLAMA_MAX_LOADED_MODELS"], + envVars["OLLAMA_NO_MODEL_EVICT"], envVars["OLLAMA_MAX_QUEUE"], envVars["OLLAMA_MODELS"], envVars["OLLAMA_NUM_PARALLEL"], diff --git a/docs/faq.mdx b/docs/faq.mdx index 4237da41f..056bec41c 100644 --- a/docs/faq.mdx +++ b/docs/faq.mdx @@ -312,6 +312,7 @@ Parallel request processing for a given model results in increasing the context The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms: - `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 \* the number of GPUs or 3 for CPU inference. +- `OLLAMA_NO_MODEL_EVICT` - If set to `1`, Ollama will not unload already loaded models to make room for a new request. Requests that would require unloading another model will return an error instead. - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory. - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512 diff --git a/envconfig/config.go b/envconfig/config.go index 238e5e6e1..984a464de 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -198,6 +198,8 @@ var ( SchedSpread = Bool("OLLAMA_SCHED_SPREAD") // MultiUserCache optimizes prompt caching for multi-user scenarios MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE") + // NoEvict prevents unloading currently loaded models to make room for a new model. + NoEvict = Bool("OLLAMA_NO_MODEL_EVICT") // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") // ContextLength sets the default context length @@ -285,6 +287,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"}, + "OLLAMA_NO_MODEL_EVICT": {"OLLAMA_NO_MODEL_EVICT", NoEvict(), "Prevent unloading loaded models to make room for new models"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, diff --git a/server/sched.go b/server/sched.go index c5bc6692d..2064851b4 100644 --- a/server/sched.go +++ b/server/sched.go @@ -62,6 +62,7 @@ type Scheduler struct { var defaultModelsPerGPU = 3 var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded") +var ErrEvictionDisabled = errors.New("unloading existing models is disabled (set OLLAMA_NO_MODEL_EVICT=0 to allow evictions)") func InitScheduler(ctx context.Context) *Scheduler { maxQueue := envconfig.MaxQueue() @@ -129,6 +130,7 @@ func (s *Scheduler) Run(ctx context.Context) { func (s *Scheduler) processPending(ctx context.Context) { maxRunners := envconfig.MaxRunners() + preventEvictions := envconfig.NoEvict() for { select { @@ -167,6 +169,13 @@ func (s *Scheduler) processPending(ctx context.Context) { break } } else if maxRunners > 0 && loadedCount >= int(maxRunners) { + if preventEvictions { + err := fmt.Errorf("%w: maximum loaded models reached while loading %s", ErrEvictionDisabled, pending.model.ModelPath) + slog.Info("skipping eviction because OLLAMA_NO_MODEL_EVICT is set", "runner_count", loadedCount, "model", pending.model.ModelPath) + pending.errCh <- err + s.abortActiveLoading() + break + } slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) runnerToExpire = s.findRunnerToUnload() } else { @@ -222,6 +231,14 @@ func (s *Scheduler) processPending(ctx context.Context) { break } + if preventEvictions { + err := fmt.Errorf("%w: unable to load %s without unloading a loaded model", ErrEvictionDisabled, pending.model.ModelPath) + slog.Info("skipping eviction because OLLAMA_NO_MODEL_EVICT is set", "model", pending.model.ModelPath) + pending.errCh <- err + s.abortActiveLoading() + break + } + runnerToExpire = s.findRunnerToUnload() } @@ -847,6 +864,16 @@ func (s *Scheduler) unloadAllRunners() { } } +func (s *Scheduler) abortActiveLoading() { + s.loadedMu.Lock() + defer s.loadedMu.Unlock() + + if s.activeLoading != nil { + s.activeLoading.Close() + s.activeLoading = nil + } +} + func (s *Scheduler) expireRunner(model *Model) { s.loadedMu.Lock() runner, ok := s.loaded[model.ModelPath] diff --git a/server/sched_test.go b/server/sched_test.go index 480aafa4e..de9e5a422 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -634,6 +634,67 @@ func TestSchedFindRunnerToUnload(t *testing.T) { require.Equal(t, r1, resp) } +func TestSchedNoEvictPreventsUnload(t *testing.T) { + t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1") + t.Setenv("OLLAMA_NO_MODEL_EVICT", "1") + + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) + defer done() + + first := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: time.Second}, nil) + second := newScenarioRequest(t, ctx, "ollama-model-2", 10, &api.Duration{Duration: time.Second}, nil) + + s := InitScheduler(ctx) + s.waitForRecovery = 10 * time.Millisecond + s.getGpuFn = getGpuFn + s.getSystemInfoFn = getSystemInfoFn + servers := map[string]*mockLlm{ + first.req.model.ModelPath: first.srv, + second.req.model.ModelPath: second.srv, + } + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + if srv, ok := servers[model]; ok { + srv.modelPath = model + return srv, nil + } + return nil, errors.New("unexpected model") + } + + s.Run(ctx) + + firstSuccess, firstErr := s.GetRunner(first.ctx, first.req.model, first.req.opts, first.req.sessionDuration) + select { + case resp := <-firstSuccess: + require.Equal(t, first.srv, resp.llama) + require.Empty(t, firstErr) + case err := <-firstErr: + t.Fatalf("unexpected error: %s", err) + case <-ctx.Done(): + t.Fatal("timeout waiting for first runner") + } + + secondSuccess, secondErr := s.GetRunner(second.ctx, second.req.model, second.req.opts, second.req.sessionDuration) + select { + case <-secondSuccess: + t.Fatal("expected eviction to be blocked") + case err := <-secondErr: + require.ErrorContains(t, err, "OLLAMA_NO_MODEL_EVICT") + case <-ctx.Done(): + t.Fatal("timeout waiting for eviction error") + } + + s.loadedMu.Lock() + require.Len(t, s.loaded, 1) + _, ok := s.loaded[first.req.model.ModelPath] + s.loadedMu.Unlock() + require.True(t, ok) + require.False(t, first.srv.closeCalled) + require.False(t, second.srv.closeCalled) + + first.ctxDone() + time.Sleep(10 * time.Millisecond) +} + func TestSchedNeedsReload(t *testing.T) { ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done()