From 8a7e2055d2196df23e86ffe813c1e9287e18068e Mon Sep 17 00:00:00 2001 From: fengyuchuanshen Date: Fri, 12 Sep 2025 00:57:31 +0800 Subject: [PATCH 1/7] cmd: use slices.Contains to simplify code (#12249) --- cmd/cmd.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 8fe068655..19f1e192f 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -56,10 +56,8 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) if err != nil { return } - for _, cap := range resp.Capabilities { - if cap == model.CapabilityThinking { - return - } + if slices.Contains(resp.Capabilities, model.CapabilityThinking) { + return } fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name) } From feb18cd710dec1e4754ea56124238a11eb3cb90a Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 11 Sep 2025 10:36:10 -0700 Subject: [PATCH 2/7] feat: add dimensions field to embed requests (#12242) * feat: add field to truncate embeddings * add openai embeddings for dimensions --- api/types.go | 4 ++++ docs/api.md | 1 + openai/openai.go | 7 ++++--- server/routes.go | 13 +++++++------ 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/api/types.go b/api/types.go index d3f6fc5a4..a7ddbc373 100644 --- a/api/types.go +++ b/api/types.go @@ -388,8 +388,12 @@ type EmbedRequest struct { // this request. KeepAlive *Duration `json:"keep_alive,omitempty"` + // Truncate truncates the input to fit the model's max sequence length. Truncate *bool `json:"truncate,omitempty"` + // Dimensions truncates the output embedding to the specified dimension. + Dimensions int `json:"dimensions,omitempty"` + // Options lists model-specific options. Options map[string]any `json:"options"` } diff --git a/docs/api.md b/docs/api.md index f11d59ed1..f47af63c6 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1708,6 +1708,7 @@ Advanced parameters: - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) +- `dimensions`: number of dimensions for the embedding ### Examples diff --git a/openai/openai.go b/openai/openai.go index 9c7c41cb4..b6a8a95e2 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -76,8 +76,9 @@ type JsonSchema struct { } type EmbedRequest struct { - Input any `json:"input"` - Model string `json:"model"` + Input any `json:"input"` + Model string `json:"model"` + Dimensions int `json:"dimensions,omitempty"` } type StreamOptions struct { @@ -1005,7 +1006,7 @@ func EmbeddingsMiddleware() gin.HandlerFunc { } var b bytes.Buffer - if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil { + if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input, Dimensions: req.Dimensions}); err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error())) return } diff --git a/server/routes.go b/server/routes.go index ac4df4a46..8dd1b217a 100644 --- a/server/routes.go +++ b/server/routes.go @@ -558,7 +558,12 @@ func (s *Server) EmbedHandler(c *gin.Context) { if err != nil { return err } - embeddings[i] = normalize(embedding) + // TODO: this first normalization should be done by the model + embedding = normalize(embedding) + if req.Dimensions > 0 && req.Dimensions < len(embedding) { + embedding = normalize(embedding[:req.Dimensions]) + } + embeddings[i] = embedding return nil }) } @@ -584,11 +589,7 @@ func normalize(vec []float32) []float32 { sum += v * v } - norm := float32(0.0) - if sum > 0 { - norm = float32(1.0 / math.Sqrt(float64(sum))) - } - + norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12)) for i := range vec { vec[i] *= norm } From eb10390de96ad6f5c21bc9e61f6cd222405f627a Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 11 Sep 2025 10:30:18 -0700 Subject: [PATCH 3/7] llm: Enable new memory estimates by default New memory estimates (see #11090 for more information) are now enabled automatically for all models running on the Ollama engine, improving both stability and performance through more accurate sizing and allocation. Models running on the llama engine will continue to use the original style of memory estimation. --- envconfig/config.go | 3 --- llm/server.go | 7 +------ 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/envconfig/config.go b/envconfig/config.go index 868813ae8..7fc018870 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -185,8 +185,6 @@ var ( ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) // Auth enables authentication between the Ollama client and server UseAuth = Bool("OLLAMA_AUTH") - // Enable the new memory estimation logic - NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES") ) func String(s string) func() string { @@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, - "OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"}, // Informational "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, diff --git a/llm/server.go b/llm/server.go index a22ae9722..5caf19875 100644 --- a/llm/server.go +++ b/llm/server.go @@ -162,11 +162,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } } - newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates() - if newEstimates { - slog.Info("enabling new memory estimates") - } - // Verify the requested context size is <= the model training size trainCtx := f.KV().ContextLength() if opts.NumCtx > int(trainCtx) && trainCtx > 0 { @@ -434,7 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } }() - if newEstimates { + if textProcessor != nil { return &ollamaServer{llmServer: s}, nil } else { return &llamaServer{llmServer: s, ggml: f}, nil From aba157531521192a04d09811fac3cda20e1a8340 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 10 Sep 2025 11:03:06 -0700 Subject: [PATCH 4/7] llm: Don't try to load split vision models in the Ollama engine If a model with a split vision projector is loaded in the Ollama engine, the projector will be ignored and the model will hallucinate a response. Instead, fallback and try to load the model in the llama engine. --- llm/server.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llm/server.go b/llm/server.go index 5caf19875..9100b6978 100644 --- a/llm/server.go +++ b/llm/server.go @@ -149,7 +149,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a var textProcessor model.TextProcessor var err error if envconfig.NewEngine() || f.KV().OllamaEngineRequired() { - textProcessor, err = model.NewTextProcessor(modelPath) + if len(projectors) == 0 { + textProcessor, err = model.NewTextProcessor(modelPath) + } else { + err = errors.New("split vision models aren't supported") + } if err != nil { // To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err) From 61fb912ca46fe902180892316f6cc34adda07b67 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 11 Sep 2025 12:25:26 -0700 Subject: [PATCH 5/7] CI: fix windows cuda build (#12246) * ci: adjust cuda component list v13 has a different breakdown of the components required to build ollama * review comments --- .github/workflows/release.yaml | 15 ++++++++++++++- .github/workflows/test.yaml | 12 +++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 902fa9ccc..fc3cde9c9 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -65,6 +65,11 @@ jobs: arch: amd64 preset: 'CUDA 12' install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' cuda-version: '12.8' flags: '' runner_dir: 'cuda_v12' @@ -72,6 +77,14 @@ jobs: arch: amd64 preset: 'CUDA 13' install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' + - '"crt"' + - '"nvvm"' + - '"nvptxcompiler"' cuda-version: '13.0' flags: '' runner_dir: 'cuda_v13' @@ -105,7 +118,7 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} + $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait } diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a10ad37a9..e470540a2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -80,6 +80,15 @@ jobs: - preset: CUDA install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe flags: '-DCMAKE_CUDA_ARCHITECTURES=80' + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' + - '"crt"' + - '"nvvm"' + - '"nvptxcompiler"' + cuda-version: '13.0' - preset: ROCm install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"' @@ -102,7 +111,8 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_13.0", "nvcc_13.0", "cublas_13.0", "cublas_dev_13.0")) -NoNewWindow -Wait + $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} + Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait } $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path From 26214125e86ac1d4512dff68c983137589cfddbf Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 11 Sep 2025 13:48:51 -0700 Subject: [PATCH 6/7] ollamarunner: Suppress stack trace during memory allocation Allocation failures can be a normal part of new memory estimates, so we shouldn't print a stack trace in this case. --- runner/ollamarunner/runner.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 201d55a16..676e5186f 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -18,7 +18,6 @@ import ( "reflect" "regexp" "runtime" - "runtime/debug" "strconv" "strings" "sync" @@ -1101,9 +1100,13 @@ func (s *Server) allocModel( // Convert memory allocation panics to errors defer func() { if r := recover(); r != nil { - debug.PrintStack() if err, ok := r.(error); ok { - panicErr = err + var noMem ml.ErrNoMem + if errors.As(err, &noMem) { + panicErr = noMem + } else { + panic(r) + } } else { panic(r) } From e4ce68311a64310ece5534ae3a4820b20ea3d42f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 12 Sep 2025 07:59:14 -0700 Subject: [PATCH 7/7] cuda: remove compression for better compatibility (#12259) This retains compatibility with driver 531 and up at the trade-off of space. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cce5e4b1..198fcdeb9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ set(GGML_LLAMAFILE ON) set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128) set(GGML_CUDA_GRAPHS ON) set(GGML_CUDA_FA ON) -set(GGML_CUDA_COMPRESSION_MODE size) +set(GGML_CUDA_COMPRESSION_MODE default) if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))