From 8a7e2055d2196df23e86ffe813c1e9287e18068e Mon Sep 17 00:00:00 2001
From: fengyuchuanshen <fengyuchuanshen@outlook.com>
Date: Fri, 12 Sep 2025 00:57:31 +0800
Subject: [PATCH 1/7] cmd: use slices.Contains to simplify code (#12249)

---
 cmd/cmd.go | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 8fe068655..19f1e192f 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -56,10 +56,8 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string)
 	if err != nil {
 		return
 	}
-	for _, cap := range resp.Capabilities {
-		if cap == model.CapabilityThinking {
-			return
-		}
+	if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
+		return
 	}
 	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
 }

From feb18cd710dec1e4754ea56124238a11eb3cb90a Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 11 Sep 2025 10:36:10 -0700
Subject: [PATCH 2/7] feat: add dimensions field to embed requests (#12242)

* feat: add field to truncate embeddings

* add openai embeddings for dimensions
---
 api/types.go     |  4 ++++
 docs/api.md      |  1 +
 openai/openai.go |  7 ++++---
 server/routes.go | 13 +++++++------
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/api/types.go b/api/types.go
index d3f6fc5a4..a7ddbc373 100644
--- a/api/types.go
+++ b/api/types.go
@@ -388,8 +388,12 @@ type EmbedRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
 
+	// Truncate truncates the input to fit the model's max sequence length.
 	Truncate *bool `json:"truncate,omitempty"`
 
+	// Dimensions truncates the output embedding to the specified dimension.
+	Dimensions int `json:"dimensions,omitempty"`
+
 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
 }
diff --git a/docs/api.md b/docs/api.md
index f11d59ed1..f47af63c6 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1708,6 +1708,7 @@ Advanced parameters:
 - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `dimensions`: number of dimensions for the embedding
 
 ### Examples
 
diff --git a/openai/openai.go b/openai/openai.go
index 9c7c41cb4..b6a8a95e2 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -76,8 +76,9 @@ type JsonSchema struct {
 }
 
 type EmbedRequest struct {
-	Input any    `json:"input"`
-	Model string `json:"model"`
+	Input      any    `json:"input"`
+	Model      string `json:"model"`
+	Dimensions int    `json:"dimensions,omitempty"`
 }
 
 type StreamOptions struct {
@@ -1005,7 +1006,7 @@ func EmbeddingsMiddleware() gin.HandlerFunc {
 		}
 
 		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil {
+		if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input, Dimensions: req.Dimensions}); err != nil {
 			c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
 			return
 		}
diff --git a/server/routes.go b/server/routes.go
index ac4df4a46..8dd1b217a 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -558,7 +558,12 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 			if err != nil {
 				return err
 			}
-			embeddings[i] = normalize(embedding)
+			// TODO: this first normalization should be done by the model
+			embedding = normalize(embedding)
+			if req.Dimensions > 0 && req.Dimensions < len(embedding) {
+				embedding = normalize(embedding[:req.Dimensions])
+			}
+			embeddings[i] = embedding
 			return nil
 		})
 	}
@@ -584,11 +589,7 @@ func normalize(vec []float32) []float32 {
 		sum += v * v
 	}
 
-	norm := float32(0.0)
-	if sum > 0 {
-		norm = float32(1.0 / math.Sqrt(float64(sum)))
-	}
-
+	norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12))
 	for i := range vec {
 		vec[i] *= norm
 	}

From eb10390de96ad6f5c21bc9e61f6cd222405f627a Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 11 Sep 2025 10:30:18 -0700
Subject: [PATCH 3/7] llm: Enable new memory estimates by default

New memory estimates (see #11090 for more information) are now
enabled automatically for all models running on the Ollama engine,
improving both stability and performance through more accurate sizing
and allocation. Models running on the llama engine will continue to
use the original style of memory estimation.
---
 envconfig/config.go | 3 ---
 llm/server.go       | 7 +------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index 868813ae8..7fc018870 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -185,8 +185,6 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
-	// Enable the new memory estimation logic
-	NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
 )
 
 func String(s string) func() string {
@@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
 		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
-		"OLLAMA_NEW_ESTIMATES":     {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
 
 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
diff --git a/llm/server.go b/llm/server.go
index a22ae9722..5caf19875 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -162,11 +162,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 	}
 
-	newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates()
-	if newEstimates {
-		slog.Info("enabling new memory estimates")
-	}
-
 	// Verify the requested context size is <= the model training size
 	trainCtx := f.KV().ContextLength()
 	if opts.NumCtx > int(trainCtx) && trainCtx > 0 {
@@ -434,7 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			}
 		}()
 
-		if newEstimates {
+		if textProcessor != nil {
 			return &ollamaServer{llmServer: s}, nil
 		} else {
 			return &llamaServer{llmServer: s, ggml: f}, nil

From aba157531521192a04d09811fac3cda20e1a8340 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 10 Sep 2025 11:03:06 -0700
Subject: [PATCH 4/7] llm: Don't try to load split vision models in the Ollama
 engine

If a model with a split vision projector is loaded in the Ollama
engine, the projector will be ignored and the model will hallucinate
a response. Instead, fallback and try to load the model in the llama
engine.
---
 llm/server.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llm/server.go b/llm/server.go
index 5caf19875..9100b6978 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -149,7 +149,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 	var textProcessor model.TextProcessor
 	var err error
 	if envconfig.NewEngine() || f.KV().OllamaEngineRequired() {
-		textProcessor, err = model.NewTextProcessor(modelPath)
+		if len(projectors) == 0 {
+			textProcessor, err = model.NewTextProcessor(modelPath)
+		} else {
+			err = errors.New("split vision models aren't supported")
+		}
 		if err != nil {
 			// To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
 			slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)

From 61fb912ca46fe902180892316f6cc34adda07b67 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:25:26 -0700
Subject: [PATCH 5/7] CI: fix windows cuda build (#12246)

* ci: adjust cuda component list

v13 has a different breakdown of the components required to build ollama

* review comments
---
 .github/workflows/release.yaml | 15 ++++++++++++++-
 .github/workflows/test.yaml    | 12 +++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 902fa9ccc..fc3cde9c9 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -65,6 +65,11 @@ jobs:
             arch: amd64
             preset: 'CUDA 12'
             install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
             cuda-version: '12.8'
             flags: ''
             runner_dir: 'cuda_v12'
@@ -72,6 +77,14 @@ jobs:
             arch: amd64
             preset: 'CUDA 13'
             install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
             cuda-version: '13.0'
             flags: ''
             runner_dir: 'cuda_v13'
@@ -105,7 +118,7 @@ jobs:
           $ErrorActionPreference = "Stop"
           if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
             Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
             Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
           }
 
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a10ad37a9..e470540a2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -80,6 +80,15 @@ jobs:
           - preset: CUDA
             install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
             flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
+            cuda-version: '13.0'
           - preset: ROCm
             install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
             flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
@@ -102,7 +111,8 @@ jobs:
           $ErrorActionPreference = "Stop"
           if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
             Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_13.0", "nvcc_13.0", "cublas_13.0", "cublas_dev_13.0")) -NoNewWindow -Wait
+            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
           }
 
           $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path

From 26214125e86ac1d4512dff68c983137589cfddbf Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 11 Sep 2025 13:48:51 -0700
Subject: [PATCH 6/7] ollamarunner: Suppress stack trace during memory
 allocation

Allocation failures can be a normal part of new memory estimates, so
we shouldn't print a stack trace in this case.
---
 runner/ollamarunner/runner.go | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 201d55a16..676e5186f 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -18,7 +18,6 @@ import (
 	"reflect"
 	"regexp"
 	"runtime"
-	"runtime/debug"
 	"strconv"
 	"strings"
 	"sync"
@@ -1101,9 +1100,13 @@ func (s *Server) allocModel(
 	// Convert memory allocation panics to errors
 	defer func() {
 		if r := recover(); r != nil {
-			debug.PrintStack()
 			if err, ok := r.(error); ok {
-				panicErr = err
+				var noMem ml.ErrNoMem
+				if errors.As(err, &noMem) {
+					panicErr = noMem
+				} else {
+					panic(r)
+				}
 			} else {
 				panic(r)
 			}

From e4ce68311a64310ece5534ae3a4820b20ea3d42f Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 12 Sep 2025 07:59:14 -0700
Subject: [PATCH 7/7] cuda: remove compression for better compatibility
 (#12259)

This retains compatibility with driver 531 and up at the trade-off of space.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cce5e4b1..198fcdeb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ set(GGML_LLAMAFILE ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_CUDA_GRAPHS ON)
 set(GGML_CUDA_FA ON)
-set(GGML_CUDA_COMPRESSION_MODE size)
+set(GGML_CUDA_COMPRESSION_MODE default)
 
 if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
     OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))