cuda: remove compression for better compatibility (#12259 )

This retains compatibility with driver 531 and up at the trade-off of space.
ollamarunner: Suppress stack trace during memory allocation
2025-09-12 07:59:14 -07:00 · 2025-09-11 14:30:31 -07:00 · 2025-09-11 12:25:26 -07:00 · 2025-09-11 11:41:55 -07:00 · 2025-09-11 11:21:53 -07:00 · 2025-09-11 10:36:10 -07:00
22 changed files with 386 additions and 133 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -65,14 +65,36 @@ jobs:
            arch: amd64
            preset: 'CUDA 12'
            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
            cuda-version: '12.8'
            flags: ''
+            runner_dir: 'cuda_v12'
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 13'
+            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
+            cuda-version: '13.0'
+            flags: ''
+            runner_dir: 'cuda_v13'
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
+            runner_dir: ''
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -96,7 +118,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }

@@ -138,7 +160,7 @@ jobs:
        run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
        env:
@@ -232,7 +254,7 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_sbsa)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
+            container: nvidia/cuda:13.0.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,8 +78,17 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
+            cuda-version: '13.0'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
@@ -102,7 +111,8 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
+            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
-set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
+set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR})

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
@@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -18,6 +18,14 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual",
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
+      }
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
@@ -26,6 +34,14 @@
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
+    {
+      "name": "CUDA 13",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;110-virtual;120-virtual;121-virtual",
+        "CMAKE_CUDA_FLAGS": "-t 2"
+      }
+    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
@@ -72,11 +88,21 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 12"
    },
+    {
+      "name": "CUDA 13",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 13"
+    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
--- a/30
+++ b/30
@@ -39,15 +39,35 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

+FROM base AS cuda-11
+ARG CUDA11VERSION=11.8
+RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \
+        && cmake --build --parallel --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel 8
+
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 12' \
+    cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\
        && cmake --build --parallel --preset 'CUDA 12' \
        && cmake --install build --component CUDA --strip --parallel 8

+
+FROM base AS cuda-13
+ARG CUDA13VERSION=13.0
+RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
+ENV PATH=/usr/local/cuda-13/bin:$PATH
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \
+        && cmake --build --parallel --preset 'CUDA 13' \
+        && cmake --install build --component CUDA --strip --parallel 8
+
+
 FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
@@ -92,10 +112,14 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama
+# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
+COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
+# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
+COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
 COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6

--- a/README.md
+++ b/README.md
@@ -414,6 +414,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
 - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
 - [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
+- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)

 ### Cloud

--- a/api/types.go
+++ b/api/types.go
@@ -388,8 +388,12 @@ type EmbedRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

+	// Truncate truncates the input to fit the model's max sequence length.
 	Truncate *bool `json:"truncate,omitempty"`

+	// Dimensions truncates the output embedding to the specified dimension.
+	Dimensions int `json:"dimensions,omitempty"`
+
 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -56,10 +56,8 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string)
 	if err != nil {
 		return
 	}
-	for _, cap := range resp.Capabilities {
-		if cap == model.CapabilityThinking {
-			return
-		}
+	if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
+		return
 	}
 	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
 }
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -43,14 +43,15 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
-		return "sbsa"
 	}

-	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
-	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
-		// The detected driver is older than Feb 2023
-		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
-		return "v11"
+	if gpuInfo.DriverMajor < 13 {
+		// The detected driver is older than 580 (Aug 2025)
+		// Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance
+		if gpuInfo.computeMajor > 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor >= 5) {
+			slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
+		}
+		return "v12"
 	}
-	return "v12"
+	return "v13"
 }
--- a/docs/api.md
+++ b/docs/api.md
@@ -1708,6 +1708,7 @@ Advanced parameters:
 - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `dimensions`: number of dimensions for the embedding

 ### Examples

--- a/docs/linux.md
+++ b/docs/linux.md
@@ -11,12 +11,13 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 > [!NOTE]
-> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.

 Download and extract the package:

 ```shell
 curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
+sudo rm -rf /usr/lib/ollama
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -185,8 +185,6 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
-	// Enable the new memory estimation logic
-	NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
 )

 func String(s string) func() string {
@@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
 		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
-		"OLLAMA_NEW_ESTIMATES":     {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},

 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -864,12 +864,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {

 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
+	if cacheType == "" || cacheType == "f16" {
+		return true
+	}
+
 	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
 		// gpt-oss uses attention with sinks which does not support quantized cache types
-		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
-		return cacheType == "f16"
+		slog.Warn("model only supports non-quantized cache types", "model", arch)
+		return false
 	}
-	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
+	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
 }

 // SupportsFlashAttention checks if the model supports flash attention
@@ -879,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

+	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
+		return false
+	}
+
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -289,6 +289,7 @@ type HarmonyMessageHandler struct {
 	state           harmonyMessageState
 	HarmonyParser   *HarmonyParser
 	FunctionNameMap *FunctionNameMap
+	ToolParser      *HarmonyToolCallAccumulator
 }

 // NewHarmonyMessageHandler creates a new message handler
@@ -301,12 +302,16 @@ func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 			HeaderEndTag:    "<|message|>",
 		},
 		FunctionNameMap: NewFunctionNameMap(),
+		ToolParser: &HarmonyToolCallAccumulator{
+			state:           harmonyToolCallState_Normal,
+			currentToolName: nil,
+		},
 	}
 }

 // AddContent processes the content and returns the content, thinking, and tool content.
 // content and thinking are already fully parsed, but tool content still needs to be passed to the tool parser
-func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyToolCallAccumulator) (string, string, string) {
+func (h *HarmonyMessageHandler) AddContent(content string) (string, string, string) {
 	contentSb := strings.Builder{}
 	thinkingSb := strings.Builder{}
 	toolContentSb := strings.Builder{}
@@ -323,14 +328,14 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 					// event.Header.Recipient is the tool name, something like
 					// "browser.search" for a built-in, or "functions.calc" for a
 					// custom one
-					toolParser.SetToolName(event.Header.Recipient)
+					h.ToolParser.SetToolName(event.Header.Recipient)
 				} else {
 					h.state = harmonyMessageState_Thinking
 				}
 			case "commentary":
 				if event.Header.Recipient != "" {
 					h.state = harmonyMessageState_ToolCalling
-					toolParser.SetToolName(event.Header.Recipient)
+					h.ToolParser.SetToolName(event.Header.Recipient)
 				} else {
 					h.state = harmonyMessageState_Normal
 				}
@@ -353,13 +358,6 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 	return contentSb.String(), thinkingSb.String(), toolContentSb.String()
 }

-func (h *HarmonyMessageHandler) CreateToolParser() *HarmonyToolCallAccumulator {
-	return &HarmonyToolCallAccumulator{
-		state:           harmonyToolCallState_Normal,
-		currentToolName: nil,
-	}
-}
-
 type harmonyToolCallState int

 const (
--- a/harmony/harmonyparser_test.go
+++ b/harmony/harmonyparser_test.go
@@ -541,7 +541,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 	t.Run("thinking_then_content_streams", func(t *testing.T) {
 		handler := NewHarmonyMessageHandler()
 		handler.HarmonyParser.AddImplicitStart()
-		tp := handler.CreateToolParser()
+		tp := handler.ToolParser
 		type step struct {
 			in           string
 			wantContent  string
@@ -554,7 +554,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 			{in: "<|end|>", wantContent: ""},
 		}
 		for i, s := range steps {
-			content, thinking, tool := handler.AddContent(s.in, tp)
+			content, thinking, tool := handler.AddContent(s.in)
 			if tool != "" {
 				tp.Add(tool)
 			}
@@ -567,7 +567,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 	t.Run("content_streams_as_it_arrives", func(t *testing.T) {
 		handler := NewHarmonyMessageHandler()
 		handler.HarmonyParser.AddImplicitStart()
-		tp := handler.CreateToolParser()
+		tp := handler.ToolParser
 		inputs := []string{
 			"<|start|>assistant<|message|>Hello",
 			", world",
@@ -575,7 +575,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 		}
 		var got []string
 		for _, in := range inputs {
-			content, thinking, tool := handler.AddContent(in, tp)
+			content, thinking, tool := handler.AddContent(in)
 			if tool != "" {
 				tp.Add(tool)
 			}
@@ -595,7 +595,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 	t.Run("thinking_streams_separately_from_content", func(t *testing.T) {
 		handler := NewHarmonyMessageHandler()
 		handler.HarmonyParser.AddImplicitStart()
-		tp := handler.CreateToolParser()
+		tp := handler.ToolParser
 		inputs := []string{
 			"<|channel|>analysis<|message|>Thinking...",
 			"<|end|>",
@@ -604,7 +604,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 		}
 		var got []string
 		for _, in := range inputs {
-			content, thinking, tool := handler.AddContent(in, tp)
+			content, thinking, tool := handler.AddContent(in)
 			if tool != "" {
 				tp.Add(tool)
 			}
@@ -624,7 +624,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 	t.Run("partial_tags_buffer_until_complete", func(t *testing.T) {
 		handler := NewHarmonyMessageHandler()
 		handler.HarmonyParser.AddImplicitStart()
-		tp := handler.CreateToolParser()
+		tp := handler.ToolParser
 		inputs := []string{
 			"<|chan",
 			"nel|>analysis<|mess",
@@ -637,7 +637,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 		var thinkingPieces []string
 		var contentPieces []string
 		for _, in := range inputs {
-			content, thinking, tool := handler.AddContent(in, tp)
+			content, thinking, tool := handler.AddContent(in)
 			if tool != "" {
 				tp.Add(tool)
 			}
@@ -659,7 +659,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 	t.Run("simple_assistant_after_analysis", func(t *testing.T) {
 		handler := NewHarmonyMessageHandler()
 		handler.HarmonyParser.AddImplicitStart()
-		tp := handler.CreateToolParser()
+		tp := handler.ToolParser
 		inputs := []string{
 			"<|channel|>analysis<|message|>Think",
 			"<|end|>",
@@ -668,7 +668,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 		}
 		var contentSb, thinkingSb strings.Builder
 		for _, in := range inputs {
-			content, thinking, tool := handler.AddContent(in, tp)
+			content, thinking, tool := handler.AddContent(in)
 			if tool != "" {
 				tp.Add(tool)
 			}
@@ -686,12 +686,12 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 	t.Run("tool_call_parsed_and_returned_correctly", func(t *testing.T) {
 		handler := NewHarmonyMessageHandler()
 		handler.HarmonyParser.AddImplicitStart()
-		tp := handler.CreateToolParser()
+		tp := handler.ToolParser
 		inputs := []string{
 			"<|channel|>commentary to=functions.calculate<|message|>{\"expression\":\"2+2\"}<|end|>",
 		}
 		for _, in := range inputs {
-			content, thinking, tool := handler.AddContent(in, tp)
+			content, thinking, tool := handler.AddContent(in)
 			if content != "" || thinking != "" {
 				continue
 			}
@@ -711,14 +711,14 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 	t.Run("tool_call_across_chunks", func(t *testing.T) {
 		handler := NewHarmonyMessageHandler()
 		handler.HarmonyParser.AddImplicitStart()
-		tp := handler.CreateToolParser()
+		tp := handler.ToolParser
 		inputs := []string{
 			"<|channel|>commentary to=functions.calculate<|message|>{\"expression\":\"2+",
 			"2\"}",
 			"<|end|>",
 		}
 		for _, in := range inputs {
-			content, thinking, tool := handler.AddContent(in, tp)
+			content, thinking, tool := handler.AddContent(in)
 			if content != "" || thinking != "" {
 				continue
 			}
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var kvct string
 	if useFlashAttention {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && f.SupportsKVCacheType(requested) {
+		if f.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -35,6 +35,7 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/parser"
 )

 type filteredEnv []string
@@ -148,7 +149,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 	var textProcessor model.TextProcessor
 	var err error
 	if envconfig.NewEngine() || f.KV().OllamaEngineRequired() {
-		textProcessor, err = model.NewTextProcessor(modelPath)
+		if len(projectors) == 0 {
+			textProcessor, err = model.NewTextProcessor(modelPath)
+		} else {
+			err = errors.New("split vision models aren't supported")
+		}
 		if err != nil {
 			// To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
 			slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)
@@ -161,11 +166,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 	}

-	newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates()
-	if newEstimates {
-		slog.Info("enabling new memory estimates")
-	}
-
 	// Verify the requested context size is <= the model training size
 	trainCtx := f.KV().ContextLength()
 	if opts.NumCtx > int(trainCtx) && trainCtx > 0 {
@@ -220,7 +220,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 		// Flash Attention also supports kv cache quantization
 		// Enable if the requested and kv cache type is supported by the model
-		if kvct != "" && f.SupportsKVCacheType(kvct) {
+		if f.SupportsKVCacheType(kvct) {
 			loadRequest.KvCacheType = kvct
 		} else {
 			slog.Warn("kv cache type not supported by model", "type", kvct)
@@ -433,7 +433,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			}
 		}()

-		if newEstimates {
+		if textProcessor != nil {
 			return &ollamaServer{llmServer: s}, nil
 		} else {
 			return &llamaServer{llmServer: s, ggml: f}, nil
@@ -1350,7 +1350,7 @@ type CompletionRequest struct {
 	Options *api.Options

 	Grammar       string // set before sending the request to the subprocess
-	UseHarmony    bool
+	ParserType    parser.TokenParserType
 	PrefillString string
 }

@@ -1364,8 +1364,6 @@ const (
 	DoneReasonLength
 	// DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed
 	DoneReasonConnectionClosed
-	// DoneReasonTokenRepeatLimit indicates the completion stopped due to a token repeat limit
-	DoneReasonTokenRepeatLimit
 )

 func (d DoneReason) String() string {
@@ -1374,8 +1372,6 @@ func (d DoneReason) String() string {
 		return "length"
 	case DoneReasonStop:
 		return "stop"
-	case DoneReasonTokenRepeatLimit:
-		return "token_repeat_limit"
 	default:
 		return "" // closed
 	}
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -76,8 +76,9 @@ type JsonSchema struct {
 }

 type EmbedRequest struct {
-	Input any    `json:"input"`
-	Model string `json:"model"`
+	Input      any    `json:"input"`
+	Model      string `json:"model"`
+	Dimensions int    `json:"dimensions,omitempty"`
 }

 type StreamOptions struct {
@@ -1005,7 +1006,7 @@ func EmbeddingsMiddleware() gin.HandlerFunc {
 		}

 		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil {
+		if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input, Dimensions: req.Dimensions}); err != nil {
 			c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
 			return
 		}
--- a/parser/token_parser.go
+++ b/parser/token_parser.go
@@ -0,0 +1,126 @@
+package parser
+
+import (
+	"encoding/json"
+	"errors"
+	"strings"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/harmony"
+)
+
+type TokenParserType int
+
+const (
+	TokenParserTypeDefault TokenParserType = iota
+	TokenParserTypeHarmony
+)
+
+type TokenParser struct {
+	messageHandler MessageHandler
+	parserEngine   ParserInternals
+	toolParser     ToolParser
+	lastToken      string
+	tokenRepeat    int
+	repeatLimit    int
+}
+
+const defaultTokenRepeatLimit = 30
+
+type MessageHandler interface {
+	AddContent(token string) (content, thinking string, toolContent string)
+}
+
+type ParserInternals interface {
+	AddImplicitStartOrPrefill(prefillString string)
+}
+
+type ToolParser interface {
+	Add(token string)
+	Drain() (toolName *string, toolContent string)
+}
+
+// Default implementation for the TokenParser interface as a no-op passthrough
+type defaultMessageHandler struct{}
+
+func (defaultMessageHandler) AddContent(token string) (string, string, string) {
+	return token, "", ""
+}
+
+type defaultEngine struct{}
+
+func (defaultEngine) AddImplicitStartOrPrefill(prefillString string) {}
+
+type defaultToolParser struct{}
+
+func (defaultToolParser) Add(token string) {}
+
+func (defaultToolParser) Drain() (*string, string) { return nil, "" }
+
+func NewTokenParser(parserType TokenParserType, prefillString string) TokenParser {
+	switch parserType {
+	case TokenParserTypeHarmony:
+		harmonyMessageHandler := harmony.NewHarmonyMessageHandler()
+		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(prefillString)
+		return TokenParser{
+			messageHandler: harmonyMessageHandler,
+			parserEngine:   harmonyMessageHandler.HarmonyParser,
+			toolParser:     harmonyMessageHandler.ToolParser,
+			repeatLimit:    defaultTokenRepeatLimit,
+		}
+
+	default:
+		return TokenParser{
+			messageHandler: defaultMessageHandler{},
+			parserEngine:   defaultEngine{},
+			toolParser:     defaultToolParser{},
+			repeatLimit:    30,
+		}
+	}
+}
+
+func (p *TokenParser) AddContent(token string) (string, string, error) {
+	if p.repeatLimitReached(token) {
+		return "", "", errors.New("token repeat limit reached")
+	}
+	content, thinking, toolContent := p.messageHandler.AddContent(token)
+	p.toolParser.Add(toolContent)
+	return content, thinking, nil
+}
+
+// repeatLimitReached updates repeat counters and returns true if the repeat limit is reached.
+func (p *TokenParser) repeatLimitReached(token string) bool {
+	if p == nil {
+		return false
+	}
+	trimmed := strings.TrimSpace(token)
+	if trimmed == p.lastToken {
+		p.tokenRepeat++
+	} else {
+		p.tokenRepeat = 0
+	}
+	p.lastToken = trimmed
+
+	return p.tokenRepeat >= p.repeatLimit
+}
+
+// TODO: update to work with multiple toolcalls - unmarshalling should also happen on parser level
+func (p *TokenParser) Drain() []api.ToolCall {
+	toolName, toolContent := p.toolParser.Drain()
+	if toolName != nil {
+		*toolName = strings.TrimPrefix(*toolName, "functions.")
+		var args api.ToolCallFunctionArguments
+		if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
+			return nil
+		}
+		return []api.ToolCall{
+			{
+				Function: api.ToolCallFunction{
+					Name:      *toolName,
+					Arguments: args,
+				},
+			},
+		}
+	}
+	return nil
+}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -18,7 +18,6 @@ import (
 	"reflect"
 	"regexp"
 	"runtime"
-	"runtime/debug"
 	"strconv"
 	"strings"
 	"sync"
@@ -30,12 +29,12 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/harmony"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/runner/common"
 	"github.com/ollama/ollama/sample"

@@ -782,13 +781,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	var harmonyMessageHandler *harmony.HarmonyMessageHandler
-	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
-	if req.UseHarmony {
-		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
-		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(req.PrefillString)
-		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
-	}
+	tokenParser := parser.NewTokenParser(req.ParserType, req.PrefillString)

 	if req.Options == nil {
 		opts := api.DefaultOptions()
@@ -872,9 +865,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
 		return
 	}
-	var lastToken string
-	tokenRepeat := 0
-	const tokenRepeatLimit = 30

 	for {
 		select {
@@ -883,23 +873,14 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
-				if strings.TrimSpace(content) == lastToken {
-					tokenRepeat++
-				}
-				if tokenRepeat == tokenRepeatLimit {
-					http.Error(w, "token repeat limit reached", http.StatusInternalServerError)
-					seq.doneReason = llm.DoneReasonTokenRepeatLimit
+				var thinking string
+				var err error
+				content, thinking, err = tokenParser.AddContent(content)
+				if err != nil {
+					http.Error(w, err.Error(), http.StatusInternalServerError)
 					close(seq.quit)
 					return
 				}
-				lastToken = strings.TrimSpace(content)
-
-				var thinking string
-				if harmonyMessageHandler != nil {
-					var toolContent string
-					content, thinking, toolContent = harmonyMessageHandler.AddContent(content, harmonyToolParser)
-					harmonyToolParser.Add(toolContent)
-				}

 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
 					Content:  content,
@@ -912,27 +893,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {

 				flusher.Flush()
 			} else {
-				var toolCalls []api.ToolCall
-				if harmonyMessageHandler != nil {
-					// these tools still need to be transformed to the original function name
-					toolName, toolContent := harmonyToolParser.Drain()
-					if toolName != nil {
-						*toolName = strings.TrimPrefix(*toolName, "functions.")
-						var args api.ToolCallFunctionArguments
-						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
-							http.Error(w, fmt.Sprintf("failed to unmarshal tool call function arguments: %v", err), http.StatusInternalServerError)
-							close(seq.quit)
-							return
-						}
-						toolCalls = append(toolCalls, api.ToolCall{
-							Function: api.ToolCallFunction{
-								Name:      *toolName,
-								Arguments: args,
-							},
-						})
-					}
-				}
-
+				toolCalls := tokenParser.Drain()
 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
 					ToolCalls:          toolCalls,
 					Done:               true,
@@ -1139,9 +1100,13 @@ func (s *Server) allocModel(
 	// Convert memory allocation panics to errors
 	defer func() {
 		if r := recover(); r != nil {
-			debug.PrintStack()
 			if err, ok := r.(error); ok {
-				panicErr = err
+				var noMem ml.ErrNoMem
+				if errors.As(err, &noMem) {
+					panicErr = noMem
+				} else {
+					panic(r)
+				}
 			} else {
 				panic(r)
 			}
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -78,7 +78,7 @@ function checkEnv() {
 }


-function buildOllama() {
+function buildCPU() {
    mkdir -Force -path "${script:DIST_DIR}\"
    if ($script:ARCH -ne "arm64") {
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
@@ -90,20 +90,72 @@ function buildOllama() {
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        & cmake --install build --component CPU --strip
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    }
+}

+function buildCUDA11() {
+    # CUDA v11 claims to be compatible with MSVC 2022, but the latest updates are no longer compatible
+    # 19.40 is the last compiler version that works, but recent udpates are 19.43
+    # So this pins to MSVC 2019 for best compatibility
+    mkdir -Force -path "${script:DIST_DIR}\"
+    if ($script:ARCH -ne "arm64") {
        $hashEnv = @{}
        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
-        if ("$script:CUDA_DIRS".Contains("v12")) {
-            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
-            $env:CUDAToolkit_ROOT=$hashEnv[$v12]
-            write-host "Building CUDA v12 backend libraries"
-            & cmake --fresh --preset "CUDA 12" --install-prefix $script:DIST_DIR
+        if ("$script:CUDA_DIRS".Contains("v11")) {
+            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
+            write-host "Building CUDA v11 backend libraries $cuda"
+            $env:CUDAToolkit_ROOT=$cuda
+            & cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --install build --component "CUDA" --strip
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
+    }
+}
+
+function buildCUDA12() {
+    mkdir -Force -path "${script:DIST_DIR}\"
+    if ($script:ARCH -ne "arm64") {
+        $hashEnv = @{}
+        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
+        if ("$script:CUDA_DIRS".Contains("v12.8")) {
+            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
+            write-host "Building CUDA v12 backend libraries $cuda"
+            $env:CUDAToolkit_ROOT=$cuda
+            & cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --build --preset "CUDA 12"  --config Release --parallel $script:JOBS
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --install build --component "CUDA" --strip
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
+    }
+}
+
+function buildCUDA13() {
+    mkdir -Force -path "${script:DIST_DIR}\"
+    if ($script:ARCH -ne "arm64") {
+        $hashEnv = @{}
+        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
+        if ("$script:CUDA_DIRS".Contains("v13")) {
+            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
+            $env:CUDAToolkit_ROOT=$cuda
+            write-host "Building CUDA v13 backend libraries $cuda"
+            & cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --build --preset "CUDA 13"  --config Release --parallel $script:JOBS
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --install build --component "CUDA" --strip
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
+    }
+}
+
+function buildROCm() {
+    mkdir -Force -path "${script:DIST_DIR}\"
+    if ($script:ARCH -ne "arm64") {
        if ($env:HIP_PATH) {
            write-host "Building ROCm backend libraries"
            if (-Not (get-command -ErrorAction silent ninja)) {
@@ -129,6 +181,10 @@ function buildOllama() {
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
+}
+
+function buildOllama() {
+    mkdir -Force -path "${script:DIST_DIR}\"
    write-host "Building ollama CLI"
    & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
@@ -236,6 +292,10 @@ function distZip() {
 checkEnv
 try {
    if ($($args.count) -eq 0) {
+        buildCPU
+        buildCUDA12
+        buildCUDA13
+        buildROCm
        buildOllama
        buildApp
        gatherDependencies
--- a/server/routes.go
+++ b/server/routes.go
@@ -36,6 +36,7 @@ import (
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/openai"
+	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
 	"github.com/ollama/ollama/template"
@@ -196,6 +197,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	}

 	useHarmony := harmony.ShouldUseHarmony(m.Config.ModelFamily, m.Template) && !req.Raw
+	var parserType parser.TokenParserType
+	if useHarmony {
+		parserType = parser.TokenParserTypeHarmony
+	} else {
+		parserType = parser.TokenParserTypeDefault
+	}
 	var functionNameMap *harmony.FunctionNameMap

 	if useHarmony {
@@ -347,7 +354,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			Images:     images,
 			Format:     req.Format,
 			Options:    opts,
-			UseHarmony: useHarmony,
+			ParserType: parserType,
 		}, func(cr llm.CompletionResponse) {
 			res := api.GenerateResponse{
 				Model:     req.Model,
@@ -551,7 +558,12 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 			if err != nil {
 				return err
 			}
-			embeddings[i] = normalize(embedding)
+			// TODO: this first normalization should be done by the model
+			embedding = normalize(embedding)
+			if req.Dimensions > 0 && req.Dimensions < len(embedding) {
+				embedding = normalize(embedding[:req.Dimensions])
+			}
+			embeddings[i] = embedding
 			return nil
 		})
 	}
@@ -577,11 +589,7 @@ func normalize(vec []float32) []float32 {
 		sum += v * v
 	}

-	norm := float32(0.0)
-	if sum > 0 {
-		norm = float32(1.0 / math.Sqrt(float64(sum)))
-	}
-
+	norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12))
 	for i := range vec {
 		vec[i] *= norm
 	}
@@ -1592,6 +1600,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	msgs = filterThinkTags(msgs, m)

 	useHarmony := harmony.ShouldUseHarmony(m.Config.ModelFamily, m.Template)
+	var parserType parser.TokenParserType
+	if useHarmony {
+		parserType = parser.TokenParserTypeHarmony
+	} else {
+		parserType = parser.TokenParserTypeDefault
+	}

 	processedTools := req.Tools
 	var functionNameMap *harmony.FunctionNameMap
@@ -1662,7 +1676,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			Images:        images,
 			Format:        req.Format,
 			Options:       opts,
-			UseHarmony:    useHarmony,
+			ParserType:    parserType,
 			PrefillString: prefillString,
 		}, func(r llm.CompletionResponse) {
 			res := api.ChatResponse{
Author	SHA1	Message	Date
Daniel Hiltgen	e4ce68311a	cuda: remove compression for better compatibility (#12259 ) This retains compatibility with driver 531 and up at the trade-off of space.	2025-09-12 07:59:14 -07:00
Jesse Gross	26214125e8	ollamarunner: Suppress stack trace during memory allocation Allocation failures can be a normal part of new memory estimates, so we shouldn't print a stack trace in this case.	2025-09-11 14:30:31 -07:00
Daniel Hiltgen	61fb912ca4	CI: fix windows cuda build (#12246 ) * ci: adjust cuda component list v13 has a different breakdown of the components required to build ollama * review comments	2025-09-11 12:25:26 -07:00
Jesse Gross	aba1575315	llm: Don't try to load split vision models in the Ollama engine If a model with a split vision projector is loaded in the Ollama engine, the projector will be ignored and the model will hallucinate a response. Instead, fallback and try to load the model in the llama engine.	2025-09-11 11:41:55 -07:00
Jesse Gross	eb10390de9	llm: Enable new memory estimates by default New memory estimates (see #11090 for more information) are now enabled automatically for all models running on the Ollama engine, improving both stability and performance through more accurate sizing and allocation. Models running on the llama engine will continue to use the original style of memory estimation.	2025-09-11 11:21:53 -07:00
Michael Yang	feb18cd710	feat: add dimensions field to embed requests (#12242 ) * feat: add field to truncate embeddings * add openai embeddings for dimensions	2025-09-11 10:36:10 -07:00
fengyuchuanshen	8a7e2055d2	cmd: use slices.Contains to simplify code (#12249 )	2025-09-11 09:57:31 -07:00
Jesse Gross	29ddfc2cab	ggml: Disable flash attention for gemma2 Our new engine implementation of gemma2 doesn't support flash attention, which means that it also doesn't support KV cache quantization. Currently, it is possible to turn these two on, which will result in a crash.	2025-09-10 16:40:45 -07:00
Jesse Gross	71cb86af3e	llm: Remove unneeded warning with flash attention enabled If flash attention is enabled without KV cache quanitization, we will currently always get this warning: level=WARN source=server.go:226 msg="kv cache type not supported by model" type=""	2025-09-10 16:40:45 -07:00
CarbonatedWater.org	5198956372	docs: add ollama-co2 to community integrations (#12230 )	2025-09-10 16:37:10 -07:00
Daniel Hiltgen	17a023f34b	Add v12 + v13 cuda support (#12000 ) * Add support for upcoming NVIDIA Jetsons The latest Jetsons with JetPack 7 are moving to an SBSA compatible model and will not require building a JetPack specific variant. * cuda: bring back dual versions This adds back dual CUDA versions for our releases, with v11 and v13 to cover a broad set of GPUs and driver versions. * win: break up native builds in build_windows.ps1 * v11 build working on windows and linux * switch to cuda v12.8 not JIT * Set CUDA compression to size * enhance manual install linux docs	2025-09-10 12:05:18 -07:00
Parth Sareen	8d6fffaead	runner: simplify parser entrypoints in runner (#12233 )	2025-09-10 11:24:42 -07:00