runner: add sync between computeBatch and completion

runner/parser: allow on-the-fly grammar constraining
2025-09-10 19:16:28 -07:00 · 2025-09-10 11:50:12 -07:00
22 changed files with 119 additions and 222 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -65,36 +65,14 @@ jobs:
            arch: amd64
            preset: 'CUDA 12'
            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
-            cuda-components:
-              - '"cudart"'
-              - '"nvcc"'
-              - '"cublas"'
-              - '"cublas_dev"'
            cuda-version: '12.8'
            flags: ''
-            runner_dir: 'cuda_v12'
-          - os: windows
-            arch: amd64
-            preset: 'CUDA 13'
-            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
-            cuda-components:
-              - '"cudart"'
-              - '"nvcc"'
-              - '"cublas"'
-              - '"cublas_dev"'
-              - '"crt"'
-              - '"nvvm"'
-              - '"nvptxcompiler"'
-            cuda-version: '13.0'
-            flags: ''
-            runner_dir: 'cuda_v13'
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
-            runner_dir: ''
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -118,7 +96,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }

@@ -160,7 +138,7 @@ jobs:
        run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
        env:
@@ -254,7 +232,7 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_sbsa)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:13.0.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,17 +78,8 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
-            cuda-components:
-              - '"cudart"'
-              - '"nvcc"'
-              - '"cublas"'
-              - '"cublas_dev"'
-              - '"crt"'
-              - '"nvvm"'
-              - '"nvptxcompiler"'
-            cuda-version: '13.0'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
@@ -111,8 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
-set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR})
+set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
@@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -18,14 +18,6 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
-      }
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
@@ -34,14 +26,6 @@
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
-    {
-      "name": "CUDA 13",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;110-virtual;120-virtual;121-virtual",
-        "CMAKE_CUDA_FLAGS": "-t 2"
-      }
-    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
@@ -88,21 +72,11 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 11"
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 12"
    },
-    {
-      "name": "CUDA 13",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 13"
-    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
--- a/30
+++ b/30
@@ -39,35 +39,15 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

-FROM base AS cuda-11
-ARG CUDA11VERSION=11.8
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
-ENV PATH=/usr/local/cuda-11/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \
-        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\
+    cmake --preset 'CUDA 12' \
        && cmake --build --parallel --preset 'CUDA 12' \
        && cmake --install build --component CUDA --strip --parallel 8

-
-FROM base AS cuda-13
-ARG CUDA13VERSION=13.0
-RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
-ENV PATH=/usr/local/cuda-13/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \
-        && cmake --build --parallel --preset 'CUDA 13' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
-
 FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
@@ -112,14 +92,10 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
-COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama

 FROM --platform=linux/arm64 scratch AS arm64
-# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
-COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
 COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6

--- a/README.md
+++ b/README.md
@@ -414,7 +414,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
 - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
 - [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)

 ### Cloud

--- a/api/types.go
+++ b/api/types.go
@@ -388,12 +388,8 @@ type EmbedRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

-	// Truncate truncates the input to fit the model's max sequence length.
 	Truncate *bool `json:"truncate,omitempty"`

-	// Dimensions truncates the output embedding to the specified dimension.
-	Dimensions int `json:"dimensions,omitempty"`
-
 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -56,8 +56,10 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string)
 	if err != nil {
 		return
 	}
-	if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
-		return
+	for _, cap := range resp.Capabilities {
+		if cap == model.CapabilityThinking {
+			return
+		}
 	}
 	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
 }
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -43,15 +43,14 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
+		return "sbsa"
 	}

-	if gpuInfo.DriverMajor < 13 {
-		// The detected driver is older than 580 (Aug 2025)
-		// Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance
-		if gpuInfo.computeMajor > 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor >= 5) {
-			slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
-		}
-		return "v12"
+	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
+	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		// The detected driver is older than Feb 2023
+		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
+		return "v11"
 	}
-	return "v13"
+	return "v12"
 }
--- a/docs/api.md
+++ b/docs/api.md
@@ -1708,7 +1708,6 @@ Advanced parameters:
 - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
- `dimensions`: number of dimensions for the embedding

 ### Examples

--- a/docs/linux.md
+++ b/docs/linux.md
@@ -11,13 +11,12 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 > [!NOTE]
-> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.

 Download and extract the package:

 ```shell
 curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
-sudo rm -rf /usr/lib/ollama
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -185,6 +185,8 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
+	// Enable the new memory estimation logic
+	NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
 )

 func String(s string) func() string {
@@ -270,6 +272,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
 		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
+		"OLLAMA_NEW_ESTIMATES":     {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},

 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -864,16 +864,12 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {

 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
-	if cacheType == "" || cacheType == "f16" {
-		return true
-	}
-
 	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
 		// gpt-oss uses attention with sinks which does not support quantized cache types
-		slog.Warn("model only supports non-quantized cache types", "model", arch)
-		return false
+		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
+		return cacheType == "f16"
 	}
-	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
+	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }

 // SupportsFlashAttention checks if the model supports flash attention
@@ -883,10 +879,6 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

-	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
-		return false
-	}
-
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -47,12 +47,13 @@ func (s harmonyParserState) String() string {
 }

 type HarmonyParser struct {
-	state           harmonyParserState
-	MessageStartTag string
-	MessageEndTag   string
-	HeaderEndTag    string
-	acc             strings.Builder
-	lifetimeAcc     strings.Builder
+	state              harmonyParserState
+	MessageStartTag    string
+	MessageEndTag      string
+	HeaderEndTag       string
+	constraintsAllowed bool
+	acc                strings.Builder
+	lifetimeAcc        strings.Builder
 }

 type HarmonyEvent interface {
@@ -89,6 +90,10 @@ func (s *HarmonyParser) AddImplicitStart() {
 	s.acc.WriteString("<|start|>assistant")
 }

+func (s *HarmonyParser) ConstraintsAllowed() bool {
+	return s.constraintsAllowed
+}
+
 func Prefill(lastMessage api.Message) string {
 	if lastMessage.Role != "assistant" {
 		return ""
@@ -341,6 +346,7 @@ func (h *HarmonyMessageHandler) AddContent(content string) (string, string, stri
 				}
 			case "final":
 				h.state = harmonyMessageState_Normal
+				h.HarmonyParser.constraintsAllowed = true
 			}
 		case HarmonyEventContentEmitted:
 			logutil.Trace("harmony event content", "content", event.Content, "state", h.state)
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var kvct string
 	if useFlashAttention {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if f.SupportsKVCacheType(requested) {
+		if requested != "" && f.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -149,11 +149,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 	var textProcessor model.TextProcessor
 	var err error
 	if envconfig.NewEngine() || f.KV().OllamaEngineRequired() {
-		if len(projectors) == 0 {
-			textProcessor, err = model.NewTextProcessor(modelPath)
-		} else {
-			err = errors.New("split vision models aren't supported")
-		}
+		textProcessor, err = model.NewTextProcessor(modelPath)
 		if err != nil {
 			// To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
 			slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)
@@ -166,6 +162,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 	}

+	newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates()
+	if newEstimates {
+		slog.Info("enabling new memory estimates")
+	}
+
 	// Verify the requested context size is <= the model training size
 	trainCtx := f.KV().ContextLength()
 	if opts.NumCtx > int(trainCtx) && trainCtx > 0 {
@@ -220,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 		// Flash Attention also supports kv cache quantization
 		// Enable if the requested and kv cache type is supported by the model
-		if f.SupportsKVCacheType(kvct) {
+		if kvct != "" && f.SupportsKVCacheType(kvct) {
 			loadRequest.KvCacheType = kvct
 		} else {
 			slog.Warn("kv cache type not supported by model", "type", kvct)
@@ -433,7 +434,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			}
 		}()

-		if textProcessor != nil {
+		if newEstimates {
 			return &ollamaServer{llmServer: s}, nil
 		} else {
 			return &llamaServer{llmServer: s, ggml: f}, nil
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -76,9 +76,8 @@ type JsonSchema struct {
 }

 type EmbedRequest struct {
-	Input      any    `json:"input"`
-	Model      string `json:"model"`
-	Dimensions int    `json:"dimensions,omitempty"`
+	Input any    `json:"input"`
+	Model string `json:"model"`
 }

 type StreamOptions struct {
@@ -1006,7 +1005,7 @@ func EmbeddingsMiddleware() gin.HandlerFunc {
 		}

 		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input, Dimensions: req.Dimensions}); err != nil {
+		if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil {
 			c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
 			return
 		}
--- a/parser/token_parser.go
+++ b/parser/token_parser.go
@@ -33,6 +33,7 @@ type MessageHandler interface {

 type ParserInternals interface {
 	AddImplicitStartOrPrefill(prefillString string)
+	ConstraintsAllowed() bool
 }

 type ToolParser interface {
@@ -51,6 +52,10 @@ type defaultEngine struct{}

 func (defaultEngine) AddImplicitStartOrPrefill(prefillString string) {}

+func (defaultEngine) ConstraintsAllowed() bool {
+	return true
+}
+
 type defaultToolParser struct{}

 func (defaultToolParser) Add(token string) {}
@@ -104,6 +109,10 @@ func (p *TokenParser) repeatLimitReached(token string) bool {
 	return p.tokenRepeat >= p.repeatLimit
 }

+func (p *TokenParser) ConstraintsAllowed() bool {
+	return p.parserEngine.ConstraintsAllowed()
+}
+
 // TODO: update to work with multiple toolcalls - unmarshalling should also happen on parser level
 func (p *TokenParser) Drain() []api.ToolCall {
 	toolName, toolContent := p.toolParser.Drain()
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -18,6 +18,7 @@ import (
 	"reflect"
 	"regexp"
 	"runtime"
+	"runtime/debug"
 	"strconv"
 	"strings"
 	"sync"
@@ -61,6 +62,11 @@ type Sequence struct {
 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
 	pendingResponses []string

+	// startGate
+	startGate *sync.Mutex
+
+	grammarReady bool
+
 	// input cache being used by this sequence
 	cache *InputCacheSlot

@@ -163,6 +169,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe

 	// TODO(jessegross): Ingest cached history for grammar

+	startGate := &sync.Mutex{}
 	return &Sequence{
 		ctxs:                ctxs,
 		mmStore:             mmStore,
@@ -178,6 +185,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		embeddingOnly:       params.embedding,
 		stop:                params.stop,
 		numKeep:             params.numKeep,
+		startGate:           startGate,
+		grammarReady:        false,
 	}, nil
 }

@@ -706,11 +715,18 @@ func (s *Server) computeBatch(activeBatch batchState) {
 		// sample a token
 		vocabSize := len(outputs) / len(activeBatch.batch.Outputs)
 		logutil.Trace("computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(outputs), "len(activeBatch.batch.Outputs)", len(activeBatch.batch.Outputs), "vocabSize", vocabSize, "iBatches", iBatches)
+
+		if !seq.grammarReady {
+			seq.startGate.Lock()
+		}
 		token, err := seq.sampler.Sample(outputs[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize])
 		if err != nil {
 			s.hardErrCh <- fmt.Errorf("failed to sample token: %w", err)
 			return
 		}
+		if !seq.grammarReady {
+			seq.startGate.Unlock()
+		}

 		nextBatchTokens[i].Token = token

@@ -781,8 +797,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	tokenParser := parser.NewTokenParser(req.ParserType, req.PrefillString)
-
 	if req.Options == nil {
 		opts := api.DefaultOptions()
 		req.Options = &opts
@@ -815,7 +829,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		req.Options.TopP,
 		req.Options.MinP,
 		req.Options.Seed,
-		grammar,
+		nil,
 	)

 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
@@ -830,6 +844,12 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	tokenParser := parser.NewTokenParser(req.ParserType, req.PrefillString)
+	// this accounts for the default case and also the case where there is a prefill which moves the state of the parser to allow for constraints
+	if tokenParser.ConstraintsAllowed() {
+		seq.grammarReady = true
+	}
+
 	// Ensure there is a place to put the sequence, released when removed from s.seqs
 	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
 		if errors.Is(err, context.Canceled) {
@@ -873,6 +893,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
+				if !seq.grammarReady {
+					seq.startGate.Lock()
+				}
 				var thinking string
 				var err error
 				content, thinking, err = tokenParser.AddContent(content)
@@ -882,6 +905,13 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 					return
 				}

+				// only apply the grammar once
+				if tokenParser.ConstraintsAllowed() && !seq.grammarReady {
+					seq.sampler.SetGrammar(grammar, &s.mu)
+					seq.grammarReady = true
+					seq.startGate.Unlock()
+				}
+
 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
 					Content:  content,
 					Thinking: thinking,
@@ -908,6 +938,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {

 				return
 			}
+			if !seq.grammarReady {
+				seq.startGate.Unlock()
+			}
 		}
 	}
 }
@@ -1100,13 +1133,9 @@ func (s *Server) allocModel(
 	// Convert memory allocation panics to errors
 	defer func() {
 		if r := recover(); r != nil {
+			debug.PrintStack()
 			if err, ok := r.(error); ok {
-				var noMem ml.ErrNoMem
-				if errors.As(err, &noMem) {
-					panicErr = noMem
-				} else {
-					panic(r)
-				}
+				panicErr = err
 			} else {
 				panic(r)
 			}
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -5,6 +5,7 @@ import (
 	"math"
 	"math/rand/v2"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/model"
@@ -25,6 +26,12 @@ type Sampler struct {
 	grammar     *GrammarSampler
 }

+func (s *Sampler) SetGrammar(grammar *GrammarSampler, mutex *sync.Mutex) {
+	mutex.Lock()
+	defer mutex.Unlock()
+	s.grammar = grammar
+}
+
 func (s *Sampler) Sample(logits []float32) (int32, error) {
 	if len(logits) == 0 {
 		return -1, errors.New("sample: no logits provided to sample")
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -78,7 +78,7 @@ function checkEnv() {
 }


-function buildCPU() {
+function buildOllama() {
    mkdir -Force -path "${script:DIST_DIR}\"
    if ($script:ARCH -ne "arm64") {
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
@@ -90,72 +90,20 @@ function buildCPU() {
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        & cmake --install build --component CPU --strip
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    }
-}

-function buildCUDA11() {
-    # CUDA v11 claims to be compatible with MSVC 2022, but the latest updates are no longer compatible
-    # 19.40 is the last compiler version that works, but recent udpates are 19.43
-    # So this pins to MSVC 2019 for best compatibility
-    mkdir -Force -path "${script:DIST_DIR}\"
-    if ($script:ARCH -ne "arm64") {
        $hashEnv = @{}
        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
-        if ("$script:CUDA_DIRS".Contains("v11")) {
-            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
-            write-host "Building CUDA v11 backend libraries $cuda"
-            $env:CUDAToolkit_ROOT=$cuda
-            & cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "CUDA" --strip
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-    }
-}
-
-function buildCUDA12() {
-    mkdir -Force -path "${script:DIST_DIR}\"
-    if ($script:ARCH -ne "arm64") {
-        $hashEnv = @{}
-        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
-        if ("$script:CUDA_DIRS".Contains("v12.8")) {
-            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
-            write-host "Building CUDA v12 backend libraries $cuda"
-            $env:CUDAToolkit_ROOT=$cuda
-            & cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
+        if ("$script:CUDA_DIRS".Contains("v12")) {
+            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
+            $env:CUDAToolkit_ROOT=$hashEnv[$v12]
+            write-host "Building CUDA v12 backend libraries"
+            & cmake --fresh --preset "CUDA 12" --install-prefix $script:DIST_DIR
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --build --preset "CUDA 12"  --config Release --parallel $script:JOBS
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --install build --component "CUDA" --strip
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
-    }
-}
-
-function buildCUDA13() {
-    mkdir -Force -path "${script:DIST_DIR}\"
-    if ($script:ARCH -ne "arm64") {
-        $hashEnv = @{}
-        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
-        if ("$script:CUDA_DIRS".Contains("v13")) {
-            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
-            $env:CUDAToolkit_ROOT=$cuda
-            write-host "Building CUDA v13 backend libraries $cuda"
-            & cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 13"  --config Release --parallel $script:JOBS
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "CUDA" --strip
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-    }
-}
-
-function buildROCm() {
-    mkdir -Force -path "${script:DIST_DIR}\"
-    if ($script:ARCH -ne "arm64") {
        if ($env:HIP_PATH) {
            write-host "Building ROCm backend libraries"
            if (-Not (get-command -ErrorAction silent ninja)) {
@@ -181,10 +129,6 @@ function buildROCm() {
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
-}
-
-function buildOllama() {
-    mkdir -Force -path "${script:DIST_DIR}\"
    write-host "Building ollama CLI"
    & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
@@ -292,10 +236,6 @@ function distZip() {
 checkEnv
 try {
    if ($($args.count) -eq 0) {
-        buildCPU
-        buildCUDA12
-        buildCUDA13
-        buildROCm
        buildOllama
        buildApp
        gatherDependencies
--- a/server/routes.go
+++ b/server/routes.go
@@ -558,12 +558,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 			if err != nil {
 				return err
 			}
-			// TODO: this first normalization should be done by the model
-			embedding = normalize(embedding)
-			if req.Dimensions > 0 && req.Dimensions < len(embedding) {
-				embedding = normalize(embedding[:req.Dimensions])
-			}
-			embeddings[i] = embedding
+			embeddings[i] = normalize(embedding)
 			return nil
 		})
 	}
@@ -589,7 +584,11 @@ func normalize(vec []float32) []float32 {
 		sum += v * v
 	}

-	norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12))
+	norm := float32(0.0)
+	if sum > 0 {
+		norm = float32(1.0 / math.Sqrt(float64(sum)))
+	}
+
 	for i := range vec {
 		vec[i] *= norm
 	}
Author	SHA1	Message	Date
ParthSareen	c0aeb3531b	runner: add sync between computeBatch and completion	2025-09-10 19:16:28 -07:00
ParthSareen	1e5fecbbc3	runner/parser: allow on-the-fly grammar constraining	2025-09-10 11:50:12 -07:00