update tests

drop float16 dependency
goos: darwin goarch: arm64 pkg: github.com/ollama/ollama/convert/float16 cpu: Apple M3 Max BenchmarkFloat16/x448/float16-16 159 7398462 ns/op BenchmarkFloat16/simple-16 512 2327098 ns/op PASS ok github.com/ollama/ollama/convert/float16 2.553s
2025-08-14 15:04:26 -07:00 · 2025-08-14 15:04:26 -07:00 · 2025-08-14 15:04:26 -07:00
123 changed files with 2196 additions and 5672 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -65,36 +65,14 @@ jobs:
            arch: amd64
            preset: 'CUDA 12'
            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
-            cuda-components:
-              - '"cudart"'
-              - '"nvcc"'
-              - '"cublas"'
-              - '"cublas_dev"'
            cuda-version: '12.8'
            flags: ''
-            runner_dir: 'cuda_v12'
-          - os: windows
-            arch: amd64
-            preset: 'CUDA 13'
-            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
-            cuda-components:
-              - '"cudart"'
-              - '"nvcc"'
-              - '"cublas"'
-              - '"cublas_dev"'
-              - '"crt"'
-              - '"nvvm"'
-              - '"nvptxcompiler"'
-            cuda-version: '13.0'
-            flags: ''
-            runner_dir: 'cuda_v13'
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
-            runner_dir: ''
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -118,7 +96,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }

@@ -160,7 +138,7 @@ jobs:
        run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
        env:
@@ -254,7 +232,7 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_sbsa)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:13.0.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,17 +78,8 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
-            cuda-components:
-              - '"cudart"'
-              - '"nvcc"'
-              - '"cublas"'
-              - '"cublas_dev"'
-              - '"crt"'
-              - '"nvvm"'
-              - '"nvptxcompiler"'
-            cuda-version: '13.0'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
@@ -111,8 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
-set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR})
+set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
@@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -18,42 +18,26 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
-      }
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
+        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;120-virtual",
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
-    {
-      "name": "CUDA 13",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;110-virtual;120-virtual;121-virtual",
-        "CMAKE_CUDA_FLAGS": "-t 2"
-      }
-    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "72;87"
+        "CMAKE_CUDA_ARCHITECTURES": "72-virtual;87-virtual"
      }
    },
    {
      "name": "JetPack 6",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "87"
+        "CMAKE_CUDA_ARCHITECTURES": "87-virtual"
      }
    },
    {
@@ -88,21 +72,11 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 11"
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 12"
    },
-    {
-      "name": "CUDA 13",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 13"
-    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
--- a/32
+++ b/32
@@ -39,35 +39,15 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

-FROM base AS cuda-11
-ARG CUDA11VERSION=11.8
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
-ENV PATH=/usr/local/cuda-11/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \
-        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\
+    cmake --preset 'CUDA 12' \
        && cmake --build --parallel --preset 'CUDA 12' \
        && cmake --install build --component CUDA --strip --parallel 8

-
-FROM base AS cuda-13
-ARG CUDA13VERSION=13.0
-RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
-ENV PATH=/usr/local/cuda-13/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \
-        && cmake --build --parallel --preset 'CUDA 13' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
-
 FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
@@ -106,20 +86,14 @@ RUN go mod download
 COPY . .
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
-ARG CGO_CFLAGS
-ARG CGO_CXXFLAGS
 RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
-COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama

 FROM --platform=linux/arm64 scratch AS arm64
-# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
-COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
 COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6

--- a/README.md
+++ b/README.md
@@ -411,10 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)

 ### Cloud

@@ -541,9 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
 - [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
- [Neuro SAN](https://github.com/cognizant-ai-lab/neuro-san-studio) (Data-driven multi-agent orchestration framework) with [example](https://github.com/cognizant-ai-lab/neuro-san-studio/blob/main/docs/user_guide.md#ollama)

 ### Mobile

@@ -604,7 +597,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
 - [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
 - [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
- [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network)

 ### Supported backends

--- a/api/types.go
+++ b/api/types.go
@@ -90,10 +90,6 @@ type GenerateRequest struct {
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -124,10 +120,6 @@ type ChatRequest struct {
 	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
 	// for supported models.
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 type Tools []Tool
@@ -286,23 +278,16 @@ func mapToTypeScriptType(jsonType string) string {
 	}
 }

-type ToolFunctionParameters struct {
-	Type       string                  `json:"type"`
-	Defs       any                     `json:"$defs,omitempty"`
-	Items      any                     `json:"items,omitempty"`
-	Required   []string                `json:"required"`
-	Properties map[string]ToolProperty `json:"properties"`
-}
-
-func (t *ToolFunctionParameters) String() string {
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
 type ToolFunction struct {
-	Name        string                 `json:"name"`
-	Description string                 `json:"description"`
-	Parameters  ToolFunctionParameters `json:"parameters"`
+	Name        string `json:"name"`
+	Description string `json:"description"`
+	Parameters  struct {
+		Type       string                  `json:"type"`
+		Defs       any                     `json:"$defs,omitempty"`
+		Items      any                     `json:"items,omitempty"`
+		Required   []string                `json:"required"`
+		Properties map[string]ToolProperty `json:"properties"`
+	} `json:"parameters"`
 }

 func (t *ToolFunction) String() string {
@@ -323,19 +308,6 @@ type ChatResponse struct {
 	Metrics
 }

-// DebugInfo contains debug information for template rendering
-type DebugInfo struct {
-	RenderedTemplate string `json:"rendered_template"`
-	ImageCount       int    `json:"image_count,omitempty"`
-}
-
-// DebugTemplateResponse is returned when _debug_render_only is set to true
-type DebugTemplateResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	DebugInfo DebugInfo `json:"_debug_info"`
-}
-
 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
@@ -388,12 +360,8 @@ type EmbedRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

-	// Truncate truncates the input to fit the model's max sequence length.
 	Truncate *bool `json:"truncate,omitempty"`

-	// Dimensions truncates the output embedding to the specified dimension.
-	Dimensions int `json:"dimensions,omitempty"`
-
 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
 }
@@ -892,7 +860,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 		if t < 0 {
 			d.Duration = time.Duration(math.MaxInt64)
 		} else {
-			d.Duration = time.Duration(t * float64(time.Second))
+			d.Duration = time.Duration(int(t) * int(time.Second))
 		}
 	case string:
 		d.Duration, err = time.ParseDuration(t)
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -17,11 +17,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		req  string
 		exp  *Duration
 	}{
-		{
-			name: "Unset",
-			req:  `{ }`,
-			exp:  nil,
-		},
 		{
 			name: "Positive Integer",
 			req:  `{ "keep_alive": 42 }`,
@@ -30,7 +25,7 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		{
 			name: "Positive Float",
 			req:  `{ "keep_alive": 42.5 }`,
-			exp:  &Duration{42500 * time.Millisecond},
+			exp:  &Duration{42 * time.Second},
 		},
 		{
 			name: "Positive Integer String",
@@ -441,50 +436,3 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 		})
 	}
 }
-
-func TestToolFunctionParameters_String(t *testing.T) {
-	tests := []struct {
-		name     string
-		params   ToolFunctionParameters
-		expected string
-	}{
-		{
-			name: "simple object with string property",
-			params: ToolFunctionParameters{
-				Type:     "object",
-				Required: []string{"name"},
-				Properties: map[string]ToolProperty{
-					"name": {
-						Type:        PropertyType{"string"},
-						Description: "The name of the person",
-					},
-				},
-			},
-			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
-		},
-		{
-			name: "marshal failure returns empty string",
-			params: ToolFunctionParameters{
-				Type: "object",
-				Defs: func() any {
-					// Create a cycle that will cause json.Marshal to fail
-					type selfRef struct {
-						Self *selfRef
-					}
-					s := &selfRef{}
-					s.Self = s
-					return s
-				}(),
-				Properties: map[string]ToolProperty{},
-			},
-			expected: "",
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			result := test.params.String()
-			assert.Equal(t, test.expected, result)
-		})
-	}
-}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -56,8 +56,10 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string)
 	if err != nil {
 		return
 	}
-	if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
-		return
+	for _, cap := range resp.Capabilities {
+		if cap == model.CapabilityThinking {
+			return
+		}
 	}
 	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
 }
@@ -1610,7 +1612,6 @@ func NewCLI() *cobra.Command {
 			appendEnvDocs(cmd, []envconfig.EnvVar{
 				envVars["OLLAMA_DEBUG"],
 				envVars["OLLAMA_HOST"],
-				envVars["OLLAMA_CONTEXT_LENGTH"],
 				envVars["OLLAMA_KEEP_ALIVE"],
 				envVars["OLLAMA_MAX_LOADED_MODELS"],
 				envVars["OLLAMA_MAX_QUEUE"],
--- a/convert/bfloat16/bfloat16.go
+++ b/convert/bfloat16/bfloat16.go
@@ -0,0 +1,21 @@
+package bfloat16
+
+import "math"
+
+// FromFloat32s converts a slice of float32 values to a slice of bfloat16 values, represented as uint16s.
+func FromFloat32s(f32s []float32) (u16s []uint16) {
+	u16s = make([]uint16, len(f32s))
+	for i := range f32s {
+		u16s[i] = uint16(math.Float32bits(f32s[i]) >> 16)
+	}
+	return u16s
+}
+
+// Float32s converts a slice of bfloat16 values, represented as uint16s, back to a slice of float32 values.
+func Float32s(u16s []uint16) (f32s []float32) {
+	f32s = make([]float32, len(u16s))
+	for i := range u16s {
+		f32s[i] = math.Float32frombits(uint32(u16s[i]) << 16)
+	}
+	return f32s
+}
--- a/convert/bfloat16/bfloat16_test.go
+++ b/convert/bfloat16/bfloat16_test.go
@@ -0,0 +1,82 @@
+package bfloat16
+
+import (
+	"math"
+	"math/rand/v2"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestBfloat16(t *testing.T) {
+	cases := []struct {
+		name  string
+		input uint16
+		want  uint32
+	}{
+		// Zero cases
+		{"positive zero", 0x0000, 0x0},
+		{"negative zero", 0x8000, 0x80000000},
+
+		// Normal numbers
+		{"one", 0x3F80, 0x3F800000},
+		{"negative one", 0xBF80, 0xBF800000},
+		{"two", 0x4000, 0x40000000},
+		{"half", 0x3F00, 0x3F000000},
+		{"quarter", 0x3E80, 0x3E800000},
+		{"max finite", 0x7F7F, 0x7F7F0000},
+		{"min positive normal", 0x0080, 0x00800000},
+
+		// Infinity cases
+		{"positive infinity", 0x7F80, 0x7F800000},
+		{"negative infinity", 0xFF80, 0xFF800000},
+
+		// NaN cases
+		{"NaN", 0x7FC0, 0x7FC00000},
+		{"NaN with payload", 0x7FC1, 0x7FC10000},
+
+		// Subnormal cases
+		{"min positive subnormal", 0x0001, 0x00010000},
+		{"max subnormal", 0x007F, 0x007F0000},
+
+		// Powers of 2
+		{"2^10", 0x4480, 0x44800000},
+		{"2^-10", 0x3A80, 0x3A800000},
+		{"2^20", 0x4B80, 0x4B800000},
+
+		// Common approximations in BF16
+		{"pi approximation", 0x4049, 0x40490000},
+		{"e approximation", 0x402E, 0x402E0000},
+		{"sqrt(2) approximation", 0x3FB5, 0x3FB50000},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			t.Run("Float32s", func(t *testing.T) {
+				got := Float32s([]uint16{tt.input})[0]
+				if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
+					t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+
+			t.Run("FromFloat32s", func(t *testing.T) {
+				got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
+				if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
+					t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+		})
+	}
+}
+
+func BenchmarkBfloat16(b *testing.B) {
+	f32s := make([]float32, 1_000_000)
+	for i := range f32s {
+		f32s[i] = rand.Float32()
+	}
+	for b.Loop() {
+		Float32s(FromFloat32s(f32s))
+	}
+}
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -28,7 +28,6 @@ type bertModel struct {
 	LayerNormEPS          float32 `json:"layer_norm_eps"`
 	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
 	NormEpsilon           float32 `json:"norm_epsilon"`
-	normalizeEmbeddings   bool

 	PoolingType uint32
 }
@@ -55,11 +54,9 @@ func (p *bertModel) parseMore(fsys fs.FS) error {

 	var pooling string
 	for _, m := range modules {
-		switch m.Type {
-		case "sentence_transformers.models.Pooling":
+		if m.Type == "sentence_transformers.models.Pooling" {
 			pooling = m.Path
-		case "sentence_transformers.models.Normalize":
-			p.normalizeEmbeddings = true
+			break
 		}
 	}

@@ -93,7 +90,6 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.pooling_type"] = p.PoolingType
-	kv["bert.normalize_embeddings"] = p.normalizeEmbeddings

 	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)

--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -15,24 +15,19 @@ import (

 type gptossModel struct {
 	ModelParameters
-	HiddenLayers          uint32  `json:"num_hidden_layers"`
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	HiddenSize            uint32  `json:"hidden_size"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	AttentionHeads        uint32  `json:"num_attention_heads"`
-	KeyValueHeads         uint32  `json:"num_key_value_heads"`
-	HeadDim               uint32  `json:"head_dim"`
-	Experts               uint32  `json:"num_experts"`
-	LocalExperts          uint32  `json:"num_local_experts"`
-	ExpertsPerToken       uint32  `json:"experts_per_token"`
-	RMSNormEpsilon        float32 `json:"rms_norm_eps"`
-	InitialContextLength  uint32  `json:"initial_context_length"`
-	RopeTheta             float32 `json:"rope_theta"`
-	RopeScalingFactor     float32 `json:"rope_scaling_factor"`
-	RopeScaling           struct {
-		Factor float32 `json:"factor"`
-	} `json:"rope_scaling"`
-	SlidingWindow uint32 `json:"sliding_window"`
+	HiddenLayers         uint32  `json:"num_hidden_layers"`
+	HiddenSize           uint32  `json:"hidden_size"`
+	IntermediateSize     uint32  `json:"intermediate_size"`
+	AttentionHeads       uint32  `json:"num_attention_heads"`
+	KeyValueHeads        uint32  `json:"num_key_value_heads"`
+	HeadDim              uint32  `json:"head_dim"`
+	Experts              uint32  `json:"num_experts"`
+	ExpertsPerToken      uint32  `json:"experts_per_token"`
+	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
+	InitialContextLength uint32  `json:"initial_context_length"`
+	RopeTheta            float32 `json:"rope_theta"`
+	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
+	SlidingWindow        uint32  `json:"sliding_window"`
 }

 var _ ModelConverter = (*gptossModel)(nil)
@@ -41,11 +36,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
-	kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
+	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
 	kv["gptoss.block_count"] = m.HiddenLayers
 	kv["gptoss.embedding_length"] = m.HiddenSize
 	kv["gptoss.feed_forward_length"] = m.IntermediateSize
-	kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
+	kv["gptoss.expert_count"] = m.Experts
 	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
 	kv["gptoss.attention.head_count"] = m.AttentionHeads
 	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -54,7 +49,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
 	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
 	kv["gptoss.rope.freq_base"] = m.RopeTheta
-	kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
+	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
 	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
 	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
 	kv["tokenizer.ggml.add_bos_token"] = false
@@ -97,11 +92,6 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {

 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
-
-		if !strings.HasSuffix(name, ".weight") {
-			name += ".weight"
-		}
-
 		out = append(out, &ggml.Tensor{
 			Name:     name,
 			Kind:     uint32(ggml.TensorTypeMXFP4),
@@ -114,47 +104,25 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 }

 func (m *gptossModel) Replacements() []string {
-	var replacements []string
-	if m.MaxPositionEmbeddings > 0 {
-		// hf flavored model
-		replacements = []string{
-			"lm_head", "output",
-			"model.embed_tokens", "token_embd",
-			"model.layers", "blk",
-			"input_layernorm", "attn_norm",
-			"self_attn.q_proj", "attn_q",
-			"self_attn.k_proj", "attn_k",
-			"self_attn.v_proj", "attn_v",
-			"self_attn.o_proj", "attn_out",
-			"self_attn.sinks", "attn_sinks",
-			"post_attention_layernorm", "ffn_norm",
-			"mlp.router", "ffn_gate_inp",
-			"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
-			"mlp.experts.down_proj_", "ffn_down_exps.",
-			"model.norm", "output_norm",
-		}
-	} else {
-		replacements = []string{
-			// noop replacements so other replacements will not be applied
-			".blocks", ".blocks",
-			".scales", ".scales",
-			// real replacements
-			"block", "blk",
-			"attn.norm", "attn_norm",
-			"attn.qkv", "attn_qkv",
-			"attn.sinks", "attn_sinks",
-			"attn.out", "attn_out",
-			"mlp.norm", "ffn_norm",
-			"mlp.gate", "ffn_gate_inp",
-			"mlp.mlp1_", "ffn_gate_up_exps.",
-			"mlp.mlp2_", "ffn_down_exps.",
-			"embedding", "token_embd",
-			"norm", "output_norm",
-			"unembedding", "output",
-			"scale", "weight",
-		}
+	return []string{
+		// noop replacements so other replacements will not be applied
+		".blocks", ".blocks",
+		".scales", ".scales",
+		// real replacements
+		"block", "blk",
+		"attn.norm", "attn_norm",
+		"attn.qkv", "attn_qkv",
+		"attn.sinks", "attn_sinks",
+		"attn.out", "attn_out",
+		"mlp.norm", "ffn_norm",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.mlp1_", "ffn_gate_up_exps.",
+		"mlp.mlp2_", "ffn_down_exps.",
+		"embedding", "token_embd",
+		"norm", "output_norm",
+		"unembedding", "output",
+		"scale", "weight",
 	}
-	return replacements
 }

 type mxfp4 struct {
@@ -172,20 +140,7 @@ func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
 		blocksDims[i] = int(d)
 	}

-	bts := b.Bytes()
-	var tmp [16]byte
-	for i := 0; i < b.Len(); i += 16 {
-		for j := range 8 {
-			// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
-			a, b := bts[i+j], bts[i+j+8]
-			tmp[2*j+0] = (a & 0x0F) | (b << 4)
-			tmp[2*j+1] = (a >> 4) | (b & 0xF0)
-		}
-
-		copy(bts[i:i+16], tmp[:])
-	}
-
-	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(bts))
+	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(b.Bytes()))

 	var s bytes.Buffer
 	if _, err := m.scales.WriteTo(&s); err != nil {
@@ -219,5 +174,5 @@ func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
 		return 0, err
 	}

-	return int64(len(u8s)), nil
+	return 0, nil
 }
--- a/convert/float16/float16.go
+++ b/convert/float16/float16.go
@@ -0,0 +1,97 @@
+package float16
+
+import (
+	"math"
+)
+
+func FromFloat32s(f32s []float32) (u16s []uint16) {
+	u16s = make([]uint16, len(f32s))
+	for i := range f32s {
+		bits := math.Float32bits(f32s[i])
+		sign := (bits >> 31) & 0x1
+		exponent := (bits >> 23) & 0xFF
+		mantissa := bits & 0x7FFFFF
+		if exponent == 0xFF {
+			if mantissa == 0 {
+				// Infinity
+				u16s[i] = uint16((sign << 15) | 0x7C00)
+			} else {
+				// NaN
+				u16s[i] = uint16((sign << 15) | 0x7C00 | (mantissa >> 13))
+			}
+		} else if exponent == 0 && mantissa == 0 {
+			// Zero
+			u16s[i] = uint16(sign << 15)
+		} else {
+			// Convert exponent from FP32 bias (127) to FP16 bias (15)
+			exponent := int(exponent) - 127 + 15
+			if exponent >= 31 {
+				// Overflow to infinity
+				u16s[i] = uint16((sign << 15) | 0x7C00)
+			} else if exponent <= 0 {
+				// Underflow - create subnormal or zero
+				if exponent < -10 {
+					u16s[i] = uint16(sign << 15) // Zero
+				} else {
+					// Subnormal number
+					mantissa = (mantissa | 0x800000) >> uint(-exponent+1)
+					u16s[i] = uint16((sign << 15) | (mantissa >> 13))
+				}
+			} else {
+				// Normal number - truncate mantissa from 23 to 10 bits
+				u16s[i] = uint16((sign << 15) | (uint32(exponent) << 10) | (mantissa >> 13))
+			}
+		}
+	}
+
+	return u16s
+}
+
+func Float32s(u16s []uint16) (f32s []float32) {
+	f32s = make([]float32, len(u16s))
+	for i := range u16s {
+		sign := (u16s[i] >> 15) & 0x1
+		exponent := (u16s[i] >> 10) & 0x1F
+		mantissa := u16s[i] & 0x3FF
+
+		var u32 uint32
+		switch exponent {
+		case 0:
+			if mantissa == 0 {
+				// Zero
+				u32 = uint32(sign) << 31
+			} else {
+				// Subnormal - convert to normal
+				// Find leading 1 bit
+				shift := 0
+				temp := mantissa
+				for temp&0x400 == 0 {
+					temp <<= 1
+					shift++
+				}
+
+				exponent := 127 - 15 + 1 - shift
+				mantissa := (uint32(temp&0x3FF) << 13)
+
+				u32 = (uint32(sign) << 31) | (uint32(exponent) << 23) | mantissa
+			}
+		case 0x1F:
+			if mantissa == 0 {
+				// Infinity
+				u32 = (uint32(sign) << 31) | 0x7F800000
+			} else {
+				// NaN
+				u32 = (uint32(sign) << 31) | 0x7F800000 | (uint32(mantissa) << 13)
+			}
+		default:
+			// Normal number
+			exponent := uint32(exponent) - 15 + 127
+			mantissa := uint32(mantissa) << 13
+
+			u32 = (uint32(sign) << 31) | (exponent << 23) | mantissa
+		}
+
+		f32s[i] = math.Float32frombits(u32)
+	}
+	return f32s
+}
--- a/convert/float16/float16_test.go
+++ b/convert/float16/float16_test.go
@@ -0,0 +1,75 @@
+package float16
+
+import (
+	"math"
+	"math/rand/v2"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestFloat16(t *testing.T) {
+	cases := []struct {
+		name  string
+		input uint16
+		want  uint32
+	}{
+		// Zero cases
+		{"positive zero", 0x0000, 0x0},
+		{"negative zero", 0x8000, 0x80000000},
+
+		// Normal numbers
+		{"one", 0x3C00, 0x3F800000},
+		{"negative one", 0xBC00, 0xBF800000},
+		{"two", 0x4000, 0x40000000},
+		{"half", 0x3800, 0x3F000000},
+		{"max normal", 0x7BFF, 0x477fe000},
+		{"min positive normal", 0x0400, 0x38800000},
+
+		// Infinity cases
+		{"positive infinity", 0x7C00, 0x7F800000},
+		{"negative infinity", 0xFC00, 0xFF800000},
+
+		// NaN cases
+		{"NaN", 0x7C01, 0x7f802000},
+		{"NaN with payload", 0x7E00, 0x7FC00000},
+
+		// Subnormal cases
+		{"min positive subnormal", 0x0001, 0x33800000},
+		{"max subnormal", 0x03FF, 0x387fc000},
+
+		// Common values
+		{"pi approximation", 0x4248, 0x40490000},
+		{"e approximation", 0x416F, 0x402de000},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			t.Run("Float32s", func(t *testing.T) {
+				got := Float32s([]uint16{tt.input})[0]
+				if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
+					t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+
+			t.Run("FromFloat32s", func(t *testing.T) {
+				got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
+				if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
+					t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+		})
+	}
+}
+
+func BenchmarkFloat16(b *testing.B) {
+	f32s := make([]float32, 1_000_000)
+	for i := range f32s {
+		f32s[i] = rand.Float32()
+	}
+	for b.Loop() {
+		Float32s(FromFloat32s(f32s))
+	}
+}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -33,8 +33,8 @@ func (t tensorBase) Shape() []uint64 {
 const (
 	tensorKindFP32 uint32 = iota
 	tensorKindFP16
+	tensorKindMXFP4 = 4
 	tensorKindBF16  = 30
-	tensorKindMXFP4 = 39
 )

 func (t tensorBase) Kind() uint32 {
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -13,8 +13,8 @@ import (
 	"slices"
 	"strings"

-	"github.com/d4l3k/go-bfloat16"
-	"github.com/x448/float16"
+	"github.com/ollama/ollama/convert/bfloat16"
+	"github.com/ollama/ollama/convert/float16"
 )

 type safetensorMetadata struct {
@@ -163,18 +163,16 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 			return 0, err
 		}

-		f32s = make([]float32, len(u16s))
-		for i := range u16s {
-			f32s[i] = float16.Frombits(u16s[i]).Float32()
-		}
+		f32s = float16.Float32s(u16s)

 	case "BF16":
-		u8s := make([]uint8, st.size)
-		if err = binary.Read(br, binary.LittleEndian, u8s); err != nil {
+		u16s := make([]uint16, st.size/2)
+		if err = binary.Read(br, binary.LittleEndian, u16s); err != nil {
 			return 0, err
 		}

-		f32s = bfloat16.DecodeFloat32(u8s)
+		f32s = bfloat16.Float32s(u16s)
+
 	default:
 		return 0, fmt.Errorf("unknown data type: %s", st.dtype)
 	}
@@ -188,17 +186,11 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {

 	switch st.Kind() {
 	case tensorKindFP32:
-		return int64(len(f32s) * 4), binary.Write(w, binary.LittleEndian, f32s)
+		return 0, binary.Write(w, binary.LittleEndian, f32s)
 	case tensorKindFP16:
-		f16s := make([]uint16, len(f32s))
-		for i := range f32s {
-			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
-		}
-
-		return int64(len(f16s) * 2), binary.Write(w, binary.LittleEndian, f16s)
+		return 0, binary.Write(w, binary.LittleEndian, float16.FromFloat32s(f32s))
 	case tensorKindBF16:
-		u8s := bfloat16.EncodeFloat32(f32s)
-		return int64(len(u8s)), binary.Write(w, binary.LittleEndian, u8s)
+		return 0, binary.Write(w, binary.LittleEndian, bfloat16.FromFloat32s(f32s))
 	default:
 		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
 	}
--- a/convert/reader_test.go
+++ b/convert/reader_test.go
@@ -7,9 +7,9 @@ import (
 	"path/filepath"
 	"testing"

-	"github.com/d4l3k/go-bfloat16"
 	"github.com/google/go-cmp/cmp"
-	"github.com/x448/float16"
+	"github.com/ollama/ollama/convert/bfloat16"
+	"github.com/ollama/ollama/convert/float16"
 )

 func TestSafetensors(t *testing.T) {
@@ -21,6 +21,11 @@ func TestSafetensors(t *testing.T) {
 	}
 	defer root.Close()

+	f32s := make([]float32, 32)
+	for i := range f32s {
+		f32s[i] = float32(i)
+	}
+
 	cases := []struct {
 		name,
 		dtype string
@@ -36,11 +41,6 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 4, // 32 floats, each 4 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
 				if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
 					t.Fatal(err)
 				}
@@ -62,11 +62,6 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 4, // 32 floats, each 4 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
 				if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
 					t.Fatal(err)
 				}
@@ -84,12 +79,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 floats, each 2 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
-				u16s := make([]uint16, 32)
-				for i := range u16s {
-					u16s[i] = float16.Fromfloat32(float32(i)).Bits()
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
@@ -106,12 +96,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 floats, each 2 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
-				u16s := make([]uint16, 32)
-				for i := range u16s {
-					u16s[i] = float16.Fromfloat32(float32(i)).Bits()
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
@@ -132,12 +117,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 brain floats, each 2 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
@@ -154,12 +134,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 brain floats, each 2 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -97,7 +97,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		return a < b
 	})
 	gpuCount := 0
-	gpuOrdinalID := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)
 		fp, err := os.Open(match)
@@ -188,6 +187,10 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			continue
 		}

+		// Keep track of numeric IDs based on valid GPUs
+		gpuID := gpuCount
+		gpuCount += 1
+
 		// Look up the memory for the current node
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
@@ -266,7 +269,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		if uniqueID != 0 {
 			ID = fmt.Sprintf("GPU-%016x", uniqueID)
 		} else {
-			ID = strconv.Itoa(gpuOrdinalID)
+			ID = strconv.Itoa(gpuID)
 		}

 		gpuInfo := RocmGPUInfo{
@@ -277,7 +280,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 					FreeMemory:  (totalMemory - usedMemory),
 				},
 				ID:            ID,
-				filterID:      gpuOrdinalID,
 				Name:          name,
 				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 				MinimumMemory: rocmMinimumMemory,
@@ -285,40 +287,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				DriverMinor:   driverMinor,
 			},
 			usedFilepath: usedFile,
-			index:        gpuCount,
+			index:        gpuID,
 		}

-		// Keep track of numeric IDs based on valid GPUs
-		gpuCount += 1
-
-		// If the user wants to filter to a subset of devices, filter out if we aren't a match
-		if len(visibleDevices) > 0 {
-			include := false
-			for _, visible := range visibleDevices {
-				if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
-					include = true
-					break
-				}
-			}
-			if !include {
-				reason := "filtering out device per user request"
-				slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
-				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-					GpuInfo: gpuInfo.GpuInfo,
-					Reason:  reason,
-				})
-
-				continue
-			}
-		}
-
-		// Ordinal IDs are based on the visible GPUs
-		gpuOrdinalID += 1
-
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
 			reason := "unsupported Radeon iGPU detected skipping"
-			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
+			slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
@@ -331,7 +306,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		}
 		if int(major) < minVer {
 			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
-			slog.Warn(reason, "gpu", gpuInfo.ID)
+			slog.Warn(reason, "gpu", gpuID)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
@@ -340,8 +315,29 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			continue
 		}

-		slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
-		slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
+
+		// If the user wants to filter to a subset of devices, filter out if we aren't a match
+		if len(visibleDevices) > 0 {
+			include := false
+			for _, visible := range visibleDevices {
+				if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
+					include = true
+					break
+				}
+			}
+			if !include {
+				reason := "filtering out device per user request"
+				slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
+					GpuInfo: gpuInfo.GpuInfo,
+					Reason:  reason,
+				})
+
+				continue
+			}
+		}

 		// Final validation is gfx compatibility - load the library if we haven't already loaded it
 		// even if the user overrides, we still need to validate the library
@@ -395,7 +391,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {

 		// Check for env var workarounds
 		if name == "1002:687f" { // Vega RX 56
-			gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, "HSA_ENABLE_SDMA=0")
+			gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
 		}

 		// The GPU has passed all the verification steps and is supported
@@ -524,26 +520,19 @@ func verifyKFDDriverAccess() error {
 	return nil
 }

-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
+func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
 		if info.Library != "rocm" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
 			continue
 		}
-		// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
-		if _, err := strconv.Atoi(info.ID); err == nil {
-			ids = append(ids, fmt.Sprintf("%d", info.filterID))
-		} else {
-			ids = append(ids, info.ID)
-		}
+		ids = append(ids, info.ID)
 	}
-	if len(ids) == 0 {
-		return ""
-	}
-
 	// There are 3 potential env vars to use to select GPUs.
 	// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
 	// GPU_DEVICE_ORDINAL supports numeric IDs only
 	// HIP_VISIBLE_DEVICES supports numeric IDs only
-	return "ROCR_VISIBLE_DEVICES=" + strings.Join(ids, ",")
+	return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -111,7 +111,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,

 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				filterID:       i,
 				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
@@ -201,26 +200,19 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 	return nil
 }

-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
+func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
 		if info.Library != "rocm" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
 			continue
 		}
-		// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
-		if _, err := strconv.Atoi(info.ID); err == nil {
-			ids = append(ids, fmt.Sprintf("%d", info.filterID))
-		} else {
-			ids = append(ids, info.ID)
-		}
+		ids = append(ids, info.ID)
 	}
-	if len(ids) == 0 {
-		return ""
-	}
-
 	// There are 3 potential env vars to use to select GPUs.
 	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
 	// HIP_VISIBLE_DEVICES supports numeric IDs only
 	// GPU_DEVICE_ORDINAL supports numeric IDs only
-	return "HIP_VISIBLE_DEVICES=" + strings.Join(ids, ",")
+	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -16,6 +16,19 @@ import (
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")

+func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "cuda" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
+			continue
+		}
+		ids = append(ids, info.ID)
+	}
+	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
+}
+
 func cudaVariant(gpuInfo CudaGPUInfo) string {
 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
 		if CudaTegra != "" {
@@ -43,15 +56,14 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
+		return "sbsa"
 	}

-	if gpuInfo.DriverMajor < 13 {
-		// The detected driver is older than 580 (Aug 2025)
-		// Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance
-		if gpuInfo.computeMajor > 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor >= 5) {
-			slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
-		}
-		return "v12"
+	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
+	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		// The detected driver is older than Feb 2023
+		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
+		return "v11"
 	}
-	return "v13"
+	return "v12"
 }
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -371,15 +371,6 @@ func GetGPUInfo() GpuInfoList {
 		}

 		rocmGPUs, err = AMDGetGPUInfo()
-
-		// The ID field is used in context of the filtered set of GPUS
-		// so we have to replace any of these numeric IDs with their
-		// placement in this set of GPUs
-		for i := range rocmGPUs {
-			if _, err := strconv.Atoi(rocmGPUs[i].ID); err == nil {
-				rocmGPUs[i].ID = strconv.Itoa(i)
-			}
-		}
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
@@ -689,16 +680,23 @@ func getVerboseState() C.uint16_t {

 // Given the list of GPUs this instantiation is targeted for,
 // figure out the visible devices environment variable
-func (l GpuInfoList) GetVisibleDevicesEnv() []string {
+//
+// If different libraries are detected, the first one is what we use
+func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	if len(l) == 0 {
-		return nil
+		return "", ""
 	}
-	vd := []string{}
-	// Only filter the AMD GPUs at this level, let all NVIDIA devices through
-	if tmp := rocmGetVisibleDevicesEnv(l); tmp != "" {
-		vd = append(vd, tmp)
+	switch l[0].Library {
+	case "cuda":
+		return cudaGetVisibleDevicesEnv(l)
+	case "rocm":
+		return rocmGetVisibleDevicesEnv(l)
+	case "oneapi":
+		return oneapiGetVisibleDevicesEnv(l)
+	default:
+		slog.Debug("no filter required for library " + l[0].Library)
+		return "", ""
 	}
-	return vd
 }

 func GetSystemInfo() SystemInfo {
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -62,9 +62,9 @@ func GetCPUMem() (memInfo, error) {
 	}, nil
 }

-func (l GpuInfoList) GetVisibleDevicesEnv() []string {
+func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	// No-op on darwin
-	return nil
+	return "", ""
 }

 func GetSystemInfo() SystemInfo {
--- a/discover/gpu_oneapi.go
+++ b/discover/gpu_oneapi.go
@@ -0,0 +1,21 @@
+//go:build linux || windows
+
+package discover
+
+import (
+	"log/slog"
+	"strings"
+)
+
+func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "oneapi" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
+			continue
+		}
+		ids = append(ids, info.ID)
+	}
+	return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
+}
--- a/discover/types.go
+++ b/discover/types.go
@@ -27,8 +27,8 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath []string `json:"lib_path,omitempty"`

-	// Extra environment variables specific to the GPU as list of [key=value]
-	EnvWorkarounds []string `json:"envs,omitempty"`
+	// Extra environment variables specific to the GPU as list of [key,value]
+	EnvWorkarounds [][2]string `json:"envs,omitempty"`

 	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
 	// the FreeMemory is best effort, and may over or under report actual memory usage
@@ -36,10 +36,9 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	UnreliableFreeMemory bool

 	// GPU information
-	ID       string `json:"gpu_id"` // string to use for selection of this specific GPU
-	filterID int    //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
-	Name     string `json:"name"`    // user friendly name if available
-	Compute  string `json:"compute"` // Compute Capability or gfx
+	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
+	Name    string `json:"name"`    // user friendly name if available
+	Compute string `json:"compute"` // Compute Capability or gfx

 	// Driver Information - TODO no need to put this on each GPU
 	DriverMajor int `json:"driver_major,omitempty"`
--- a/docs/api.md
+++ b/docs/api.md
@@ -1708,7 +1708,6 @@ Advanced parameters:
 - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
- `dimensions`: number of dimensions for the embedding

 ### Examples

--- a/docs/development.md
+++ b/docs/development.md
@@ -11,10 +11,6 @@ Then build and run Ollama from the root directory of the repository:
 go run . serve
 ```

-> [!NOTE]
-> Ollama includes native code compiled with CGO.  From time to time these data structures can change and CGO can get out of sync resulting in unexpected crashes.  You can force a full build of the native code by running `go clean -cache` first. 
-
-
 ## macOS (Apple Silicon)

 macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -11,13 +11,12 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 > [!NOTE]
-> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.

 Download and extract the package:

 ```shell
 curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
-sudo rm -rf /usr/lib/ollama
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -92,9 +92,6 @@ If none of those resolve the problem, gather additional information and file an
 - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`

-You may get more details for initialization failures by enabling debug prints in the uvm driver.  You should only use this temporarily while troubleshooting
- `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1`
-

 ## AMD GPU Discovery

--- a/docs/turbo.md
+++ b/docs/turbo.md
@@ -75,7 +75,7 @@ for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
 import { Ollama } from 'ollama';

 const ollama = new Ollama({
-  host: 'https://ollama.com',
+  host: 'https://ollama.com'
  headers: {
 	  Authorization: "Bearer <api key>"
  }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -7,11 +7,9 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"math"
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/util/bufioutil"
 )

@@ -57,28 +55,10 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }

-func (kv KV) HeadCount() []uint64 {
-	headCountDefault := uint32(1)
-	headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
-	if len(headCount) == 1 {
-		headCountDefault = headCount[0]
-	}
-	nLayers := int(kv.BlockCount())
-	if len(headCount) > nLayers {
-		slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
-	}
-	out := make([]uint64, nLayers)
-	for i := range nLayers {
-		if i >= len(headCount) {
-			out[i] = uint64(headCountDefault)
-		} else {
-			out[i] = uint64(headCount[i])
-		}
-	}
-	return out
-}
-
 func (kv KV) HeadCountMax() uint64 {
+	// TODO(drifkin): using the max value can cause an overestimation. In the
+	// future if array values become more popular, we can adapt the more invasive
+	// <https://github.com/ollama/ollama/pull/10225>
 	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
 }

@@ -86,27 +66,6 @@ func (kv KV) HeadCountMin() uint64 {
 	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
 }

-func (kv KV) HeadCountKV() []uint64 {
-	headCountKVDefault := uint32(1)
-	headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
-	if len(headCountKV) == 1 {
-		headCountKVDefault = headCountKV[0]
-	}
-	nLayers := int(kv.BlockCount())
-	if len(headCountKV) > nLayers {
-		slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
-	}
-	out := make([]uint64, nLayers)
-	for i := range nLayers {
-		if i >= len(headCountKV) {
-			out[i] = uint64(headCountKVDefault)
-		} else {
-			out[i] = uint64(headCountKV[i])
-		}
-	}
-	return out
-}
-
 func (kv KV) HeadCountKVMax() uint64 {
 	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
 }
@@ -139,26 +98,6 @@ func (kv KV) ChatTemplate() string {
 	return kv.String("tokenizer.chat_template")
 }

-// ssm architecture parameters
-
-func (kv KV) SSMConvKernel() uint64 {
-	return uint64(kv.Uint("ssm.conv_kernel"))
-}
-
-func (kv KV) SSMInnerSize() uint64 {
-	return uint64(kv.Uint("ssm.inner_size"))
-}
-
-func (kv KV) SSMStateSize() uint64 {
-	return uint64(kv.Uint("ssm.state_size"))
-}
-
-func (kv KV) SSMGroupCount() uint64 {
-	return uint64(kv.Uint("ssm.group_count"))
-}
-
-// general types
-
 func (kv KV) String(key string, defaultValue ...string) string {
 	val, _ := keyValue(kv, key, append(defaultValue, "")...)
 	return val
@@ -190,27 +129,22 @@ func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
 }

 func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
-	arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
-	return slices.Min(arrVal), slices.Max(arrVal)
-}
-
-func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
 	if u32, ok := keyValue(kv, key, uint32(0)); ok {
-		return []uint32{u32}
+		return u32, u32
 	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
-		return u32s.values
+		min := slices.Min(u32s.values)
+		max := slices.Max(u32s.values)
+		return min, max
 	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
-		dst := make([]uint32, len(i32s.values))
-		for i, v := range i32s.values {
-			if v < 0 {
-				slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
-			}
-			dst[i] = uint32(v)
+		min := slices.Min(i32s.values)
+		max := slices.Max(i32s.values)
+		if min < 0 || max < 0 {
+			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
 		}
-		return dst
+		return uint32(min), uint32(max)
 	}

-	return []uint32{defaultValue}
+	return defaultValue, defaultValue
 }

 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
@@ -341,7 +275,7 @@ type Tensor struct {

 func (t Tensor) block() (n int) {
 	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
-		return math.MaxInt
+		return -1
 	}

 	return
@@ -354,24 +288,24 @@ func (t Tensor) blockSize() uint64 {
 func (t TensorType) BlockSize() uint64 {
 	switch t {
 	case
-		TensorTypeF32,
-		TensorTypeF16,
-		TensorTypeI8,
-		TensorTypeI16,
-		TensorTypeI32,
-		TensorTypeI64,
-		TensorTypeF64,
-		TensorTypeBF16:
+		0,  // F32
+		1,  // F16
+		24, // I8
+		25, // I16
+		26, // I32
+		27, // I64
+		28, // F64
+		30: // BF16
 		return 1
 	case
-		TensorTypeQ4_0,
-		TensorTypeQ4_1,
-		TensorTypeQ5_0,
-		TensorTypeQ5_1,
-		TensorTypeQ8_0,
-		TensorTypeQ8_1,
-		tensorTypeIQ4_NL,
-		4, TensorTypeMXFP4:
+		2,  // Q4_0
+		3,  // Q4_1
+		4,  // MXFP4
+		6,  // Q5_0
+		7,  // Q5_1
+		8,  // Q8_0
+		9,  // Q8_1
+		20: // IQ4_NL
 		return 32
 	default:
 		return 256
@@ -394,6 +328,8 @@ func (t TensorType) TypeSize() uint64 {
 		return 2 + blockSize/2
 	case TensorTypeQ4_1:
 		return 2 + 2 + blockSize/2
+	case TensorTypeMXFP4, 39:
+		return 1 + blockSize/2
 	case TensorTypeQ5_0:
 		return 2 + 4 + blockSize/2
 	case TensorTypeQ5_1:
@@ -444,8 +380,6 @@ func (t TensorType) TypeSize() uint64 {
 		return blockSize/8 + blockSize/16 + blockSize/32
 	case TensorTypeBF16:
 		return 2
-	case 4, TensorTypeMXFP4:
-		return 1 + blockSize/2
 	default:
 		return 0
 	}
@@ -545,14 +479,10 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	}, nil
 }

-func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
-	context *= uint64(numParallel)
-
+func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
 	heads := f.KV().HeadCountMax()
-	headsArr := f.KV().HeadCount()
 	headsKV := f.KV().HeadCountKVMax()
-	headsKVArr := f.KV().HeadCountKV()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)

 	embeddingHeads := f.KV().EmbeddingHeadCountMax()
@@ -562,51 +492,12 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	layers := f.Tensors().GroupLayers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-
-	// Default for models unless special-cased below. These defaults mirror the
-	// cache usage in llama.cpp under the assumption that models without special
-	// cases below will use the llamarunner and caching will be handled by the
-	// llama.cpp layer.
-	//
-	// This also assumes that a layer without heads or headsKV set is recurrent
-	// which is usually the case. Some models (eg nemotronh) use "blocks" in
-	// place of layers where some are MLP blocks that don't have any cache.
-	// Models like this will need a special case below to be accurately
-	// estimated.
 	var kvTotal uint64
 	kv = make([]uint64, f.KV().BlockCount())
-	kvSizeAttn := uint64(0)
-	kvSizeRecurrent := uint64(0)
 	for i := range kv {
-		headsL := headsArr[i]
-		headsKVL := headsKVArr[i]
-		if headsL > 0 && headsKVL > 0 {
-			// full attention layer
-			// NOTE: Assumes uniform values for all attn layers
-			kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
-			kvSizeAttn += kv[i]
-		} else {
-			// recurrent layer
-			ssmDConv := f.KV().SSMConvKernel()
-			ssmDState := f.KV().SSMStateSize()
-			ssmDInner := f.KV().SSMInnerSize()
-			ssmNGroups := f.KV().SSMGroupCount()
-			nEmbdR := uint64(0)
-			if ssmDConv > 0 {
-				nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
-			}
-			nEmbdS := ssmDState * ssmDInner
-
-			// recurrent always uses F32 in llama.cpp backend
-			// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
-			bytesPerElementRecurrent := kvCacheBytesPerElement("f32")
-
-			kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
-			kvSizeRecurrent += kv[i]
-		}
+		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
 		kvTotal += kv[i]
 	}
-	slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)

 	switch f.KV().Architecture() {
 	case "llama", "llama4":
@@ -784,12 +675,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				kv[i] *= context
 			}
 		}
-
 		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
-		if useFlashAttention {
-			// rough estimate of graph size with flash attention on
-			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
-		}
 	}

 	return
@@ -864,16 +750,7 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {

 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
-	if cacheType == "" || cacheType == "f16" {
-		return true
-	}
-
-	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
-		// gpt-oss uses attention with sinks which does not support quantized cache types
-		slog.Warn("model only supports non-quantized cache types", "model", arch)
-		return false
-	}
-	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
+	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }

 // SupportsFlashAttention checks if the model supports flash attention
@@ -883,23 +760,12 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

-	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
-		return false
-	}
-
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()
 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }

-// FlashAttention checks if the model should enable flash attention
-func (f GGML) FlashAttention() bool {
-	return slices.Contains([]string{
-		"gptoss", "gpt-oss",
-	}, f.KV().String("general.architecture"))
-}
-
 // kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
 func kvCacheBytesPerElement(cacheType string) float64 {
 	switch cacheType {
@@ -907,8 +773,6 @@ func kvCacheBytesPerElement(cacheType string) float64 {
 		return 1 // 1/2 of fp16
 	case "q4_0":
 		return 0.5 // 1/4 of fp16
-	case "f32":
-		return 4 // f32 (default for recurrent)
 	default:
 		return 2 // f16 (default)
 	}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -533,15 +533,12 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		}
 	}

-	slices.SortStableFunc(
-		ts,
-		func(a, b *Tensor) int {
-			return cmp.Or(
-				cmp.Compare(a.block(), b.block()),
-				cmp.Compare(a.Name, b.Name),
-			)
-		},
-	)
+	slices.SortStableFunc(ts, func(a, b *Tensor) int {
+		if i, j := a.block(), b.block(); i > 0 && j > 0 {
+			return cmp.Compare(i, j)
+		}
+		return cmp.Compare(a.Name, b.Name)
+	})

 	var s uint64
 	for i := range ts {
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -11,24 +11,24 @@ import (
 )

 func TestWriteGGUF(t *testing.T) {
-	b := bytes.NewBuffer(make([]byte, 2*3))
+	r := rand.New(rand.NewPCG(0, 0))
 	for range 8 {
 		t.Run("shuffle", func(t *testing.T) {
 			t.Parallel()

 			ts := []*Tensor{
-				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: b},
-				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: b},
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.1.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.2.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.3.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.4.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.5.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
 			}

-			rand.Shuffle(len(ts), func(i, j int) {
+			r.Shuffle(len(ts), func(i, j int) {
 				ts[i], ts[j] = ts[j], ts[i]
 			})

@@ -63,14 +63,14 @@ func TestWriteGGUF(t *testing.T) {
 			}

 			if diff := cmp.Diff(Tensors{
-				Offset: 592,
+				Offset: 608,
 				items: []*Tensor{
-					{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
-					{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
-					{Name: "blk.0.ffn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
-					{Name: "blk.1.ffn_down.weight", Offset: 96, Shape: []uint64{2, 3}},
-					{Name: "blk.1.ffn_up.weight", Offset: 128, Shape: []uint64{2, 3}},
-					{Name: "blk.2.ffn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
+					{Name: "blk.0.attn_norm.weight", Offset: 0, Shape: []uint64{2, 3}},
+					{Name: "blk.1.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
+					{Name: "blk.2.attn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
+					{Name: "blk.3.attn_norm.weight", Offset: 96, Shape: []uint64{2, 3}},
+					{Name: "blk.4.attn_norm.weight", Offset: 128, Shape: []uint64{2, 3}},
+					{Name: "blk.5.attn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
 					{Name: "output.weight", Offset: 192, Shape: []uint64{3, 2}},
 					{Name: "output_norm.weight", Offset: 224, Shape: []uint64{3, 2}},
 					{Name: "token_embd.weight", Offset: 256, Shape: []uint64{2, 3}},
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -146,6 +146,8 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeQ4_0
 	case fileTypeQ4_1:
 		return TensorTypeQ4_1
+	case fileTypeMXFP4:
+		return TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
 	case fileTypeQ5_0:
@@ -174,8 +176,6 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeQ2_K
 	case FileTypeBF16:
 		return TensorTypeBF16
-	case fileTypeMXFP4:
-		return TensorTypeMXFP4
 	default:
 		slog.Warn("unsupported file type", "type", ftype)
 		return 0 // F32
@@ -191,8 +191,8 @@ const (
 	TensorTypeF16
 	TensorTypeQ4_0
 	TensorTypeQ4_1
-	tensorTypeQ4_2
-	tensorTypeQ4_3 // unused by GGML
+	TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
+	tensorTypeQ4_3  // unused by GGML
 	TensorTypeQ5_0
 	TensorTypeQ5_1
 	TensorTypeQ8_0
@@ -226,7 +226,6 @@ const (
 	tensorTypeIQ4_NL_4_4 // unused by GGML
 	tensorTypeIQ4_NL_4_8 // unused by GGML
 	tensorTypeIQ4_NL_8_8 // unused by GGML
-	TensorTypeMXFP4
 )

 // ParseFileType parses the provided GGUF file type
@@ -319,7 +318,7 @@ func (t TensorType) String() string {
 		return "F64"
 	case TensorTypeBF16:
 		return "BF16"
-	case 4, TensorTypeMXFP4:
+	case TensorTypeMXFP4:
 		return "MXFP4"
 	default:
 		return "unknown"
--- a/go.mod
+++ b/go.mod
@@ -10,13 +10,11 @@ require (
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
-	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.12.0
 )

 require (
 	github.com/agnivade/levenshtein v1.1.1
-	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.7.0
--- a/go.sum
+++ b/go.sum
@@ -35,8 +35,6 @@ github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARu
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -197,8 +195,6 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
 github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
-github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
-github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
 github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
--- a/integration/README.md
+++ b/integration/README.md
@@ -2,13 +2,10 @@

 This directory contains integration tests to exercise Ollama end-to-end to verify behavior

-By default, these tests are disabled so `go test ./...` will exercise only unit tests.  To run integration tests you must pass the integration tag.  `go test -tags=integration ./...` Some tests require additional tags to enable to allow scoped testing to keep the duration reasonable.  For example, testing a broad set of models requires `-tags=integration,models` and a longer timeout (~60m or more depending on the speed of your GPU.). To view the current set of tag combinations use `find integration -type f | xargs grep "go:build"`
+By default, these tests are disabled so `go test ./...` will exercise only unit tests.  To run integration tests you must pass the integration tag.  `go test -tags=integration ./...`


 The integration tests have 2 modes of operating.

 1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
-2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote based on your `OLLAMA_HOST` environment variable
-
-> [!IMPORTANT]
-> Before running the tests locally without the "test existing" setting, compile ollama from the top of the source tree  `go build .` in addition to GPU support with cmake if applicable on your platform.  The integration tests expect to find an ollama binary at the top of the tree.
+2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote
--- a/integration/api_test.go
+++ b/integration/api_test.go
@@ -390,7 +390,7 @@ func TestAPIEmbeddings(t *testing.T) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	req := api.EmbeddingRequest{
-		Model:  libraryEmbedModels[0],
+		Model:  "orca-mini",
 		Prompt: "why is the sky blue?",
 		Options: map[string]interface{}{
 			"temperature": 0,
@@ -410,99 +410,3 @@ func TestAPIEmbeddings(t *testing.T) {
 		t.Errorf("zero length embedding response")
 	}
 }
-
-func TestAPIToolCalling(t *testing.T) {
-	initialTimeout := 60 * time.Second
-	streamTimeout := 30 * time.Second
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	modelName := "qwen3:0.6b"
-	if err := PullIfMissing(ctx, client, modelName); err != nil {
-		t.Fatalf("pull failed %s", err)
-	}
-
-	tools := []api.Tool{
-		{
-			Type: "function",
-			Function: api.ToolFunction{
-				Name:        "get_weather",
-				Description: "Get the current weather in a given location",
-				Parameters: api.ToolFunctionParameters{
-					Type:     "object",
-					Required: []string{"location"},
-					Properties: map[string]api.ToolProperty{
-						"location": {
-							Type:        api.PropertyType{"string"},
-							Description: "The city and state, e.g. San Francisco, CA",
-						},
-					},
-				},
-			},
-		},
-	}
-
-	req := api.ChatRequest{
-		Model: modelName,
-		Messages: []api.Message{
-			{
-				Role:    "user",
-				Content: "Call get_weather with location set to San Francisco.",
-			},
-		},
-		Tools: tools,
-		Options: map[string]any{
-			"temperature": 0,
-		},
-	}
-
-	stallTimer := time.NewTimer(initialTimeout)
-	var gotToolCall bool
-	var lastToolCall api.ToolCall
-
-	fn := func(response api.ChatResponse) error {
-		if len(response.Message.ToolCalls) > 0 {
-			gotToolCall = true
-			lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1]
-		}
-		if !stallTimer.Reset(streamTimeout) {
-			return fmt.Errorf("stall was detected while streaming response, aborting")
-		}
-		return nil
-	}
-
-	stream := true
-	req.Stream = &stream
-	done := make(chan int)
-	var genErr error
-	go func() {
-		genErr = client.Chat(ctx, &req, fn)
-		done <- 0
-	}()
-
-	select {
-	case <-stallTimer.C:
-		t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String())
-	case <-done:
-		if genErr != nil {
-			t.Fatalf("chat failed: %v", genErr)
-		}
-
-		if !gotToolCall {
-			t.Fatalf("expected at least one tool call, got none")
-		}
-
-		if lastToolCall.Function.Name != "get_weather" {
-			t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
-		}
-
-		if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
-			t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
-		}
-	case <-ctx.Done():
-		t.Error("outer test context done while waiting for tool-calling chat")
-	}
-}
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -11,6 +11,7 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
+	"github.com/stretchr/testify/require"
 )

 func TestBlueSky(t *testing.T) {
@@ -36,8 +37,8 @@ func TestUnicode(t *testing.T) {
 	// Set up the test data
 	req := api.GenerateRequest{
 		// DeepSeek has a Unicode tokenizer regex, making it a unicode torture test
-		Model:  "deepseek-coder-v2:16b-lite-instruct-q2_K", // TODO is there an ollama-engine model we can switch to and keep the coverage?
-		Prompt: "天空为什么是蓝色的?",                               // Why is the sky blue?
+		Model:  "deepseek-coder-v2:16b-lite-instruct-q2_K",
+		Prompt: "天空为什么是蓝色的?",
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
@@ -49,20 +50,8 @@ func TestUnicode(t *testing.T) {
 	}
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatal(err)
-	}
-	slog.Info("loading", "model", req.Model)
-	err := client.Generate(ctx, &api.GenerateRequest{Model: req.Model}, func(response api.GenerateResponse) error { return nil })
-	if err != nil {
-		t.Fatalf("failed to load model %s: %s", req.Model, err)
-	}
-	skipIfNotGPULoaded(ctx, t, client, req.Model, 100)
-
-	DoGenerate(ctx, t, client, req, []string{
-		"散射", // scattering
-		"频率", // frequency
-	}, 120*time.Second, 120*time.Second)
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	DoGenerate(ctx, t, client, req, []string{"散射", "频率"}, 120*time.Second, 120*time.Second)
 }

 func TestExtendedUnicodeOutput(t *testing.T) {
@@ -80,9 +69,7 @@ func TestExtendedUnicodeOutput(t *testing.T) {
 	}
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatal(err)
-	}
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
 	DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
 }

@@ -97,9 +84,7 @@ func TestUnicodeModelDir(t *testing.T) {
 	}

 	modelDir, err := os.MkdirTemp("", "ollama_埃")
-	if err != nil {
-		t.Fatal(err)
-	}
+	require.NoError(t, err)
 	defer os.RemoveAll(modelDir)
 	slog.Info("unicode", "OLLAMA_MODELS", modelDir)

--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -7,175 +7,254 @@ import (
 	"fmt"
 	"log/slog"
 	"math"
-	"math/rand"
 	"os"
 	"strconv"
 	"sync"
 	"testing"
 	"time"

+	"github.com/stretchr/testify/require"
+
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )

-// Send multiple requests in parallel (concurrently) to a single model and ensure responses are expected
-func TestConcurrentGenerate(t *testing.T) {
-	// Assumes all requests have the same model
-	req, resp := GenerateRequests()
-	numParallel := int(envconfig.NumParallel() + 1)
-	iterLimit := 3
+func TestMultiModelConcurrency(t *testing.T) {
+	var (
+		req = [2]api.GenerateRequest{
+			{
+				Model:     smol,
+				Prompt:    "why is the ocean blue?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
+			}, {
+				Model:     "qwen3:0.6b",
+				Prompt:    "what is the origin of the us thanksgiving holiday?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
+			},
+		}
+		resp = [2][]string{
+			{"sunlight"},
+			{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
+		}
+	)
+	var wg sync.WaitGroup
+	wg.Add(len(req))
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
+	defer cancel()

-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for i := 0; i < len(req); i++ {
+		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
+	}
+
+	for i := 0; i < len(req); i++ {
+		go func(i int) {
+			defer wg.Done()
+			// Note: CPU based inference can crawl so don't give up too quickly
+			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
+		}(i)
+	}
+	wg.Wait()
+}
+
+func TestIntegrationConcurrentPredict(t *testing.T) {
+	req, resp := GenerateRequests()
+	reqLimit := len(req)
+	iterLimit := 5
+
+	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
+		maxVram, err := strconv.ParseUint(s, 10, 64)
+		require.NoError(t, err)
+		// Don't hammer on small VRAM cards...
+		if maxVram < 4*format.GibiByte {
+			reqLimit = min(reqLimit, 2)
+			iterLimit = 2
+		}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

 	// Get the server running (if applicable) warm the model up with a single initial request
-	slog.Info("loading", "model", req[0].Model)
-	err := client.Generate(ctx,
-		&api.GenerateRequest{Model: req[0].Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
-		func(response api.GenerateResponse) error { return nil },
-	)
-	if err != nil {
-		t.Fatalf("failed to load model %s: %s", req[0].Model, err)
-	}
+	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)

 	var wg sync.WaitGroup
-	r := rand.New(rand.NewSource(0))
-	wg.Add(numParallel)
-	for i := range numParallel {
+	wg.Add(reqLimit)
+	for i := 0; i < reqLimit; i++ {
 		go func(i int) {
 			defer wg.Done()
 			for j := 0; j < iterLimit; j++ {
-				if time.Now().Sub(started) > softTimeout {
-					slog.Info("exceeded soft timeout, winding down test")
-					return
-				}
-				k := r.Int() % len(req)
-				slog.Info("Starting", "thread", i, "iter", j)
+				slog.Info("Starting", "req", i, "iter", j)
 				// On slower GPUs it can take a while to process the concurrent requests
 				// so we allow a much longer initial timeout
-				DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
 			}
 		}(i)
 	}
 	wg.Wait()
 }

-// Stress the scheduler and attempt to load more models than will fit to cause thrashing
-// This test will always load at least 2 models even on CPU based systems
+// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	s := os.Getenv("OLLAMA_MAX_VRAM")
+	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if s == "" {
-		s = "0"
+		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}

 	maxVram, err := strconv.ParseUint(s, 10, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
-
-	// All models compatible with ollama-engine
-	smallModels := []string{
-		"llama3.2:1b",
-		"qwen3:0.6b",
-		"gemma2:2b",
-		"deepseek-r1:1.5b", // qwen2 arch
-		"gemma3:270m",
-	}
-	mediumModels := []string{
-		"llama3.2:3b",    // ~3.4G
-		"qwen3:8b",       // ~6.6G
-		"gpt-oss:20b",    // ~15G
-		"deepseek-r1:7b", // ~5.6G
-		"gemma3:4b",      // ~5.8G
-		"gemma2:9b",      // ~8.1G
+	if maxVram < 2*format.GibiByte {
+		t.Skip("VRAM less than 2G, skipping model stress tests")
 	}

-	var chosenModels []string
+	type model struct {
+		name string
+		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
+	}
+
+	smallModels := []model{
+		{
+			name: "llama3.2:1b",
+			size: 2876 * format.MebiByte,
+		},
+		{
+			name: "qwen3:0.6b",
+			size: 1600 * format.MebiByte,
+		},
+		{
+			name: "gemma:2b",
+			size: 2364 * format.MebiByte,
+		},
+		{
+			name: "deepseek-r1:1.5b",
+			size: 2048 * format.MebiByte,
+		},
+		{
+			name: "starcoder2:3b",
+			size: 2166 * format.MebiByte,
+		},
+	}
+	mediumModels := []model{
+		{
+			name: "qwen3:8b",
+			size: 6600 * format.MebiByte,
+		},
+		{
+			name: "llama2",
+			size: 5118 * format.MebiByte,
+		},
+		{
+			name: "deepseek-r1:7b",
+			size: 5600 * format.MebiByte,
+		},
+		{
+			name: "mistral",
+			size: 4620 * format.MebiByte,
+		},
+		{
+			name: "dolphin-mistral",
+			size: 4620 * format.MebiByte,
+		},
+		{
+			name: "gemma:7b",
+			size: 5000 * format.MebiByte,
+		},
+		{
+			name: "codellama:7b",
+			size: 5118 * format.MebiByte,
+		},
+	}
+
+	// These seem to be too slow to be useful...
+	// largeModels := []model{
+	// 	{
+	// 		name: "llama2:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "codellama:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "orca-mini:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "gemma:7b",
+	// 		size: 5000 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "starcoder2:15b",
+	// 		size: 9100 * format.MebiByte,
+	// 	},
+	// }
+
+	var chosenModels []model
 	switch {
 	case maxVram < 10000*format.MebiByte:
 		slog.Info("selecting small models")
 		chosenModels = smallModels
+	// case maxVram < 30000*format.MebiByte:
 	default:
 		slog.Info("selecting medium models")
 		chosenModels = mediumModels
+		// default:
+		// 	slog.Info("selecting large models")
+		// 	chosenModels = largeModels
 	}

-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	req, resp := GenerateRequests()
+
+	for i := range req {
+		if i > len(chosenModels) {
+			break
+		}
+		req[i].Model = chosenModels[i].name
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

 	// Make sure all the models are pulled before we get started
-	for _, model := range chosenModels {
-		if err := PullIfMissing(ctx, client, model); err != nil {
-			t.Fatal(err)
-		}
+	for _, r := range req {
+		require.NoError(t, PullIfMissing(ctx, client, r.Model))
 	}

-	// Determine how many models we can load in parallel before we exceed VRAM
-	// The intent is to go 1 over what can fit so we force the scheduler to thrash
-	targetLoadCount := 0
-	slog.Info("Loading models to find how many can fit in VRAM before overflowing")
-chooseModels:
-	for i, model := range chosenModels {
-		req := &api.GenerateRequest{Model: model}
-		slog.Info("loading", "model", model)
-		err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil })
-		if err != nil {
-			t.Fatalf("failed to load model %s: %s", model, err)
-		}
-		targetLoadCount++
-		if i > 0 {
-			models, err := client.ListRunning(ctx)
-			if err != nil {
-				t.Fatalf("failed to list running models: %s", err)
-			}
-			if len(models.Models) < targetLoadCount {
-				loaded := []string{}
-				for _, m := range models.Models {
-					loaded = append(loaded, m.Name)
-				}
-				slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
-				break
-			}
-			// Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts
-			for _, m := range models.Models {
-				if m.SizeVRAM == 0 {
-					slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
-					break chooseModels
-				}
-			}
-		}
-	}
-	if targetLoadCount == len(chosenModels) {
-		// TODO consider retrying the medium models
-		slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models")
-	}
-
-	r := rand.New(rand.NewSource(0))
 	var wg sync.WaitGroup
-	for i := range targetLoadCount {
+	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
+	for i := 0; i < len(req); i++ {
+		// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
+		if i > 1 && consumed > maxVram {
+			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
+			break
+		}
+		consumed += chosenModels[i].size
+		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
+
 		wg.Add(1)
 		go func(i int) {
 			defer wg.Done()
-			reqs, resps := GenerateRequests()
 			for j := 0; j < 3; j++ {
-				if time.Now().Sub(started) > softTimeout {
-					slog.Info("exceeded soft timeout, winding down test")
-					return
-				}
-				k := r.Int() % len(reqs)
-				reqs[k].Model = chosenModels[i]
-				slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Prompt)
-				DoGenerate(ctx, t, client, reqs[k], resps[k],
-					120*time.Second, // Be extra patient for the model to load initially
-					10*time.Second,  // Once results start streaming, fail if they stall
-				)
+				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
 			}
 		}(i)
 	}
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -4,8 +4,6 @@ package integration

 import (
 	"context"
-	"log/slog"
-	"sync"
 	"testing"
 	"time"

@@ -22,7 +20,7 @@ func TestLongInputContext(t *testing.T) {
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
-		Model:  smol,
+		Model:  "llama2",
 		Prompt: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
 		Stream: &stream,
 		Options: map[string]any{
@@ -36,7 +34,7 @@ func TestLongInputContext(t *testing.T) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
-	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
+	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia"}, 120*time.Second, 10*time.Second)
 }

 func TestContextExhaustion(t *testing.T) {
@@ -49,8 +47,8 @@ func TestContextExhaustion(t *testing.T) {
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
-		Model:  smol,
-		Prompt: "Write me a story in english with a lot of emojis",
+		Model:  "llama2",
+		Prompt: "Write me a story with a ton of emojis?",
 		Stream: &stream,
 		Options: map[string]any{
 			"temperature": 0,
@@ -63,104 +61,5 @@ func TestContextExhaustion(t *testing.T) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
-	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water"}, 120*time.Second, 10*time.Second)
-}
-
-// Send multiple generate requests with prior context and ensure the response is coherant and expected
-func TestGenerateWithHistory(t *testing.T) {
-	modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
-	req, resp := GenerateRequests()
-	numParallel := 2
-	iterLimit := 2
-
-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	// Get the server running (if applicable) warm the model up with a single initial request
-	slog.Info("loading", "model", modelOverride)
-	err := client.Generate(ctx,
-		&api.GenerateRequest{Model: modelOverride, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
-		func(response api.GenerateResponse) error { return nil },
-	)
-	if err != nil {
-		t.Fatalf("failed to load model %s: %s", modelOverride, err)
-	}
-
-	var wg sync.WaitGroup
-	wg.Add(numParallel)
-	for i := range numParallel {
-		go func(i int) {
-			defer wg.Done()
-			k := i % len(req)
-			req[k].Model = modelOverride
-			for j := 0; j < iterLimit; j++ {
-				if time.Now().Sub(started) > softTimeout {
-					slog.Info("exceeded soft timeout, winding down test")
-					return
-				}
-				slog.Info("Starting", "thread", i, "iter", j)
-				// On slower GPUs it can take a while to process the concurrent requests
-				// so we allow a much longer initial timeout
-				c := DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
-				req[k].Context = c
-				req[k].Prompt = "tell me more!"
-			}
-		}(i)
-	}
-	wg.Wait()
-}
-
-// Send multiple chat requests with prior context and ensure the response is coherant and expected
-func TestChatWithHistory(t *testing.T) {
-	modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
-	req, resp := ChatRequests()
-	numParallel := 2
-	iterLimit := 2
-
-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	// Get the server running (if applicable) warm the model up with a single initial empty request
-	slog.Info("loading", "model", modelOverride)
-	err := client.Generate(ctx,
-		&api.GenerateRequest{Model: modelOverride, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
-		func(response api.GenerateResponse) error { return nil },
-	)
-	if err != nil {
-		t.Fatalf("failed to load model %s: %s", modelOverride, err)
-	}
-
-	var wg sync.WaitGroup
-	wg.Add(numParallel)
-	for i := range numParallel {
-		go func(i int) {
-			defer wg.Done()
-			k := i % len(req)
-			req[k].Model = modelOverride
-			for j := 0; j < iterLimit; j++ {
-				if time.Now().Sub(started) > softTimeout {
-					slog.Info("exceeded soft timeout, winding down test")
-					return
-				}
-				slog.Info("Starting", "thread", i, "iter", j)
-				// On slower GPUs it can take a while to process the concurrent requests
-				// so we allow a much longer initial timeout
-				assistant := DoChat(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
-				if assistant == nil {
-					t.Fatalf("didn't get an assistant response for context")
-				}
-				req[k].Messages = append(req[k].Messages,
-					*assistant,
-					api.Message{Role: "user", Content: "tell me more!"},
-				)
-			}
-		}(i)
-	}
-	wg.Wait()
+	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
 }
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -38,9 +38,8 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
 	defer cleanup()

 	req := api.EmbeddingRequest{
-		Model:     "all-minilm",
-		Prompt:    "why is the sky blue?",
-		KeepAlive: &api.Duration{Duration: 10 * time.Second},
+		Model:  "all-minilm",
+		Prompt: "why is the sky blue?",
 	}

 	res, err := embeddingTestHelper(ctx, client, t, req)
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -9,6 +9,7 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
+	"github.com/stretchr/testify/require"
 )

 func TestVisionModels(t *testing.T) {
@@ -31,9 +32,7 @@ func TestVisionModels(t *testing.T) {
 	for _, v := range testCases {
 		t.Run(v.model, func(t *testing.T) {
 			image, err := base64.StdEncoding.DecodeString(imageEncoding)
-			if err != nil {
-				t.Fatal(err)
-			}
+			require.NoError(t, err)
 			req := api.GenerateRequest{
 				Model:  v.model,
 				Prompt: "what does the text in this image say?",
@@ -53,9 +52,7 @@ func TestVisionModels(t *testing.T) {
 			// Note: sometimes it returns "the ollamas" sometimes "the ollams"
 			resp := "the ollam"
 			defer cleanup()
-			if err := PullIfMissing(ctx, client, req.Model); err != nil {
-				t.Fatal(err)
-			}
+			require.NoError(t, PullIfMissing(ctx, client, req.Model))
 			// llava models on CPU can be quite slow to start
 			DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
 		})
@@ -65,9 +62,7 @@ func TestVisionModels(t *testing.T) {
 func TestIntegrationSplitBatch(t *testing.T) {
 	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
-	if err != nil {
-		t.Fatal(err)
-	}
+	require.NoError(t, err)
 	req := api.GenerateRequest{
 		Model: "gemma3:4b",
 		// Fill up a chunk of the batch so the image will partially spill over into the next one
@@ -89,9 +84,7 @@ func TestIntegrationSplitBatch(t *testing.T) {
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatal(err)
-	}
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
 	// llava models on CPU can be quite slow to start,
 	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
 }
--- a/integration/llm_test.go
+++ b/integration/llm_test.go
@@ -0,0 +1,47 @@
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
+//        package to avoid circular dependencies
+
+var (
+	stream = false
+	req    = [2]api.GenerateRequest{
+		{
+			Model:  smol,
+			Prompt: "why is the ocean blue?",
+			Stream: &stream,
+			Options: map[string]any{
+				"seed":        42,
+				"temperature": 0.0,
+			},
+		}, {
+			Model:  smol,
+			Prompt: "what is the origin of the us thanksgiving holiday?",
+			Stream: &stream,
+			Options: map[string]any{
+				"seed":        42,
+				"temperature": 0.0,
+			},
+		},
+	}
+	resp = [2][]string{
+		{"sunlight", "scattering", "interact"},
+		{"england", "english", "massachusetts", "pilgrims"},
+	}
+)
+
+func TestIntegrationSimple(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
+	defer cancel()
+	GenerateTestHelper(ctx, t, req[0], resp[0])
+}
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -13,12 +13,12 @@ import (
 	"testing"
 	"time"

+	"github.com/stretchr/testify/require"
+
 	"github.com/ollama/ollama/api"
 )

 func TestMaxQueue(t *testing.T) {
-	t.Skip("this test needs to be re-evaluated to use a proper embedding model")
-
 	if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
 		t.Skip("Max Queue test requires spawning a local server so we can adjust the queue size")
 		return
@@ -45,9 +45,7 @@ func TestMaxQueue(t *testing.T) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatal(err)
-	}
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))

 	// Context for the worker threads so we can shut them down
 	// embedCtx, embedCancel := context.WithCancel(ctx)
@@ -91,9 +89,7 @@ func TestMaxQueue(t *testing.T) {
 			switch {
 			case genErr == nil:
 				successCount++
-				if len(resp.Embedding) < 5 { // somewhat arbitrary, but sufficient to be reasonable
-					t.Fatalf("embeddings shorter than expected: %d", len(resp.Embedding))
-				}
+				require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
 			case errors.Is(genErr, context.Canceled):
 				canceledCount++
 			case strings.Contains(genErr.Error(), "busy"):
@@ -101,9 +97,7 @@ func TestMaxQueue(t *testing.T) {
 			case strings.Contains(genErr.Error(), "connection reset by peer"):
 				resetByPeerCount++
 			default:
-				if genErr != nil {
-					t.Fatalf("%d request failed", i)
-				}
+				require.NoError(t, genErr, "%d request failed", i)
 			}

 			slog.Info("embed finished", "id", i)
@@ -114,13 +108,8 @@ func TestMaxQueue(t *testing.T) {
 	embedwg.Wait()

 	slog.Info("embeds completed", "success", successCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
-	if resetByPeerCount != 0 {
-		t.Fatalf("Connections reset by peer, have you updated your fd and socket limits? %d", resetByPeerCount)
-	}
-	if busyCount == 0 {
-		t.Fatalf("no requests hit busy error but some should have")
-	}
-	if canceledCount > 0 {
-		t.Fatalf("no requests should have been canceled due to timeout %d", canceledCount)
-	}
+	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
+	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
+	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
+
 }
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -9,7 +9,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"math"
 	"math/rand"
 	"net"
 	"net/http"
@@ -26,11 +25,11 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/ollama/ollama/format"
+	"github.com/stretchr/testify/require"
 )

 var (
-	smol   = "llama3.2:1b"
-	stream = false
+	smol = "llama3.2:1b"
 )

 var (
@@ -436,9 +435,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 		}
 		lifecycle.ServerLogFile = fp.Name()
 		fp.Close()
-		if err := startServer(t, ctx, testEndpoint); err != nil {
-			t.Fatal(err)
-		}
+		require.NoError(t, startServer(t, ctx, testEndpoint))
 	}

 	return client, testEndpoint, func() {
@@ -471,25 +468,19 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRequest, anyResp []string) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	if err := PullIfMissing(ctx, client, genReq.Model); err != nil {
-		t.Fatal(err)
-	}
+	require.NoError(t, PullIfMissing(ctx, client, genReq.Model))
 	DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
 }

-func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) []int {
+func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) {
 	stallTimer := time.NewTimer(initialTimeout)
 	var buf bytes.Buffer
-	var context []int
 	fn := func(response api.GenerateResponse) error {
 		// fmt.Print(".")
 		buf.Write([]byte(response.Response))
 		if !stallTimer.Reset(streamTimeout) {
 			return errors.New("stall was detected while streaming response, aborting")
 		}
-		if len(response.Context) > 0 {
-			context = response.Context
-		}
 		return nil
 	}

@@ -502,22 +493,6 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 		done <- 0
 	}()

-	var response string
-	verify := func() {
-		// Verify the response contains the expected data
-		response = buf.String()
-		atLeastOne := false
-		for _, resp := range anyResp {
-			if strings.Contains(strings.ToLower(response), resp) {
-				atLeastOne = true
-				break
-			}
-		}
-		if !atLeastOne {
-			t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response)
-		}
-	}
-
 	select {
 	case <-stallTimer.C:
 		if buf.Len() == 0 {
@@ -528,21 +503,23 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 	case <-done:
 		if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
 			slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
-			return context
+			return
 		}
-		if genErr != nil {
-			t.Fatalf("%s failed with %s request prompt %s", genErr, genReq.Model, genReq.Prompt)
+		require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
+		// Verify the response contains the expected data
+		response := buf.String()
+		atLeastOne := false
+		for _, resp := range anyResp {
+			if strings.Contains(strings.ToLower(response), resp) {
+				atLeastOne = true
+				break
+			}
 		}
-		verify()
+		require.True(t, atLeastOne, "%s: none of %v found in %s", genReq.Model, anyResp, response)
 		slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response)
 	case <-ctx.Done():
-		// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
-		// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
-		// if they are still generating valid responses
-		slog.Warn("outer test context done while waiting for generate")
-		verify()
+		t.Error("outer test context done while waiting for generate")
 	}
-	return context
 }

 // Generate a set of requests
@@ -551,132 +528,65 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 	return []api.GenerateRequest{
 			{
 				Model:     smol,
-				Prompt:    "why is the ocean blue? Be brief but factual in your reply",
+				Prompt:    "why is the ocean blue?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "why is the color of dirt brown? Be brief but factual in your reply",
+				Prompt:    "why is the color of dirt brown?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "how do rainbows form? Be brief but factual in your reply",
+				Prompt:    "what is the origin of the us thanksgiving holiday?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "what is the origin of independence day? Be brief but factual in your reply",
+				Prompt:    "what is the origin of independence day?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "what is the composition of air? Be brief but factual in your reply",
+				Prompt:    "what is the composition of air?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			},
 		},
 		[][]string{
-			{"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"},
-			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"},
-			{"water", "droplet", "refracted", "reflect", "color", "spectrum"},
+			{"sunlight", "scattering", "interact"},
+			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles"},
+			{"england", "english", "massachusetts", "pilgrims", "british"},
 			{"fourth", "july", "declaration", "independence"},
-			{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor"},
+			{"nitrogen", "oxygen", "carbon", "dioxide"},
 		}
 }

-func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) *api.Message {
-	stallTimer := time.NewTimer(initialTimeout)
-	var buf bytes.Buffer
-	role := "assistant"
-	fn := func(response api.ChatResponse) error {
-		// fmt.Print(".")
-		role = response.Message.Role
-		buf.Write([]byte(response.Message.Content))
-		if !stallTimer.Reset(streamTimeout) {
-			return errors.New("stall was detected while streaming response, aborting")
-		}
-		return nil
-	}
-
-	stream := true
-	req.Stream = &stream
-	done := make(chan int)
-	var genErr error
-	go func() {
-		genErr = client.Chat(ctx, &req, fn)
-		done <- 0
-	}()
-
-	var response string
-	verify := func() {
-		// Verify the response contains the expected data
-		response = buf.String()
-		atLeastOne := false
-		for _, resp := range anyResp {
-			if strings.Contains(strings.ToLower(response), resp) {
-				atLeastOne = true
-				break
-			}
-		}
-		if !atLeastOne {
-			t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages)
-		}
-	}
-
-	select {
-	case <-stallTimer.C:
-		if buf.Len() == 0 {
-			t.Errorf("generate never started.  Timed out after :%s", initialTimeout.String())
-		} else {
-			t.Errorf("generate stalled.  Response so far:%s", buf.String())
-		}
-	case <-done:
-		if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
-			slog.Warn("model is too large for the target test system", "model", req.Model, "error", genErr)
-			return nil
-		}
-		if genErr != nil {
-			t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages)
-		}
-		verify()
-		slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response)
-	case <-ctx.Done():
-		// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
-		// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
-		// if they are still generating valid responses
-		slog.Warn("outer test context done while waiting for chat")
-		verify()
-	}
-	return &api.Message{Role: role, Content: buf.String()}
-}
-
-func ChatRequests() ([]api.ChatRequest, [][]string) {
-	genReqs, results := GenerateRequests()
-	reqs := make([]api.ChatRequest, len(genReqs))
-	// think := api.ThinkValue{Value: "low"}
-	for i := range reqs {
-		reqs[i].Model = genReqs[i].Model
-		reqs[i].Stream = genReqs[i].Stream
-		reqs[i].KeepAlive = genReqs[i].KeepAlive
-		// reqs[i].Think = &think
-		reqs[i].Messages = []api.Message{
-			{
-				Role:    "user",
-				Content: genReqs[i].Prompt,
-			},
-		}
-	}
-	return reqs, results
-}
-
 func skipUnderMinVRAM(t *testing.T, gb uint64) {
 	// TODO use info API in the future
 	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
 		maxVram, err := strconv.ParseUint(s, 10, 64)
-		if err != nil {
-			t.Fatal(err)
-		}
+		require.NoError(t, err)
 		// Don't hammer on small VRAM cards...
 		if maxVram < gb*format.GibiByte {
 			t.Skip("skipping with small VRAM to avoid timeouts")
@@ -684,39 +594,6 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
 	}
 }

-// Skip if the target model isn't X% GPU loaded to avoid excessive runtime
-func skipIfNotGPULoaded(ctx context.Context, t *testing.T, client *api.Client, model string, minPercent int) {
-	models, err := client.ListRunning(ctx)
-	if err != nil {
-		t.Fatalf("failed to list running models: %s", err)
-	}
-	loaded := []string{}
-	for _, m := range models.Models {
-		loaded = append(loaded, m.Name)
-		if m.Name != model {
-			continue
-		}
-		gpuPercent := 0
-		switch {
-		case m.SizeVRAM == 0:
-			gpuPercent = 0
-		case m.SizeVRAM == m.Size:
-			gpuPercent = 100
-		case m.SizeVRAM > m.Size || m.Size == 0:
-			t.Logf("unexpected size detected: %d", m.SizeVRAM)
-		default:
-			sizeCPU := m.Size - m.SizeVRAM
-			cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 110)
-			gpuPercent = int(100 - cpuPercent)
-		}
-		if gpuPercent < minPercent {
-			t.Skip(fmt.Sprintf("test requires minimum %d%% GPU load, but model %s only has %d%%", minPercent, model, gpuPercent))
-		}
-		return
-	}
-	t.Skip(fmt.Sprintf("model %s not loaded - actually loaded: %v", model, loaded))
-}
-
 func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
 	deadline, hasDeadline := t.Deadline()
 	if !hasDeadline {
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -378,7 +378,9 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
-		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
+		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
+		ctx.Forward(maskTensor.Copy(ctx, out))
+		maskTensor = out
 	}

 	return maskTensor
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
    const int64_t n_vocab = vocab.n_tokens();
    const int64_t n_embd  = hparams.n_embd;

-    const bool output_all = false;
+    // when computing embeddings, all tokens are output
+    const bool output_all = cparams.embeddings;

    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -62,22 +62,6 @@ func BackendInit() {
 	C.llama_backend_init()
 }

-func EnumerateGPUs() []string {
-	var ids []string
-
-	for i := range C.ggml_backend_dev_count() {
-		device := C.ggml_backend_dev_get(i)
-
-		if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
-			var props C.struct_ggml_backend_dev_props
-			C.ggml_backend_dev_get_props(device, &props)
-			ids = append(ids, C.GoString(props.id))
-		}
-	}
-
-	return ids
-}
-
 func GetModelArch(modelPath string) (string, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
@@ -515,34 +499,33 @@ func (c *MtmdContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	}
 	nChunks := C.mtmd_input_chunks_size(ic)
 	numEmbed := llamaContext.Model().NEmbd()
-	embed := make([][]float32, 0)
+	lastChunkSize := 0
 	for i := range int(nChunks) {
 		chunk := C.mtmd_input_chunks_get(ic, C.size_t(i))
 		numTokens := int(C.mtmd_input_chunk_get_n_tokens(chunk))
-		slog.Debug("chunk tokens", "index", i, "numTokens", numTokens)
+		lastChunkSize = numTokens

 		// Encode the chunk
 		if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) {
 			return nil, errors.New("unable to encode mtmd image chunk")
 		}
-
-		// Get the embeddings for this chunk
-		chunkEmbed := make([][]float32, numTokens)
-		chunkEmbd := C.mtmd_get_output_embd(c.c)
-		if nil == chunkEmbd {
-			continue
-		}
-
-		// Extend the embedding array for each token
-		s := unsafe.Slice((*float32)(chunkEmbd), numTokens*numEmbed)
-		rows := make([]float32, len(s))
-		copy(rows, s)
-		for i := range numTokens {
-			chunkEmbed[i] = rows[i*numEmbed : (i+1)*numEmbed]
-		}
-		embed = append(embed, chunkEmbed...)
 	}
-	slog.Debug("image embeddings", "totalEmbeddings", len(embed))
+
+	// Get the embeddings
+	embed := make([][]float32, lastChunkSize)
+	embd := C.mtmd_get_output_embd(c.c)
+	if nil == embd {
+		return nil, errors.New("failed to get image embedding")
+	}
+
+	// Extend the embedding array for each token
+	s := unsafe.Slice((*float32)(embd), numEmbed*lastChunkSize)
+	rows := make([]float32, len(s))
+	copy(rows, s)
+	for i := range lastChunkSize {
+		embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
+	}
+
 	return embed, nil
 }

--- a/llama/patches/0016-temporary-prevent-rocm-cuda-mixed-loading.patch
+++ b/llama/patches/0016-temporary-prevent-rocm-cuda-mixed-loading.patch
@@ -0,0 +1,32 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Sun, 22 Jun 2025 09:22:05 -0700
+Subject: [PATCH] temporary prevent rocm+cuda mixed loading
+
+---
+ ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
+index 3040b2aa..f1e9c180 100644
+--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
+@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
+ 
+     ggml_backend_load_best("blas", silent, dir_path);
+     ggml_backend_load_best("cann", silent, dir_path);
+-    ggml_backend_load_best("cuda", silent, dir_path);
+-    ggml_backend_load_best("hip", silent, dir_path);
+
+    // Avoid mixed hip+cuda configurations
+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
+    if (!hip_devices && !rocr_devices) {
+        ggml_backend_load_best("cuda", silent, dir_path);
+    } else {
+        ggml_backend_load_best("hip", silent, dir_path);
+    }
+
+     ggml_backend_load_best("metal", silent, dir_path);
+     ggml_backend_load_best("rpc", silent, dir_path);
+     ggml_backend_load_best("sycl", silent, dir_path);
--- a/llama/patches/0017-add-C-API-for-mtmd_input_text.patch
+++ b/llama/patches/0017-add-C-API-for-mtmd_input_text.patch
--- a/llama/patches/0018-no-power-throttling-win32-with-gnuc.patch
+++ b/llama/patches/0018-no-power-throttling-win32-with-gnuc.patch
--- a/llama/patches/0019-BF16-macos-version-guard.patch
+++ b/llama/patches/0019-BF16-macos-version-guard.patch
--- a/llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch
+++ b/llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -13,7 +13,7 @@ checks.
 1 file changed, 18 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 57eae461..c7f9dc3a 100644
+index 57eae461..9db0c8b5 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
--- a/llama/patches/0021-Disable-ggml-blas-on-macos-v13-and-older.patch
+++ b/llama/patches/0021-Disable-ggml-blas-on-macos-v13-and-older.patch
--- a/llama/patches/0022-fix-mtmd-audio.cpp-build-on-windows.patch
+++ b/llama/patches/0022-fix-mtmd-audio.cpp-build-on-windows.patch
--- a/llama/patches/0023-decode-disable-output_all.patch
+++ b/llama/patches/0023-decode-disable-output_all.patch
@@ -1,23 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Mon, 18 Aug 2025 16:58:39 -0700
-Subject: [PATCH] decode: disable output_all
-
---
- src/llama-context.cpp | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 26a5cf9c..6ece5263 100644
--- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
-     const int64_t n_vocab = vocab.n_tokens();
-     const int64_t n_embd  = hparams.n_embd;
- 
-    // when computing embeddings, all tokens are output
-    const bool output_all = cparams.embeddings;
-+    const bool output_all = false;
- 
-     if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
-         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llama/patches/0023-ggml-No-alloc-mode.patch
+++ b/llama/patches/0023-ggml-No-alloc-mode.patch
--- a/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
+++ b/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
@@ -1,130 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Wed, 27 Aug 2025 14:39:48 -0700
-Subject: [PATCH] ggml: Enable resetting backend devices
-
-Touching a CUDA device causes the allocation of a primary context
-with CUDA data structures (~300 MB of VRAM). If a device is
-unused then it can be reset to free these data structures.
---
- ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-backend-impl.h     |  4 ++++
- ggml/src/ggml-backend.cpp        |  8 ++++++++
- ggml/src/ggml-cuda/ggml-cuda.cu  | 17 +++++++++++++++--
- ggml/src/ggml-cuda/vendors/hip.h |  1 +
- 5 files changed, 29 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index b602a7c78..fda5ceb24 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -167,6 +167,7 @@ extern "C" {
-     GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
-     GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
-     GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
-+    GGML_API void                          ggml_backend_dev_reset(ggml_backend_dev_t device);
-     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
-     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
-     GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
-diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index 81749a5a3..6f10c353b 100644
--- a/ggml/src/ggml-backend-impl.h
-+++ b/ggml/src/ggml-backend-impl.h
-@@ -178,6 +178,10 @@ extern "C" {
-         ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
-         void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
-         void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
-+
-+        // (optional) reset device, clearing existing allocations and context
-+        // the caller must ensure that there are no outstanding buffers, as these will become invalid
-+        void (*reset)(ggml_backend_dev_t dev);
-     };
- 
-     struct ggml_backend_device {
-diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 05a842ed5..6556943b0 100644
--- a/ggml/src/ggml-backend.cpp
-+++ b/ggml/src/ggml-backend.cpp
-@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
-     return device->iface.init_backend(device, params);
- }
- 
-+void ggml_backend_dev_reset(ggml_backend_dev_t device) {
-+    if (device->iface.reset == NULL) {
-+        return;
-+    }
-+
-+    device->iface.reset(device);
-+}
-+
- ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
-     return device->iface.get_buffer_type(device);
- }
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index c7f9dc3a5..e43fde523 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -103,6 +103,11 @@ int ggml_cuda_get_device() {
-     return id;
- }
- 
-+void ggml_cuda_reset_device(int device) {
-+    ggml_cuda_set_device(device);
-+    CUDA_CHECK(cudaDeviceReset());
-+}
-+
- static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
-     ggml_cuda_set_device(device);
-     cudaError_t err;
-@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
-     props->description = ggml_backend_cuda_device_get_description(dev);
-     props->id          = ggml_backend_cuda_device_get_id(dev);
-     props->type        = ggml_backend_cuda_device_get_type(dev);
-    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
-+
-+    // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
-+    // If you need the memory data, call ggml_backend_dev_memory() explicitly.
-+    props->memory_total = props->memory_free = 0;
- 
-     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
- #ifdef GGML_CUDA_NO_PEER_COPY
-@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
-     CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
- }
- 
-+static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) {
-+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    ggml_cuda_reset_device(ctx->device);
-+}
-+
- static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
-     /* .get_name                = */ ggml_backend_cuda_device_get_name,
-     /* .get_description         = */ ggml_backend_cuda_device_get_description,
-@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
-     /* .event_new               = */ ggml_backend_cuda_device_event_new,
-     /* .event_free              = */ ggml_backend_cuda_device_event_free,
-     /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
-+    /* .reset                   = */ ggml_backend_cuda_device_reset,
- };
- 
- // backend reg
-@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
-                 dev_ctx->device = i;
-                 dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
- 
-                ggml_cuda_set_device(i);
-                 cudaDeviceProp prop;
-                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-                 dev_ctx->description = prop.name;
-diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index c31f31923..cf22e60d2 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
-+++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -40,6 +40,7 @@
- #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
- #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
- #define cudaDeviceProp hipDeviceProp_t
-+#define cudaDeviceReset hipDeviceReset
- #define cudaDeviceSynchronize hipDeviceSynchronize
- #define cudaError_t hipError_t
- #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
--- a/llama/patches/0025-harden-uncaught-exception-registration.patch
+++ b/llama/patches/0025-harden-uncaught-exception-registration.patch
@@ -1,28 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Fri, 29 Aug 2025 16:53:08 -0700
-Subject: [PATCH] harden uncaught exception registration
-
---
- ggml/src/ggml.cpp | 8 ++++++--
- 1 file changed, 6 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
-index 0d388d45..f5bcb446 100644
--- a/ggml/src/ggml.cpp
-+++ b/ggml/src/ggml.cpp
-@@ -19,8 +19,12 @@ static bool ggml_uncaught_exception_init = []{
-         return false;
-     }
-     const auto prev{std::get_terminate()};
-    GGML_ASSERT(prev != ggml_uncaught_exception);
-    previous_terminate_handler = prev;
-+    // GGML_ASSERT(prev != ggml_uncaught_exception);
-+    if (prev != ggml_uncaught_exception) {
-+        previous_terminate_handler = prev;
-+    } else {
-+        GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__);
-+    }
-     std::set_terminate(ggml_uncaught_exception);
-     return true;
- }();
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -4,7 +4,7 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
-	"sort"
+	"strconv"
 	"strings"

 	"github.com/ollama/ollama/api"
@@ -14,79 +14,13 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 )

-// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
-// The list of GPUs returned will always be the same brand (library)
-// If the model can not be fit fully within the available GPU(s) nil is returned
-func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
-	for _, gl := range gpus.ByLibrary() {
-		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
-
-		// TODO - potentially sort by performance capability, existing models loaded, etc.
-		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
-		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
-		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
-
-		if !envconfig.SchedSpread() {
-			// Try to pack into as few GPUs as possible, starting from 1 GPU
-			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
-				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
-
-				if ok {
-					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
-						"model", modelPath,
-						"library", sgl[0].Library,
-						"parallel", numParallel,
-						"required", format.HumanBytes2(estimatedVRAM),
-						"gpus", numGPUs)
-					return gpuSubset
-				}
-			}
-		} else {
-			// TODO future refinements
-			// - if multiple Libraries, see if any single GPU in any Library will fit
-			// - try subsets of GPUs instead of just falling back to 1 or all in a family
-
-			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
-				slog.Info("new model will fit in available VRAM, loading",
-					"model", modelPath,
-					"library", sgl[0].Library,
-					"parallel", numParallel,
-					"required", format.HumanBytes2(estimatedVRAM),
-					"gpus", len(sgl))
-				return sgl
-			}
-		}
-	}
-	return nil
-}
-
-// If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
-	byLibrary := gpus.ByLibrary()
-	if len(byLibrary) <= 1 {
-		return gpus
-	}
-	var bestEstimate uint64
-	var bestFit int
-	for i, gl := range byLibrary {
-		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
-		if estimatedVRAM > bestEstimate {
-			bestEstimate = estimatedVRAM
-			bestFit = i
-		}
-	}
-	return byLibrary[bestFit]
-}
-
 // This algorithm looks for a complete fit to determine if we need to unload other models
-func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
+		estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
@@ -97,10 +31,6 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
 				return true, estimatedVRAM
 			}
 		}
-
-		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
-			return true, estimatedVRAM
-		}
 	}
 	return false, estimatedVRAM
 }
@@ -119,7 +49,7 @@ type MemoryEstimate struct {
 	TotalSize uint64

 	// For multi-GPU scenarios, this provides the tensor split parameter
-	TensorSplit []int
+	TensorSplit string

 	// For multi-GPU scenarios, this is the size in bytes per GPU
 	GPUSizes []uint64
@@ -141,7 +71,7 @@ type MemoryEstimate struct {

 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64

@@ -182,9 +112,13 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	for _, projector := range projectors {
 		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
+
+		// multimodal models require at least 2048 context
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 	if llamaEngineProjectorWeights == 0 {
 		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}

 	layers := f.Tensors().GroupLayers()
@@ -195,19 +129,17 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		slog.Warn("model missing blk.0 layer size")
 	}

-	useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
-		f.SupportsFlashAttention()
-
 	var kvct string
-	if useFlashAttention {
+	if envconfig.FlashAttention() &&
+		discover.GetGPUInfo().FlashAttentionSupported() &&
+		f.SupportsFlashAttention() {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if f.SupportsKVCacheType(requested) {
+		if requested != "" && f.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}

-	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct, useFlashAttention)
+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct)

 	if len(kv) > 0 {
 		layerSize += kv[0]
@@ -252,7 +184,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
-	tensorSplit := make([]int, len(gpus))
+	layerCounts := make([]int, len(gpus))
 	gpuAllocations := make([]uint64, len(gpus))
 	type gs struct {
 		i int
@@ -316,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
 			if g.g.FreeMemory > overhead+used+layerSize {
 				gpuAllocations[g.i] += layerSize
-				tensorSplit[g.i]++
+				layerCounts[g.i]++
 				layerCount++
 				break
 			} else {
@@ -341,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
 				if g.g.FreeMemory > overhead+used+memoryLastLayer {
 					gpuAllocations[g.i] += memoryLastLayer
-					tensorSplit[g.i]++
+					layerCounts[g.i]++
 					layerCount++
 					break
 				}
@@ -356,7 +288,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	// Add the applicable (full or partial) graph allocations
 	for i := range gpus {
-		if tensorSplit[i] <= 0 {
+		if layerCounts[i] <= 0 {
 			continue
 		}
 		if fullyLoaded {
@@ -378,6 +310,14 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	memoryRequiredTotal = memoryRequiredPartial + overflow

+	tensorSplit := ""
+	if len(gpus) > 1 {
+		splits := make([]string, len(gpus))
+		for i, count := range layerCounts {
+			splits[i] = strconv.Itoa(count)
+		}
+		tensorSplit = strings.Join(splits, ",")
+	}
 	allocationsList := []string{}
 	for _, a := range gpuAllocations {
 		allocationsList = append(allocationsList, format.HumanBytes2(a))
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	projectors := []string{}
 	opts := api.DefaultOptions()
 	t.Run("cpu", func(t *testing.T) {
-		estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
 		assert.Equal(t, 0, estimate.Layers)
 		assert.Equal(t, uint64(0), estimate.Graph)
 	})
@@ -88,7 +88,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
 	for i, s := range []struct {
 		layer0, layer1   uint64
-		expect0, expect1 int
+		expect0, expect1 uint64
 	}{
 		{1, 1, 1, 1},
 		{2, 1, 2, 1},
@@ -112,9 +112,9 @@ func TestEstimateGPULayers(t *testing.T) {
 			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
 			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
 			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
-			assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
-			assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
+			estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
+			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
+			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
 			var layerSums uint64
 			for _, b := range estimate.GPUSizes {
 				layerSums += b
--- a/llm/server.go
+++ b/llm/server.go
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -8,178 +8,9 @@ import (
 	"testing"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/ml"
 	"golang.org/x/sync/semaphore"
 )

-func TestLLMServerFitGPU(t *testing.T) {
-	type gpu struct {
-		library string
-		free    int
-	}
-
-	tests := []struct {
-		name        string
-		gpus        []gpu
-		layers      []int
-		numGPU      int
-		requireFull bool
-		expected    ml.GPULayersList
-		expectedErr error
-	}{
-		{
-			name:     "No GPU",
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{},
-		},
-		{
-			name:     "Full single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
-		},
-		{
-			name:     "Partial single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
-		},
-		{
-			name:     "Single GPU with numGPU 1",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
-		},
-		{
-			name:     "Single GPU with numGPU 0",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   0,
-			expected: ml.GPULayersList{},
-		},
-		{
-			name:     "Single GPU with numGPU 999",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
-		},
-		{
-			name:     "Multi GPU fits on one",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
-		},
-		{
-			name:     "Multi GPU split",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
-		},
-		{
-			name:     "Multi GPU partial",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
-		},
-		{
-			name:     "Multi GPU numGPU 1",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
-		},
-		{
-			name:     "Multi GPU numGPU 2",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   2,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
-		},
-		{
-			name:     "Multi GPU numGPU 999",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
-		},
-		{
-			name:     "Multi GPU different libraries",
-			gpus:     []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
-			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
-		},
-		{
-			name:        "requireFull",
-			gpus:        []gpu{{free: 256 * format.MebiByte}},
-			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:      -1,
-			requireFull: true,
-			expectedErr: ErrLoadRequiredFull,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			var systemInfo discover.SystemInfo
-			systemInfo.System.TotalMemory = format.GibiByte
-			systemInfo.System.FreeMemory = 512 * format.MebiByte
-			systemInfo.System.FreeSwap = 256 * format.MebiByte
-
-			gpus := make(discover.GpuInfoList, len(tt.gpus))
-			for i := range tt.gpus {
-				gpus[i].ID = fmt.Sprintf("gpu%d", i)
-				gpus[i].Library = tt.gpus[i].library
-				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
-			}
-
-			s := &ollamaServer{
-				llmServer: llmServer{
-					totalLayers: uint64(len(tt.layers)),
-					options: api.Options{
-						Runner: api.Runner{
-							NumGPU: tt.numGPU,
-						},
-					},
-				},
-			}
-
-			s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
-				Weights: make([]ml.Memory, s.totalLayers),
-				Cache:   make([]ml.Memory, s.totalLayers),
-			}, GPUs: make([]ml.DeviceMemory, len(gpus))}
-
-			for i := range tt.layers {
-				s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
-			}
-
-			for i := range s.mem.GPUs {
-				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
-				s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
-				s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
-			}
-
-			gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
-			if err != tt.expectedErr {
-				t.Fatalf("fitGPU returned error: %v", err)
-			}
-			if gpuLayers.Hash() != tt.expected.Hash() {
-				t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
-			}
-		})
-	}
-}
-
 func TestLLMServerCompletionFormat(t *testing.T) {
 	// This test was written to fix an already deployed issue. It is a bit
 	// of a mess, and but it's good enough, until we can refactoring the
--- a/logutil/logutil.go
+++ b/logutil/logutil.go
@@ -1,7 +1,6 @@
 package logutil

 import (
-	"context"
 	"io"
 	"log/slog"
 	"path/filepath"
@@ -28,11 +27,3 @@ func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
 		},
 	}))
 }
-
-func Trace(msg string, args ...any) {
-	slog.Log(context.TODO(), LevelTrace, msg, args...)
-}
-
-func TraceContext(ctx context.Context, msg string, args ...any) {
-	slog.Log(ctx, LevelTrace, msg, args...)
-}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,14 +5,12 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"hash/maphash"
 	"log/slog"
 	"math"
 	"slices"
 	"strconv"
 	"strings"

-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs"
 )

@@ -60,89 +58,19 @@ type CacheConfig struct {
 	MaskBatchPadding int
 }

-// GPULayers is a set of layers to be allocated on a single GPU
-type GPULayers struct {
-	// ID is the identifier of the GPU, as reported in DeviceMemory
-	ID string
-
-	// Layers is a set of layer indicies to load
-	Layers []int
-}
-
-func (g GPULayers) String() string {
-	if len(g.Layers) == 0 {
-		return ""
-	}
-
-	slices.Sort(g.Layers)
-
-	contiguous := true
-	base := g.Layers[0]
-	for i := range g.Layers {
-		if g.Layers[i] != base+i {
-			contiguous = false
-			break
-		}
-	}
-
-	if contiguous {
-		return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
-	} else {
-		return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
-	}
-}
-
-// GPULayersList is a set of layer allocations across multiple GPUs
-type GPULayersList []GPULayers
-
-func (l GPULayersList) String() string {
-	if l.Sum() > 0 {
-		return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
-	} else {
-		return fmt.Sprintf("%v", []GPULayers(l))
-	}
-}
-
-// Sum is the total number of layers assigned across all GPUs
-func (l GPULayersList) Sum() int {
-	var sum int
-
-	for _, g := range l {
-		sum += len(g.Layers)
-	}
-
-	return sum
-}
-
-var h maphash.Hash
-
-// Hash is an identifier of this layer assignment
-func (l GPULayersList) Hash() uint64 {
-	h.Reset()
-	for _, g := range l {
-		if len(g.Layers) > 0 {
-			h.WriteString(g.ID)
-			for _, l := range g.Layers {
-				binary.Write(&h, binary.NativeEndian, int64(l))
-			}
-		}
-	}
-
-	return h.Sum64()
-}
-
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// AllocMemory causes the backend to allocate memory for the model. If
-	// false, this is only being used for discovering the required amount of
-	// memory and cannot load the model for running.
-	AllocMemory bool
-
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

-	// GPULayers is the set of layers to offload to GPUs
-	GPULayers GPULayersList
+	// MainGPU is the index of the primary GPU to use
+	MainGPU int
+
+	// NumGPULayers is the number of layers to offload to GPUs
+	NumGPULayers int
+
+	// TensorSplit is the fraction of the model to offload to each GPU
+	TensorSplit []float32

 	// FlashAttention indicates that we should use a fused flash attention kernel
 	FlashAttention bool
@@ -213,28 +141,6 @@ type DeviceMemory struct {
 	Graph Memory
 }

-// Allocated returns the total size of the memory that has been successfully
-// allocated on this device
-func (m DeviceMemory) Allocated() uint64 {
-	var mem uint64
-
-	for _, w := range m.Weights {
-		if w.Status == Allocated {
-			mem += w.Size
-		}
-	}
-	for _, c := range m.Cache {
-		if c.Status == Allocated {
-			mem += c.Size
-		}
-	}
-	if m.Graph.Status == Allocated {
-		mem += m.Graph.Size
-	}
-
-	return mem
-}
-
 func memoryPresent(mem []Memory) bool {
 	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
 }
@@ -266,7 +172,7 @@ func (m DeviceMemory) LogValue() slog.Value {
 // allocation is guaranteed to be provided so that if it failed, the caller can
 // accommodate that to make forward progress.
 type BackendMemory struct {
-	// InputWeights are always located on the CPU and cannot be moved
+	// InputsWeights are always located on the CPU and cannot be moved
 	InputWeights Memory

 	// CPU model components are located in system memory. This does not
@@ -291,58 +197,6 @@ func (m BackendMemory) LogValue() slog.Value {
 	return slog.GroupValue(attrs...)
 }

-func sumMemory(mem []Memory) uint64 {
-	var sum uint64
-
-	for _, m := range mem {
-		sum += m.Size
-	}
-
-	return sum
-}
-
-// Log prints a high level summary of the memory (allocated or not)
-func (m BackendMemory) Log(level slog.Level) {
-	var total uint64
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Weights); sum > 0 {
-			slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
-		slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Cache); sum > 0 {
-			slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := sumMemory(m.CPU.Cache); sum > 0 {
-		slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := gpu.Graph.Size; sum > 0 {
-			slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.CPU.Graph.Size; sum > 0 {
-		slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	if total > 0 {
-		slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
-	}
-}
-
 var backends = make(map[string]func(string, BackendParams) (Backend, error))

 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
@@ -372,7 +226,6 @@ type Context interface {

 	Forward(...Tensor) Context
 	Compute(...Tensor)
-	ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun

 	// Reserve is analogous to Compute but rather than executing a
 	// graph, simply preallocates memory. Typically called with a
@@ -397,13 +250,10 @@ type Tensor interface {

 	Shape() []int
 	DType() DType
-	Cast(ctx Context, dtype DType) Tensor

 	Bytes() []byte
 	Floats() []float32

-	SetValueFromIntSlice(s []int32)
-
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Sub(ctx Context, t2 Tensor) Tensor
@@ -416,7 +266,6 @@ type Tensor interface {
 	AddID(ctx Context, t2, ids Tensor) Tensor

 	Softmax(ctx Context) Tensor
-	L2Norm(ctx Context, eps float32) Tensor
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,7 +10,6 @@ import "C"

 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -63,40 +62,27 @@ var initDevices = sync.OnceFunc(func() {
 	}
 })

-type layerDevice struct {
-	d  C.ggml_backend_dev_t
-	bt C.ggml_backend_buffer_type_t
-}
-
 type Backend struct {
 	// modelPath is the location of the model data
 	modelPath string

 	meta *fsggml.GGML

-	// allocMemory means that memory should be allocated for tensors and not
-	// just a dry run
-	allocMemory bool
-
 	// tensorLoadTargets maps from the name of the tensor in the file
 	// to the name that is used by the model definition
 	tensorLoadTargets map[string][]string

-	schedMu       sync.Mutex // Only one Compute can run at a time
 	sched         C.ggml_backend_sched_t
 	schedBackends []C.ggml_backend_t
 	schedBufts    []C.ggml_backend_buffer_type_t

 	tensors map[string]*C.struct_ggml_tensor

-	// input is the backend buffer type used for inputs
+	// input is the backend used for inputs
 	input C.ggml_backend_buffer_type_t

-	// output is the backend device used for outputs
-	output C.ggml_backend_dev_t
-
 	// layers is the backend used for repeating layers
-	layers map[int]layerDevice
+	layers map[int]C.ggml_backend_buffer_type_t

 	// requiredMemory is the cumulative memory allocations needed by the backend
 	requiredMemory *ml.BackendMemory
@@ -113,8 +99,6 @@ type Backend struct {
 	weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
 }

-var once sync.Once
-
 func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	r, err := os.Open(modelPath)
 	if err != nil {
@@ -127,17 +111,15 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		return nil, err
 	}

-	once.Do(func() {
-		slog.Info(
-			"",
-			"architecture", meta.KV().Architecture(),
-			"file_type", meta.KV().FileType(),
-			"name", meta.KV().String("general.name"),
-			"description", meta.KV().String("general.description"),
-			"num_tensors", len(meta.Tensors().Items()),
-			"num_key_values", len(meta.KV()),
-		)
-	})
+	slog.Info(
+		"",
+		"architecture", meta.KV().Architecture(),
+		"file_type", meta.KV().FileType(),
+		"name", meta.KV().String("general.name"),
+		"description", meta.KV().String("general.description"),
+		"num_tensors", len(meta.Tensors().Items()),
+		"num_key_values", len(meta.KV()),
+	)

 	initDevices()

@@ -157,10 +139,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		switch C.ggml_backend_dev_type(d) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
-			bt := C.ggml_backend_dev_buffer_type(d)
-			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
-			C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
-
+			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
 			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}
@@ -181,8 +160,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			d:   d,
 			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
 		})
-		C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
-
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
@@ -192,25 +169,56 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

+	useDefaultSplit := true
+	for _, s := range params.TensorSplit {
+		if s != 0 {
+			useDefaultSplit = false
+			break
+		}
+	}
+
+	// calculate splits
+	splits := make([]float32, len(gpus))
+	if useDefaultSplit {
+		// default: split on free memory
+		for i := range splits {
+			var free, total C.size_t
+			C.ggml_backend_dev_memory(gpus[i], &free, &total)
+			splits[i] = float32(free)
+		}
+	} else {
+		splits = params.TensorSplit
+	}
+
+	var sum float32
+	// cumulative sum of all splits
+	for i := range splits {
+		sum += splits[i]
+		splits[i] = sum
+	}
+
+	// normalize splits
+	for i := range splits {
+		splits[i] /= sum
+	}
+
 	// inputs always use cpu
 	input := cpuDeviceBufferType

-	assignLayer := func(layer int) deviceBufferType {
-		for _, p := range params.GPULayers {
-			for _, l := range p.Layers {
-				if l == layer {
-					for i := range requiredMemory.GPUs {
-						if requiredMemory.GPUs[i].ID == p.ID {
-							return gpuDeviceBufferTypes[i]
-						}
-					}
-
-					return cpuDeviceBufferType
-				}
-			}
+	// define a range of gpu layers. anything outside of this range is assigned to the cpu
+	gpuRangeStart := max(0, blocks-params.NumGPULayers)
+	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
+	assignLayer := func(i int) deviceBufferType {
+		if i < gpuRangeStart || i >= gpuRangeStop {
+			return cpuDeviceBufferType
 		}

-		return cpuDeviceBufferType
+		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
+		if index < 0 || index >= len(gpuDeviceBufferTypes) {
+			return cpuDeviceBufferType
+		}
+
+		return gpuDeviceBufferTypes[index]
 	}

 	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
@@ -271,14 +279,12 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			tt := C.ggml_new_tensor(ctxs[bt], kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
 			C.ggml_set_name(tt, cname)

-			logutil.Trace("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))

 			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
 			if layer == -1 {
 				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
-				if params.AllocMemory {
-					requiredMemory.InputWeights.Status = ml.Allocated
-				}
+				requiredMemory.InputWeights.Status = ml.Allocated
 				requiredMemory.InputWeights.Size += uint64(size)
 			} else {
 				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
@@ -349,14 +355,12 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}

 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
-		if params.AllocMemory {
-			for i := range btDeviceMemory[bt].Weights {
-				if btDeviceMemory[bt].Weights[i].Size != 0 {
-					if b != nil {
-						btDeviceMemory[bt].Weights[i].Status = ml.Allocated
-					} else {
-						btDeviceMemory[bt].Weights[i].Status = ml.Failed
-					}
+		for i := range btDeviceMemory[bt].Weights {
+			if btDeviceMemory[bt].Weights[i].Size != 0 {
+				if b != nil {
+					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
+				} else {
+					btDeviceMemory[bt].Weights[i].Status = ml.Failed
 				}
 			}
 		}
@@ -377,9 +381,28 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		bbs[c] = b
 	}

+	// Mimic llama runner logs summarizing layers and memory
+	gpuLayers := 0
+	for _, layer := range layers {
+		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
+			gpuLayers++
+		}
+	}
+	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
+
+	switch C.ggml_backend_dev_type(output.d) {
+	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
+		slog.Info("offloading output layer to CPU")
+	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
+		slog.Info("offloading output layer to GPU")
+		gpuLayers++
+	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
+		slog.Info("offloading output layer to ACCEL")
+	}
+	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
+
 	for bs := range maps.Values(bbs) {
-		logutil.Trace("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
-			"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
+		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
 	}

 	// map tensor names to tensors for easy lookup later
@@ -400,13 +423,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		b := backends[d]
 		bt := C.ggml_backend_get_default_buffer_type(b)

-		// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
-		if !slices.Contains(cpuDeviceBufferType.bts, bt) {
-			if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
-				continue
-			}
-		}
-
 		deviceBufferTypes[d] = bt

 		schedBackends = append(schedBackends, b)
@@ -421,7 +437,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
 		modelPath:         modelPath,
-		allocMemory:       params.AllocMemory,
 		flashAttention:    params.FlashAttention,
 		meta:              meta,
 		tensorLoadTargets: targets,
@@ -437,14 +452,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		schedBackends: schedBackends,
 		schedBufts:    schedBufts,
 		input:         deviceBufferTypes[input.d],
-		output:        output.d,
-		layers: func() map[int]layerDevice {
-			m := make(map[int]layerDevice)
+		layers: func() map[int]C.ggml_backend_buffer_type_t {
+			m := make(map[int]C.ggml_backend_buffer_type_t)
 			for i, layer := range layers {
-				m[i] = layerDevice{
-					d:  layer.d,
-					bt: deviceBufferTypes[layer.d],
-				}
+				m[i] = deviceBufferTypes[layer.d]
 			}
 			return m
 		}(),
@@ -473,30 +484,6 @@ func (b *Backend) Close() {
 }

 func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
-	if !b.allocMemory {
-		return errors.New("cannot load model without memory allocation")
-	}
-
-	// Mimic llama runner logs summarizing layers and memory
-	gpuLayers := 0
-	for layer := range maps.Values(b.layers) {
-		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
-			gpuLayers++
-		}
-	}
-	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
-
-	switch C.ggml_backend_dev_type(b.output) {
-	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
-		slog.Info("offloading output layer to CPU")
-	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
-		slog.Info("offloading output layer to GPU")
-		gpuLayers++
-	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
-		slog.Info("offloading output layer to ACCEL")
-	}
-	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
-
 	var doneBytes atomic.Uint64
 	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset

@@ -536,7 +523,6 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 				const BS = 17                             // MXFP4 block size
 				bts := make([]byte, 8*BS*format.KibiByte) // ~128k block aligned
 				var s uint64
-				var tmp [16]byte
 				for s < t.Size() {
 					// Stop if either the parent context has been canceled or if any of the other tensors returned an error
 					if err := ctx.Err(); err != nil {
@@ -548,13 +534,37 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 						return err
 					}
 					for j := range n / BS {
-						for i := 1; i < 9; i++ {
-							// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
-							a, b := bts[j*BS+i], bts[j*BS+i+8]
-							tmp[2*(i-1)] = (a & 0x0F) | (b << 4)
-							tmp[2*(i-1)+1] = (a >> 4) | (b & 0xF0)
+						for i := 1; i < BS; i++ {
+							// swap nibbles
+							t_lo := bts[j*BS+i] & 0x0F
+							t_hi := bts[j*BS+i] & 0xF0
+							bts[j*BS+i] = (t_lo << 4) | (t_hi >> 4)
+						}
+						// transform aaaa...bbbb... to abababab...
+						oi := 0
+						tmp := [16]byte{}
+						for i := 1; i < 9; i++ {
+							blk_a0 := bts[j*BS+i] & 0xF0
+							blk_a1 := bts[j*BS+i] << 4
+							blk_b0 := bts[j*BS+i+8] >> 4
+							blk_b1 := bts[j*BS+i+8] & 0x0F
+							// swap once more
+							out0 := blk_a0 | blk_b0
+							out1 := blk_a1 | blk_b1
+							out_h0 := out0 & 0xF0
+							out_l0 := out0 & 0x0F
+							out_h1 := out1 & 0xF0
+							out_l1 := out1 & 0x0F
+							out0 = (out_h0 >> 4) | (out_l0 << 4)
+							out1 = (out_h1 >> 4) | (out_l1 << 4)
+							tmp[oi] = out0
+							oi++
+							tmp[oi] = out1
+							oi++
+						}
+						for i := range tmp {
+							bts[j*BS+i+1] = tmp[i]
 						}
-						copy(bts[j*BS+1:j*BS+17], tmp[:])
 					}

 					for _, tt := range tts {
@@ -630,18 +640,6 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 		})
 	}

-	// Cleanup any backend state from devices that we didn't end up using
-nextDevice:
-	for _, d := range append(gpus, append(accels, cpus...)...) {
-		for _, backend := range b.schedBackends {
-			if d == C.ggml_backend_get_device(backend) {
-				continue nextDevice
-			}
-		}
-
-		C.ggml_backend_dev_reset(d)
-	}
-
 	if err := g.Wait(); err != nil {
 		return err
 	}
@@ -732,11 +730,11 @@ func (c *Context) Input() ml.Context {
 }

 func (c *Context) Layer(i int) ml.Context {
-	if layer, ok := c.b.layers[i]; ok {
+	if buft, ok := c.b.layers[i]; ok {
 		return &Context{
 			b:                c.b,
 			ctx:              c.ctx,
-			buft:             layer.bt,
+			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
 			layer:            i,
@@ -759,15 +757,6 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }

 func (c *Context) Compute(tensors ...ml.Tensor) {
-	c.ComputeWithNotify(nil, tensors...)
-}
-
-func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
-	c.b.schedMu.Lock()
-	defer c.b.schedMu.Unlock()
-	if cb != nil {
-		go cb()
-	}
 	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
 		panic(fmt.Errorf("error computing ggml graph: %v", status))
 	}
@@ -803,16 +792,14 @@ func (c *Context) Reserve() {

 		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
 		graph.Size += uint64(bufferStatus.size)
-		if c.b.allocMemory {
-			if bufferStatus.allocated && graph.Status != ml.Failed {
-				graph.Status = ml.Allocated
-			} else {
-				graph.Status = ml.Failed
-			}
+		if bufferStatus.allocated && graph.Status != ml.Failed {
+			graph.Status = ml.Allocated
+		} else {
+			graph.Status = ml.Failed
 		}

-		logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
-			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
+		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
+			"size", format.HumanBytes2(uint64(bufferStatus.size)))
 	}

 	if !reserved {
@@ -842,7 +829,23 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		panic("set Input or Layer before creating tensors")
 	}

-	cdtype := ggmlDType(dtype)
+	var cdtype uint32
+	switch dtype {
+	case ml.DTypeF32:
+		cdtype = C.GGML_TYPE_F32
+	case ml.DTypeF16:
+		cdtype = C.GGML_TYPE_F16
+	case ml.DTypeQ80:
+		cdtype = C.GGML_TYPE_Q8_0
+	case ml.DTypeQ40:
+		cdtype = C.GGML_TYPE_Q4_0
+	case ml.DTypeI32:
+		cdtype = C.GGML_TYPE_I32
+	case ml.DTypeMXFP4:
+		cdtype = C.GGML_TYPE_MXFP4
+	default:
+		panic("unsupported dtype")
+	}

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
@@ -865,12 +868,10 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]

 		cache.Size += uint64(size)
-		if c.b.allocMemory {
-			if b != nil {
-				cache.Status = ml.Allocated
-			} else {
-				cache.Status = ml.Failed
-			}
+		if b != nil {
+			cache.Status = ml.Allocated
+		} else {
+			cache.Status = ml.Failed
 		}
 	}

@@ -889,9 +890,7 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {

 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	t := c.newTensor(dtype, shape)
-	if c.b.allocMemory {
-		C.ggml_set_zero(t.(*Tensor).t)
-	}
+	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }

@@ -916,7 +915,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {

 	t := c.newTensor(ml.DTypeF32, shape)

-	if c.b.allocMemory && len(s) > 0 {
+	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

@@ -928,7 +927,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {

 	t := c.newTensor(ml.DTypeI32, shape)

-	if c.b.allocMemory && len(s) > 0 {
+	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

@@ -1020,12 +1019,6 @@ func (t *Tensor) Floats() (data []float32) {
 	return
 }

-func (t *Tensor) SetValueFromIntSlice(s []int32) {
-	if len(s) > 0 {
-		C.ggml_backend_tensor_set(t.t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.t))
-	}
-}
-
 func (t *Tensor) DType() ml.DType {
 	switch t.t._type {
 	case C.GGML_TYPE_F32:
@@ -1045,32 +1038,6 @@ func (t *Tensor) DType() ml.DType {
 	}
 }

-func ggmlDType(dtype ml.DType) uint32 {
-	switch dtype {
-	case ml.DTypeF32:
-		return C.GGML_TYPE_F32
-	case ml.DTypeF16:
-		return C.GGML_TYPE_F16
-	case ml.DTypeQ80:
-		return C.GGML_TYPE_Q8_0
-	case ml.DTypeQ40:
-		return C.GGML_TYPE_Q4_0
-	case ml.DTypeI32:
-		return C.GGML_TYPE_I32
-	case ml.DTypeMXFP4:
-		return C.GGML_TYPE_MXFP4
-	default:
-		panic("unsupported dtype")
-	}
-}
-
-func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
-	}
-}
-
 func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1205,13 +1172,6 @@ func (t *Tensor) AddID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) L2Norm(ctx ml.Context, eps float32) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_l2_norm(ctx.(*Context).ctx, t.t, C.float(eps)),
-	}
-}
-
 func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
 	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
 	if w != nil {
@@ -1590,7 +1550,7 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
 func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
 	// Unchecked to handle quantized types
 	t := c.newTensor(dtype, shape)
-	if c.b.allocMemory && len(s) > 0 {
+	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -167,7 +167,6 @@ extern "C" {
    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
-    GGML_API void                          ggml_backend_dev_reset(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
--- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
@@ -178,10 +178,6 @@ extern "C" {
        ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
        void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
        void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
-
-        // (optional) reset device, clearing existing allocations and context
-        // the caller must ensure that there are no outstanding buffers, as these will become invalid
-        void (*reset)(ggml_backend_dev_t dev);
    };

    struct ggml_backend_device {
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {

    ggml_backend_load_best("blas", silent, dir_path);
    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
+
+    // Avoid mixed hip+cuda configurations
+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
+    if (!hip_devices && !rocr_devices) {
+        ggml_backend_load_best("cuda", silent, dir_path);
+    } else {
+        ggml_backend_load_best("hip", silent, dir_path);
+    }
+
    ggml_backend_load_best("metal", silent, dir_path);
    ggml_backend_load_best("rpc", silent, dir_path);
    ggml_backend_load_best("sycl", silent, dir_path);
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -477,14 +477,6 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
    return device->iface.init_backend(device, params);
 }

-void ggml_backend_dev_reset(ggml_backend_dev_t device) {
-    if (device->iface.reset == NULL) {
-        return;
-    }
-
-    device->iface.reset(device);
-}
-
 ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
    return device->iface.get_buffer_type(device);
 }
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/arm.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/arm.go
@@ -1,5 +1,5 @@
 package arm

 // #cgo CXXFLAGS: -std=c++17
-// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include -DHWCAP2_SVE2="2"
+// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include
 import "C"
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -103,11 +103,6 @@ int ggml_cuda_get_device() {
    return id;
 }

-void ggml_cuda_reset_device(int device) {
-    ggml_cuda_set_device(device);
-    CUDA_CHECK(cudaDeviceReset());
-}
-
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
    ggml_cuda_set_device(device);
    cudaError_t err;
@@ -3248,10 +3243,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
    props->description = ggml_backend_cuda_device_get_description(dev);
    props->id          = ggml_backend_cuda_device_get_id(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
-
-    // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
-    // If you need the memory data, call ggml_backend_dev_memory() explicitly.
-    props->memory_total = props->memory_free = 0;
+    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

    bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
@@ -3708,11 +3700,6 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
    CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 }

-static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) {
-    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    ggml_cuda_reset_device(ctx->device);
-}
-
 static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
    /* .get_name                = */ ggml_backend_cuda_device_get_name,
    /* .get_description         = */ ggml_backend_cuda_device_get_description,
@@ -3729,7 +3716,6 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
    /* .event_new               = */ ggml_backend_cuda_device_event_new,
    /* .event_free              = */ ggml_backend_cuda_device_event_free,
    /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
-    /* .reset                   = */ ggml_backend_cuda_device_reset,
 };

 // backend reg
@@ -3849,6 +3835,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->device = i;
                dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);

+                ggml_cuda_set_device(i);
                cudaDeviceProp prop;
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
@@ -40,7 +40,6 @@
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceReset hipDeviceReset
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
--- a/ml/backend/ggml/ggml/src/ggml.cpp
+++ b/ml/backend/ggml/ggml/src/ggml.cpp
@@ -19,12 +19,8 @@ static bool ggml_uncaught_exception_init = []{
        return false;
    }
    const auto prev{std::get_terminate()};
-    // GGML_ASSERT(prev != ggml_uncaught_exception);
-    if (prev != ggml_uncaught_exception) {
-        previous_terminate_handler = prev;
-    } else {
-        GGML_LOG_WARN("%s double registration of ggml_uncaught_exception\n", __func__);
-    }
+    GGML_ASSERT(prev != ggml_uncaught_exception);
+    previous_terminate_handler = prev;
    std::set_terminate(ggml_uncaught_exception);
    return true;
 }();
--- a/ml/nn/pooling/pooling.go
+++ b/ml/nn/pooling/pooling.go
@@ -1,36 +0,0 @@
-package pooling
-
-import (
-	"github.com/ollama/ollama/ml"
-)
-
-type Type uint32
-
-const (
-	TypeNone Type = iota
-	TypeMean
-	TypeCLS
-	TypeLast
-	TypeRank
-
-	TypeUnknown     = 0xFFFFFFFE
-	TypeUnspecified = 0xFFFFFFFF
-)
-
-func Pooling(ctx ml.Context, hiddenStates ml.Tensor, poolingType Type) ml.Tensor {
-	switch poolingType {
-	case TypeNone:
-		return hiddenStates
-	case TypeMean:
-		hiddenStates = hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Mean(ctx)
-		return hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-	case TypeCLS:
-		return hiddenStates.View(ctx, 0, hiddenStates.Dim(0))
-	case TypeLast:
-		panic("not implemented")
-	case TypeRank:
-		panic("not implemented")
-	default:
-		panic("not implemented")
-	}
-}
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -2,6 +2,7 @@ package model

 import (
 	"cmp"
+	"context"
 	"fmt"
 	"iter"
 	"log/slog"
@@ -108,7 +109,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 					r = 0x0143
 				case r <= 0x0020:
 					r = r + 0x0100
-				case r >= 0x007f && r <= 0x00a0:
+				case r >= 0x007e && r <= 0x00a0:
 					r = r + 0x00a2
 				}

@@ -201,11 +202,12 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
 		ids = bpe.vocab.addSpecials(ids)
 	}

-	logutil.Trace("encoded", "string", s, "ids", ids)
 	return ids, nil
 }

@@ -241,6 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}

-	logutil.Trace("decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
 	return sb.String(), nil
 }
--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
@@ -207,36 +207,6 @@ func TestLlama(t *testing.T) {
 			}
 		}
 	})
-
-	t.Run("roundtriping 0x00-0xFF", func(t *testing.T) {
-		t.Parallel()
-
-		for b := 0x00; b <= 0xFF; b++ {
-			input := string(rune(b))
-			ids, err := tokenizer.Encode(input, false)
-			if err != nil {
-				t.Errorf("failed to encode rune 0x%02X: %v", b, err)
-				continue
-			}
-
-			decoded, err := tokenizer.Decode(ids)
-			if err != nil {
-				t.Errorf("failed to decode rune 0x%02X: %v", b, err)
-				continue
-			}
-
-			if b == 0x00 {
-				if len(decoded) != 0 {
-					t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids)
-				}
-				continue
-			}
-
-			if decoded != input {
-				t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input)
-			}
-		}
-	})
 }

 func BenchmarkBytePairEncoding(b *testing.B) {
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -54,9 +54,10 @@ type Batch struct {
 	// Inputs is the input tokens, including placeholders for multimodal inputs.
 	Inputs ml.Tensor

-	// Outputs are the set of indicies into Inputs for which output data should
-	// be returned.
-	Outputs ml.Tensor
+	// Multimodal is a set of multimodal embeddings previously created by
+	// EncodeMultimodal, along with an index into Inputs. Unused for text-only
+	// models or for batches without multimodal elements.
+	Multimodal []MultimodalIndex

 	// Positions is the position for each Input, relative to its sequence. Equal
 	// in length to Inputs.
@@ -65,8 +66,7 @@ type Batch struct {
 	// Sequences is the sequence for each Input. Equal in length to Inputs.
 	Sequences []int

-	// Multimodal is a set of multimodal embeddings previously created by
-	// EncodeMultimodal, along with an index into Inputs. Unused for text-only
-	// models or for batches without multimodal elements.
-	Multimodal []MultimodalIndex
+	// Outputs are the set of indicies into Inputs for which output data should
+	// be returned.
+	Outputs []int32
 }
--- a/model/model.go
+++ b/model/model.go
@@ -1,11 +1,12 @@
 package model

 import (
+	"context"
 	"errors"
 	"fmt"
 	_ "image/jpeg"
 	_ "image/png"
-	"math"
+	"log/slog"
 	"os"
 	"reflect"
 	"strconv"
@@ -24,11 +25,7 @@ import (
 	"github.com/ollama/ollama/model/input"
 )

-var (
-	ErrNoVisionModel        = errors.New("this model is missing data required for image input")
-	ErrUnsupportedModel     = errors.New("model not supported")
-	ErrUnsupportedTokenizer = errors.New("tokenizer not supported")
-)
+var ErrNoVisionModel = errors.New("this model is missing data required for image input")

 // Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
@@ -67,7 +64,7 @@ type MultimodalProcessor interface {
 	// This function is also responsible for updating MultimodalHash for any Multimodal
 	// that is modified to ensure that there is a unique hash value that accurately
 	// represents the contents.
-	PostTokenize([]*input.Input) ([]*input.Input, error)
+	PostTokenize([]input.Input) ([]input.Input, error)
 }

 // Base implements the common fields and methods for all models
@@ -108,10 +105,6 @@ func New(modelPath string, params ml.BackendParams) (Model, error) {
 	}

 	arch := b.Config().Architecture()
-	if b.Config().Uint("pooling_type", math.MaxUint32) != math.MaxUint32 {
-		arch = arch + "_embed"
-	}
-
 	f, ok := models[arch]
 	if !ok {
 		return nil, fmt.Errorf("unsupported model architecture %q", arch)
@@ -205,7 +198,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
 				names := fn(tagsCopy)
 				for _, name := range names {
 					if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
-						logutil.Trace("found tensor", "", tensor)
+						slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor)
 						vv.Set(reflect.ValueOf(tensor))
 						break
 					}
@@ -246,7 +239,7 @@ func setPointer(base Base, v reflect.Value, tags []Tag) {
 		vv = vv.Elem()
 	}

-	vv = reflect.Indirect(vv)
+	vv = vv.Elem()
 	if v.IsNil() {
 		vv = reflect.New(v.Type().Elem()).Elem()
 	}
@@ -285,7 +278,7 @@ func canNil(t reflect.Type) bool {
 		t.Kind() == reflect.Slice
 }

-func Forward(ctx ml.Context, m Model, batch input.Batch) (ml.Tensor, error) {
+func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Tensor, error) {
 	if len(batch.Positions) != len(batch.Sequences) {
 		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(batch.Positions), len(batch.Sequences))
 	}
@@ -294,6 +287,8 @@ func Forward(ctx ml.Context, m Model, batch input.Batch) (ml.Tensor, error) {
 		return nil, errors.New("batch size cannot be less than 1")
 	}

+	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
+
 	cache := m.Config().Cache
 	if cache != nil {
 		err := cache.StartForward(ctx, batch, false)
@@ -307,7 +302,7 @@ func Forward(ctx ml.Context, m Model, batch input.Batch) (ml.Tensor, error) {
 		return nil, err
 	}

-	ctx.Forward(t)
+	ctx.Forward(t).Compute(t)

 	return t, nil
 }
--- a/model/models/bert/model.go
+++ b/model/models/bert/model.go
@@ -1,181 +0,0 @@
-package bert
-
-import (
-	"cmp"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/pooling"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Model struct {
-	model.Base
-	model.TextProcessor
-
-	TokenEmbedding     *nn.Embedding `gguf:"token_embd"`
-	TypeEmbedding      *nn.Embedding `gguf:"token_types"`
-	PositionEmbedding  *nn.Embedding `gguf:"position_embd"`
-	TokenEmbeddingNorm *nn.LayerNorm `gguf:"token_embd_norm"`
-
-	Layers []EncoderLayer `gguf:"blk"`
-
-	Options
-}
-
-// Forward implements model.Model.
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-	hiddenStates = hiddenStates.Add(ctx, m.TypeEmbedding.Weight.View(ctx, 0, m.hiddenSize))
-	hiddenStates = hiddenStates.Add(ctx, m.PositionEmbedding.Forward(ctx, ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))))
-	hiddenStates = m.TokenEmbeddingNorm.Forward(ctx, hiddenStates, m.eps)
-
-	for _, layer := range m.Layers {
-		hiddenStates = layer.Forward(ctx, hiddenStates, &m.Options)
-	}
-
-	hiddenStates = pooling.Pooling(ctx, hiddenStates, m.poolingType)
-	if m.normalize {
-		hiddenStates = hiddenStates.L2Norm(ctx, 1e-12)
-	}
-
-	return hiddenStates, nil
-}
-
-type EncoderLayer struct {
-	*Attention
-	AttentionNorm *nn.LayerNorm `gguf:"attn_output_norm"`
-
-	*MLP
-	MLPNorm *nn.LayerNorm `gguf:"layer_output_norm"`
-}
-
-func (e *EncoderLayer) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	// Attention
-	residual := hiddenStates
-	hiddenStates = e.Attention.Forward(ctx, hiddenStates, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	hiddenStates = e.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-
-	// MLP
-	residual = hiddenStates
-	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	hiddenStates = e.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
-
-	return hiddenStates
-}
-
-type Attention struct {
-	Query     *nn.Linear    `gguf:"attn_q"`
-	QueryNorm *nn.LayerNorm `gguf:"attn_q_norm"`
-
-	Key     *nn.Linear    `gguf:"attn_k"`
-	KeyNorm *nn.LayerNorm `gguf:"attn_k_norm"`
-
-	Value *nn.Linear `gguf:"attn_v"`
-
-	Output *nn.Linear `gguf:"attn_output"`
-}
-
-func (a *Attention) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-
-	query := a.Query.Forward(ctx, hiddenStates)
-	if a.QueryNorm != nil {
-		query = a.QueryNorm.Forward(ctx, query, opts.eps)
-	}
-	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
-
-	key := a.Key.Forward(ctx, hiddenStates)
-	if a.KeyNorm != nil {
-		key = a.KeyNorm.Forward(ctx, key, opts.eps)
-	}
-	key = key.Reshape(ctx, opts.headDim(), cmp.Or(opts.numKVHeads, opts.numHeads), batchSize)
-
-	value := a.Value.Forward(ctx, hiddenStates)
-	value = value.Reshape(ctx, opts.headDim(), cmp.Or(opts.numKVHeads, opts.numHeads), batchSize)
-
-	attention := nn.Attention(ctx, query, key, value, 1/math.Sqrt(float64(opts.headDim())), nil)
-	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
-	return a.Output.Forward(ctx, attention)
-}
-
-type MLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (m *MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	return m.Down.Forward(ctx, m.Up.Forward(ctx, hiddenStates).GELU(ctx))
-}
-
-type Options struct {
-	hiddenSize,
-	numHeads,
-	numKVHeads,
-	keyLength,
-	valueLength int
-	poolingType pooling.Type
-	eps         float32
-	normalize   bool
-}
-
-func (o Options) headDim() int {
-	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
-}
-
-func New(c fs.Config) (model.Model, error) {
-	var processor model.TextProcessor
-	switch c.String("tokenizer.ggml.model", "bert") {
-	case "bert":
-		processor = model.NewWordPiece(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS: []int32{
-					int32(cmp.Or(
-						c.Uint("tokenizer.ggml.cls_token_id"),
-						c.Uint("tokenizer.ggml.bos_token_id"),
-					)),
-				},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", true),
-				EOS: []int32{
-					int32(cmp.Or(
-						c.Uint("tokenizer.ggml.separator_token_id"),
-						//nolint:misspell
-						// NOTE: "seperator_token_id" is a typo in model metadata but we need to
-						// support it for compatibility.
-						c.Uint("tokenizer.ggml.seperator_token_id"),
-						c.Uint("tokenizer.ggml.eos_token_id"),
-					)),
-				},
-			},
-		)
-	default:
-		return nil, model.ErrUnsupportedTokenizer
-	}
-
-	return &Model{
-		TextProcessor: processor,
-		Layers:        make([]EncoderLayer, c.Uint("block_count")),
-		Options: Options{
-			hiddenSize:  int(c.Uint("embedding_length")),
-			numHeads:    int(c.Uint("attention.head_count")),
-			numKVHeads:  int(c.Uint("attention.head_count_kv")),
-			eps:         c.Float("attention.layer_norm_epsilon"),
-			poolingType: pooling.Type(c.Uint("pooling_type")),
-			normalize:   c.Bool("normalize_embeddings", true),
-		},
-	}, nil
-}
-
-func init() {
-	model.Register("bert", New)
-	model.Register("bert_embed", New)
-}
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -24,7 +24,7 @@ type Options struct {

 type Model struct {
 	model.Base
-	model.SentencePiece
+	model.SentencePieceModel

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -40,7 +40,7 @@ const (

 func New(c fs.Config) (model.Model, error) {
 	m := Model{
-		SentencePiece: model.NewSentencePiece(
+		SentencePieceModel: model.NewSentencePieceModel(
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
@@ -176,6 +176,7 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
@@ -192,7 +193,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

 		var lastLayerOutputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			lastLayerOutputs = batch.Outputs
+			lastLayerOutputs = outputs
 		}

 		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
--- a/model/models/gemma3/embed.go
+++ b/model/models/gemma3/embed.go
@@ -1,62 +0,0 @@
-package gemma3
-
-import (
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/pooling"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type embedModel struct {
-	model.Base
-	model.SentencePiece
-
-	*TextModel
-	poolingType pooling.Type
-
-	Dense [2]*nn.Linear `gguf:"dense"`
-}
-
-func (m *embedModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	hiddenStates := m.TextModel.Forward(ctx, batch, m.Cache)
-	hiddenStates = pooling.Pooling(ctx, hiddenStates, m.poolingType)
-	for _, dense := range m.Dense {
-		hiddenStates = dense.Forward(ctx, hiddenStates)
-	}
-	hiddenStates = hiddenStates.L2Norm(ctx, 1e-12)
-	return hiddenStates, nil
-}
-
-func newEmbedModel(c fs.Config) (model.Model, error) {
-	m := &embedModel{
-		SentencePiece: model.NewSentencePiece(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{
-						int32(c.Uint("tokenizer.ggml.eos_token_id")),
-						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
-					},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
-			},
-		),
-		TextModel:   newTextModel(c),
-		poolingType: pooling.Type(c.Uint("pooling_type", 0)),
-	}
-
-	m.Cache = kvcache.NewWrapperCache(
-		kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),
-		kvcache.NewCausalCache(m.Shift),
-	)
-
-	return m, nil
-}
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -16,9 +16,9 @@ import (

 type Model struct {
 	model.Base
-	model.SentencePiece
+	model.SentencePieceModel

-	*VisionModel `gguf:"v"`
+	*VisionModel `gguf:"v,vision"`
 	*TextModel

 	*MultiModalProjector `gguf:"mm"`
@@ -55,7 +55,7 @@ func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, i

 func New(c fs.Config) (model.Model, error) {
 	m := Model{
-		SentencePiece: model.NewSentencePiece(
+		SentencePieceModel: model.NewSentencePieceModel(
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
@@ -112,8 +112,8 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }

-func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
-	var result []*input.Input
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input

 	for _, inp := range inputs {
 		if len(inp.Multimodal) == 0 {
@@ -122,17 +122,17 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 			inputMultimodal := inp.Multimodal[0].Tensor

 			result = append(result,
-				&input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
-				&input.Input{Token: 255999},                                     // "<start_of_image>""
-				&input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
+				input.Input{Token: 255999},                                     // "<start_of_image>""
+				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
-			result = append(result, slices.Repeat([]*input.Input{{Token: 0}}, inputMultimodal.Dim(1)-1)...)
+			result = append(result, slices.Repeat([]input.Input{{Token: 0}}, inputMultimodal.Dim(1)-1)...)

 			result = append(result,
-				&input.Input{Token: 256000}, // <end_of_image>
-				&input.Input{Token: 108},    // "\n\n"
+				input.Input{Token: 256000}, // <end_of_image>
+				input.Input{Token: 108},    // "\n\n"
 			)
 		}
 	}
@@ -141,11 +141,12 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	hiddenStates := m.TextModel.Forward(ctx, batch, m.Cache)
-	return m.Output.Forward(ctx, hiddenStates), nil
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }

 func init() {
 	model.Register("gemma3", New)
-	model.Register("gemma3_embed", newEmbedModel)
 }
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -159,10 +159,8 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,
 	return hiddenState.Add(ctx, residual)
 }

-func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) ml.Tensor {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-
-	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextConfig.hiddenSize)))

 	// set image embeddings
@@ -193,12 +191,12 @@ func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cac

 		var lastLayerOutputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			lastLayerOutputs = batch.Outputs
+			lastLayerOutputs = outputs
 		}

 		hiddenState = layer.Forward(ctx, i, hiddenState, positions, lastLayerOutputs, cache, m.TextConfig)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
-	return hiddenState
+	return m.Output.Forward(ctx, hiddenState)
 }
--- a/model/models/gemma3n/model.go
+++ b/model/models/gemma3n/model.go
@@ -10,7 +10,7 @@ import (

 type Model struct {
 	model.Base
-	model.SentencePiece
+	model.SentencePieceModel

 	*TextModel
 }
@@ -23,7 +23,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 func New(c fs.Config) (model.Model, error) {
 	m := Model{
 		TextModel: newTextModel(c),
-		SentencePiece: model.NewSentencePiece(
+		SentencePieceModel: model.NewSentencePieceModel(
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
--- a/model/models/gemma3n/model_text.go
+++ b/model/models/gemma3n/model_text.go
@@ -83,7 +83,7 @@ func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cac

 	hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx).Mean(ctx)
 	hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
-	hiddenStates = hiddenStates.Rows(ctx, batch.Outputs)
+	hiddenStates = hiddenStates.Rows(ctx, ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs)))

 	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
 	return m.Output.Forward(ctx, hiddenStates), nil
--- a/model/models/gptoss/model.go
+++ b/model/models/gptoss/model.go
@@ -41,8 +41,8 @@ func (m *Transformer) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, err
 		}

 		var outputs ml.Tensor
-		if i == len(m.TransformerBlocks)-1 {
-			outputs = batch.Outputs
+		if len(batch.Outputs) > 0 && i == len(m.TransformerBlocks)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}

 		hiddenStates = block.Forward(ctx, hiddenStates, positions, outputs, one, m.Cache, &m.Options)
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -160,7 +160,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs = batch.Outputs
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}

 		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -18,7 +18,7 @@ type Model struct {
 	model.BytePairEncoding
 	ImageProcessor

-	*VisionModel `gguf:"v"`
+	*VisionModel `gguf:"v,vision"`
 	*Projector   `gguf:"mm"`
 	*TextModel
 }
@@ -134,16 +134,16 @@ type separator struct {
 	y bool
 }

-func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
-	var result []*input.Input
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
 	for _, inp := range inputs {
 		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 			continue
 		}

-		var imageInputs []*input.Input
-		imageInputs = append(imageInputs, &input.Input{Token: 200080}) // <|image_start|>
+		var imageInputs []input.Input
+		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

 		for i, mm := range inp.Multimodal {
 			patchesPerChunk := mm.Tensor.Dim(1)
@@ -151,20 +151,20 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 			if i < len(inp.Multimodal)-1 {
 				separator := mm.Data.(*separator)

-				imageInputs = append(imageInputs, &input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]*input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)

 				if separator.x {
-					imageInputs = append(imageInputs, &input.Input{Token: 200084}) // <|tile_x_separator|>
+					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
 				}
 				if separator.y {
-					imageInputs = append(imageInputs, &input.Input{Token: 200085}) // <|tile_y_separator|>
+					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
 				}
 			} else {
-				imageInputs = append(imageInputs, &input.Input{Token: 200090})                                                                                                                      // <|image|>
-				imageInputs = append(imageInputs, &input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]*input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-				imageInputs = append(imageInputs, &input.Input{Token: 200080}) // <|image_end|>
+				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 			}
 		}

@@ -176,7 +176,9 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache), nil
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }

 func init() {
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -18,7 +18,7 @@ type Model struct {
 	model.BytePairEncoding

 	*TextModel
-	*VisionModel         `gguf:"v"`
+	*VisionModel         `gguf:"v,vision"`
 	*MultiModalProjector `gguf:"mm"`

 	ImageProcessor
@@ -133,22 +133,22 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
 // Each sequence of [IMG]...[IMG] is a set of patches of vision embeddings
 // that can be processed together.
-func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
-	var result []*input.Input
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
 	for _, inp := range inputs {
 		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
 			for i, row := range inp.Multimodal {
 				// [IMG]
-				result = append(result, &input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
-				result = append(result, slices.Repeat([]*input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
+				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
 				if i == len(inp.Multimodal)-1 {
 					// [IMG_END]
-					result = append(result, &input.Input{Token: 13})
+					result = append(result, input.Input{Token: 13})
 				} else {
 					// [IMG_BREAK]
-					result = append(result, &input.Input{Token: 12})
+					result = append(result, input.Input{Token: 12})
 				}
 			}
 		}
@@ -159,8 +159,9 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

-	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }

 func init() {
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -17,7 +17,7 @@ type Model struct {
 	model.Base
 	model.BytePairEncoding

-	*VisionModel `gguf:"v"`
+	*VisionModel `gguf:"v,vision"`
 	*TextModel

 	Projector *nn.Linear `gguf:"mm.0"`
@@ -90,7 +90,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
 }

-func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	for i := range inputs {
 		if inputs[i].Multimodal != nil {
 			inputs[i].Token = 128256 // <|image|>
@@ -107,9 +107,10 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	}

 	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }

 func init() {
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -1,7 +1,6 @@
 package models

 import (
-	_ "github.com/ollama/ollama/model/models/bert"
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -111,7 +111,7 @@ func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs = batch.Outputs
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}

 		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -18,7 +18,7 @@ type Model struct {
 	model.BytePairEncoding

 	*TextModel
-	*VisionModel `gguf:"v"`
+	*VisionModel `gguf:"v,vision"`

 	ImageProcessor
 }
@@ -89,8 +89,8 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 }

 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
-func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
-	var result []*input.Input
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input

 	var (
 		imageToken       int32 = 151655
@@ -112,16 +112,16 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
 			}
 			for i := range pre {
-				result = append(result, &input.Input{Token: pre[i]})
+				result = append(result, input.Input{Token: pre[i]})
 			}

 			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)

 			// First add the vision start token
-			result = append(result, &input.Input{Token: visionStartToken})
+			result = append(result, input.Input{Token: visionStartToken})

 			// Add the image token with the multimodal tensor data at the first position
-			result = append(result, &input.Input{
+			result = append(result, input.Input{
 				Token:          imageToken,
 				Multimodal:     inp.Multimodal,
 				MultimodalHash: inp.MultimodalHash,
@@ -129,9 +129,9 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 			})

 			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
-			result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
+			result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)

-			result = append(result, &input.Input{Token: visionEndToken})
+			result = append(result, input.Input{Token: visionEndToken})
 		}
 	}

@@ -140,8 +140,9 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

-	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache)
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }

 func init() {
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -165,7 +165,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs = batch.Outputs
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}

 		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
--- a/model/sentencepiece.go
+++ b/model/sentencepiece.go
@@ -2,6 +2,7 @@ package model

 import (
 	"container/heap"
+	"context"
 	"fmt"
 	"log/slog"
 	"strconv"
@@ -12,19 +13,19 @@ import (

 const spmWhitespaceSep = "▁"

-type SentencePiece struct {
+type SentencePieceModel struct {
 	maxTokenLen int
 	vocab       *Vocabulary
 }

-var _ TextProcessor = (*SentencePiece)(nil)
+var _ TextProcessor = (*SentencePieceModel)(nil)

-func (spm SentencePiece) Vocabulary() *Vocabulary {
+func (spm SentencePieceModel) Vocabulary() *Vocabulary {
 	return spm.vocab
 }

-func NewSentencePiece(vocab *Vocabulary) SentencePiece {
-	logutil.Trace("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
+func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
+	slog.Log(context.TODO(), logutil.LevelTrace, "Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])

 	counter := map[int]int{}
 	var maxTokenLen int
@@ -38,21 +39,21 @@ func NewSentencePiece(vocab *Vocabulary) SentencePiece {
 		}
 	}

-	logutil.Trace("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
+	slog.Log(context.TODO(), logutil.LevelTrace, "Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
 		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
 		"max token len", maxTokenLen)

-	return SentencePiece{
+	return SentencePieceModel{
 		maxTokenLen: maxTokenLen,
 		vocab:       vocab,
 	}
 }

-func (spm SentencePiece) Is(id int32, special Special) bool {
+func (spm SentencePieceModel) Is(id int32, special Special) bool {
 	return spm.vocab.Is(id, special)
 }

-func (spm SentencePiece) Encode(s string, addSpecial bool) ([]int32, error) {
+func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error) {
 	fragments := []fragment{{value: s}}
 	for _, special := range spm.vocab.SpecialVocabulary() {
 		id := spm.vocab.Encode(special)
@@ -181,11 +182,12 @@ func (spm SentencePiece) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
 		ids = spm.vocab.addSpecials(ids)
 	}

-	logutil.Trace("encoded", "string", s, "ids", ids)
 	return ids, nil
 }

@@ -218,7 +220,7 @@ func (q *queue) Pop() interface{} {
 	return item
 }

-func (spm SentencePiece) Decode(ids []int32) (string, error) {
+func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
 		data := spm.vocab.Decode(id)
@@ -244,6 +246,6 @@ func (spm SentencePiece) Decode(ids []int32) (string, error) {
 		}
 	}

-	logutil.Trace("decoded", "ids", ids, "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/sentencepiece_test.go
+++ b/model/sentencepiece_test.go
@@ -12,7 +12,7 @@ import (
 	"github.com/ollama/ollama/convert/sentencepiece"
 )

-func loadSentencePieceVocab(t *testing.T) SentencePiece {
+func loadSentencePieceVocab(t *testing.T) SentencePieceModel {
 	t.Helper()

 	bts, err := os.ReadFile(filepath.Join("testdata", "gemma2", "tokenizer.model"))
@@ -45,7 +45,7 @@ func loadSentencePieceVocab(t *testing.T) SentencePiece {
 		}
 	}

-	return NewSentencePiece(&v)
+	return NewSentencePieceModel(&v)
 }

 func TestSentencePieceEncode(t *testing.T) {
@@ -115,7 +115,7 @@ func TestSentencePieceEncode(t *testing.T) {
 	})
 }

-func TestSentencePieceDecodeByteTokens(t *testing.T) {
+func TestSentencePieceModelDecodeByteTokens(t *testing.T) {
 	vocab := &Vocabulary{
 		Values: []string{
 			"normal",
@@ -134,7 +134,7 @@ func TestSentencePieceDecodeByteTokens(t *testing.T) {
 		Scores: []float32{0, 0, 0, 0, 0},
 	}

-	spm := NewSentencePiece(vocab)
+	spm := NewSentencePieceModel(vocab)

 	tests := []struct {
 		name     string
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -49,7 +49,7 @@ func (v *Vocabulary) addSpecials(ids []int32) []int32 {
 			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
 		}

-		slog.Debug("adding bos token to prompt", "id", v.BOS[0])
+		slog.Debug("adding bos token to prompt", "id", v.BOS)
 		ids = append([]int32{v.BOS[0]}, ids...)
 	}

@@ -58,7 +58,7 @@ func (v *Vocabulary) addSpecials(ids []int32) []int32 {
 			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
 		}

-		slog.Debug("adding eos token to prompt", "id", v.EOS[0])
+		slog.Debug("adding eos token to prompt", "id", v.EOS)
 		ids = append(ids, v.EOS[0])
 	}

--- a/Show More
+++ b/Show More