Merge pull request #11765 from ollama/drifkin/thinking-without-content

openai: always provide reasoning
2025-08-06 19:02:23 -07:00 · 2025-08-06 18:54:20 -07:00 · 2025-08-06 17:53:25 -07:00 · 2025-08-06 17:00:24 -07:00 · 2025-08-06 16:55:57 -07:00 · 2025-08-06 16:11:31 -07:00
290 changed files with 149441 additions and 146107 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -23,7 +23,7 @@ jobs:
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT

  darwin-build:
-    runs-on: macos-13
+    runs-on: macos-13-xlarge
    environment: release
    needs: setup-environment
    strategy:
@@ -54,48 +54,6 @@ jobs:
          name: build-${{ matrix.os }}-${{ matrix.arch }}
          path: dist/*

-  darwin-sign:
-    runs-on: macos-13
-    environment: release
-    needs: darwin-build
-    steps:
-      - uses: actions/checkout@v4
-      - run: |
-          echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
-          security create-keychain -p password build.keychain
-          security default-keychain -s build.keychain
-          security unlock-keychain -p password build.keychain
-          security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
-          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
-          security set-keychain-settings -lut 3600 build.keychain
-        env:
-          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
-          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
-      - uses: actions/download-artifact@v4
-        with:
-          name: build-darwin-amd64
-          path: dist/darwin-amd64
-      - uses: actions/download-artifact@v4
-        with:
-          name: build-darwin-arm64
-          path: dist/darwin-arm64
-      - run: |
-          export VERSION=${GITHUB_REF_NAME#v}
-          ./scripts/build_darwin.sh sign macapp
-        env:
-          APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
-          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
-          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
-          APPLE_ID: ${{ vars.APPLE_ID }}
-          SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-          DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-darwin
-          path: |
-            dist/Ollama-darwin.zip
-            dist/ollama-darwin.tgz
-
  windows-depends:
    strategy:
      matrix:
@@ -103,21 +61,18 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
-          - os: windows
-            arch: amd64
-            preset: 'CUDA 11'
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            cuda-version: '12.8'
+            flags: ''
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
+            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -160,6 +115,9 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: matrix.preset == 'CPU'
        run: |
          echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
@@ -178,9 +136,9 @@ jobs:
          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
      - name: Build target "${{ matrix.preset }}"
        run: |
-          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}"
+          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
        env:
@@ -230,61 +188,11 @@ jobs:
          go-version-file: go.mod
      - run: |
          go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
-      - if: matrix.arch == 'arm64'
-        run: |
-          Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vc_redist.arm64.exe" -OutFile "dist\windows-arm64\vc_redist.arm64.exe"
-      - run: |
-          $env:VERSION='${{ github.ref_name }}' -Replace "v(.*)", '$1'
-          & .\scripts\build_windows.ps1 buildApp
-        env:
-          VCToolsRedistDir: stub
      - uses: actions/upload-artifact@v4
        with:
          name: build-${{ matrix.os }}-${{ matrix.arch }}
          path: |
            dist\${{ matrix.os }}-${{ matrix.arch }}\*.exe
-            dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
-
-  windows-sign:
-    runs-on: windows-2022
-    environment: release
-    needs: [windows-depends, windows-build]
-    steps:
-      - uses: actions/checkout@v4
-      - uses: google-github-actions/auth@v2
-        with:
-          project_id: ollama
-          credentials_json: ${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}
-      - run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${{ runner.temp }}\sdksetup.exe"
-          Start-Process "${{ runner.temp }}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${{ runner.temp }}\plugin.zip"
-          Expand-Archive -Path "${{ runner.temp }}\plugin.zip" -DestinationPath "${{ runner.temp }}\plugin\"
-          & "${{ runner.temp }}\plugin\*\kmscng.msi" /quiet
-
-          echo "${{ vars.OLLAMA_CERT }}" >ollama_inc.crt
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: build-windows-*
-          path: dist\
-          merge-multiple: true
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: depends-windows-amd64-*
-          path: dist\windows-amd64\
-          merge-multiple: true
-      - run: |
-          & .\scripts\build_windows.ps1 gatherDependencies sign buildInstaller distZip
-        env:
-          KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-windows
-          path: |
-            dist\OllamaSetup.exe
-            dist\ollama-windows-*.zip

  linux-build:
    strategy:
@@ -317,21 +225,26 @@ jobs:
            CGO_CFLAGS=${{ env.CGO_CFLAGS }}
            CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
-          cache-from: type=registry,ref=ollama/ollama:latest
+          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
            case "$COMPONENT" in
-              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
-              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
-              lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_sbsa)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
            esac
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
+      - run: |
+          echo "Manifests"
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
+            echo $ARCHIVE
+            cat $ARCHIVE
+          done
      - run: |
          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
@@ -385,8 +298,8 @@ jobs:
          context: .
          platforms: ${{ matrix.os }}/${{ matrix.arch }}
          build-args: ${{ matrix.build-args }}
-          outputs: type=image,name=ollama/ollama,push-by-digest=true,name-canonical=true,push=true
-          cache-from: type=registry,ref=ollama/ollama:latest
+          outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
      - run: |
          mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
@@ -418,7 +331,7 @@ jobs:
            latest=false
            suffix=${{ matrix.suffix }}
          images: |
-            ollama/ollama
+            ${{ vars.DOCKER_REPO }}
          tags: |
            type=ref,enable=true,priority=600,prefix=pr-,event=pr
            type=semver,pattern={{version}}
@@ -428,56 +341,24 @@ jobs:
          path: ${{ runner.temp }}
          merge-multiple: true
      - run: |
-          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf 'ollama/ollama@%s ')
-          docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
+          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf '${{ vars.DOCKER_REPO }}@%s ')
+          docker buildx imagetools inspect ${{ vars.DOCKER_REPO }}:${{ steps.metadata.outputs.version }}
        working-directory: ${{ runner.temp }}

  # Trigger downstream release process
  trigger:
    runs-on: ubuntu-latest
    environment: release
-    needs: [darwin-build, windows-build, windows-depends]
-    steps:
-      - name: Trigger downstream release process
-        run: |
-          curl -L \
-            -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
-            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}"
-
-  # Aggregate all the assets and ship a release
-  release:
-    needs: [darwin-sign, windows-sign, linux-build]
-    runs-on: linux
-    environment: release
+    needs: [darwin-build, windows-build, windows-depends, linux-build]
    permissions:
      contents: write
    env:
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
-      - uses: actions/download-artifact@v4
-        with:
-          name: dist-darwin
-          path: dist
-      - uses: actions/download-artifact@v4
-        with:
-          name: dist-windows
-          path: dist
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: dist-linux-*
-          path: dist
-          merge-multiple: true
-      - run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
-        working-directory: dist
-      - name: Create or update Release
+      - name: Create or update Release for tag
        run: |
          RELEASE_VERSION="$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
-
          echo "Looking for existing release for ${RELEASE_VERSION}"
          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
          if [ -n "$OLD_TAG" ]; then
@@ -491,5 +372,12 @@ jobs:
              --generate-notes \
              --prerelease
          fi
-          echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
-          gh release upload ${GITHUB_REF_NAME} dist/* --clobber
+      - name: Trigger downstream release process
+        run: |
+          curl -L \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
+            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"origin\": \"${GITHUB_REPOSITORY}\", \"publish\": \"1\"}}"
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -36,7 +36,7 @@ jobs:
              | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
          }

-          echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT

  linux:
    needs: [changes]
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,11 +78,11 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
-            flags: '-DAMDGPU_TARGETS=gfx1010'
+            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
    runs-on: windows
    steps:
      - run: |
@@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
@@ -120,6 +120,9 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
@@ -133,8 +136,8 @@ jobs:
          path: ${{ github.workspace }}\.ccache
          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
      - run: |
-          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
        env:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,14 +78,13 @@ if(CMAKE_CUDA_COMPILER)

    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
-    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
-        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
    )
 endif()

@@ -116,7 +115,11 @@ if(CMAKE_HIP_COMPILER)

        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
-            RUNTIME_DEPENDENCIES
+            RUNTIME_DEPENDENCY_SET rocm
+            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+        )
+        install(RUNTIME_DEPENDENCY_SET rocm
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -6,7 +6,8 @@
      "binaryDir": "${sourceDir}/build",
      "installDir": "${sourceDir}/dist",
      "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release"
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_MSVC_RUNTIME_LIBRARY": "MultiThreaded"
      }
    },
    {
@@ -17,20 +18,12 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
-      }
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
    {
@@ -58,6 +51,7 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
+        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    }
@@ -78,11 +72,6 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 11"
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -65,7 +65,7 @@ continuation of the sentence:
 Examples:

      llm/backend/mlx: support the llama architecture
-      CONTRIBUTING: provide clairity on good commit messages, and bad
+      CONTRIBUTING: provide clarity on good commit messages, and bad

 Bad Examples:

--- a/26
+++ b/26
@@ -7,12 +7,13 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2

-# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
+# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
+    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH

@@ -38,15 +39,6 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

-FROM base AS cuda-11
-ARG CUDA11VERSION=11.3
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
-ENV PATH=/usr/local/cuda-11/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' \
-        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -98,23 +90,21 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
-COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
+COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6

 FROM scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
+COPY --from=rocm-6 dist/lib/ollama /lib/ollama

 FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama

-FROM ubuntu:20.04
+FROM ubuntu:24.04
 RUN apt-get update \
    && apt-get install -y ca-certificates \
    && apt-get clean \
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=1caae7fc6c77551cb1066515e0f414713eebb367
+FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618

 .PHONY: help
 help:
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
   <a href="https://ollama.com">
-    <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+    <img alt="ollama" width="240" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
  </a>
 </div>

@@ -10,7 +10,7 @@ Get up and running with large language models.

 ### macOS

-[Download](https://ollama.com/download/Ollama-darwin.zip)
+[Download](https://ollama.com/download/Ollama.dmg)

 ### Windows

@@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 ## Quickstart

-To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
+To run and chat with [Gemma 3](https://ollama.com/library/gemma3):

 ```shell
-ollama run llama3.2
+ollama run gemma3
 ```

 ## Model library
@@ -360,7 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
+- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG and deep research on Mac/Windows/Linux)
 - [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
@@ -407,6 +407,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
+- [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
+- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
+- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
+- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)

 ### Cloud

@@ -451,6 +455,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
+- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
+- [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.

 ### Apple Vision Pro

@@ -589,10 +595,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
 - [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
+- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
+- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)

 ### Supported backends

- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
+- [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.

 ### Observability
 - [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
--- a/api/client.go
+++ b/api/client.go
@@ -222,10 +222,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			return fmt.Errorf("unmarshal: %w", err)
 		}

-		if errorResponse.Error != "" {
-			return errors.New(errorResponse.Error)
-		}
-
 		if response.StatusCode >= http.StatusBadRequest {
 			return StatusError{
 				StatusCode:   response.StatusCode,
@@ -234,6 +230,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			}
 		}

+		if errorResponse.Error != "" {
+			return errors.New(errorResponse.Error)
+		}
+
 		if err := fn(bts); err != nil {
 			return err
 		}
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -89,6 +89,16 @@ func TestClientStream(t *testing.T) {
 			},
 			wantErr: "mid-stream error",
 		},
+		{
+			name: "http status error takes precedence over general error",
+			responses: []any{
+				testError{
+					message:    "custom error message",
+					statusCode: http.StatusInternalServerError,
+				},
+			},
+			wantErr: "500",
+		},
 		{
 			name: "successful stream completion",
 			responses: []any{
--- a/api/types.go
+++ b/api/types.go
@@ -85,10 +85,11 @@ type GenerateRequest struct {
 	Options map[string]any `json:"options"`

 	// Think controls whether thinking/reasoning models will think before
-	// responding. Needs to be a pointer so we can distinguish between false
+	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
+	// for supported models. Needs to be a pointer so we can distinguish between false
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
-	Think *bool `json:"think,omitempty"`
+	Think *ThinkValue `json:"think,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -116,8 +117,9 @@ type ChatRequest struct {
 	Options map[string]any `json:"options"`

 	// Think controls whether thinking/reasoning models will think before
-	// responding
-	Think *bool `json:"think,omitempty"`
+	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
+	// for supported models.
+	Think *ThinkValue `json:"think,omitempty"`
 }

 type Tools []Tool
@@ -143,6 +145,7 @@ type Message struct {
 	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
+	ToolName  string      `json:"tool_name,omitempty"`
 }

 func (m *Message) UnmarshalJSON(b []byte) error {
@@ -222,20 +225,68 @@ func (pt PropertyType) String() string {
 	return fmt.Sprintf("%v", []string(pt))
 }

+type ToolProperty struct {
+	AnyOf       []ToolProperty `json:"anyOf,omitempty"`
+	Type        PropertyType   `json:"type"`
+	Items       any            `json:"items,omitempty"`
+	Description string         `json:"description"`
+	Enum        []any          `json:"enum,omitempty"`
+}
+
+// ToTypeScriptType converts a ToolProperty to a TypeScript type string
+func (tp ToolProperty) ToTypeScriptType() string {
+	if len(tp.AnyOf) > 0 {
+		var types []string
+		for _, anyOf := range tp.AnyOf {
+			types = append(types, anyOf.ToTypeScriptType())
+		}
+		return strings.Join(types, " | ")
+	}
+
+	if len(tp.Type) == 0 {
+		return "any"
+	}
+
+	if len(tp.Type) == 1 {
+		return mapToTypeScriptType(tp.Type[0])
+	}
+
+	var types []string
+	for _, t := range tp.Type {
+		types = append(types, mapToTypeScriptType(t))
+	}
+	return strings.Join(types, " | ")
+}
+
+// mapToTypeScriptType maps JSON Schema types to TypeScript types
+func mapToTypeScriptType(jsonType string) string {
+	switch jsonType {
+	case "string":
+		return "string"
+	case "number", "integer":
+		return "number"
+	case "boolean":
+		return "boolean"
+	case "array":
+		return "any[]"
+	case "object":
+		return "Record<string, any>"
+	case "null":
+		return "null"
+	default:
+		return "any"
+	}
+}
+
 type ToolFunction struct {
 	Name        string `json:"name"`
 	Description string `json:"description"`
 	Parameters  struct {
-		Type       string   `json:"type"`
-		Defs       any      `json:"$defs,omitempty"`
-		Items      any      `json:"items,omitempty"`
-		Required   []string `json:"required"`
-		Properties map[string]struct {
-			Type        PropertyType `json:"type"`
-			Items       any          `json:"items,omitempty"`
-			Description string       `json:"description"`
-			Enum        []any        `json:"enum,omitempty"`
-		} `json:"properties"`
+		Type       string                  `json:"type"`
+		Defs       any                     `json:"$defs,omitempty"`
+		Items      any                     `json:"items,omitempty"`
+		Required   []string                `json:"required"`
+		Properties map[string]ToolProperty `json:"properties"`
 	} `json:"parameters"`
 }

@@ -457,24 +508,24 @@ type ProcessResponse struct {

 // ListModelResponse is a single model description in [ListResponse].
 type ListModelResponse struct {
-	Name         string             `json:"name"`
-	Model        string             `json:"model"`
-	ModifiedAt   time.Time          `json:"modified_at"`
-	Size         int64              `json:"size"`
-	Digest       string             `json:"digest"`
-	Capabilities []model.Capability `json:"capabilities,omitempty"`
-	Details      ModelDetails       `json:"details,omitempty"`
+	Name       string       `json:"name"`
+	Model      string       `json:"model"`
+	ModifiedAt time.Time    `json:"modified_at"`
+	Size       int64        `json:"size"`
+	Digest     string       `json:"digest"`
+	Details    ModelDetails `json:"details,omitempty"`
 }

 // ProcessModelResponse is a single model description in [ProcessResponse].
 type ProcessModelResponse struct {
-	Name      string       `json:"name"`
-	Model     string       `json:"model"`
-	Size      int64        `json:"size"`
-	Digest    string       `json:"digest"`
-	Details   ModelDetails `json:"details,omitempty"`
-	ExpiresAt time.Time    `json:"expires_at"`
-	SizeVRAM  int64        `json:"size_vram"`
+	Name          string       `json:"name"`
+	Model         string       `json:"model"`
+	Size          int64        `json:"size"`
+	Digest        string       `json:"digest"`
+	Details       ModelDetails `json:"details,omitempty"`
+	ExpiresAt     time.Time    `json:"expires_at"`
+	SizeVRAM      int64        `json:"size_vram"`
+	ContextLength int          `json:"context_length"`
 }

 type TokenResponse struct {
@@ -507,6 +558,8 @@ type GenerateResponse struct {
 	Context []int `json:"context,omitempty"`

 	Metrics
+
+	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
 }

 // ModelDetails provides details about a model.
@@ -676,6 +729,113 @@ func DefaultOptions() Options {
 	}
 }

+// ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low")
+type ThinkValue struct {
+	// Value can be a bool or string
+	Value interface{}
+}
+
+// IsValid checks if the ThinkValue is valid
+func (t *ThinkValue) IsValid() bool {
+	if t == nil || t.Value == nil {
+		return true // nil is valid (means not set)
+	}
+
+	switch v := t.Value.(type) {
+	case bool:
+		return true
+	case string:
+		return v == "high" || v == "medium" || v == "low"
+	default:
+		return false
+	}
+}
+
+// IsBool returns true if the value is a boolean
+func (t *ThinkValue) IsBool() bool {
+	if t == nil || t.Value == nil {
+		return false
+	}
+	_, ok := t.Value.(bool)
+	return ok
+}
+
+// IsString returns true if the value is a string
+func (t *ThinkValue) IsString() bool {
+	if t == nil || t.Value == nil {
+		return false
+	}
+	_, ok := t.Value.(string)
+	return ok
+}
+
+// AsBool returns the value as a bool (true if enabled in any way)
+func (t *ThinkValue) AsBool() bool {
+	if t == nil || t.Value == nil {
+		return false
+	}
+
+	switch v := t.Value.(type) {
+	case bool:
+		return v
+	case string:
+		// Any string value ("high", "medium", "low") means thinking is enabled
+		return v == "high" || v == "medium" || v == "low"
+	default:
+		return false
+	}
+}
+
+// AsString returns the value as a string
+func (t *ThinkValue) AsString() string {
+	if t == nil || t.Value == nil {
+		return ""
+	}
+
+	switch v := t.Value.(type) {
+	case string:
+		return v
+	case bool:
+		if v {
+			return "medium" // Default level when just true
+		}
+		return ""
+	default:
+		return ""
+	}
+}
+
+// UnmarshalJSON implements json.Unmarshaler
+func (t *ThinkValue) UnmarshalJSON(data []byte) error {
+	// Try to unmarshal as bool first
+	var b bool
+	if err := json.Unmarshal(data, &b); err == nil {
+		t.Value = b
+		return nil
+	}
+
+	// Try to unmarshal as string
+	var s string
+	if err := json.Unmarshal(data, &s); err == nil {
+		// Validate string values
+		if s != "high" && s != "medium" && s != "low" {
+			return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", true, or false)", s)
+		}
+		t.Value = s
+		return nil
+	}
+
+	return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\")")
+}
+
+// MarshalJSON implements json.Marshaler
+func (t *ThinkValue) MarshalJSON() ([]byte, error) {
+	if t == nil || t.Value == nil {
+		return []byte("null"), nil
+	}
+	return json.Marshal(t.Value)
+}
+
 type Duration struct {
 	time.Duration
 }
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -374,24 +374,21 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 }

 func TestThinking_UnmarshalJSON(t *testing.T) {
-	trueVal := true
-	falseVal := false
-
 	tests := []struct {
 		name             string
 		input            string
-		expectedThinking *bool
+		expectedThinking *ThinkValue
 		expectedError    bool
 	}{
 		{
 			name:             "true",
 			input:            `{ "think": true }`,
-			expectedThinking: &trueVal,
+			expectedThinking: &ThinkValue{Value: true},
 		},
 		{
 			name:             "false",
 			input:            `{ "think": false }`,
-			expectedThinking: &falseVal,
+			expectedThinking: &ThinkValue{Value: false},
 		},
 		{
 			name:             "unset",
@@ -399,8 +396,23 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 			expectedThinking: nil,
 		},
 		{
-			name:             "invalid",
-			input:            `{ "think": "true" }`,
+			name:             "string_high",
+			input:            `{ "think": "high" }`,
+			expectedThinking: &ThinkValue{Value: "high"},
+		},
+		{
+			name:             "string_medium",
+			input:            `{ "think": "medium" }`,
+			expectedThinking: &ThinkValue{Value: "medium"},
+		},
+		{
+			name:             "string_low",
+			input:            `{ "think": "low" }`,
+			expectedThinking: &ThinkValue{Value: "low"},
+		},
+		{
+			name:             "invalid_string",
+			input:            `{ "think": "invalid" }`,
 			expectedThinking: nil,
 			expectedError:    true,
 		},
@@ -414,7 +426,12 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 				require.Error(t, err)
 			} else {
 				require.NoError(t, err)
-				assert.Equal(t, test.expectedThinking, req.Think)
+				if test.expectedThinking == nil {
+					assert.Nil(t, req.Think)
+				} else {
+					require.NotNil(t, req.Think)
+					assert.Equal(t, test.expectedThinking.Value, req.Think.Value)
+				}
 			}
 		})
 	}
--- a/api/types_typescript_test.go
+++ b/api/types_typescript_test.go
@@ -0,0 +1,142 @@
+package api
+
+import (
+	"testing"
+)
+
+func TestToolParameterToTypeScriptType(t *testing.T) {
+	tests := []struct {
+		name     string
+		param    ToolProperty
+		expected string
+	}{
+		{
+			name: "single string type",
+			param: ToolProperty{
+				Type: PropertyType{"string"},
+			},
+			expected: "string",
+		},
+		{
+			name: "single number type",
+			param: ToolProperty{
+				Type: PropertyType{"number"},
+			},
+			expected: "number",
+		},
+		{
+			name: "integer maps to number",
+			param: ToolProperty{
+				Type: PropertyType{"integer"},
+			},
+			expected: "number",
+		},
+		{
+			name: "boolean type",
+			param: ToolProperty{
+				Type: PropertyType{"boolean"},
+			},
+			expected: "boolean",
+		},
+		{
+			name: "array type",
+			param: ToolProperty{
+				Type: PropertyType{"array"},
+			},
+			expected: "any[]",
+		},
+		{
+			name: "object type",
+			param: ToolProperty{
+				Type: PropertyType{"object"},
+			},
+			expected: "Record<string, any>",
+		},
+		{
+			name: "null type",
+			param: ToolProperty{
+				Type: PropertyType{"null"},
+			},
+			expected: "null",
+		},
+		{
+			name: "multiple types as union",
+			param: ToolProperty{
+				Type: PropertyType{"string", "number"},
+			},
+			expected: "string | number",
+		},
+		{
+			name: "string or null union",
+			param: ToolProperty{
+				Type: PropertyType{"string", "null"},
+			},
+			expected: "string | null",
+		},
+		{
+			name: "anyOf with single types",
+			param: ToolProperty{
+				AnyOf: []ToolProperty{
+					{Type: PropertyType{"string"}},
+					{Type: PropertyType{"number"}},
+				},
+			},
+			expected: "string | number",
+		},
+		{
+			name: "anyOf with multiple types in each branch",
+			param: ToolProperty{
+				AnyOf: []ToolProperty{
+					{Type: PropertyType{"string", "null"}},
+					{Type: PropertyType{"number"}},
+				},
+			},
+			expected: "string | null | number",
+		},
+		{
+			name: "nested anyOf",
+			param: ToolProperty{
+				AnyOf: []ToolProperty{
+					{Type: PropertyType{"boolean"}},
+					{
+						AnyOf: []ToolProperty{
+							{Type: PropertyType{"string"}},
+							{Type: PropertyType{"number"}},
+						},
+					},
+				},
+			},
+			expected: "boolean | string | number",
+		},
+		{
+			name: "empty type returns any",
+			param: ToolProperty{
+				Type: PropertyType{},
+			},
+			expected: "any",
+		},
+		{
+			name: "unknown type maps to any",
+			param: ToolProperty{
+				Type: PropertyType{"unknown_type"},
+			},
+			expected: "any",
+		},
+		{
+			name: "multiple types including array",
+			param: ToolProperty{
+				Type: PropertyType{"string", "array", "null"},
+			},
+			expected: "string | any[] | null",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := tt.param.ToTypeScriptType()
+			if result != tt.expected {
+				t.Errorf("ToTypeScriptType() = %q, want %q", result, tt.expected)
+			}
+		})
+	}
+}
--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@@ -1,178 +0,0 @@
-package benchmark
-
-import (
-	"context"
-	"flag"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// Command line flags
-var modelFlag string
-
-func init() {
-	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
-	flag.Lookup("m").DefValue = "model"
-}
-
-// modelName returns the model name from flags, failing the test if not set
-func modelName(b *testing.B) string {
-	if modelFlag == "" {
-		b.Fatal("Error: -m flag is required for benchmark tests")
-	}
-	return modelFlag
-}
-
-type TestCase struct {
-	name      string
-	prompt    string
-	maxTokens int
-}
-
-// runGenerateBenchmark contains the common generate and metrics logic
-func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
-	start := time.Now()
-	var ttft time.Duration
-	var metrics api.Metrics
-
-	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
-		if ttft == 0 && resp.Response != "" {
-			ttft = time.Since(start)
-		}
-		if resp.Done {
-			metrics = resp.Metrics
-		}
-		return nil
-	})
-
-	// Report custom metrics as part of the benchmark results
-	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
-	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
-
-	// Token throughput metrics
-	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
-	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
-	b.ReportMetric(promptThroughput, "prompt_tok/s")
-	b.ReportMetric(genThroughput, "gen_tok/s")
-
-	// Token counts
-	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
-	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
-	if err != nil {
-		b.Fatal(err)
-	}
-}
-
-// BenchmarkColdStart runs benchmarks with model loading from cold state
-func BenchmarkColdStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
-			ctx := b.Context()
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				b.StopTimer()
-				// Ensure model is unloaded before each iteration
-				unload(client, m, b)
-				b.StartTimer()
-
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// BenchmarkWarmStart runs benchmarks with pre-loaded model
-func BenchmarkWarmStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
-			ctx := b.Context()
-
-			// Pre-warm the model
-			warmup(client, m, tt.prompt, b)
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// setup verifies server and model availability
-func setup(b *testing.B) *api.Client {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		b.Fatal(err)
-	}
-	if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
-		b.Fatalf("Model unavailable: %v", err)
-	}
-
-	return client
-}
-
-// warmup ensures the model is loaded and warmed up
-func warmup(client *api.Client, model string, prompt string, b *testing.B) {
-	for range 3 {
-		err := client.Generate(
-			context.Background(),
-			&api.GenerateRequest{
-				Model:   model,
-				Prompt:  prompt,
-				Options: map[string]any{"num_predict": 50, "temperature": 0.1},
-			},
-			func(api.GenerateResponse) error { return nil },
-		)
-		if err != nil {
-			b.Logf("Error during model warm-up: %v", err)
-		}
-	}
-}
-
-// unload forces model unloading using KeepAlive: 0 parameter
-func unload(client *api.Client, model string, b *testing.B) {
-	req := &api.GenerateRequest{
-		Model:     model,
-		KeepAlive: &api.Duration{Duration: 0},
-	}
-	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
-		b.Logf("Unload error: %v", err)
-	}
-	time.Sleep(1 * time.Second)
-}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -322,11 +322,23 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	thinkFlag := cmd.Flags().Lookup("think")
 	if thinkFlag.Changed {
-		think, err := cmd.Flags().GetBool("think")
+		thinkStr, err := cmd.Flags().GetString("think")
 		if err != nil {
 			return err
 		}
-		opts.Think = &think
+
+		// Handle different values for --think
+		switch thinkStr {
+		case "", "true":
+			// --think or --think=true
+			opts.Think = &api.ThinkValue{Value: true}
+		case "false":
+			opts.Think = &api.ThinkValue{Value: false}
+		case "high", "medium", "low":
+			opts.Think = &api.ThinkValue{Value: thinkStr}
+		default:
+			return fmt.Errorf("invalid value for --think: %q (must be true, false, high, medium, or low)", thinkStr)
+		}
 	} else {
 		opts.Think = nil
 	}
@@ -583,12 +595,13 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 			} else {
 				until = format.HumanTime(m.ExpiresAt, "Never")
 			}
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
+			ctxStr := strconv.Itoa(m.ContextLength)
+			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, ctxStr, until})
 		}
 	}

 	table := tablewriter.NewWriter(os.Stdout)
-	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
+	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "CONTEXT", "UNTIL"})
 	table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	table.SetHeaderLine(false)
@@ -976,7 +989,7 @@ type runOptions struct {
 	Options      map[string]any
 	MultiModal   bool
 	KeepAlive    *api.Duration
-	Think        *bool
+	Think        *api.ThinkValue
 	HideThinking bool
 }

@@ -1016,10 +1029,11 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 				}

 				switch ch {
-				case ' ':
+				case ' ', '\t':
 					state.wordBuffer = ""
-				case '\n':
+				case '\n', '\r':
 					state.lineLength = 0
+					state.wordBuffer = ""
 				default:
 					state.wordBuffer += string(ch)
 				}
@@ -1077,12 +1091,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	}()

 	var state *displayResponseState = &displayResponseState{}
+	var thinkingContent strings.Builder
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
-	var role string
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false

+	role := "assistant"
+
 	fn := func(response api.ChatResponse) error {
 		if response.Message.Content != "" || !opts.HideThinking {
 			p.StopAndClear()
@@ -1095,14 +1111,21 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(false))
 				thinkTagOpened = true
+				thinkTagClosed = false
 			}
+			thinkingContent.WriteString(response.Message.Thinking)
 			displayResponse(response.Message.Thinking, opts.WordWrap, state)
 		}

 		content := response.Message.Content
-		if thinkTagOpened && !thinkTagClosed && content != "" {
+		if thinkTagOpened && !thinkTagClosed && (content != "" || len(response.Message.ToolCalls) > 0) {
+			if !strings.HasSuffix(thinkingContent.String(), "\n") {
+				fmt.Println()
+			}
 			fmt.Print(thinkingOutputClosingText(false))
+			thinkTagOpened = false
 			thinkTagClosed = true
+			state = &displayResponseState{}
 		}
 		// purposefully not putting thinking blocks in the response, which would
 		// only be needed if we later added tool calling to the cli (they get
@@ -1110,6 +1133,13 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		// about to finish some tool calls)
 		fullResponse.WriteString(content)

+		if response.Message.ToolCalls != nil {
+			toolCalls := response.Message.ToolCalls
+			if len(toolCalls) > 0 {
+				fmt.Print(renderToolCalls(toolCalls, false))
+			}
+		}
+
 		displayResponse(content, opts.WordWrap, state)

 		return nil
@@ -1135,6 +1165,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		if errors.Is(err, context.Canceled) {
 			return nil, nil
 		}
+
+		// this error should ideally be wrapped properly by the client
+		if strings.Contains(err.Error(), "upstream error") {
+			p.StopAndClear()
+			fmt.Println("An error occurred while processing your message. Please try again.")
+			fmt.Println()
+			return nil, nil
+		}
 		return nil, err
 	}

@@ -1186,6 +1224,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()

 	var state *displayResponseState = &displayResponseState{}
+	var thinkingContent strings.Builder
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false

@@ -1203,17 +1242,31 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(plainText))
 				thinkTagOpened = true
+				thinkTagClosed = false
 			}
+			thinkingContent.WriteString(response.Thinking)
 			displayResponse(response.Thinking, opts.WordWrap, state)
 		}

-		if thinkTagOpened && !thinkTagClosed && content != "" {
+		if thinkTagOpened && !thinkTagClosed && (content != "" || len(response.ToolCalls) > 0) {
+			if !strings.HasSuffix(thinkingContent.String(), "\n") {
+				fmt.Println()
+			}
 			fmt.Print(thinkingOutputClosingText(plainText))
+			thinkTagOpened = false
 			thinkTagClosed = true
+			state = &displayResponseState{}
 		}

 		displayResponse(content, opts.WordWrap, state)

+		if response.ToolCalls != nil {
+			toolCalls := response.ToolCalls
+			if len(toolCalls) > 0 {
+				fmt.Print(renderToolCalls(toolCalls, plainText))
+			}
+		}
+
 		return nil
 	}

@@ -1416,13 +1469,13 @@ func NewCLI() *cobra.Command {

 	createCmd := &cobra.Command{
 		Use:     "create MODEL",
-		Short:   "Create a model from a Modelfile",
+		Short:   "Create a model",
 		Args:    cobra.ExactArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE:    CreateHandler,
 	}

-	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
+	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\")")
 	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")

 	showCmd := &cobra.Command{
@@ -1453,7 +1506,8 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
-	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
+	runCmd.Flags().String("think", "", "Enable thinking mode: true/false or high/medium/low for supported models")
+	runCmd.Flags().Lookup("think").NoOptDefVal = "true"
 	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")

 	stopCmd := &cobra.Command{
@@ -1603,7 +1657,7 @@ func NewCLI() *cobra.Command {
 // to false).
 //
 // If capabilities are not provided, we fetch them from the server.
-func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
+func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*api.ThinkValue, error) {
 	if explicitlySetByUser {
 		return runOpts.Think, nil
 	}
@@ -1630,9 +1684,34 @@ func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicit
 	}

 	if thinkingSupported {
-		thinking := true
-		return &thinking, nil
+		return &api.ThinkValue{Value: true}, nil
 	}

 	return nil, nil
 }
+
+func renderToolCalls(toolCalls []api.ToolCall, plainText bool) string {
+	out := ""
+	formatExplanation := ""
+	formatValues := ""
+	if !plainText {
+		formatExplanation = readline.ColorGrey + readline.ColorBold
+		formatValues = readline.ColorDefault
+		out += formatExplanation
+	}
+	for i, toolCall := range toolCalls {
+		argsAsJSON, err := json.Marshal(toolCall.Function.Arguments)
+		if err != nil {
+			return ""
+		}
+		if i > 0 {
+			out += "\n"
+		}
+		// all tool calls are unexpected since we don't currently support registering any in the CLI
+		out += fmt.Sprintf("  Model called a non-existent function '%s()' with arguments: %s", formatValues+toolCall.Function.Name+formatExplanation, formatValues+string(argsAsJSON)+formatExplanation)
+	}
+	if !plainText {
+		out += readline.ColorDefault
+	}
+	return out
+}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -272,16 +272,29 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					}
 					fmt.Println("Set 'quiet' mode.")
 				case "think":
-					think := true
-					opts.Think = &think
+					thinkValue := api.ThinkValue{Value: true}
+					var maybeLevel string
+					if len(args) > 2 {
+						maybeLevel = args[2]
+					}
+					if maybeLevel != "" {
+						// TODO(drifkin): validate the level, could be model dependent
+						// though... It will also be validated on the server once a call is
+						// made.
+						thinkValue.Value = maybeLevel
+					}
+					opts.Think = &thinkValue
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
 					}
-					fmt.Println("Set 'think' mode.")
+					if maybeLevel != "" {
+						fmt.Printf("Set 'think' mode to '%s'.\n", maybeLevel)
+					} else {
+						fmt.Println("Set 'think' mode.")
+					}
 				case "nothink":
-					think := false
-					opts.Think = &think
+					opts.Think = &api.ThinkValue{Value: false}
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
@@ -385,18 +398,21 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				case "modelfile":
 					fmt.Println(resp.Modelfile)
 				case "parameters":
+					fmt.Println("Model defined parameters:")
 					if resp.Parameters == "" {
-						fmt.Println("No parameters were specified for this model.")
+						fmt.Println("  No additional parameters were specified for this model.")
 					} else {
-						if len(opts.Options) > 0 {
-							fmt.Println("User defined parameters:")
-							for k, v := range opts.Options {
-								fmt.Printf("%-*s %v\n", 30, k, v)
-							}
-							fmt.Println()
+						for _, l := range strings.Split(resp.Parameters, "\n") {
+							fmt.Printf("  %s\n", l)
 						}
-						fmt.Println("Model defined parameters:")
-						fmt.Println(resp.Parameters)
+					}
+					fmt.Println()
+					if len(opts.Options) > 0 {
+						fmt.Println("User defined parameters:")
+						for k, v := range opts.Options {
+							fmt.Printf("  %-*s %v\n", 30, k, v)
+						}
+						fmt.Println()
 					}
 				case "system":
 					switch {
@@ -475,7 +491,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 			assistant, err := chat(cmd, opts)
 			if err != nil {
-				if strings.Contains(err.Error(), "does not support thinking") {
+				if strings.Contains(err.Error(), "does not support thinking") ||
+					strings.Contains(err.Error(), "invalid think value") {
 					fmt.Printf("error: %v\n", err)
 					sb.Reset()
 					continue
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -5,7 +5,7 @@ import (
 	"errors"
 	"os"
 	"os/exec"
-	"strings"
+	"regexp"

 	"github.com/ollama/ollama/api"
 )
@@ -19,11 +19,12 @@ func startApp(ctx context.Context, client *api.Client) error {
 	if err != nil {
 		return err
 	}
-	if !strings.Contains(link, "Ollama.app") {
+	r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
+	m := r.FindStringSubmatch(link)
+	if len(m) != 1 {
 		return errors.New("could not find ollama app")
 	}
-	path := strings.Split(link, "Ollama.app")
-	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
+	if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -45,14 +45,11 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
-	// log.Printf("XXX attempting to start app %s", appExe)

 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe)
-	// TODO - these hide flags aren't working - still pops up a command window for some reason
+	cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}

-	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -74,7 +71,16 @@ func isProcRunning(procName string) []uint32 {
 		slog.Debug("failed to check for running installers", "error", err)
 		return nil
 	}
-	pids = pids[:ret]
+	if ret > uint32(len(pids)) {
+		pids = make([]uint32, ret+10)
+		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
+			slog.Debug("failed to check for running installers", "error", err)
+			return nil
+		}
+	}
+	if ret < uint32(len(pids)) {
+		pids = pids[:ret]
+	}
 	var matches []uint32
 	for _, pid := range pids {
 		if pid == 0 {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -190,6 +190,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &gemma2Model{}
 	case "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration":
 		conv = &gemma3Model{Architecture: p.Architectures[0]}
+	case "Gemma3nForConditionalGeneration":
+		conv = &gemma3nModel{}
 	case "Phi3ForCausalLM":
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
@@ -200,6 +202,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &bertModel{}
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
+	case "GptOssForCausalLM":
+		conv = &gptossModel{}
 	default:
 		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
--- a/convert/convert_gemma3n.go
+++ b/convert/convert_gemma3n.go
@@ -0,0 +1,165 @@
+package convert
+
+import (
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+type gemma3nModel struct {
+	ModelParameters
+
+	TextModel struct {
+		ActivationSparsityPattern []float32 `json:"activation_sparsity_pattern"`
+		AltupActiveIdx            uint32    `json:"altup_active_idx"`
+		AltupCoefClip             float32   `json:"altup_coef_clip"`
+		AltupCorrectScale         bool      `json:"altup_correct_scale"`
+		AltupLRMultiplier         float32   `json:"altup_lr_multiplier"`
+		AltupNumInputs            uint32    `json:"altup_num_inputs"`
+		HeadDim                   uint32    `json:"head_dim"`
+		HiddenSize                uint32    `json:"hidden_size"`
+		HiddenSizePerLayerInput   uint32    `json:"hidden_size_per_layer_input"`
+		IntermediateSize          uint32    `json:"intermediate_size"`
+		MaxPositionEmbeddings     uint32    `json:"max_position_embeddings"`
+		NumAttentionHeads         uint32    `json:"num_attention_heads"`
+		NumHiddenLayers           uint32    `json:"num_hidden_layers"`
+		NumKeyValueHeads          uint32    `json:"num_key_value_heads"`
+		NumKVSharedLayers         uint32    `json:"num_kv_shared_layers"`
+		RMSNormEPS                float32   `json:"rms_norm_eps"`
+		RopeLocalBaseFreq         float32   `json:"rope_local_base_freq"`
+		RopeTheta                 float32   `json:"rope_theta"`
+		SlidingWindow             uint32    `json:"sliding_window"`
+		LayerTypes                []string  `json:"layer_types"`
+	} `json:"text_config"`
+	VisionModel struct{} `json:"vision_config"`
+}
+
+func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "gemma3n"
+	kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
+		norm := distuv.Normal{Mu: 0, Sigma: 1}
+		for _, v := range m.TextModel.ActivationSparsityPattern {
+			if !yield(float32(norm.Quantile(float64(v)))) {
+				break
+			}
+		}
+	})
+	kv["gemma3n.altup.active_idx"] = m.TextModel.AltupActiveIdx
+	kv["gemma3n.altup.correct_scale"] = m.TextModel.AltupCorrectScale
+	kv["gemma3n.altup.lr_multiplier"] = m.TextModel.AltupLRMultiplier
+	kv["gemma3n.altup.num_inputs"] = m.TextModel.AltupNumInputs
+	kv["gemma3n.attention.head_count_kv"] = m.TextModel.NumKeyValueHeads
+	kv["gemma3n.attention.head_count"] = m.TextModel.NumAttentionHeads
+	kv["gemma3n.attention.layer_norm_rms_epsilon"] = m.TextModel.RMSNormEPS
+	kv["gemma3n.attention.sliding_window"] = m.TextModel.SlidingWindow
+	kv["gemma3n.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
+		for _, t := range m.TextModel.LayerTypes {
+			if !yield(t == "sliding_attention") {
+				break
+			}
+		}
+	})
+	kv["gemma3n.attention.shared_kv_layers"] = m.TextModel.NumKVSharedLayers
+	kv["gemma3n.block_count"] = m.TextModel.NumHiddenLayers
+	kv["gemma3n.context_length"] = m.TextModel.MaxPositionEmbeddings
+	kv["gemma3n.embedding_length_per_layer_input"] = m.TextModel.HiddenSizePerLayerInput
+	kv["gemma3n.embedding_length"] = m.TextModel.HiddenSize
+	kv["gemma3n.feed_forward_length"] = m.TextModel.IntermediateSize
+	kv["gemma3n.head_dim"] = m.TextModel.HeadDim
+	kv["gemma3n.rope.freq_base_local"] = m.TextModel.RopeLocalBaseFreq
+	kv["gemma3n.rope.freq_base"] = m.TextModel.RopeTheta
+	return kv
+}
+
+func (m *gemma3nModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	out, ts := mergeTensors(ts,
+		merge{"altup_proj.*.weight", "altup_proj.weight"},
+		merge{"altup_unembd_proj.*.weight", "altup_unembd_proj.weight"},
+	)
+
+	for _, t := range ts {
+		switch {
+		case strings.Contains(t.Name(), "audio_tower"),
+			strings.Contains(t.Name(), "embed_audio"),
+			strings.Contains(t.Name(), "vision_tower"),
+			strings.Contains(t.Name(), "embed_vision"):
+			// TODO: handle audio and vision towers
+			continue
+		case strings.Contains(t.Name(), "altup_predict_coef"),
+			strings.Contains(t.Name(), "altup_correct_coef"):
+			if m.TextModel.AltupCoefClip > 0 {
+				t.SetRepacker(func(name string, data []float32, shape []uint64) (_ []float32, err error) {
+					dims := make([]int, len(shape))
+					for i := range shape {
+						dims[i] = int(shape[i])
+					}
+
+					var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+					t, err = tensor.Clamp(t, -m.TextModel.AltupCoefClip, m.TextModel.AltupCoefClip)
+					if err != nil {
+						return nil, err
+					}
+
+					if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+						return nil, err
+					}
+
+					return native.VectorF32(t.(*tensor.Dense))
+				})
+			}
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (m *gemma3nModel) Replacements() []string {
+	return []string{
+		"model.language_model.embed_tokens_per_layer", "per_layer_token_embd",
+		"model.language_model.embed_tokens", "token_embd",
+		"model.language_model.per_layer_model_projection", "per_layer_model_proj",
+		"model.language_model.per_layer_projection_norm", "per_layer_proj_norm", "model.language_model.altup_projections", "altup_proj",
+		"model.language_model.altup_unembed_projections", "altup_unembd_proj",
+		"model.language_model.norm", "output_norm",
+		"model.language_model.layers", "blk",
+
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.k_norm", "attn_k_norm",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"mlp.down_proj", "ffn_down",
+		"post_feedforward_layernorm", "post_ffw_norm",
+		"per_layer_input_gate", "inp_gate",
+		"per_layer_projection", "proj",
+		"post_per_layer_input_norm", "post_norm",
+		"altup.", "altup_",
+		"modality_router", "router",
+		"prediction_coefs", "predict_coef",
+		"correction_coefs", "correct_coef",
+		"correct_output_scale", "correct_scale.weight",
+		"laurel.", "laurel_",
+		"linear_left", "l",
+		"linear_right", "r",
+		"post_laurel_norm", "post_norm",
+	}
+}
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -0,0 +1,178 @@
+package convert
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"io"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type gptossModel struct {
+	ModelParameters
+	HiddenLayers         uint32  `json:"num_hidden_layers"`
+	HiddenSize           uint32  `json:"hidden_size"`
+	IntermediateSize     uint32  `json:"intermediate_size"`
+	AttentionHeads       uint32  `json:"num_attention_heads"`
+	KeyValueHeads        uint32  `json:"num_key_value_heads"`
+	HeadDim              uint32  `json:"head_dim"`
+	Experts              uint32  `json:"num_experts"`
+	ExpertsPerToken      uint32  `json:"experts_per_token"`
+	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
+	InitialContextLength uint32  `json:"initial_context_length"`
+	RopeTheta            float32 `json:"rope_theta"`
+	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
+	SlidingWindow        uint32  `json:"sliding_window"`
+}
+
+var _ ModelConverter = (*gptossModel)(nil)
+
+func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "gptoss"
+	kv["general.file_type"] = uint32(4)
+	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
+	kv["gptoss.block_count"] = m.HiddenLayers
+	kv["gptoss.embedding_length"] = m.HiddenSize
+	kv["gptoss.feed_forward_length"] = m.IntermediateSize
+	kv["gptoss.expert_count"] = m.Experts
+	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
+	kv["gptoss.attention.head_count"] = m.AttentionHeads
+	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
+	kv["gptoss.attention.key_length"] = m.HeadDim
+	kv["gptoss.attention.value_length"] = m.HeadDim
+	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
+	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
+	kv["gptoss.rope.freq_base"] = m.RopeTheta
+	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
+	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
+	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
+	kv["tokenizer.ggml.add_bos_token"] = false
+	kv["tokenizer.ggml.eos_token_id"] = uint32(199999) // <|endoftext|>
+	kv["tokenizer.ggml.eos_token_ids"] = []int32{
+		199999, /* <|endoftext|> */
+		200002, /* <|return|> */
+		200012, /* <|call|> */
+	}
+	kv["tokenizer.ggml.add_eos_token"] = false
+	return kv
+}
+
+func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+	mxfp4s := make(map[string]*mxfp4)
+	for _, t := range ts {
+		if strings.HasSuffix(t.Name(), ".blocks") || strings.HasSuffix(t.Name(), ".scales") {
+			dot := strings.LastIndex(t.Name(), ".")
+			name, suffix := t.Name()[:dot], t.Name()[dot+1:]
+			if _, ok := mxfp4s[name]; !ok {
+				mxfp4s[name] = &mxfp4{}
+			}
+
+			switch suffix {
+			case "blocks":
+				mxfp4s[name].blocks = t
+			case "scales":
+				mxfp4s[name].scales = t
+			}
+		} else {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		}
+	}
+
+	for name, mxfp4 := range mxfp4s {
+		dims := mxfp4.blocks.Shape()
+		out = append(out, &ggml.Tensor{
+			Name:     name,
+			Kind:     uint32(ggml.TensorTypeMXFP4),
+			Shape:    []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
+			WriterTo: mxfp4,
+		})
+	}
+
+	return out
+}
+
+func (m *gptossModel) Replacements() []string {
+	return []string{
+		// noop replacements so other replacements will not be applied
+		".blocks", ".blocks",
+		".scales", ".scales",
+		// real replacements
+		"block", "blk",
+		"attn.norm", "attn_norm",
+		"attn.qkv", "attn_qkv",
+		"attn.sinks", "attn_sinks",
+		"attn.out", "attn_out",
+		"mlp.norm", "ffn_norm",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.mlp1_", "ffn_gate_up_exps.",
+		"mlp.mlp2_", "ffn_down_exps.",
+		"embedding", "token_embd",
+		"norm", "output_norm",
+		"unembedding", "output",
+		"scale", "weight",
+	}
+}
+
+type mxfp4 struct {
+	blocks, scales Tensor
+}
+
+func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
+	var b bytes.Buffer
+	if _, err := m.blocks.WriteTo(&b); err != nil {
+		return 0, err
+	}
+
+	blocksDims := make([]int, len(m.blocks.Shape()))
+	for i, d := range m.blocks.Shape() {
+		blocksDims[i] = int(d)
+	}
+
+	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(b.Bytes()))
+
+	var s bytes.Buffer
+	if _, err := m.scales.WriteTo(&s); err != nil {
+		return 0, err
+	}
+
+	scalesDims := slices.Repeat([]int{1}, len(m.blocks.Shape()))
+	for i, d := range m.scales.Shape() {
+		scalesDims[i] = int(d)
+	}
+
+	var scales tensor.Tensor = tensor.New(tensor.WithShape(scalesDims...), tensor.WithBacking(s.Bytes()))
+
+	out, err := tensor.Concat(3, scales, blocks)
+	if err != nil {
+		return 0, err
+	}
+
+	out = tensor.Materialize(out)
+
+	if err := out.Reshape(out.Shape().TotalSize()); err != nil {
+		return 0, err
+	}
+
+	u8s, err := native.VectorU8(out.(*tensor.Dense))
+	if err != nil {
+		return 0, err
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, u8s); err != nil {
+		return 0, err
+	}
+
+	return 0, nil
+}
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,9 +2,6 @@ package convert

 import (
 	"fmt"
-	"io"
-	"slices"
-	"strings"

 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }

 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	oldnew := []string{
-		"model.layers", "blk",
-		"w1", "ffn_gate_exps",
-		"w2", "ffn_down_exps",
-		"w3", "ffn_up_exps",
-	}
-
-	for i := range p.NumLocalExperts {
-		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
-	}
-
-	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
-	namer := strings.NewReplacer(oldnew...)
-	experts := make(map[string]experts)
-
-	// merge experts into a single tensor while removing them from ts
-	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
-		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
-			return false
-		}
-
-		name := namer.Replace(t.Name())
-		experts[name] = append(experts[name], t)
-		return true
-	})
-
-	var out []*ggml.Tensor
-	for n, e := range experts {
-		// TODO(mxyng): sanity check experts
-		out = append(out, &ggml.Tensor{
-			Name:     n,
-			Kind:     e[0].Kind(),
-			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
-			WriterTo: e,
+	merges := make([]merge, 0, p.NumHiddenLayers*6)
+	for i := range p.NumHiddenLayers {
+		merges = append(merges, merge{
+			fmt.Sprintf("blk.%d.*.w1.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w1.bias", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.bias", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.bias", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
 		})
 	}

+	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }

 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
+		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
+		"block_sparse_moe.experts.", ".",
 	)
 }
-
-type experts []Tensor
-
-func (e experts) WriteTo(w io.Writer) (int64, error) {
-	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
-	for _, t := range e {
-		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
-		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
-		// this accomplishes the same thing by writing each expert tensor in sequence
-		if _, err := t.WriteTo(w); err != nil {
-			return 0, err
-		}
-	}
-
-	return 0, nil
-}
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -65,17 +65,17 @@ func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	for _, t := range ts {
 		if strings.Contains(t.Name(), "patch_embed.proj") {
 			for t := range splitDim(t, 2,
-				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
-				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
+				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
+				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
 			) {
 				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
 				out = append(out, t)
 			}
 		} else if strings.Contains(t.Name(), "attn.qkv") {
 			out = append(out, slices.Collect(splitDim(t, 0,
-				strings.NewReplacer("attn.qkv", "attn_q"),
-				strings.NewReplacer("attn.qkv", "attn_k"),
-				strings.NewReplacer("attn.qkv", "attn_v"),
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -11,14 +11,13 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"maps"
 	"os"
 	"path/filepath"
 	"slices"
 	"strings"
 	"testing"

-	"golang.org/x/exp/maps"
-
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -137,9 +136,7 @@ func TestConvertModel(t *testing.T) {
 				t.Fatal(err)
 			}

-			keys := maps.Keys(expect)
-			slices.Sort(keys)
-			for _, k := range keys {
+			for _, k := range slices.Sorted(maps.Keys(expect)) {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != expect[k] {
@@ -343,9 +340,7 @@ func TestConvertAdapter(t *testing.T) {

 			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())

-			keys := maps.Keys(c.Expected)
-			slices.Sort(keys)
-			for _, k := range keys {
+			for _, k := range slices.Sorted(maps.Keys(c.Expected)) {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != c.Expected[k] {
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -31,8 +31,10 @@ func (t tensorBase) Shape() []uint64 {
 }

 const (
-	tensorKindF32 uint32 = iota
-	tensorKindF16
+	tensorKindFP32 uint32 = iota
+	tensorKindFP16
+	tensorKindMXFP4 = 4
+	tensorKindBF16  = 30
 )

 func (t tensorBase) Kind() uint32 {
@@ -43,16 +45,16 @@ func (t tensorBase) Kind() uint32 {
 		t.name == "v.pre_tile_position_embd.weight" ||
 		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
-		return 0
+		return tensorKindFP32
 	}

 	switch len(t.shape) {
 	case 0:
 		panic("invalid tensor shape")
 	case 1:
-		return tensorKindF32
+		return tensorKindFP32
 	default:
-		return tensorKindF16
+		return tensorKindFP16
 	}
 }

--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
+	"maps"
 	"slices"
 	"strings"

 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
-	"golang.org/x/exp/maps"
 )

 type safetensorMetadata struct {
@@ -46,8 +46,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 			return nil, err
 		}

-		keys := maps.Keys(headers)
-		slices.Sort(keys)
+		keys := slices.Sorted(maps.Keys(headers))

 		names := make(map[string]struct{}, len(keys))

@@ -94,6 +93,15 @@ type safetensor struct {
 	*tensorBase
 }

+func (st safetensor) Kind() uint32 {
+	kind := st.tensorBase.Kind()
+	if st.dtype == "BF16" && kind != tensorKindFP32 {
+		kind = tensorKindBF16
+	}
+
+	return kind
+}
+
 func (st safetensor) Clone() Tensor {
 	return &safetensor{
 		fs:     st.fs,
@@ -151,6 +159,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 		}

 		f32s = bfloat16.DecodeFloat32(u8s)
+	case "U8":
+		// U8 tensors do not support repacking or type conversion.
+		return io.CopyN(w, f, st.size)
 	default:
 		return 0, fmt.Errorf("unknown data type: %s", st.dtype)
 	}
@@ -163,15 +174,18 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	}

 	switch st.Kind() {
-	case tensorKindF32:
+	case tensorKindFP32:
 		return 0, binary.Write(w, binary.LittleEndian, f32s)
-	case tensorKindF16:
+	case tensorKindFP16:
 		f16s := make([]uint16, len(f32s))
 		for i := range f32s {
 			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
 		}

 		return 0, binary.Write(w, binary.LittleEndian, f16s)
+	case tensorKindBF16:
+		u8s := bfloat16.EncodeFloat32(f32s)
+		return 0, binary.Write(w, binary.LittleEndian, u8s)
 	default:
 		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
 	}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -1,56 +1,129 @@
 package convert

 import (
+	"cmp"
+	"io"
 	"iter"
+	"path"
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/fs/ggml"
 )

+type split struct {
+	*strings.Replacer
+	dim int
+
+	// fn is an optional function to apply to the tensor after slicing
+	fn func(tensor.Tensor) (tensor.Tensor, error)
+}
+
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
-// is split evenly based on the number of replacers provided.
-func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
+// is split evenly based on the number of replacers provided unless a specific count is given.
+func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 	return func(yield func(*ggml.Tensor) bool) {
-		for i, replacer := range replacers {
+		var offset int
+		for _, split := range splits {
+			t := t.Clone()
 			shape := slices.Clone(t.Shape())
-			shape[dim] = shape[dim] / uint64(len(replacers))
+			shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))

 			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
-			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
+			slice[dim] = tensor.S(offset, offset+int(shape[dim]))
+			offset += int(shape[dim])

-			tt := t.Clone()
-			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
 				dims := make([]int, len(shape))
 				for i := range shape {
 					dims[i] = int(shape[i])
 				}

-				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-				t, err := t.Slice(slice...)
+				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				tt, err := tt.Slice(slice...)
 				if err != nil {
 					return nil, err
 				}

-				t = tensor.Materialize(t)
+				tt = tensor.Materialize(tt)
+
+				if split.fn != nil {
+					tt, err = split.fn(tt)
+					if err != nil {
+						return nil, err
+					}
+				}
+
 				// flatten tensor so it can be written as a vector
-				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
 					return nil, err
 				}

-				return native.VectorF32(t.(*tensor.Dense))
+				return native.VectorF32(tt.(*tensor.Dense))
 			})

 			if !yield(&ggml.Tensor{
-				Name:     replacer.Replace(t.Name()),
+				Name:     split.Replace(t.Name()),
 				Kind:     t.Kind(),
 				Shape:    shape,
-				WriterTo: tt,
+				WriterTo: t,
 			}) {
 				break
 			}
 		}
 	}
 }
+
+type merge struct {
+	pattern, name string
+}
+
+// mergeTensors merges tensors that match a given pattern into a single tensor.
+func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
+	var matched []Tensor
+	for i := range merges {
+		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
+			matched, _ := path.Match(merges[i].pattern, t.Name())
+			return matched
+		})
+
+		if len(matched) > 0 {
+			out = append(out, &ggml.Tensor{
+				Name:     merges[i].name,
+				Kind:     matched[0].Kind(),
+				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
+				WriterTo: mergeGroup(matched),
+			})
+		}
+	}
+
+	return out, unmatched
+}
+
+// slicesSplitFunc splits a slice into two slices based on a predicate function.
+func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
+	for _, e := range s {
+		if fn(e) {
+			matched = append(matched, e)
+		} else {
+			unmatched = append(unmatched, e)
+		}
+	}
+
+	return matched, unmatched
+}
+
+type mergeGroup []Tensor
+
+func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
+	for _, t := range g {
+		if _, err := t.WriteTo(w); err != nil {
+			return 0, err
+		}
+	}
+
+	return 0, nil
+}
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -0,0 +1,953 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"iter"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+)
+
+type fakeTensor struct {
+	name  string
+	shape []uint64
+	data  []float32
+
+	repacker Repacker
+}
+
+func (f fakeTensor) Name() string {
+	return f.name
+}
+
+func (f fakeTensor) Shape() []uint64 {
+	return f.shape
+}
+
+func (f fakeTensor) Kind() uint32 {
+	return 0
+}
+
+func (f *fakeTensor) SetRepacker(fn Repacker) {
+	f.repacker = fn
+}
+
+func (f fakeTensor) Clone() Tensor {
+	return &fakeTensor{
+		name:     f.name,
+		shape:    slices.Clone(f.shape),
+		data:     slices.Clone(f.data),
+		repacker: f.repacker,
+	}
+}
+
+func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
+	data := f.data
+	if f.repacker != nil {
+		data, err = f.repacker(f.name, data, f.shape)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, data); err != nil {
+		return 0, err
+	}
+
+	return int64(len(data) * 4), nil
+}
+
+func mul(shape []uint64) int {
+	n := 1
+	for _, dim := range shape {
+		n *= int(dim)
+	}
+	return n
+}
+
+func TestSplitDim(t *testing.T) {
+	t.Run("2d", func(t *testing.T) {
+		r := fakeTensor{
+			name:  "a.b",
+			shape: []uint64{3, 4},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+		}
+
+		t.Run("no split", func(t *testing.T) {
+			for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
+				if tt.Name != "x.b" {
+					t.Fatalf("expected name 'x', got '%s'", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("even split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x")},
+				split{Replacer: strings.NewReplacer("b", "y")},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{2, 3, 6, 7, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{2, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{4, 5, 6, 7}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{2, 6, 10}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{3, 7, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("split with transpose", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x")},
+				split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
+					return tensor.Transpose(tt, 1, 0)
+				}},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{2, 6, 10, 3, 7, 11}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+	})
+	t.Run("3d", func(t *testing.T) {
+		r := fakeTensor{
+			name:  "a.b",
+			shape: []uint64{3, 4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+		}
+
+		t.Run("no split", func(t *testing.T) {
+			for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
+				if tt.Name != "x.b" {
+					t.Fatalf("expected name 'x', got '%s'", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("even split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x")},
+				split{Replacer: strings.NewReplacer("b", "y")},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{2, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'a.y', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 0,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11, 12, 13, 14, 15}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+
+		t.Run("uneven three way split", func(t *testing.T) {
+			next, stop := iter.Pull(splitDim(&r, 1,
+				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
+			))
+			defer stop()
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "x.b" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.y" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{4, 5, 12, 13, 20, 21}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+
+			{
+				tt, ok := next()
+				if !ok {
+					t.Fatal("expected at least one split")
+				}
+
+				if tt.Name != "a.z" {
+					t.Fatal("expected name 'x.b', got", tt.Name)
+				}
+
+				if diff := cmp.Diff(tt.Shape, []uint64{3, 1, 2}); diff != "" {
+					t.Errorf("unexpected shape (-want +got):\n%s", diff)
+				}
+
+				var b bytes.Buffer
+				if _, err := tt.WriteTo(&b); err != nil {
+					t.Fatal(err)
+				}
+
+				f32s := make([]float32, mul(tt.Shape))
+				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(f32s, []float32{6, 7, 14, 15, 22, 23}); diff != "" {
+					t.Errorf("unexpected data (-want +got):\n%s", diff)
+				}
+			}
+		})
+	})
+}
+
+func TestMerge(t *testing.T) {
+	unmatched := []Tensor{
+		&fakeTensor{
+			name:  "a.0.b",
+			shape: []uint64{5, 2},
+			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+		},
+		&fakeTensor{
+			name:  "a.1.b",
+			shape: []uint64{5, 2},
+			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
+		},
+		&fakeTensor{
+			name:  "c.0.d",
+			shape: []uint64{5, 2},
+			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
+		},
+		&fakeTensor{
+			name:  "c.1.d",
+			shape: []uint64{5, 2},
+			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
+		},
+		&fakeTensor{
+			name:  "e.0.f",
+			shape: []uint64{5, 2},
+			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
+		},
+	}
+
+	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
+		for i := range n {
+			got := matched[i]
+			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
+				t.Errorf("unexpected (-want +got):\n%s", diff)
+			}
+
+			var b bytes.Buffer
+			if _, err := got.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, 20)
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			offset := 10 + (i * 20)
+			want := make([]float32, 20)
+			for j := range 20 {
+				want[j] = float32(offset + j)
+			}
+
+			if diff := cmp.Diff(want, f32s); diff != "" {
+				t.Errorf("unexpected data (-want +got):\n%s", diff)
+			}
+		}
+	}
+
+	t.Run("single merge", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
+		if len(unmatched) != 3 {
+			t.Error("expected 3 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 1 {
+			t.Error("expected 1 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 1, matched)
+	})
+
+	t.Run("multiple merges", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
+		if len(unmatched) != 1 {
+			t.Error("expected 1 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 2 {
+			t.Error("expected 2 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 2, matched)
+	})
+
+	t.Run("no match", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
+		if len(unmatched) != 5 {
+			t.Error("expected 5 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 0 {
+			t.Error("expected no merged tensors, got", len(matched))
+		}
+	})
+}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -8,11 +8,10 @@ import (
 	"fmt"
 	"io/fs"
 	"log/slog"
+	"maps"
 	"os"
 	"slices"
 	"strings"
-
-	"golang.org/x/exp/maps"
 )

 const (
@@ -260,11 +259,8 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		tokens[token.ID] = token
 	}

-	keys := maps.Keys(tokens)
-	slices.Sort(keys)
-
 	v := Vocabulary{Model: "gpt2"}
-	for _, k := range keys {
+	for _, k := range slices.Sorted(maps.Keys(tokens)) {
 		token := tokens[k]
 		v.Tokens = append(v.Tokens, token.Content)
 		v.Scores = append(v.Scores, float32(token.ID))
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -58,7 +58,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	driverMajor, driverMinor, err := AMDDriverVersion()
 	if err != nil {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
+		slog.Warn("ollama recommends running the https://www.amd.com/en/support/download/linux-drivers.html", "error", err)
 	}

 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,6 +3,7 @@
 package discover

 import (
+	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -55,10 +56,13 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
+		return "sbsa"
 	}

 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		// The detected driver is older than Feb 2023
+		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+// 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,6 +4,7 @@
 * [Quickstart](../README.md#quickstart)
 * [Examples](./examples.md)
 * [Importing models](./import.md)
+* [MacOS Documentation](./macos.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](./docker.md)
--- a/docs/api.md
+++ b/docs/api.md
@@ -500,21 +500,30 @@ The `message` object has the following fields:
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
+- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result

 Advanced parameters (optional):

- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
+- `format`: the format to return a response in. Format can be `json` or a JSON schema.
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

+### Tool calling
+
+Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below.
+
+Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below.
+
+[See models with tool calling capabilities](https://ollama.com/search?c=tool).
+
 ### Structured outputs

 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.

 ### Examples

-#### Chat Request (Streaming)
+#### Chat request (Streaming)

 ##### Request

@@ -569,6 +578,88 @@ Final response:
 }
 ```

+#### Chat request (Streaming with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": true
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+```json
+{
+    "model": "llama3.2",
+    "created_at": "2025-07-07T20:22:19.184789Z",
+    "message": {
+        "role": "assistant",
+        "content": "",
+        "tool_calls": [
+            {
+                "function": {
+                    "name": "get_weather",
+                    "arguments": {
+                        "city": "Tokyo"
+                    }
+                },
+            }
+        ]
+    },
+    "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model":"llama3.2",
+  "created_at":"2025-07-07T20:22:19.19314Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 182242375,
+  "load_duration": 41295167,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 24573166,
+  "eval_count": 15,
+  "eval_duration": 115959084
+}
+```
+
 #### Chat request (No streaming)

 ##### Request
@@ -606,6 +697,74 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```

+#### Chat request (No streaming, with tools)
+
+##### Request
+
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": false 
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:32:53.844124Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_weather",
+          "arguments": {
+            "city": "Tokyo"
+          }
+        },
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 3244883583,
+  "load_duration": 2969184542,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 141656333,
+  "eval_count": 18,
+  "eval_duration": 133293625
+}
+```
+
 #### Chat request (Structured outputs)

 ##### Request
@@ -712,6 +871,87 @@ Final response:
 }
 ```

+
+#### Chat request (With history, with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in Toronto?"
+    },
+    // the message from the model appended to history
+    {
+      "role": "assistant",
+      "content": "",
+      "tool_calls": [
+        {
+          "function": {
+            "name": "get_temperature",
+            "arguments": {
+              "city": "Toronto"
+            }
+          },
+        }
+      ]
+    },
+    // the tool call result appended to history
+    {
+      "role": "tool",
+      "content": "11 degrees celsius",
+      "tool_name": "get_temperature",
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:43:37.688511Z",
+  "message": {
+    "role": "assistant",
+    "content": "The current temperature in Toronto is 11°C."
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 890771750,
+  "load_duration": 707634750,
+  "prompt_eval_count": 94,
+  "prompt_eval_duration": 91703208,
+  "eval_count": 11,
+  "eval_duration": 90282125
+}
+
+```
+
+
 #### Chat request (with images)

 ##### Request
@@ -1157,15 +1397,11 @@ A single JSON object will be returned.
 {
  "models": [
    {
-
-      "model": "codellama:13b",
-      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
-      "size": 7365960935,
-      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
-      "capabilities": [
-        "completion"
-      ],
-
+      "name": "deepseek-r1:latest",
+      "model": "deepseek-r1:latest",
+      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
+      "size": 4683075271,
+      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
      "details": {
        "parent_model": "",
        "format": "gguf",
@@ -1178,16 +1414,11 @@ A single JSON object will be returned.
      }
    },
    {
-
-      "model": "llama4:latest",
-      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
-      "size": 3825819519,
-      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
-      "capabilities": [
-        "completion",
-        "vision"
-      ],
-
+      "name": "llama3.2:latest",
+      "model": "llama3.2:latest",
+      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
+      "size": 2019393189,
+      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
      "details": {
        "parent_model": "",
        "format": "gguf",
@@ -1362,7 +1593,7 @@ Then there is a series of downloading responses. Until any of the download is co

 ```json
 {
-  "status": "downloading digestname",
+  "status": "pulling digestname",
  "digest": "digestname",
  "total": 2142590208,
  "completed": 241970
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -1,59 +0,0 @@
-# Benchmark
-
-Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
-
-## When to use
-
-Run these benchmarks when:
- Making changes to the model inference engine
- Modifying model loading/unloading logic
- Changing prompt processing or token generation code
- Implementing a new model architecture
- Testing performance across different hardware setups
-
-## Prerequisites
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
-## Usage and Examples
-
->[!NOTE]
->All commands must be run from the root directory of the Ollama project.
-
-Basic syntax:
-```bash
-go test -bench=. ./benchmark/... -m $MODEL_NAME
-```
-
-Required flags:
- `-bench=.`: Run all benchmarks
- `-m`: Model name to benchmark
-
-Optional flags:
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
-
-Common usage patterns:
-
-Single benchmark run with a model specified:
-```bash
-go test -bench=. ./benchmark/... -m llama3.3
-```
-
-## Output metrics
-
-The benchmark reports several key metrics:
-
- `gen_tok/s`: Generated tokens per second
- `prompt_tok/s`: Prompt processing tokens per second
- `ttft_ms`: Time to first token in milliseconds
- `load_ms`: Model load time in milliseconds
- `gen_tokens`: Total tokens generated
- `prompt_tokens`: Total prompt tokens processed
-
-Each benchmark runs two scenarios:
- Cold start: Model is loaded from disk for each test
- Warm start: Model is pre-loaded in memory
-
-Three prompt lengths are tested for each scenario:
- Short prompt (100 tokens)
- Medium prompt (500 tokens)
- Long prompt (1000 tokens)
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```

-> NOTE: In rare cirumstances, you may nedd to change a package using the new
+> NOTE: In rare circumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens. 
+By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.

-This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
+This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:

 ```shell
 OLLAMA_CONTEXT_LENGTH=8192 ollama serve
@@ -46,6 +46,8 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

+Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
+
 ## How can I tell if my model was loaded onto the GPU?

 Use the `ollama ps` command to see what models are currently loaded into memory.
@@ -57,8 +59,8 @@ ollama ps
 > **Output**:
 >
 > ```
-> NAME      	ID          	SIZE 	PROCESSOR	UNTIL
-> llama3:70b	bcfb190ca3a7	42 GB	100% GPU 	4 minutes from now
+> NAME           ID              SIZE     PROCESSOR    CONTEXT    UNTIL
+> gpt-oss:20b    05afbac4bad6    16 GB    100% GPU     8192       4 minutes from now
 > ```

 The `Processor` column will show which memory the model was loaded in to:
@@ -148,9 +150,11 @@ docker build -t ollama-with-ca .
 docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
 ```

-## Does Ollama send my prompts and answers back to ollama.com?
+## Does Ollama send my prompts and responses back to ollama.com?

-No. Ollama runs locally, and conversation data does not leave your machine.
+If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
+
+If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.

 ## How can I expose Ollama on my network?

@@ -292,7 +296,7 @@ If too many requests are sent to the server, it will respond with a 503 error in

 ## How does Ollama handle concurrent requests?

-Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
+Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.

 If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded.  As prior models become idle, one or more will be unloaded to make room for the new model.  Queued requests will be processed in order.  When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.

@@ -301,7 +305,7 @@ Parallel request processing for a given model results in increasing the context
 The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:

 - `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory.  The default is 3 * the number of GPUs or 3 for CPU inference.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
+- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default is 1, and will handle 1 request per model at a time.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512

 Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
@@ -333,3 +337,16 @@ The currently available K/V cache quantization types are:
 How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.

 You may need to experiment with different quantization types to find the best balance between memory usage and quality.
+
+## How can I stop Ollama from starting when I login to my computer
+
+Ollama for Windows and macOS register as a login item during installation.  You can disable this if you prefer not to have Ollama automatically start.  Ollama will respect this setting across upgrades, unless you uninstall the application.
+
+**Windows**
+- Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
+
+**MacOS Monterey (v12)**
+- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
+
+**MacOS Ventura (v13) and later**
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,12 +1,14 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
+Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.

 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)

 | Compute Capability | Family              | Cards                                                                                                       |
 | ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
+| 12.0               | GeForce RTX 50xx    | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090`                                     |
+|                    | NVIDIA Professioal  | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell`         |
 | 9.0                | NVIDIA              | `H200` `H100`                                                                                               |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
--- a/docs/import.md
+++ b/docs/import.md
@@ -53,6 +53,8 @@ FROM /path/to/safetensors/directory

 If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.

+If you do not create the Modelfile, ollama will act as if there was a Modelfile with the command `FROM .`.
+
 Now run the `ollama create` command from the directory where you created the `Modelfile`:

 ```shell
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -16,7 +16,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Download and extract the package:

 ```shell
-curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
+curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```

@@ -112,8 +112,8 @@ sudo systemctl status ollama
 > While AMD has contributed the `amdgpu` driver upstream to the official linux
 > kernel source, the version is older and may not support all ROCm features. We
 > recommend you install the latest driver from
-> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-> GPU.
+> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
+> of your Radeon GPU.

 ## Customizing

--- a/docs/macos.md
+++ b/docs/macos.md
@@ -0,0 +1,42 @@
+# Ollama for macOS
+
+## System Requirements
+
+* MacOS Monterey (v12) or newer
+* Apple M series (CPU and GPU support) or x86 (CPU only)
+
+
+## Filesystem Requirements
+
+The preferred method of installation is to mount the `ollama.dmg` and drag-and-drop the Ollama application to the system-wide `Applications` folder.  Upon startup, the Ollama app will verify the `ollama` CLI is present in your PATH, and if not detected, will prompt for permission to create a link in `/usr/local/bin`
+
+Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
+
+### Changing Install Location
+
+To install the Ollama application somewhere other than `Applications`, place the Ollama application in the desired location, and ensure the CLI `Ollama.app/Contents/Resources/ollama` or a sym-link to the CLI can be found in your path.  Upon first start decline the "Move to Applications?" request.
+
+
+## Troubleshooting
+
+Ollama on MacOS stores files in a few different locations.
+- `~/.ollama` contains models and configuration
+- `~/.ollama/logs` contains logs
+    - *app.log* contains most recent logs from the GUI application
+    - *server.log* contains the most recent server logs
+- `<install location>/Ollama.app/Contents/Resources/ollama` the CLI binary
+
+## Uninstall
+
+To fully remove Ollama from your system, remove the following files and folders:
+
+```
+sudo rm -rf /Applications/Ollama.app
+sudo rm /usr/local/bin/ollama
+rm -rf "~/Library/Application Support/Ollama"
+rm -rf "~/Library/Saved Application State/com.electron.ollama.savedState"
+rm -rf ~/Library/Caches/com.electron.ollama/
+rm -rf ~/Library/Caches/ollama
+rm -rf ~/Library/WebKit/com.electron.ollama
+rm -rf ~/.ollama
+```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -150,7 +150,7 @@ PARAMETER <parameter> <parametervalue>

 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
+| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 4096)                                                                                                                                                                    | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -72,7 +72,7 @@ client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
 # Define the schema for the response
 class FriendInfo(BaseModel):
    name: str
-    age: int 
+    age: int
    is_available: bool

 class FriendList(BaseModel):
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
 On **Linux** systems with systemd, the logs can be found with this command:

 ```shell
-journalctl -u ollama --no-pager --follow --pager-end 
+journalctl -u ollama --no-pager --follow --pager-end
 ```

 When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
@@ -23,7 +23,7 @@ docker logs <container-name>
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.

 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored

@@ -38,12 +38,12 @@ Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.

 ## LLM libraries

-Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. 
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` and the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library.

 In the server log, you will see a message that looks something like this (varies from release to release):

 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
 ```

 **Experimental LLM Library Override**
@@ -97,7 +97,7 @@ If none of those resolve the problem, gather additional information and file an

 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.

-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`

 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
--- a/docs/turbo.md
+++ b/docs/turbo.md
@@ -0,0 +1,107 @@
+# Turbo
+
+> ⚠️ Turbo is preview
+
+Ollama’s [Turbo](https://ollama.com/turbo) is a new way to run open-source models with acceleration from datacenter-grade hardware.
+
+Currently, the following models are available in Turbo:
+
+- `gpt-oss:20b`
+- `gpt-oss:120b`
+
+## Get started
+
+### Ollama for macOS & Windows
+
+Download Ollama
+
+- Select a model such as `gpt-oss:20b` or `gpt-oss:120b`
+- Click on **Turbo**. You’ll be prompted to create an account or sign in
+
+### Ollama’s CLI
+
+- [Sign up](https://ollama.com/signup) for an Ollama account
+- Add your Ollama key [to ollama.com](https://ollama.com/settings/keys).
+
+  On macOS and Linux:
+
+  ```shell
+  cat ~/.ollama/id_ed25519.pub
+  ```
+
+  On Windows:
+
+  ```
+  type "%USERPROFILE%\.ollama\id_ed25519.pub"
+  ```
+
+- Then run a model setting `OLLAMA_HOST` to `ollama.com`:
+  ```shell
+  OLLAMA_HOST=ollama.com ollama run gpt-oss:120b
+  ```
+
+### Ollama’s Python library
+
+- Download Ollama's [Python library](https://github.com/ollama/ollama-python)
+- [Sign up](https://ollama.com/signup) for an Ollama account
+- Create an API key by visiting https://ollama.com/settings/keys
+
+```python
+from ollama import Client
+
+client = Client(
+    host="https://ollama.com",
+    headers={'Authorization': '<api key>'}
+)
+
+messages = [
+  {
+    'role': 'user',
+    'content': 'Why is the sky blue?',
+  },
+]
+
+for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
+  print(part['message']['content'], end='', flush=True)
+```
+
+### Ollama’s JavaScript library
+
+- Download Ollama's [JavaScript library](https://github.com/ollama/ollama-js)
+- [Sign up](https://ollama.com/signup) for an Ollama account
+- Create an API key by visiting https://ollama.com/settings/keys
+
+```typescript
+import { Ollama } from 'ollama';
+
+const ollama = new Ollama({
+  host: 'https://ollama.com'
+  headers: {
+	  Authorization: "Bearer <api key>"
+  }
+});
+
+const response = await ollama.chat({
+  model: 'gpt-oss:120b',
+  messages: [{ role: 'user', content: 'Explain quantum computing' }],
+  stream: true
+});
+
+for await (const part of response) {
+    process.stdout.write(part.message.content)
+}
+```
+
+### Community integrations
+
+Turbo mode is also compatible with several community integrations.
+
+#### Open WebUI
+
+- Go to **settings** → **Admin settings** → **Connections**
+- Under **Ollama API,** click **+**
+- For the **URL** put `https://ollama.com`
+- For the **API key,** create an API key on https://ollama.com/settings/keys and add it.
+- Click **Save**
+
+Now, if you navigate to the model selector, Turbo models should be available under **External**.
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -30,20 +30,6 @@ To install the Ollama application in a location different than your home directo
 OllamaSetup.exe /DIR="d:\some\location"
 ```

-### Changing Model Location
-
-To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
-
-1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
-
-2. Click on _Edit environment variables for your account_.
-
-3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
-
-4. Click OK/Apply to save.
-
-If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
-
 ## API Access

 Here's a quick example showing API access from `powershell`
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -219,7 +219,7 @@ func Uint(key string, defaultValue uint) func() uint {

 var (
 	// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
-	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
+	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 1)
 	// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
 	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
 	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
--- a/fs/config.go
+++ b/fs/config.go
@@ -10,4 +10,5 @@ type Config interface {
 	Strings(string, ...[]string) []string
 	Ints(string, ...[]int32) []int32
 	Floats(string, ...[]float32) []float32
+	Bools(string, ...[]bool) []bool
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,6 +1,7 @@
 package ggml

 import (
+	"cmp"
 	"encoding/binary"
 	"errors"
 	"fmt"
@@ -34,7 +35,8 @@ func (kv KV) Kind() string {
 }

 func (kv KV) ParameterCount() uint64 {
-	return keyValue(kv, "general.parameter_count", uint64(0))
+	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
+	return val
 }

 func (kv KV) FileType() FileType {
@@ -53,16 +55,27 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }

-func (kv KV) HeadCount() uint64 {
-	return uint64(kv.Uint("attention.head_count"))
+func (kv KV) HeadCountMax() uint64 {
+	// TODO(drifkin): using the max value can cause an overestimation. In the
+	// future if array values become more popular, we can adapt the more invasive
+	// <https://github.com/ollama/ollama/pull/10225>
+	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
 }

-func (kv KV) HeadCountKV() uint64 {
-	return uint64(kv.Uint("attention.head_count_kv", 1))
+func (kv KV) HeadCountMin() uint64 {
+	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
 }

-func (kv KV) EmbeddingHeadCount() uint64 {
-	if heads := kv.HeadCount(); heads > 0 {
+func (kv KV) HeadCountKVMax() uint64 {
+	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
+}
+
+func (kv KV) HeadCountKVMin() uint64 {
+	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
+}
+
+func (kv KV) EmbeddingHeadCountMax() uint64 {
+	if heads := kv.HeadCountMin(); heads > 0 {
 		return kv.EmbeddingLength() / heads
 	}

@@ -70,15 +83,11 @@ func (kv KV) EmbeddingHeadCount() uint64 {
 }

 func (kv KV) EmbeddingHeadCountK() uint64 {
-	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
 }

 func (kv KV) EmbeddingHeadCountV() uint64 {
-	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
-}
-
-func (kv KV) GQA() uint64 {
-	return kv.HeadCount() / kv.HeadCountKV()
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
 }

 func (kv KV) ContextLength() uint64 {
@@ -90,44 +99,88 @@ func (kv KV) ChatTemplate() string {
 }

 func (kv KV) String(key string, defaultValue ...string) string {
-	return keyValue(kv, key, append(defaultValue, "")...)
+	val, _ := keyValue(kv, key, append(defaultValue, "")...)
+	return val
 }

 func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
-	return keyValue(kv, key, append(defaultValue, 0)...)
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
 }

 func (kv KV) Float(key string, defaultValue ...float32) float32 {
-	return keyValue(kv, key, append(defaultValue, 0)...)
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
 }

 func (kv KV) Bool(key string, defaultValue ...bool) bool {
-	return keyValue(kv, key, append(defaultValue, false)...)
+	val, _ := keyValue(kv, key, append(defaultValue, false)...)
+	return val
+}
+
+func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
+	_, max := kv.UintOrArrayValue(key, defaultValue)
+	return max
+}
+
+func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
+	min, _ := kv.UintOrArrayValue(key, defaultValue)
+	return min
+}
+
+func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
+	if u32, ok := keyValue(kv, key, uint32(0)); ok {
+		return u32, u32
+	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
+		min := slices.Min(u32s.values)
+		max := slices.Max(u32s.values)
+		return min, max
+	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
+		min := slices.Min(i32s.values)
+		max := slices.Max(i32s.values)
+		if min < 0 || max < 0 {
+			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
+		}
+		return uint32(min), uint32(max)
+	}
+
+	return defaultValue, defaultValue
 }

 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
-	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
+	return val.values
 }

 func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
-	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
+	return val.values
 }

 func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
-	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
+	return val.values
 }

 func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
-	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
+	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
+	return val.values
+}
+
+func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
+	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
+	return val.values
 }

 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"gemma3",
+		"gemma3n",
 		"mistral3",
 		"llama4",
 		"mllama",
 		"qwen25vl",
+		"gptoss",
 	}, kv.Architecture())
 }

@@ -143,17 +196,17 @@ type arrayValueTypes interface {
 		*array[string] | *array[float32] | *array[float64] | *array[bool]
 }

-func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
+func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 	}

-	if val, ok := kv[key]; ok {
-		return val.(T)
+	if val, ok := kv[key].(T); ok {
+		return val, true
 	}

-	slog.Debug("key not found", "key", key, "default", defaultValue[0])
-	return defaultValue[0]
+	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0], false
 }

 type Tensors struct {
@@ -229,7 +282,7 @@ func (t Tensor) block() (n int) {
 }

 func (t Tensor) blockSize() uint64 {
-	return (TensorType)(t.Kind).BlockSize()
+	return TensorType(t.Kind).BlockSize()
 }

 func (t TensorType) BlockSize() uint64 {
@@ -247,6 +300,7 @@ func (t TensorType) BlockSize() uint64 {
 	case
 		2,  // Q4_0
 		3,  // Q4_1
+		4,  // MXFP4
 		6,  // Q5_0
 		7,  // Q5_1
 		8,  // Q8_0
@@ -274,6 +328,8 @@ func (t TensorType) TypeSize() uint64 {
 		return 2 + blockSize/2
 	case TensorTypeQ4_1:
 		return 2 + 2 + blockSize/2
+	case TensorTypeMXFP4:
+		return 1 + blockSize/2
 	case TensorTypeQ5_0:
 		return 2 + 4 + blockSize/2
 	case TensorTypeQ5_1:
@@ -425,20 +481,22 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
-	heads := f.KV().HeadCount()
-	headsKV := f.KV().HeadCountKV()
+	heads := f.KV().HeadCountMax()
+	headsKV := f.KV().HeadCountKVMax()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)

-	embeddingHeads := f.KV().EmbeddingHeadCount()
+	embeddingHeads := f.KV().EmbeddingHeadCountMax()
 	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
 	embeddingHeadsV := f.KV().EmbeddingHeadCountV()

 	layers := f.Tensors().GroupLayers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
+	var kvTotal uint64
 	kv = make([]uint64, f.KV().BlockCount())
 	for i := range kv {
 		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+		kvTotal += kv[i]
 	}

 	switch f.KV().Architecture() {
@@ -504,7 +562,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 			// vocab graph
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
-	case "gemma", "gemma2", "gemma3":
+	case "gemma", "gemma2", "gemma3", "gemma3n":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
@@ -517,6 +575,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				embedding*embeddingHeadsK*heads*9/16,
 		)

+		if f.KV().Architecture() == "gemma3n" {
+			fullOffload *= 4
+			partialOffload *= 4
+		}
+
 		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
 		// engine. Gemma3 always uses the Ollama engine.
 		if f.KV().Architecture() == "gemma3" {
@@ -602,6 +665,18 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 					4*qkvBias.Shape[0],
 			)
 		}
+	case "gptoss":
+		kv = make([]uint64, f.KV().BlockCount())
+		for i := range kv {
+			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+			if i%2 == 0 {
+				kv[i] *= (uint64(numParallel)*4096 + batch)
+			} else {
+				kv[i] *= context
+			}
+		}
+		fullOffload = 4 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
+		partialOffload = fullOffload
 	}

 	return
@@ -686,6 +761,10 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

+	if f.KV().Architecture() == "gptoss" {
+		return false
+	}
+
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()
--- a/fs/ggml/ggml_test.go
+++ b/fs/ggml/ggml_test.go
@@ -269,3 +269,33 @@ func TestKeyValue(t *testing.T) {
 		t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
 	}
 }
+
+func TestHeadCount(t *testing.T) {
+	valuesArray := []int32{1, 5, 3, 4}
+	cases := []struct {
+		kv   KV
+		want uint64
+	}{
+		{
+			kv: KV{
+				"general.architecture":     "abc",
+				"abc.attention.head_count": &array[int32]{values: valuesArray, size: len(valuesArray)},
+			},
+			want: uint64(5),
+		},
+		{
+			kv: KV{
+				"general.architecture":     "abc",
+				"abc.attention.head_count": uint32(3),
+			},
+			want: uint64(3),
+		},
+	}
+
+	for _, tt := range cases {
+		got := tt.kv.HeadCountMax()
+		if got != tt.want {
+			t.Errorf("unexpected max value: got=%d want=%d", got, tt.want)
+		}
+	}
+}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -527,23 +527,17 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		return err
 	}

-	keys := slices.Collect(maps.Keys(kv))
-	slices.Sort(keys)
-
-	for _, key := range keys {
+	for _, key := range slices.Sorted(maps.Keys(kv)) {
 		if err := ggufWriteKV(f, key, kv[key]); err != nil {
 			return err
 		}
 	}

 	slices.SortStableFunc(ts, func(a, b *Tensor) int {
-		if i, j := a.block(), b.block(); i < 0 && j > 0 {
-			return 1
-		} else if i > 0 && j < 0 {
-			return -1
-		} else {
+		if i, j := a.block(), b.block(); i > 0 && j > 0 {
 			return cmp.Compare(i, j)
 		}
+		return cmp.Compare(a.Name, b.Name)
 	})

 	var s uint64
@@ -615,6 +609,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 		err = writeGGUFArray(ws, ggufTypeString, v)
 	case *array[string]:
 		err = writeGGUFArray(ws, ggufTypeString, v.values)
+	case []bool:
+		err = writeGGUFArray(ws, ggufTypeBool, v)
+	case *array[bool]:
+		err = writeGGUFArray(ws, ggufTypeBool, v.values)
 	default:
 		return fmt.Errorf("improper type for '%s'", k)
 	}
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -2,62 +2,82 @@ package ggml

 import (
 	"bytes"
+	"math/rand/v2"
 	"os"
-	"slices"
+	"strings"
 	"testing"

 	"github.com/google/go-cmp/cmp"
 )

 func TestWriteGGUF(t *testing.T) {
-	w, err := os.CreateTemp(t.TempDir(), "*.bin")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer w.Close()
+	r := rand.New(rand.NewPCG(0, 0))
+	for range 8 {
+		t.Run("shuffle", func(t *testing.T) {
+			t.Parallel()

-	if err := WriteGGUF(w, KV{
-		"general.alignment": uint32(16),
-	}, []*Tensor{
-		{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-	}); err != nil {
-		t.Fatal(err)
-	}
+			ts := []*Tensor{
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.1.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.2.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.3.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.4.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.5.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
+			}

-	r, err := os.Open(w.Name())
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer r.Close()
+			r.Shuffle(len(ts), func(i, j int) {
+				ts[i], ts[j] = ts[j], ts[i]
+			})

-	ff, err := Decode(r, 0)
-	if err != nil {
-		t.Fatal(err)
-	}
+			w, err := os.CreateTemp(t.TempDir(), strings.ReplaceAll(t.Name(), "/", "_")+"*.bin")
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer w.Close()

-	if diff := cmp.Diff(ff.KV(), KV{
-		"general.alignment":       uint32(16),
-		"general.parameter_count": uint64(36),
-	}); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
-	}
+			if err := WriteGGUF(w, KV{
+				"general.alignment": uint32(16),
+			}, ts); err != nil {
+				t.Fatal(err)
+			}

-	if diff := cmp.Diff(ff.Tensors(), Tensors{
-		Offset: 336,
-		items: []*Tensor{
-			{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
-			{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
-			{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
-			{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
-			{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
-			{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
-		},
-	}, cmp.AllowUnexported(Tensors{})); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
+			r, err := os.Open(w.Name())
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+
+			ff, err := Decode(r, 0)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if diff := cmp.Diff(KV{
+				"general.alignment":       uint32(16),
+				"general.parameter_count": uint64(54),
+			}, ff.KV()); diff != "" {
+				t.Errorf("Mismatch (-want +got):\n%s", diff)
+			}
+
+			if diff := cmp.Diff(Tensors{
+				Offset: 608,
+				items: []*Tensor{
+					{Name: "blk.0.attn_norm.weight", Offset: 0, Shape: []uint64{2, 3}},
+					{Name: "blk.1.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
+					{Name: "blk.2.attn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
+					{Name: "blk.3.attn_norm.weight", Offset: 96, Shape: []uint64{2, 3}},
+					{Name: "blk.4.attn_norm.weight", Offset: 128, Shape: []uint64{2, 3}},
+					{Name: "blk.5.attn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
+					{Name: "output.weight", Offset: 192, Shape: []uint64{3, 2}},
+					{Name: "output_norm.weight", Offset: 224, Shape: []uint64{3, 2}},
+					{Name: "token_embd.weight", Offset: 256, Shape: []uint64{2, 3}},
+				},
+			}, ff.Tensors(), cmp.AllowUnexported(Tensors{})); diff != "" {
+				t.Errorf("Mismatch (-want +got):\n%s", diff)
+			}
+		})
 	}
 }
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -14,9 +14,9 @@ const (
 	FileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
-	fileTypeQ4_1_F16 // unused by GGML
-	fileTypeQ4_2     // unused by GGML
-	fileTypeQ4_3     // unused by GGML
+	fileTypeMXFP4 // originally fileTypeQ4_1_F16 // unused by GGML
+	fileTypeQ4_2  // unused by GGML
+	fileTypeQ4_3  // unused by GGML
 	FileTypeQ8_0
 	fileTypeQ5_0
 	fileTypeQ5_1
@@ -97,6 +97,8 @@ func (t FileType) String() string {
 		return "Q4_0"
 	case fileTypeQ4_1:
 		return "Q4_1"
+	case fileTypeMXFP4:
+		return "MXFP4"
 	case FileTypeQ8_0:
 		return "Q8_0"
 	case fileTypeQ5_0:
@@ -144,6 +146,8 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeQ4_0
 	case fileTypeQ4_1:
 		return TensorTypeQ4_1
+	case fileTypeMXFP4:
+		return TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
 	case fileTypeQ5_0:
@@ -187,8 +191,8 @@ const (
 	TensorTypeF16
 	TensorTypeQ4_0
 	TensorTypeQ4_1
-	tensorTypeQ4_2 // unused by GGML
-	tensorTypeQ4_3 // unused by GGML
+	TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
+	tensorTypeQ4_3  // unused by GGML
 	TensorTypeQ5_0
 	TensorTypeQ5_1
 	TensorTypeQ8_0
@@ -260,6 +264,8 @@ func ParseTensorType(s string) (TensorType, error) {
 		return TensorTypeF64, nil
 	case "BF16":
 		return TensorTypeBF16, nil
+	case "MXFP4":
+		return TensorTypeMXFP4, nil
 	default:
 		return 0, fmt.Errorf("unsupported quantization type %s", s)
 	}
@@ -312,6 +318,8 @@ func (t TensorType) String() string {
 		return "F64"
 	case TensorTypeBF16:
 		return "BF16"
+	case TensorTypeMXFP4:
+		return "MXFP4"
 	default:
 		return "unknown"
 	}
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -0,0 +1,347 @@
+package gguf
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"iter"
+	"os"
+	"slices"
+	"strings"
+)
+
+const (
+	typeUint8 uint32 = iota
+	typeInt8
+	typeUint16
+	typeInt16
+	typeUint32
+	typeInt32
+	typeFloat32
+	typeBool
+	typeString
+	typeArray
+	typeUint64
+	typeInt64
+	typeFloat64
+)
+
+var ErrUnsupported = errors.New("unsupported")
+
+type File struct {
+	Magic   [4]byte
+	Version uint32
+
+	keyValues *lazy[KeyValue]
+	tensors   *lazy[TensorInfo]
+	offset    int64
+
+	file   *os.File
+	reader *bufferedReader
+	bts    []byte
+}
+
+func Open(path string) (f *File, err error) {
+	f = &File{bts: make([]byte, 4096)}
+	f.file, err = os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+
+	f.reader = newBufferedReader(f.file, 32<<10)
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
+		return nil, err
+	}
+
+	if bytes.Equal(f.Magic[:], []byte("gguf")) {
+		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
+	}
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
+		return nil, err
+	}
+
+	if f.Version < 2 {
+		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
+	}
+
+	f.tensors, err = newLazy(f, f.readTensor)
+	if err != nil {
+		return nil, err
+	}
+
+	f.tensors.successFunc = func() error {
+		offset := f.reader.offset
+
+		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
+		f.offset = offset + (alignment-offset%alignment)%alignment
+		return nil
+	}
+
+	f.keyValues, err = newLazy(f, f.readKeyValue)
+	if err != nil {
+		return nil, err
+	}
+
+	return f, nil
+}
+
+func (f *File) readTensor() (TensorInfo, error) {
+	name, err := readString(f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	dims, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	shape := make([]uint64, dims)
+	for i := range dims {
+		shape[i], err = read[uint64](f)
+		if err != nil {
+			return TensorInfo{}, err
+		}
+	}
+
+	type_, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	offset, err := read[uint64](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	return TensorInfo{
+		Name:   name,
+		Offset: offset,
+		Shape:  shape,
+		Type:   TensorType(type_),
+	}, nil
+}
+
+func (f *File) readKeyValue() (KeyValue, error) {
+	key, err := readString(f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	t, err := read[uint32](f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	value, err := func() (any, error) {
+		switch t {
+		case typeUint8:
+			return read[uint8](f)
+		case typeInt8:
+			return read[int8](f)
+		case typeUint16:
+			return read[uint16](f)
+		case typeInt16:
+			return read[int16](f)
+		case typeUint32:
+			return read[uint32](f)
+		case typeInt32:
+			return read[int32](f)
+		case typeUint64:
+			return read[uint64](f)
+		case typeInt64:
+			return read[int64](f)
+		case typeFloat32:
+			return read[float32](f)
+		case typeFloat64:
+			return read[float64](f)
+		case typeBool:
+			return read[bool](f)
+		case typeString:
+			return readString(f)
+		case typeArray:
+			return readArray(f)
+		default:
+			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+		}
+	}()
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	return KeyValue{
+		Key:   key,
+		Value: Value{value},
+	}, nil
+}
+
+func read[T any](f *File) (t T, err error) {
+	err = binary.Read(f.reader, binary.LittleEndian, &t)
+	return t, err
+}
+
+func readString(f *File) (string, error) {
+	n, err := read[uint64](f)
+	if err != nil {
+		return "", err
+	}
+
+	if int(n) > len(f.bts) {
+		f.bts = make([]byte, n)
+	}
+
+	bts := f.bts[:n]
+	if _, err := io.ReadFull(f.reader, bts); err != nil {
+		return "", err
+	}
+	defer clear(bts)
+
+	return string(bts), nil
+}
+
+func readArray(f *File) (any, error) {
+	t, err := read[uint32](f)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := read[uint64](f)
+	if err != nil {
+		return nil, err
+	}
+
+	switch t {
+	case typeUint8:
+		return readArrayData[uint8](f, n)
+	case typeInt8:
+		return readArrayData[int8](f, n)
+	case typeUint16:
+		return readArrayData[uint16](f, n)
+	case typeInt16:
+		return readArrayData[int16](f, n)
+	case typeUint32:
+		return readArrayData[uint32](f, n)
+	case typeInt32:
+		return readArrayData[int32](f, n)
+	case typeUint64:
+		return readArrayData[uint64](f, n)
+	case typeInt64:
+		return readArrayData[int64](f, n)
+	case typeFloat32:
+		return readArrayData[float32](f, n)
+	case typeFloat64:
+		return readArrayData[float64](f, n)
+	case typeBool:
+		return readArrayData[bool](f, n)
+	case typeString:
+		return readArrayString(f, n)
+	default:
+		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+	}
+}
+
+func readArrayData[T any](f *File, n uint64) (s []T, err error) {
+	s = make([]T, n)
+	for i := range n {
+		e, err := read[T](f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func readArrayString(f *File, n uint64) (s []string, err error) {
+	s = make([]string, n)
+	for i := range n {
+		e, err := readString(f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func (f *File) Close() error {
+	f.keyValues.stop()
+	f.tensors.stop()
+	return f.file.Close()
+}
+
+func (f *File) KeyValue(key string) KeyValue {
+	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
+		key = f.KeyValue("general.architecture").String() + "." + key
+	}
+
+	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
+		return kv.Key == key
+	}); index >= 0 {
+		return f.keyValues.values[index]
+	}
+
+	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
+		if keyValue.Key == key {
+			return keyValue
+		}
+	}
+
+	return KeyValue{}
+}
+
+func (f *File) NumKeyValues() int {
+	return int(f.keyValues.count)
+}
+
+func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
+	return f.keyValues.All()
+}
+
+func (f *File) TensorInfo(name string) TensorInfo {
+	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
+		return t.Name == name
+	}); index >= 0 {
+		return f.tensors.values[index]
+	}
+
+	// fast-forward through key values if we haven't already
+	_ = f.keyValues.rest()
+	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
+		if tensor.Name == name {
+			return tensor
+		}
+	}
+
+	return TensorInfo{}
+}
+
+func (f *File) NumTensors() int {
+	return int(f.tensors.count)
+}
+
+func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
+	// fast forward through key values if we haven't already
+	f.keyValues.rest()
+	return f.tensors.All()
+}
+
+func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
+	t := f.TensorInfo(name)
+	if t.NumBytes() == 0 {
+		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
+	}
+
+	// fast forward through tensor info if we haven't already
+	_ = f.tensors.rest()
+	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
+}
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@@ -0,0 +1,249 @@
+package gguf_test
+
+import (
+	"bytes"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
+)
+
+func createBinFile(tb testing.TB) string {
+	tb.Helper()
+	f, err := os.CreateTemp(tb.TempDir(), "")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	kv := ggml.KV{
+		"general.architecture":                   "llama",
+		"llama.block_count":                      uint32(8),
+		"llama.embedding_length":                 uint32(3),
+		"llama.attention.head_count":             uint32(2),
+		"llama.attention.head_count_kv":          uint32(2),
+		"llama.attention.key_length":             uint32(3),
+		"llama.rope.dimension_count":             uint32(4),
+		"llama.rope.freq_base":                   float32(10000.0),
+		"llama.rope.freq_scale":                  float32(1.0),
+		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
+		"tokenizer.ggml.eos_token_id":            uint32(0),
+		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
+		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
+		"tokenizer.ggml.scores":                  []float32{0, 1},
+	}
+
+	tensors := []*ggml.Tensor{
+		{
+			Name:     "token_embd.weight",
+			Kind:     0,
+			Shape:    []uint64{2, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
+		},
+		{
+			Name:     "output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 2},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
+		},
+	}
+
+	for i := range 8 {
+		tensors = append(tensors, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		})
+	}
+
+	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
+		tb.Fatal(err)
+	}
+
+	return f.Name()
+}
+
+func TestRead(t *testing.T) {
+	f, err := gguf.Open(createBinFile(t))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	if got := f.KeyValue("does.not.exist").Valid(); got {
+		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
+	}
+
+	if got := f.KeyValue("general.architecture").String(); got != "llama" {
+		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
+	}
+
+	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
+		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
+	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
+		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if got.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
+	}
+
+	if got := f.KeyValue("block_count").Uint(); got != 8 {
+		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
+	}
+
+	var kvs []string
+	for _, kv := range f.KeyValues() {
+		if !kv.Valid() {
+			t.Error("found invalid key-value pair:", kv)
+		}
+
+		kvs = append(kvs, kv.Key)
+	}
+
+	if len(kvs) != f.NumKeyValues() {
+		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
+	}
+
+	if diff := cmp.Diff(kvs, []string{
+		"general.architecture",
+		"llama.block_count",
+		"llama.embedding_length",
+		"llama.attention.head_count",
+		"llama.attention.head_count_kv",
+		"llama.attention.key_length",
+		"llama.rope.dimension_count",
+		"llama.rope.freq_base",
+		"llama.rope.freq_scale",
+		"llama.attention.layer_norm_rms_epsilon",
+		"tokenizer.ggml.eos_token_id",
+		"tokenizer.ggml.eos_token_ids",
+		"tokenizer.ggml.tokens",
+		"tokenizer.ggml.scores",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
+	}
+
+	var tis []string
+	for _, ti := range f.TensorInfos() {
+		if !ti.Valid() {
+			t.Error("found invalid tensor info:", ti)
+		}
+
+		tis = append(tis, ti.Name)
+	}
+
+	if len(tis) != f.NumTensors() {
+		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
+	}
+
+	if diff := cmp.Diff(tis, []string{
+		"token_embd.weight",
+		"output.weight",
+		"blk.0.attn_q.weight",
+		"blk.0.attn_k.weight",
+		"blk.0.attn_v.weight",
+		"blk.0.attn_output.weight",
+		"blk.1.attn_q.weight",
+		"blk.1.attn_k.weight",
+		"blk.1.attn_v.weight",
+		"blk.1.attn_output.weight",
+		"blk.2.attn_q.weight",
+		"blk.2.attn_k.weight",
+		"blk.2.attn_v.weight",
+		"blk.2.attn_output.weight",
+		"blk.3.attn_q.weight",
+		"blk.3.attn_k.weight",
+		"blk.3.attn_v.weight",
+		"blk.3.attn_output.weight",
+		"blk.4.attn_q.weight",
+		"blk.4.attn_k.weight",
+		"blk.4.attn_v.weight",
+		"blk.4.attn_output.weight",
+		"blk.5.attn_q.weight",
+		"blk.5.attn_k.weight",
+		"blk.5.attn_v.weight",
+		"blk.5.attn_output.weight",
+		"blk.6.attn_q.weight",
+		"blk.6.attn_k.weight",
+		"blk.6.attn_v.weight",
+		"blk.6.attn_output.weight",
+		"blk.7.attn_q.weight",
+		"blk.7.attn_k.weight",
+		"blk.7.attn_v.weight",
+		"blk.7.attn_output.weight",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
+	}
+
+	ti, r, err := f.TensorReader("output.weight")
+	if err != nil {
+		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
+	}
+
+	if ti.Name != "output.weight" {
+		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
+	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
+		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if ti.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
+	}
+
+	var b bytes.Buffer
+	if _, err := b.ReadFrom(r); err != nil {
+		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
+	}
+
+	if b.Len() != int(ti.NumBytes()) {
+		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
+	}
+}
+
+func BenchmarkRead(b *testing.B) {
+	b.ReportAllocs()
+
+	p := createBinFile(b)
+	for b.Loop() {
+		f, err := gguf.Open(p)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		if got := f.KeyValue("general.architecture").String(); got != "llama" {
+			b.Errorf("got = %q, want %q", got, "llama")
+		}
+
+		// Iterate through some tensors
+		for range f.TensorInfos() {
+		}
+
+		f.Close()
+	}
+}
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@@ -0,0 +1,90 @@
+package gguf
+
+import (
+	"reflect"
+	"slices"
+)
+
+type KeyValue struct {
+	Key string
+	Value
+}
+
+func (kv KeyValue) Valid() bool {
+	return kv.Key != "" && kv.Value.value != nil
+}
+
+type Value struct {
+	value any
+}
+
+func value[T any](v Value, kinds ...reflect.Kind) (t T) {
+	vv := reflect.ValueOf(v.value)
+	if slices.Contains(kinds, vv.Kind()) {
+		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
+	}
+	return
+}
+
+func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
+	switch vv := reflect.ValueOf(v.value); vv.Kind() {
+	case reflect.Slice:
+		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
+			ts = make([]T, vv.Len())
+			for i := range vv.Len() {
+				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
+			}
+		}
+	}
+	return
+}
+
+// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
+func (v Value) Int() int64 {
+	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
+func (v Value) Ints() (i64s []int64) {
+	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
+func (v Value) Uint() uint64 {
+	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
+func (v Value) Uints() (u64s []uint64) {
+	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Float returns Value as a float. If it is not a float, it returns 0.
+func (v Value) Float() float64 {
+	return value[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
+func (v Value) Floats() (f64s []float64) {
+	return values[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Bool returns Value as a boolean. If it is not a boolean, it returns false.
+func (v Value) Bool() bool {
+	return value[bool](v, reflect.Bool)
+}
+
+// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
+func (v Value) Bools() (bools []bool) {
+	return values[bool](v, reflect.Bool)
+}
+
+// String returns Value as a string. If it is not a string, it returns an empty string.
+func (v Value) String() string {
+	return value[string](v, reflect.String)
+}
+
+// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
+func (v Value) Strings() (strings []string) {
+	return values[string](v, reflect.String)
+}
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@@ -0,0 +1,208 @@
+package gguf
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func split(name string, values map[string][]any) (matched []any, unmatched []any) {
+	for key, value := range values {
+		if key == name {
+			matched = value
+		} else {
+			unmatched = append(unmatched, value...)
+		}
+	}
+	return
+}
+
+func TestValue(t *testing.T) {
+	values := map[string][]any{
+		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
+		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
+		"float64": {float32(42), float64(42)},
+		"string":  {"42", "hello"},
+		"bool":    {true, false},
+	}
+
+	t.Run("int64", func(t *testing.T) {
+		matched, unmatched := split("int64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 42 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 0 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+	})
+
+	t.Run("uint64", func(t *testing.T) {
+		matched, unmatched := split("uint64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 42 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 0 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+	})
+
+	t.Run("float64", func(t *testing.T) {
+		matched, unmatched := split("float64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 42 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 0 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+	})
+
+	t.Run("string", func(t *testing.T) {
+		matched, unmatched := split("string", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != v {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != "" {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		matched, unmatched := split("bool", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != v {
+				t.Errorf("expected true, got %v", b)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != false {
+				t.Errorf("expected false, got %v", b)
+			}
+		}
+	})
+}
+
+func TestValues(t *testing.T) {
+	values := map[string][]any{
+		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
+		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
+		"float64s": {[]float32{42}, []float64{42}},
+		"strings":  {[]string{"42"}, []string{"hello"}},
+		"bools":    {[]bool{true}, []bool{false}},
+	}
+
+	t.Run("int64s", func(t *testing.T) {
+		matched, unmatched := split("int64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64s := kv.Ints(); i64s != nil {
+				t.Errorf("expected nil, got %v", i64s)
+			}
+		}
+	})
+
+	t.Run("uint64s", func(t *testing.T) {
+		matched, unmatched := split("uint64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64s := kv.Uints(); u64s != nil {
+				t.Errorf("expected nil, got %v", u64s)
+			}
+		}
+	})
+
+	t.Run("float64s", func(t *testing.T) {
+		matched, unmatched := split("float64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64s := kv.Floats(); f64s != nil {
+				t.Errorf("expected nil, got %v", f64s)
+			}
+		}
+	})
+
+	t.Run("strings", func(t *testing.T) {
+		matched, unmatched := split("strings", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.Strings(); s != nil {
+				t.Errorf("expected nil, got %v", s)
+			}
+		}
+	})
+
+	t.Run("bools", func(t *testing.T) {
+		matched, unmatched := split("bools", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bools(); b != nil {
+				t.Errorf("expected nil, got %v", b)
+			}
+		}
+	})
+}
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@@ -0,0 +1,89 @@
+package gguf
+
+import (
+	"encoding/binary"
+	"iter"
+	"log/slog"
+)
+
+type lazy[T any] struct {
+	count  uint64
+	next   func() (T, bool)
+	stop   func()
+	values []T
+
+	// successFunc is called when all values have been successfully read.
+	successFunc func() error
+}
+
+func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
+	it := lazy[T]{}
+	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
+		return nil, err
+	}
+
+	it.values = make([]T, 0)
+	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
+		for i := range it.count {
+			t, err := fn()
+			if err != nil {
+				slog.Error("error reading tensor", "index", i, "error", err)
+				return
+			}
+
+			it.values = append(it.values, t)
+			if !yield(t) {
+				break
+			}
+		}
+
+		if it.successFunc != nil {
+			it.successFunc()
+		}
+	})
+
+	return &it, nil
+}
+
+func (g *lazy[T]) Values() iter.Seq[T] {
+	return func(yield func(T) bool) {
+		for _, v := range g.All() {
+			if !yield(v) {
+				break
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) All() iter.Seq2[int, T] {
+	return func(yield func(int, T) bool) {
+		for i := range int(g.count) {
+			if i < len(g.values) {
+				if !yield(i, g.values[i]) {
+					break
+				}
+			} else {
+				t, ok := g.next()
+				if !ok {
+					break
+				}
+
+				if !yield(i, t) {
+					break
+				}
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) rest() (collected bool) {
+	for {
+		_, ok := g.next()
+		collected = collected || ok
+		if !ok {
+			break
+		}
+	}
+
+	return collected
+}
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@@ -0,0 +1,23 @@
+package gguf
+
+import (
+	"bufio"
+	"io"
+)
+
+type bufferedReader struct {
+	offset int64
+	*bufio.Reader
+}
+
+func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
+	return &bufferedReader{
+		Reader: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (rs *bufferedReader) Read(p []byte) (n int, err error) {
+	n, err = rs.Reader.Read(p)
+	rs.offset += int64(n)
+	return n, err
+}
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@@ -0,0 +1,288 @@
+package gguf
+
+import (
+	"log/slog"
+	"strings"
+)
+
+type TensorInfo struct {
+	Name   string
+	Offset uint64
+	Shape  []uint64
+	Type   TensorType
+}
+
+func (ti TensorInfo) Valid() bool {
+	return ti.Name != "" && ti.NumBytes() > 0
+}
+
+func (ti TensorInfo) NumValues() int64 {
+	var numItems int64 = 1
+	for _, dim := range ti.Shape {
+		numItems *= int64(dim)
+	}
+	return numItems
+}
+
+// NumBytes returns the number of bytes in the tensor.
+func (ti TensorInfo) NumBytes() int64 {
+	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
+}
+
+func (ti TensorInfo) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.String("name", ti.Name),
+		slog.Int64("offset", int64(ti.Offset)),
+		slog.Any("shape", ti.Shape),
+		slog.Int64("num_values", ti.NumValues()),
+		slog.Int64("num_bytes", ti.NumBytes()),
+		slog.Any("type", ti.Type),
+	)
+}
+
+type TensorType uint32
+
+const (
+	TensorTypeF32 TensorType = iota
+	TensorTypeF16
+	TensorTypeQ4_0
+	TensorTypeQ4_1
+
+	// unexported // unused in gguf
+	tensorTypeQ4_2
+	tensorTypeQ4_3
+
+	TensorTypeQ5_0
+	TensorTypeQ5_1
+	TensorTypeQ8_0
+	TensorTypeQ8_1
+	TensorTypeQ2_K
+	TensorTypeQ3_K
+	TensorTypeQ4_K
+	TensorTypeQ5_K
+	TensorTypeQ6_K
+	TensorTypeQ8_K
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ2_XXS
+	tensorTypeIQ2_XS
+	tensorTypeIQ3_XXS
+	tensorTypeIQ1_S
+	tensorTypeIQ4_NL
+	tensorTypeIQ3_S
+	tensorTypeIQ2_S
+	tensorTypeIQ4_XS
+
+	TensorTypeI8
+	TensorTypeI16
+	TensorTypeI32
+	TensorTypeI64
+	TensorTypeF64
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ1_M
+
+	TensorTypeBF16
+
+	// unexported // unused in gguf
+	tensorTypeQ4_0_4_4
+	tensorTypeQ4_0_4_8
+	tensorTypeQ4_0_8_8
+
+	// unexported // unquantizable by ollama
+	tensorTypeTQ1_0
+	tensorTypeTQ2_0
+
+	// unexported // unused in gguf
+	tensorTypeIQ4_NL_4_4
+	tensorTypeIQ4_NL_4_8
+	tensorTypeIQ4_NL_8_8
+)
+
+func (tt TensorType) NumBytes() float64 {
+	return float64(tt.typeSize()) / float64(tt.blockSize())
+}
+
+func (tt TensorType) typeSize() int64 {
+	switch tt {
+	case TensorTypeF32:
+		return 4
+	case TensorTypeF16:
+		return 2
+	case TensorTypeQ4_0:
+		return 2 + tt.blockSize()/2
+	case TensorTypeQ4_1:
+		return 2 + 2 + tt.blockSize()/2
+	case TensorTypeQ5_0:
+		return 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ5_1:
+		return 2 + 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ8_0:
+		return 2 + tt.blockSize()
+	case TensorTypeQ8_1:
+		return 2 + 2 + tt.blockSize()
+	case TensorTypeQ2_K:
+		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
+	case TensorTypeQ3_K:
+		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
+	case TensorTypeQ4_K:
+		return 2 + 2 + 12 + tt.blockSize()/2
+	case TensorTypeQ5_K:
+		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
+	case TensorTypeQ6_K:
+		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
+	case TensorTypeQ8_K:
+		return 4 + tt.blockSize() + 2*tt.blockSize()/16
+	case tensorTypeIQ2_XXS:
+		return 2 + 2*tt.blockSize()/8
+	case tensorTypeIQ2_XS:
+		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
+	case tensorTypeIQ3_XXS:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8
+	case tensorTypeIQ1_S:
+		return 2 + tt.blockSize()/8 + tt.blockSize()/16
+	case tensorTypeIQ4_NL:
+		return 2 + tt.blockSize()/2
+	case tensorTypeIQ3_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
+	case tensorTypeIQ2_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/16
+	case tensorTypeIQ4_XS:
+		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
+	case TensorTypeI8:
+		return 1
+	case TensorTypeI16:
+		return 2
+	case TensorTypeI32:
+		return 4
+	case TensorTypeI64:
+		return 8
+	case TensorTypeF64:
+		return 8
+	case tensorTypeIQ1_M:
+		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
+	case TensorTypeBF16:
+		return 2
+	default:
+		return 0
+	}
+}
+
+func (tt TensorType) blockSize() int64 {
+	switch tt {
+	case TensorTypeF32,
+		TensorTypeF16,
+		TensorTypeI8,
+		TensorTypeI16,
+		TensorTypeI32,
+		TensorTypeI64,
+		TensorTypeF64,
+		TensorTypeBF16:
+		return 1
+	case TensorTypeQ4_0,
+		TensorTypeQ4_1,
+		TensorTypeQ5_0,
+		TensorTypeQ5_1,
+		TensorTypeQ8_0,
+		TensorTypeQ8_1,
+		tensorTypeIQ4_NL:
+		return 32
+	default:
+		return 256
+	}
+}
+
+func (tt TensorType) String() string {
+	switch tt {
+	case TensorTypeF32:
+		return "f32"
+	case TensorTypeF16:
+		return "f16"
+	case TensorTypeQ4_0:
+		return "q4_0"
+	case TensorTypeQ4_1:
+		return "q4_1"
+	case tensorTypeQ4_2:
+		return "q4_2"
+	case tensorTypeQ4_3:
+		return "q4_3"
+	case TensorTypeQ5_0:
+		return "q5_0"
+	case TensorTypeQ5_1:
+		return "q5_1"
+	case TensorTypeQ8_0:
+		return "q8_0"
+	case TensorTypeQ8_1:
+		return "q8_1"
+	case TensorTypeQ2_K:
+		return "q2_k"
+	case TensorTypeQ3_K:
+		return "q3_k"
+	case TensorTypeQ4_K:
+		return "q4_k"
+	case TensorTypeQ5_K:
+		return "q5_k"
+	case TensorTypeQ6_K:
+		return "q6_k"
+	case TensorTypeQ8_K:
+		return "q8_k"
+	case tensorTypeIQ2_XXS:
+		return "iq2_xxs"
+	case tensorTypeIQ2_XS:
+		return "iq2_xs"
+	case tensorTypeIQ3_XXS:
+		return "iq3_xxs"
+	case tensorTypeIQ1_S:
+		return "iq1_s"
+	case tensorTypeIQ4_NL:
+		return "iq4_nl"
+	case tensorTypeIQ3_S:
+		return "iq3_s"
+	case tensorTypeIQ2_S:
+		return "iq2_s"
+	case tensorTypeIQ4_XS:
+		return "iq4_xs"
+	case TensorTypeI8:
+		return "i8"
+	case TensorTypeI16:
+		return "i16"
+	case TensorTypeI32:
+		return "i32"
+	case TensorTypeI64:
+		return "i64"
+	case TensorTypeF64:
+		return "f64"
+	case tensorTypeIQ1_M:
+		return "iq1_m"
+	case TensorTypeBF16:
+		return "bf16"
+	case tensorTypeQ4_0_4_4:
+		return "q4_0_4_4"
+	case tensorTypeQ4_0_4_8:
+		return "q4_0_4_8"
+	case tensorTypeQ4_0_8_8:
+		return "q4_0_8_8"
+	case tensorTypeTQ1_0:
+		return "tq1_0"
+	case tensorTypeTQ2_0:
+		return "tq2_0"
+	case tensorTypeIQ4_NL_4_4:
+		return "iq4_nl_4_4"
+	case tensorTypeIQ4_NL_4_8:
+		return "iq4_nl_4_8"
+	case tensorTypeIQ4_NL_8_8:
+		return "iq4_nl_8_8"
+	default:
+		return "unknown"
+	}
+}
+
+func (tt TensorType) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.Uint64("value", uint64(tt)),
+		slog.String("name", strings.ToUpper(tt.String())),
+		slog.Int64("size", tt.typeSize()),
+		slog.Int64("block_size", tt.blockSize()),
+		slog.Float64("num_bytes", tt.NumBytes()),
+	)
+}
--- a/go.mod
+++ b/go.mod
@@ -19,12 +19,13 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
 	golang.org/x/tools v0.30.0
+	gonum.org/v1/gonum v0.15.0
 )

 require (
@@ -44,7 +45,6 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )
@@ -71,7 +71,7 @@ require (
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
 	golang.org/x/crypto v0.36.0
-	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
+	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
 	golang.org/x/net v0.38.0 // indirect
 	golang.org/x/sys v0.31.0
 	golang.org/x/term v0.30.0
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
--- a/integration/library_models_test.go
+++ b/integration/library_models_test.go
@@ -0,0 +1,57 @@
+//go:build integration && library
+
+package integration
+
+import (
+	"context"
+	"log/slog"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+// First run of this scenario on a target system will take a long time to download
+// ~1.5TB of models.  Set a sufficiently large -timeout for your network speed
+func TestLibraryModelsGenerate(t *testing.T) {
+	softTimeout, hardTimeout := getTimeouts(t)
+	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	chatModels := libraryChatModels
+	for _, model := range chatModels {
+		t.Run(model, func(t *testing.T) {
+			if time.Now().Sub(started) > softTimeout {
+				t.Skip("skipping remaining tests to avoid excessive runtime")
+			}
+			if err := PullIfMissing(ctx, client, model); err != nil {
+				t.Fatalf("pull failed %s", err)
+			}
+			req := api.GenerateRequest{
+				Model:     model,
+				Prompt:    "why is the sky blue?",
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]interface{}{
+					"temperature": 0.1,
+					"seed":        123,
+				},
+			}
+			anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"}
+			// Special cases
+			if model == "duckdb-nsql" {
+				anyResp = []string{"select", "from"}
+			} else if model == "granite3-guardian" || model == "shieldgemma" || model == "llama-guard3" || model == "bespoke-minicheck" {
+				anyResp = []string{"yes", "no", "safe", "unsafe"}
+			} else if model == "openthinker" || model == "nexusraven" {
+				anyResp = []string{"plugin", "im_sep", "components", "function call"}
+			} else if model == "starcoder" || model == "starcoder2" || model == "magicoder" || model == "deepseek-coder" {
+				req.Prompt = "def fibonacci():"
+				anyResp = []string{"f(n)", "sequence", "n-1", "main()", "__main__", "while"}
+			}
+			DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
+		})
+	}
+}
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -19,35 +19,6 @@ import (
 	"github.com/ollama/ollama/format"
 )

-var (
-	started    = time.Now()
-	chatModels = []string{
-		"granite3-moe:latest",
-		"granite-code:latest",
-		"nemotron-mini:latest",
-		"command-r:latest",
-		"gemma2:latest",
-		"gemma:latest",
-		"internlm2:latest",
-		"phi3.5:latest",
-		"phi3:latest",
-		// "phi:latest", // flaky, sometimes generates no response on first query
-		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
-		"falcon:latest",
-		"falcon2:latest",
-		"minicpm-v:latest",
-		"mistral:latest",
-		"orca-mini:latest",
-		"llama2:latest",
-		"llama3.1:latest",
-		"llama3.2:latest",
-		"llama3.2-vision:latest",
-		"qwen2.5-coder:latest",
-		"qwen:latest",
-		"solar-pro:latest",
-	}
-)
-
 func TestModelsGenerate(t *testing.T) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
@@ -68,6 +39,13 @@ func TestModelsGenerate(t *testing.T) {
 		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
 	}

+	var chatModels []string
+	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
+		chatModels = ollamaEngineChatModels
+	} else {
+		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
+	}
+
 	for _, model := range chatModels {
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
--- a/integration/model_perf_test.go
+++ b/integration/model_perf_test.go
@@ -0,0 +1,266 @@
+//go:build integration && perf
+
+package integration
+
+import (
+	"context"
+	"fmt"
+	"io/ioutil"
+	"log/slog"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
+)
+
+var (
+	// Models that don't work reliably with the large context prompt in this test case
+	longContextFlakes = []string{
+		"granite-code:latest",
+		"nemotron-mini:latest",
+		"falcon:latest",  // 2k model
+		"falcon2:latest", // 2k model
+		"minicpm-v:latest",
+		"qwen:latest",
+		"solar-pro:latest",
+	}
+)
+
+// Note: this test case can take a long time to run, particularly on models with
+// large contexts.  Run with -timeout set to a large value to get reasonable coverage
+// Example usage:
+//
+// go test --tags=integration,perf -count 1 ./integration -v -timeout 90m -run TestModelsPerf 2>&1 | tee int.log
+// cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
+// cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
+func TestModelsPerf(t *testing.T) {
+	softTimeout, hardTimeout := getTimeouts(t)
+	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	// TODO use info API eventually
+	var maxVram uint64
+	var err error
+	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
+		maxVram, err = strconv.ParseUint(s, 10, 64)
+		if err != nil {
+			t.Fatalf("invalid  OLLAMA_MAX_VRAM %v", err)
+		}
+	} else {
+		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
+	}
+
+	data, err := ioutil.ReadFile(filepath.Join("testdata", "shakespeare.txt"))
+	if err != nil {
+		t.Fatalf("failed to open test data file: %s", err)
+	}
+	longPrompt := "summarize the following: " + string(data)
+
+	var chatModels []string
+	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
+		chatModels = ollamaEngineChatModels
+	} else {
+		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
+	}
+
+	for _, model := range chatModels {
+		t.Run(model, func(t *testing.T) {
+			if time.Now().Sub(started) > softTimeout {
+				t.Skip("skipping remaining tests to avoid excessive runtime")
+			}
+			if err := PullIfMissing(ctx, client, model); err != nil {
+				t.Fatalf("pull failed %s", err)
+			}
+			var maxContext int
+
+			resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
+			if err != nil {
+				t.Fatalf("show failed: %s", err)
+			}
+			arch := resp.ModelInfo["general.architecture"].(string)
+			maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
+
+			if maxVram > 0 {
+				resp, err := client.List(ctx)
+				if err != nil {
+					t.Fatalf("list models failed %v", err)
+				}
+				for _, m := range resp.Models {
+					// For these tests we want to exercise a some amount of overflow on the CPU
+					if m.Name == model && float32(m.Size)*0.75 > float32(maxVram) {
+						t.Skipf("model %s is too large %s for available VRAM %s", model, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
+					}
+				}
+			}
+			slog.Info("scneario", "model", model, "max_context", maxContext)
+			loaded := false
+			defer func() {
+				// best effort unload once we're done with the model
+				if loaded {
+					client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
+				}
+			}()
+
+			// Some models don't handle the long context data well so skip them to avoid flaky test results
+			longContextFlake := false
+			for _, flake := range longContextFlakes {
+				if model == flake {
+					longContextFlake = true
+					break
+				}
+			}
+
+			// iterate through a few context sizes for coverage without excessive runtime
+			var contexts []int
+			keepGoing := true
+			if maxContext > 16384 {
+				contexts = []int{4096, 8192, 16384, maxContext}
+			} else if maxContext > 8192 {
+				contexts = []int{4096, 8192, maxContext}
+			} else if maxContext > 4096 {
+				contexts = []int{4096, maxContext}
+			} else if maxContext > 0 {
+				contexts = []int{maxContext}
+			} else {
+				t.Fatal("unknown max context size")
+			}
+			for _, numCtx := range contexts {
+				if !keepGoing && numCtx > 8192 { // Always try up to 8k before bailing out
+					break
+				}
+				skipLongPrompt := false
+
+				// Workaround bug 11172 temporarily...
+				maxPrompt := longPrompt
+				// If we fill the context too full with the prompt, many models
+				// quickly hit context shifting and go bad.
+				if len(maxPrompt) > numCtx*2 { // typically yields ~1/2 full context
+					maxPrompt = maxPrompt[:numCtx*2]
+				}
+
+				testCases := []struct {
+					prompt  string
+					anyResp []string
+				}{
+					{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
+					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
+				}
+				var gpuPercent int
+				for _, tc := range testCases {
+					if len(tc.prompt) > 100 && (longContextFlake || skipLongPrompt) {
+						slog.Info("skipping long prompt", "model", model, "num_ctx", numCtx, "gpu_percent", gpuPercent)
+						continue
+					}
+					req := api.GenerateRequest{
+						Model:     model,
+						Prompt:    tc.prompt,
+						KeepAlive: &api.Duration{Duration: 20 * time.Second}, // long enough to ensure a ps returns
+						Options: map[string]interface{}{
+							"temperature": 0,
+							"seed":        123,
+							"num_ctx":     numCtx,
+						},
+					}
+					atLeastOne := false
+					var resp api.GenerateResponse
+
+					stream := false
+					req.Stream = &stream
+
+					// Avoid potentially getting stuck indefinitely
+					limit := 5 * time.Minute
+					genCtx, cancel := context.WithDeadlineCause(
+						ctx,
+						time.Now().Add(limit),
+						fmt.Errorf("generate on model %s with ctx %d took longer than %v", model, numCtx, limit),
+					)
+					defer cancel()
+
+					err = client.Generate(genCtx, &req, func(rsp api.GenerateResponse) error {
+						resp = rsp
+						return nil
+					})
+					if err != nil {
+						// Avoid excessive test runs, but don't consider a failure with massive context
+						if numCtx > 16384 && strings.Contains(err.Error(), "took longer") {
+							slog.Warn("max context was taking too long, skipping", "error", err)
+							keepGoing = false
+							skipLongPrompt = true
+							continue
+						}
+						t.Fatalf("generate error: ctx:%d err:%s", numCtx, err)
+					}
+					loaded = true
+					for _, expResp := range tc.anyResp {
+						if strings.Contains(strings.ToLower(resp.Response), expResp) {
+							atLeastOne = true
+							break
+						}
+					}
+					if !atLeastOne {
+						t.Fatalf("response didn't contain expected values: ctx:%d  expected:%v response:%s ", numCtx, tc.anyResp, resp.Response)
+					}
+					models, err := client.ListRunning(ctx)
+					if err != nil {
+						slog.Warn("failed to list running models", "error", err)
+						continue
+					}
+					if len(models.Models) > 1 {
+						slog.Warn("multiple models loaded, may impact performance results", "loaded", models.Models)
+					}
+					for _, m := range models.Models {
+						if m.Name == model {
+							if m.SizeVRAM == 0 {
+								slog.Info("Model fully loaded into CPU")
+								gpuPercent = 0
+								keepGoing = false
+								skipLongPrompt = true
+							} else if m.SizeVRAM == m.Size {
+								slog.Info("Model fully loaded into GPU")
+								gpuPercent = 100
+							} else {
+								sizeCPU := m.Size - m.SizeVRAM
+								cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
+								gpuPercent = int(100 - cpuPercent)
+								slog.Info("Model split between CPU/GPU", "CPU", cpuPercent, "GPU", gpuPercent)
+								keepGoing = false
+
+								// Heuristic to avoid excessive test run time
+								if gpuPercent < 90 {
+									skipLongPrompt = true
+								}
+							}
+						}
+					}
+					fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
+						"MODEL",
+						"CONTEXT",
+						"GPU PERCENT",
+						"PROMPT COUNT",
+						"LOAD TIME",
+						"PROMPT EVAL TPS",
+						"EVAL TPS",
+					)
+					fmt.Fprintf(os.Stderr, "MODEL_PERF_DATA:%s,%d,%d,%d,%0.2f,%0.2f,%0.2f\n",
+						model,
+						numCtx,
+						gpuPercent,
+						resp.PromptEvalCount,
+						float64(resp.LoadDuration)/1000000000.0,
+						float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
+						float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
+					)
+				}
+			}
+		})
+	}
+}
--- a/integration/testdata/shakespeare.txt
+++ b/integration/testdata/shakespeare.txt
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -32,6 +32,229 @@ const (
 	smol = "llama3.2:1b"
 )

+var (
+	started = time.Now()
+
+	// Note: add newer models at the top of the list to test them first
+	ollamaEngineChatModels = []string{
+		"gemma3n:e2b",
+		"mistral-small3.2:latest",
+		"deepseek-r1:1.5b",
+		"llama3.2-vision:latest",
+		"qwen2.5-coder:latest",
+		"qwen2.5vl:3b",
+		"qwen3:0.6b", // dense
+		"qwen3:30b",  // MOE
+		"gemma3:1b",
+		"llama3.1:latest",
+		"llama3.2:latest",
+		"gemma2:latest",
+		"minicpm-v:latest",    // arch=qwen2
+		"granite-code:latest", // arch=llama
+	}
+	llamaRunnerChatModels = []string{
+		"mistral:latest",
+		"falcon3:latest",
+		"granite3-moe:latest",
+		"command-r:latest",
+		"nemotron-mini:latest",
+		"phi3.5:latest",
+		"solar-pro:latest",
+		"internlm2:latest",
+		"codellama:latest", // arch=llama
+		"phi3:latest",
+		"falcon2:latest",
+		"gemma:latest",
+		"llama2:latest",
+		"nous-hermes:latest",
+		"orca-mini:latest",
+		"qwen:latest",
+		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
+		"falcon:latest",
+	}
+
+	// Some library models are quite large - ensure large VRAM and sufficient disk space
+	// before running scenarios based on this set
+	libraryChatModels = []string{
+		"alfred",
+		"athene-v2",
+		"aya-expanse",
+		"aya",
+		"bakllava",
+		"bespoke-minicheck",
+		"codebooga",
+		"codegeex4",
+		"codegemma",
+		"codellama",
+		"codeqwen",
+		"codestral",
+		"codeup",
+		"cogito",
+		"command-a",
+		"command-r-plus",
+		"command-r",
+		"command-r7b-arabic",
+		"command-r7b",
+		"dbrx",
+		"deepcoder",
+		"deepscaler",
+		"deepseek-coder-v2",
+		"deepseek-coder",
+		"deepseek-llm",
+		"deepseek-r1",
+		// "deepseek-v2.5", // requires 155 GB VRAM
+		"deepseek-v2",
+		// "deepseek-v3", // requires 482 GB VRAM
+		"devstral",
+		"dolphin-llama3",
+		"dolphin-mistral",
+		"dolphin-mixtral",
+		"dolphin-phi",
+		"dolphin3",
+		"dolphincoder",
+		"duckdb-nsql",
+		"everythinglm",
+		"exaone-deep",
+		"exaone3.5",
+		"falcon",
+		"falcon2",
+		"falcon3",
+		"firefunction-v2",
+		"gemma",
+		"gemma2",
+		"gemma3",
+		"gemma3n",
+		"glm4",
+		"goliath",
+		"granite-code",
+		"granite3-dense",
+		"granite3-guardian",
+		"granite3-moe",
+		"granite3.1-dense",
+		"granite3.1-moe",
+		"granite3.2-vision",
+		"granite3.2",
+		"granite3.3",
+		"hermes3",
+		"internlm2",
+		"llama-guard3",
+		"llama-pro",
+		"llama2-chinese",
+		"llama2-uncensored",
+		"llama2",
+		"llama3-chatqa",
+		"llama3-gradient",
+		"llama3-groq-tool-use",
+		"llama3.1",
+		"llama3.2-vision",
+		"llama3.2",
+		"llama3.3",
+		"llama3",
+		"llama4",
+		"llava-llama3",
+		"llava-phi3",
+		"llava",
+		"magicoder",
+		"magistral",
+		"marco-o1",
+		"mathstral",
+		"meditron",
+		"medllama2",
+		"megadolphin",
+		"minicpm-v",
+		"mistral-large",
+		"mistral-nemo",
+		"mistral-openorca",
+		"mistral-small",
+		"mistral-small3.1",
+		"mistral-small3.2",
+		"mistral",
+		"mistrallite",
+		"mixtral",
+		"moondream",
+		"nemotron-mini",
+		"nemotron",
+		"neural-chat",
+		"nexusraven",
+		"notus",
+		"nous-hermes",
+		"nous-hermes2-mixtral",
+		"nous-hermes2",
+		"nuextract",
+		"olmo2",
+		"open-orca-platypus2",
+		"openchat",
+		"opencoder",
+		"openhermes",
+		"openthinker",
+		"orca-mini",
+		"orca2",
+		// "phi", // unreliable
+		"phi3.5",
+		"phi3",
+		"phi4-mini-reasoning",
+		"phi4-mini",
+		"phi4-reasoning",
+		"phi4",
+		"phind-codellama",
+		"qwen",
+		"qwen2-math",
+		"qwen2.5-coder",
+		"qwen2.5",
+		"qwen2.5vl",
+		"qwen2",
+		"qwen3:0.6b", // dense
+		"qwen3:30b",  // MOE
+		"qwq",
+		"r1-1776",
+		"reader-lm",
+		"reflection",
+		"sailor2",
+		"samantha-mistral",
+		"shieldgemma",
+		"smallthinker",
+		"smollm",
+		"smollm2",
+		"solar-pro",
+		"solar",
+		"sqlcoder",
+		"stable-beluga",
+		"stable-code",
+		"stablelm-zephyr",
+		"stablelm2",
+		"starcoder",
+		"starcoder2",
+		"starling-lm",
+		"tinydolphin",
+		"tinyllama",
+		"tulu3",
+		"vicuna",
+		"wizard-math",
+		"wizard-vicuna-uncensored",
+		"wizard-vicuna",
+		"wizardcoder",
+		"wizardlm-uncensored",
+		"wizardlm2",
+		"xwinlm",
+		"yarn-llama2",
+		"yarn-mistral",
+		"yi-coder",
+		"yi",
+		"zephyr",
+	}
+	libraryEmbedModels = []string{
+		"all-minilm",
+		"bge-large",
+		"bge-m3",
+		"granite-embedding",
+		"mxbai-embed-large",
+		"nomic-embed-text",
+		"paraphrase-multilingual",
+		"snowflake-arctic-embed",
+		"snowflake-arctic-embed2",
+	}
+)
+
 func Init() {
 	lifecycle.InitLogging()
 }
@@ -271,6 +494,10 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 			t.Errorf("generate stalled.  Response so far:%s", buf.String())
 		}
 	case <-done:
+		if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
+			slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
+			return
+		}
 		require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
 		// Verify the response contains the expected data
 		response := buf.String()
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -19,12 +19,22 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 // The tensors are of shape embed dim, kv heads, batch size
 // The mask is of shape history size, batch size
 type Causal struct {
-	DType      ml.DType
-	windowSize int32
-	chunkSize  int32
+	DType ml.DType
+
+	// swaWindowSize is the number of tokens that will be included in the mask
+	// during attention operations. swaMemorySize is the number of tokens that
+	// will be retained in memory for partial prefix caching. Set to math.MaxInt32
+	// for unlimited or if sliding window attention is not being used.
+	swaWindowSize int32
+	swaMemorySize int32
+
+	chunkSize int32

 	opts CausalOptions

+	// maxBatch is the largest batch that we might receive
+	maxBatch int
+
 	// config controls mostly backend-specific optimizations
 	config *ml.CacheConfig

@@ -85,32 +95,41 @@ type cellRange struct {

 func NewCausalCache(shift shiftFn) *Causal {
 	return &Causal{
-		windowSize: math.MaxInt32,
-		shiftFn:    shift,
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
+		shiftFn: shift,
+		ctxs:    make(map[int]ml.Context),
+		keys:    make(map[int]ml.Tensor),
+		values:  make(map[int]ml.Tensor),
 	}
 }

 func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	return &Causal{
-		windowSize: windowSize,
-		shiftFn:    shift,
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
+		swaWindowSize: windowSize,
+		shiftFn:       shift,
+		ctxs:          make(map[int]ml.Context),
+		keys:          make(map[int]ml.Tensor),
+		values:        make(map[int]ml.Tensor),
+	}
+}
+
+func NewSWAMemCache(windowSize int32, memorySize int32, shift shiftFn) *Causal {
+	return &Causal{
+		swaWindowSize: windowSize,
+		swaMemorySize: memorySize,
+		shiftFn:       shift,
+		ctxs:          make(map[int]ml.Context),
+		keys:          make(map[int]ml.Tensor),
+		values:        make(map[int]ml.Tensor),
 	}
 }

 func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal {
 	return &Causal{
-		windowSize: math.MaxInt32,
-		chunkSize:  chunkSize,
-		shiftFn:    shift,
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
+		chunkSize: chunkSize,
+		shiftFn:   shift,
+		ctxs:      make(map[int]ml.Context),
+		keys:      make(map[int]ml.Tensor),
+		values:    make(map[int]ml.Tensor),
 	}
 }

@@ -135,11 +154,25 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.MaskDType = ml.DTypeF32
 	}

+	if c.swaWindowSize == 0 {
+		c.swaWindowSize = math.MaxInt32
+	}
+	if c.swaMemorySize == 0 {
+		c.swaMemorySize = c.swaWindowSize
+	}
+	if int(c.swaMemorySize) > capacity {
+		c.swaMemorySize = math.MaxInt32
+	}
+
+	if c.swaMemorySize < c.swaWindowSize {
+		panic(fmt.Errorf("sliding window memory (%v) must be at least as large as the window (%v)", c.swaMemorySize, c.swaWindowSize))
+	}
+
 	var cacheSize int
-	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize) {
+	if c.swaMemorySize == math.MaxInt32 {
 		cacheSize = maxSequences * capacity
 	} else {
-		cacheSize = (maxSequences * int(c.windowSize)) + maxBatch
+		cacheSize = (maxSequences * int(c.swaMemorySize)) + maxBatch
 	}
 	cacheSize = roundUp(cacheSize, c.config.CachePadding)
 	c.cells = make([]cacheCell, cacheSize)
@@ -147,6 +180,7 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 	c.DType = dtype
 	c.cellRanges = make(map[int]cellRange)
 	c.backend = backend
+	c.maxBatch = maxBatch
 }

 func (c *Causal) SetConfig(config ml.CacheConfig) {
@@ -180,10 +214,10 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 			c.curLoc, err = c.findStartLoc()
 		}
 		if err != nil {
+			slog.Warn("unable to find a kv cache slot", "cache", c)
 			return err
 		}

-		c.curCellRange = newRange()
 		for i, pos := range batch.Positions {
 			seq := batch.Sequences[i]

@@ -194,19 +228,12 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 				seqRange = newRange()
 			}

-			if c.curLoc+i > seqRange.max {
-				seqRange.max = c.curLoc + i
-			}
-			if seqRange.max > c.curCellRange.max {
-				c.curCellRange.max = seqRange.max
-			}
+			seqRange.min = min(seqRange.min, c.curLoc+i)
+			c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
+
+			seqRange.max = max(seqRange.max, c.curLoc+i)
+			c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)

-			if c.curLoc+i < seqRange.min {
-				seqRange.min = c.curLoc + i
-			}
-			if seqRange.min < c.curCellRange.min {
-				c.curCellRange.min = seqRange.min
-			}
 			c.cellRanges[seq] = seqRange
 		}
 	} else {
@@ -248,7 +275,16 @@ func (c *Causal) findStartLoc() (int, error) {
 }

 func (c *Causal) updateSlidingWindow() {
-	if c.windowSize == math.MaxInt32 {
+	c.curCellRange = newRange()
+
+	if c.swaMemorySize == math.MaxInt32 {
+		for _, seq := range c.curSequences {
+			if seqRange, ok := c.cellRanges[seq]; ok {
+				c.curCellRange.min = min(c.curCellRange.min, seqRange.min)
+				c.curCellRange.max = max(c.curCellRange.max, seqRange.max)
+			}
+		}
+
 		return
 	}

@@ -278,12 +314,16 @@ func (c *Causal) updateSlidingWindow() {

 		for i := oldRange.min; i <= oldRange.max; i++ {
 			if slices.Contains(c.cells[i].sequences, seq) {
-				if c.cells[i].pos < pos-c.windowSize {
+				if c.cells[i].pos < pos-c.swaMemorySize {
 					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
 				} else {
 					newRange.min = min(newRange.min, i)
 					newRange.max = max(newRange.max, i)
 				}
+				if c.cells[i].pos >= pos-c.swaWindowSize {
+					c.curCellRange.min = min(c.curCellRange.min, i)
+					c.curCellRange.max = max(c.curCellRange.max, i)
+				}
 			}
 		}

@@ -323,7 +363,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 			if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
 				(enabled && c.cells[j].pos > c.curPositions[i]) ||
 				c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
-				c.cells[j].pos < c.curPositions[i]-c.windowSize {
+				c.cells[j].pos < c.curPositions[i]-c.swaWindowSize {
 				mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
 			}
 		}
@@ -481,6 +521,8 @@ func (c *Causal) defrag() {

 		c.cellRanges[seq] = seqRange
 	}
+
+	c.updateSlidingWindow()
 }

 func (c *Causal) SetLayer(layer int) {
@@ -606,7 +648,7 @@ func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
 }

 func (c *Causal) CanResume(seq int, pos int32) bool {
-	if c.windowSize == math.MaxInt32 {
+	if c.swaMemorySize == math.MaxInt32 {
 		return true
 	}

@@ -628,8 +670,8 @@ func (c *Causal) CanResume(seq int, pos int32) bool {
 		return false
 	}

-	lastWindowStart := max(0, last-c.windowSize)
-	posWindowStart := max(0, pos-c.windowSize)
+	lastWindowStart := max(0, last-c.swaMemorySize)
+	posWindowStart := max(0, pos-c.swaWindowSize)

 	return posWindowStart >= lastWindowStart
 }
@@ -639,48 +681,64 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		return ErrNotSupported
 	}

-	ctx := c.backend.NewContext()
-	defer ctx.Close()
-
 	seqRange := c.cellRanges[seq]
-	size := seqRange.max - seqRange.min + 1

-	offsets := make([]int32, size)
-	for i := range offsets {
-		cell := c.cells[seqRange.min+i]
+	for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
+		size := min(seqRange.max-start+1, c.maxBatch)
+		offsets := make([]int32, size)

-		if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
-			offsets[i] = offset
+		var batchFirst, batchLast int
+
+		batchFirst = -1
+		for i := range offsets {
+			cell := c.cells[start+i]
+
+			if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
+				offsets[i] = offset
+				if batchFirst < 0 {
+					batchFirst = i
+				}
+				batchLast = i
+			}
 		}
-	}

-	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
-
-	for i, key := range c.keys {
-		if key == nil {
+		if batchFirst < 0 {
 			continue
 		}

-		kHeadDim := key.Dim(0)
-		numKVHeads := key.Dim(1)
-		rowSize := key.Stride(2)
+		offsets = offsets[batchFirst : batchLast+1]

-		key = key.View(ctx, rowSize*seqRange.min,
-			kHeadDim, key.Stride(1),
-			numKVHeads, key.Stride(2),
-			size,
-		)
+		ctx := c.backend.NewContext()
+		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))

-		roped, err := c.shiftFn(ctx, i, key, kShift)
-		if err != nil {
-			return err
+		for i, key := range c.keys {
+			if key == nil {
+				continue
+			}
+
+			kHeadDim := key.Dim(0)
+			numKVHeads := key.Dim(1)
+			rowSize := key.Stride(2)
+
+			key = key.View(ctx, rowSize*(start+batchFirst),
+				kHeadDim, key.Stride(1),
+				numKVHeads, key.Stride(2),
+				len(offsets),
+			)
+
+			roped, err := c.shiftFn(ctx, i, key, kShift)
+			if err != nil {
+				ctx.Close()
+				return err
+			}
+
+			ctx.Forward(roped.Copy(ctx, key))
 		}

-		ctx.Forward(roped.Copy(ctx, key))
+		ctx.Compute()
+		ctx.Close()
 	}

-	ctx.Compute()
-
 	return nil
 }

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -60,6 +60,8 @@ func TestSWA(t *testing.T) {

 	cache.Init(backend, ml.DTypeF16, 1, 16, 16)

+	x := float32(math.Inf(-1))
+
 	tests := []testCase{
 		{
 			name:          "FirstBatch",
@@ -69,7 +71,12 @@ func TestSWA(t *testing.T) {
 			pos:           []int32{0, 1, 2, 3},
 			expected:      []float32{1, 2, 3, 4},
 			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
+			expectedMask: []float32{
+				0, x, x, x,
+				0, 0, x, x,
+				x, 0, 0, x,
+				x, x, 0, 0,
+			},
 		},
 		{
 			name:          "SecondBatch",
@@ -79,7 +86,53 @@ func TestSWA(t *testing.T) {
 			pos:           []int32{4, 5},
 			expected:      []float32{5, 6, 3, 4},
 			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1))},
+			expectedMask: []float32{
+				0, x, x, 0,
+				0, 0, x, x,
+			},
+		},
+	}
+
+	testCache(t, backend, cache, tests)
+}
+
+func TestSWAMem(t *testing.T) {
+	backend := &testBackend{}
+	cache := NewSWAMemCache(1, 3, nil)
+	defer cache.Close()
+
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+
+	x := float32(math.Inf(-1))
+
+	tests := []testCase{
+		{
+			name:          "FirstBatch",
+			in:            []float32{1, 2, 3, 4},
+			inShape:       []int{1, 1, 4},
+			seqs:          []int{0, 0, 0, 0},
+			pos:           []int32{0, 1, 2, 3},
+			expected:      []float32{1, 2, 3, 4},
+			expectedShape: []int{1, 1, 4},
+			expectedMask: []float32{
+				0, x, x, x,
+				0, 0, x, x,
+				x, 0, 0, x,
+				x, x, 0, 0,
+			},
+		},
+		{
+			name:          "SecondBatch",
+			in:            []float32{5, 6},
+			inShape:       []int{1, 1, 2},
+			seqs:          []int{0, 0},
+			pos:           []int32{4, 5},
+			expected:      []float32{4, 5, 6},
+			expectedShape: []int{1, 1, 3},
+			expectedMask: []float32{
+				0, 0, x,
+				x, 0, 0,
+			},
 		},
 	}

@@ -437,6 +490,70 @@ func TestCanResume(t *testing.T) {
 	}
 }

+func TestCanResumeSWAMem(t *testing.T) {
+	backend := &testBackend{}
+	windowSize := int32(4)
+	memSize := int32(5)
+	cache := NewSWAMemCache(windowSize, memSize, nil)
+	defer cache.Close()
+
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
+
+	context := backend.NewContext()
+	defer context.Close()
+
+	err := cache.StartForward(context, input.Batch{
+		Positions: []int32{0, 1, 2, 3, 4, 5},
+		Sequences: []int{0, 0, 0, 0, 0, 0},
+	}, false)
+	if err != nil {
+		t.Fatalf("StartForward failed: %v", err)
+	}
+
+	cache.SetLayer(0)
+	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4, 5, 6}, 1, 1, 6)
+	cache.Put(context, tensor, tensor)
+
+	// shift window by adding position 6
+	err = cache.StartForward(context, input.Batch{
+		Positions: []int32{6, 7},
+		Sequences: []int{0, 0},
+	}, false)
+	if err != nil {
+		t.Fatalf("StartForward failed: %v", err)
+	}
+
+	cache.SetLayer(0)
+	tensor = context.FromFloatSlice([]float32{7, 8}, 1, 1, 2)
+	cache.Put(context, tensor, tensor)
+
+	// only the latest position has overlapping windows
+	if cache.CanResume(0, 0) {
+		t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 1) {
+		t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 2) {
+		t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 3) {
+		t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 4) {
+		t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)")
+	}
+	if cache.CanResume(0, 5) {
+		t.Errorf("after shift: CanResume(0, 5) = true, want false (outside window)")
+	}
+	if !cache.CanResume(0, 6) {
+		t.Errorf("after shift: CanResume(0, 6) = false, want true (inside window)")
+	}
+	if !cache.CanResume(0, 7) {
+		t.Errorf("after shift: CanResume(0, 7) = false, want true (latest position)")
+	}
+}
+
 type testBackend struct {
 	ml.Backend
 }
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "1caae7fc6c77551cb1066515e0f414713eebb367";
+char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@@ -1,14 +1,9 @@
 protect **/*.go
 include common/
-include common/arg.*
-include common/chat.*
-include common/chat-parser.*
-include common/console.*
 include common/base64.*
 include common/common.*
 include common/json-schema-to-grammar.*
-include common/json-partial.*
-include common/regex-partial.*
+include common/json.*
 include common/log.*
 include common/sampling.*
 include common/stb_image.*
@@ -17,23 +12,12 @@ include include/llama.*
 include include/llama-*.*
 include tools/
 include tools/mtmd/
-include tools/mtmd/mtmd.*
-include tools/mtmd/mtmd-helper.*
-include tools/mtmd/mtmd-audio.*
 include tools/mtmd/clip.*
 include tools/mtmd/clip-impl.*
+include tools/mtmd/llava.*
 include src/
 include src/llama.*
 include src/llama-*.*
 include src/unicode-data.*
 include src/unicode.*
-include vendor/
-include vendor/nlohmann
-include vendor/nlohmann/*
-include vendor/miniaudio
-include vendor/miniaudio/*
-include vendor/stb
-include vendor/stb/stb_image.*
-include vendor/minja
-include vendor/minja/*
 exclude *
--- a/llama/llama.cpp/common/arg.cpp
+++ b/llama/llama.cpp/common/arg.cpp
--- a/llama/llama.cpp/common/arg.h
+++ b/llama/llama.cpp/common/arg.h
@@ -1,89 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-#include <set>
-#include <string>
-#include <vector>
-
-//
-// CLI argument parsing
-//
-
-struct common_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::set<enum llama_example> excludes = {};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    bool is_sparam = false; // is current arg a sampling param?
-    void (*handler_void)   (common_params & params) = nullptr;
-    void (*handler_string) (common_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (common_params & params, int) = nullptr;
-
-    common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(common_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(common_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    common_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(common_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-
-    // support 2 values for arg
-    common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(common_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-
-    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
-    common_arg & set_env(const char * env);
-    common_arg & set_sparam();
-    bool in_example(enum llama_example ex);
-    bool is_exclude(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
-    std::string to_string();
-};
-
-struct common_params_context {
-    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
-    common_params & params;
-    std::vector<common_arg> options;
-    void(*print_usage)(int, char **) = nullptr;
-    common_params_context(common_params & params) : params(params) {}
-};
-
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-
-// function to be used by test-arg-parser
-common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-bool common_has_curl();
-
-struct common_remote_params {
-    std::vector<std::string> headers;
-    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
-    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
-};
-// get remote file content, returns <http_code, raw_response_body>
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
--- a/llama/llama.cpp/common/chat-parser.cpp
+++ b/llama/llama.cpp/common/chat-parser.cpp
@@ -1,380 +0,0 @@
-#include "chat-parser.h"
-#include "common.h"
-#include "log.h"
-#include "regex-partial.h"
-
-#include <optional>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-using json = nlohmann::ordered_json;
-
-common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
-    : input_(input), is_partial_(is_partial), syntax_(syntax)
-{
-    result_.role = "assistant";
-
-    while (true) {
-        std::string id = std::to_string(std::rand());
-        if (input.find(id) == std::string::npos) {
-            healing_marker_ = id;
-            break;
-        }
-    }
-}
-
-std::string common_chat_msg_parser::str(const common_string_range & rng) const {
-    GGML_ASSERT(rng.begin <= rng.end);
-    return input_.substr(rng.begin, rng.end - rng.begin);
-}
-
-void common_chat_msg_parser::add_content(const std::string &content) {
-    result_.content += content;
-}
-
-void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
-    result_.reasoning_content += reasoning_content;
-}
-
-bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
-    if (name.empty()) {
-        return false;
-    }
-
-    common_chat_tool_call tool_call;
-    tool_call.name = name;
-    tool_call.arguments = arguments;
-    tool_call.id = id;
-
-    // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
-    result_.tool_calls.emplace_back(tool_call);
-    return true;
-}
-bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
-    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
-    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
-    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
-    return add_tool_call(name, id, arguments);
-}
-
-bool common_chat_msg_parser::add_tool_calls(const json & arr) {
-    for (const auto & item : arr) {
-        if (!add_tool_call(item)) {
-            return false;
-        }
-    }
-    return true;
-}
-void common_chat_msg_parser::finish() {
-    if (!is_partial_ && pos_ != input_.size()) {
-        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
-    }
-}
-
-bool common_chat_msg_parser::consume_spaces() {
-    const auto length = input_.size();
-    auto consumed = false;
-    while (pos_ < length && std::isspace(input_[pos_])) {
-        ++pos_;
-        consumed = true;
-    }
-    return consumed;
-}
-
-bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
-    auto pos = pos_;
-    for (auto i = 0u; i < literal.size(); ++i) {
-        if (pos >= input_.size()) {
-            return false;
-        }
-        if (input_[pos] != literal[i]) {
-            return false;
-        }
-        ++pos;
-    }
-    pos_ = pos;
-    return true;
-}
-
-std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
-    auto idx = input_.find(literal, pos_);
-    if (idx != std::string::npos) {
-        find_regex_result res;
-        res.prelude = input_.substr(pos_, idx - pos_);
-        auto end = idx + literal.size();
-        res.groups.emplace_back(common_string_range{idx, end});
-        move_to(end);
-        return res;
-    }
-    if (is_partial_) {
-        idx = string_find_partial_stop(input_, literal);
-        if (idx != std::string::npos && idx >= pos_) {
-            find_regex_result res;
-            res.prelude = input_.substr(pos_, idx - pos_);
-            auto end = input_.size();
-            res.groups.emplace_back(common_string_range{idx, end});
-            move_to(end);
-            return res;
-        }
-    }
-    return std::nullopt;
-}
-
-void common_chat_msg_parser::consume_literal(const std::string & literal) {
-    if (!try_consume_literal(literal)) {
-        throw common_chat_msg_partial_exception(literal);
-    }
-}
-
-bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
-    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
-        auto stripped_reasoning = string_strip(reasoning);
-        if (stripped_reasoning.empty()) {
-            return;
-        }
-        if (syntax_.reasoning_in_content) {
-            add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
-            add_content(stripped_reasoning);
-            if (closed) {
-                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
-            }
-        } else {
-            add_reasoning_content(stripped_reasoning);
-        }
-    };
-    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
-        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
-            if (auto res = try_find_literal(end_think)) {
-                handle_reasoning(res->prelude, /* closed */ true);
-                consume_spaces();
-                return true;
-            }
-            auto rest = consume_rest();
-            if (!rest.empty()) {
-                handle_reasoning(rest, /* closed */ !is_partial());
-            }
-            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
-            // if (!syntax_.thinking_forced_open) {
-            //     throw common_chat_msg_partial_exception(end_think);
-            // }
-            return true;
-        }
-    }
-    return false;
-}
-
-std::string common_chat_msg_parser::consume_rest() {
-    auto rest = input_.substr(pos_);
-    pos_ = input_.size();
-    return rest;
-}
-
-// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
-std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
-    auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
-    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
-        return std::nullopt;
-    }
-    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
-    pos_ = m.groups[0].end;
-
-    if (add_prelude_to_content) {
-        add_content(prelude);
-    }
-    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
-        if (is_partial()) {
-            throw common_chat_msg_partial_exception(regex.str());
-        }
-        return std::nullopt;
-    }
-    return find_regex_result{prelude, m.groups};
-}
-
-common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
-    if (auto result = try_consume_regex(regex)) {
-        return *result;
-    }
-    throw common_chat_msg_partial_exception(regex.str());
-}
-
-std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
-    auto m = regex.search(input_, pos_);
-    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
-        return std::nullopt;
-    }
-    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
-        if (is_partial()) {
-            throw common_chat_msg_partial_exception(regex.str());
-        }
-        return std::nullopt;
-    }
-    if (m.groups[0].begin != pos_) {
-        // Didn't match at the current position.
-        return std::nullopt;
-    }
-    pos_ = m.groups[0].end;
-
-    return find_regex_result {
-        /* .prelude = */ "",
-        m.groups,
-    };
-}
-
-std::optional<common_json> common_chat_msg_parser::try_consume_json() {
-    auto it = input_.cbegin() + pos_;
-    const auto end = input_.cend();
-    common_json result;
-    if (!common_json_parse(it, end, healing_marker_, result)) {
-        return std::nullopt;
-    }
-    pos_ = std::distance(input_.cbegin(), it);
-    if (result.healing_marker.marker.empty()) {
-        // No healing marker, just return the parsed json
-        return result;
-    }
-    if (!is_partial()) {
-        throw common_chat_msg_partial_exception("JSON");
-    }
-    return result;
-}
-
-common_json common_chat_msg_parser::consume_json() {
-    if (auto result = try_consume_json()) {
-        return *result;
-    }
-    throw common_chat_msg_partial_exception("JSON");
-}
-
-common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
-    const std::vector<std::vector<std::string>> & args_paths,
-    const std::vector<std::vector<std::string>> & content_paths
-) {
-    if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
-        return *result;
-    }
-    throw common_chat_msg_partial_exception("JSON");
-}
-
-std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
-    const std::vector<std::vector<std::string>> & args_paths,
-    const std::vector<std::vector<std::string>> & content_paths
-) {
-    auto partial = try_consume_json();
-    if (!partial) {
-        return std::nullopt;
-    }
-    auto is_arguments_path = [&](const std::vector<std::string> & path) {
-        return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
-    };
-    auto is_content_path = [&](const std::vector<std::string> & path) {
-        return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
-    };
-
-    if (partial->healing_marker.marker.empty()) {
-        if (args_paths.empty()) {
-            // No arguments to dump, and JSON was parsed fully.
-            return consume_json_result {
-                partial->json,
-                /* .is_partial = */ false,
-            };
-        }
-        if (is_arguments_path({})) {
-            // Entire JSON is the arguments and was parsed fully.
-            return consume_json_result {
-                partial->json.dump(),
-                /* .is_partial = */ false,
-            };
-        }
-    }
-
-    LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
-
-    auto found_healing_marker = false;
-    std::vector<std::string> path;
-    std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
-        if (is_arguments_path(path)) {
-            auto arguments = j.dump();
-            if (is_partial() && !partial->healing_marker.marker.empty()) {
-                auto idx = arguments.find(partial->healing_marker.json_dump_marker);
-                if (idx != std::string::npos) {
-                    arguments.resize(idx);
-                    found_healing_marker = true;
-                }
-                if (arguments == "\"") {
-                    // This happens because of completing `:"$magic` after `"arguments"`
-                    arguments = "";
-                }
-            }
-            return arguments;
-        }
-        if (is_content_path(path)) {
-            if (!j.is_string()) {
-                throw std::runtime_error("Content path must be a string");
-            }
-            std::string str = j;
-            auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
-            if (idx != std::string::npos) {
-                str.resize(idx);
-                found_healing_marker = true;
-            }
-            return str;
-        }
-        if (j.is_object()) {
-            auto obj = json::object();
-            for (const auto & p : j.items()) {
-                const auto & key = p.key();
-                const auto & value = p.value();
-                const std::string key_str = key; // NOLINT
-                auto idx = key_str.find(healing_marker_);
-                if (idx != std::string::npos) {
-                    found_healing_marker = true;
-                    break;
-                }
-                path.push_back(key_str);
-                if (value.is_string()) {
-                    const std::string value_str = value;
-                    if (value_str.find(healing_marker_) != std::string::npos) {
-                        found_healing_marker = true;
-                        if (is_content_path(path)) {
-                            if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
-                                // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
-                                obj[key] = remove_unsupported_healings_and_dump_args(value);
-                            }
-                        }
-                        break;
-                    }
-                    obj[key] = value;
-                } else {
-                    obj[key] = remove_unsupported_healings_and_dump_args(value);
-                }
-                path.pop_back();
-            }
-            return obj;
-        }
-        if (j.is_array()) {
-            auto arr = json::array();
-            for (const auto & value : j) {
-                if (value.is_string()) {
-                    std::string str = value;
-                    auto idx = str.find(healing_marker_);
-                    if (idx != std::string::npos) {
-                        // Don't heal array values that aren't in the arguments.
-                        found_healing_marker = true;
-                        break;
-                    }
-                }
-                arr.push_back(remove_unsupported_healings_and_dump_args(value));
-            }
-            return arr;
-        }
-        return j;
-    };
-
-    auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
-    LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
-    return consume_json_result {
-        cleaned,
-        /* .is_partial = */ found_healing_marker,
-    };
-}
--- a/llama/llama.cpp/common/chat-parser.h
+++ b/llama/llama.cpp/common/chat-parser.h
@@ -1,118 +0,0 @@
-#pragma once
-
-#include "chat.h"
-#include "json-partial.h"
-#include "regex-partial.h"
-
-#include <nlohmann/json.hpp>
-
-#include <optional>
-#include <string>
-#include <vector>
-
-class common_chat_msg_partial_exception : public std::runtime_error {
-  public:
-    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
-};
-
-class common_chat_msg_parser {
-    std::string input_;
-    bool is_partial_;
-    common_chat_syntax syntax_;
-    std::string healing_marker_;
-
-    size_t pos_ = 0;
-    common_chat_msg result_;
-
-  public:
-    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
-    const std::string & input() const { return input_; }
-    size_t pos() const { return pos_; }
-    const std::string & healing_marker() const { return healing_marker_; }
-    const bool & is_partial() const { return is_partial_; }
-    const common_chat_msg & result() const { return result_; }
-    const common_chat_syntax & syntax() const { return syntax_; }
-
-    void move_to(size_t pos) {
-        if (pos > input_.size()) {
-            throw std::runtime_error("Invalid position!");
-        }
-        pos_ = pos;
-    }
-    void move_back(size_t n) {
-        if (pos_ < n) {
-            throw std::runtime_error("Can't move back that far!");
-        }
-        pos_ -= n;
-    }
-
-    // Get the substring of the input at the given range
-    std::string str(const common_string_range & rng) const;
-
-    // Appends to the result.content field
-    void add_content(const std::string & content);
-
-    // Appends to the result.reasoning_content field
-    void add_reasoning_content(const std::string & reasoning_content);
-
-    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
-    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
-
-    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
-    bool add_tool_call(const nlohmann::ordered_json & tool_call);
-
-    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
-    bool add_tool_calls(const nlohmann::ordered_json & arr);
-
-    void finish();
-
-    bool consume_spaces();
-
-    void consume_literal(const std::string & literal);
-
-    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
-
-    std::string consume_rest();
-
-    struct find_regex_result {
-        std::string prelude;
-        std::vector<common_string_range> groups;
-    };
-
-    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
-
-    bool try_consume_literal(const std::string & literal);
-
-    std::optional<find_regex_result> try_find_literal(const std::string & literal);
-
-    find_regex_result consume_regex(const common_regex & regex);
-
-    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
-
-    std::optional<common_json> try_consume_json();
-    common_json consume_json();
-
-    struct consume_json_result {
-        nlohmann::ordered_json value;
-        bool is_partial;
-    };
-
-    /*
-        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
-
-        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
-        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
-
-        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
-        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
-        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
-    */
-    consume_json_result consume_json_with_dumped_args(
-        const std::vector<std::vector<std::string>> & args_paths = {},
-        const std::vector<std::vector<std::string>> & content_paths = {}
-    );
-    std::optional<consume_json_result> try_consume_json_with_dumped_args(
-        const std::vector<std::vector<std::string>> & args_paths = {},
-        const std::vector<std::vector<std::string>> & content_paths = {}
-    );
-};
--- a/llama/llama.cpp/common/chat.cpp
+++ b/llama/llama.cpp/common/chat.cpp
--- a/llama/llama.cpp/common/chat.h
+++ b/llama/llama.cpp/common/chat.h
@@ -1,202 +0,0 @@
-// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
-
-#pragma once
-
-#include "common.h"
-#include <functional>
-#include <chrono>
-#include <string>
-#include <vector>
-
-struct common_chat_templates;
-
-struct common_chat_tool_call {
-    std::string name;
-    std::string arguments;
-    std::string id;
-
-    bool operator==(const common_chat_tool_call & other) const {
-        return name == other.name && arguments == other.arguments && id == other.id;
-    }
-};
-
-struct common_chat_msg_content_part {
-    std::string type;
-    std::string text;
-
-    bool operator==(const common_chat_msg_content_part & other) const {
-        return type == other.type && text == other.text;
-    }
-};
-
-struct common_chat_msg {
-    std::string role;
-    std::string content;
-    std::vector<common_chat_msg_content_part> content_parts = {};
-    std::vector<common_chat_tool_call> tool_calls = {};
-    std::string reasoning_content;
-    std::string tool_name;
-    std::string tool_call_id;
-
-    template <class T> T to_json_oaicompat() const;
-
-    bool empty() const {
-        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
-    }
-    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
-        for (auto i = 0u; i < tool_calls.size(); i++) {
-            if (ids_cache.size() <= i) {
-                auto id = tool_calls[i].id;
-                if (id.empty()) {
-                    id = gen_tool_call_id();
-                }
-                ids_cache.push_back(id);
-            }
-            tool_calls[i].id = ids_cache[i];
-        }
-    }
-    bool operator==(const common_chat_msg & other) const {
-        return role == other.role
-            && content == other.content
-            && content_parts == other.content_parts
-            && tool_calls == other.tool_calls
-            && reasoning_content == other.reasoning_content
-            && tool_name == other.tool_name
-            && tool_call_id == other.tool_call_id;
-    }
-    bool operator!=(const common_chat_msg & other) const {
-        return !(*this == other);
-    }
-};
-
-struct common_chat_msg_diff {
-    std::string reasoning_content_delta;
-    std::string content_delta;
-    size_t tool_call_index = std::string::npos;
-    common_chat_tool_call tool_call_delta;
-
-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
-
-    bool operator==(const common_chat_msg_diff & other) const {
-        return content_delta == other.content_delta
-        && tool_call_index == other.tool_call_index
-        && tool_call_delta == other.tool_call_delta;
-    }
-};
-
-struct common_chat_tool {
-    std::string name;
-    std::string description;
-    std::string parameters;
-};
-
-enum common_chat_tool_choice {
-    COMMON_CHAT_TOOL_CHOICE_AUTO,
-    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
-    COMMON_CHAT_TOOL_CHOICE_NONE,
-};
-
-enum common_chat_format {
-    COMMON_CHAT_FORMAT_CONTENT_ONLY,
-    COMMON_CHAT_FORMAT_GENERIC,
-    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
-    COMMON_CHAT_FORMAT_LLAMA_3_X,
-    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_COMMAND_R7B,
-
-    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
-};
-
-struct common_chat_templates_inputs {
-    std::vector<common_chat_msg> messages;
-    std::string grammar;
-    std::string json_schema;
-    bool add_generation_prompt = true;
-    bool use_jinja = true;
-    // Parameters below only supported when use_jinja is true
-    std::vector<common_chat_tool> tools;
-    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
-    bool parallel_tool_calls = false;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
-    bool enable_thinking = true;
-    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
-};
-
-struct common_chat_params {
-    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    std::string                         prompt;
-    std::string                         grammar;
-    bool                                grammar_lazy = false;
-    bool                                thinking_forced_open = false;
-    std::vector<common_grammar_trigger> grammar_triggers;
-    std::vector<std::string>            preserved_tokens;
-    std::vector<std::string>            additional_stops;
-};
-
-struct common_chat_syntax {
-    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
-    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
-    bool                     reasoning_in_content  = false;
-    bool                     thinking_forced_open  = false;
-    bool                     parse_tool_calls      = true;
-};
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
-
-void common_chat_templates_free(struct common_chat_templates * tmpls);
-
-struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
-
-typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
-
-common_chat_templates_ptr common_chat_templates_init(
-                                    const struct llama_model * model,
-                                           const std::string & chat_template_override,
-                                           const std::string & bos_token_override = "",
-                                           const std::string & eos_token_override = "");
-
-bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
-
-
-struct common_chat_params      common_chat_templates_apply(
-    const struct common_chat_templates * tmpls,
-    const struct common_chat_templates_inputs & inputs);
-
-// Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(
-        const struct common_chat_templates * tmpls,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja);
-
-// Returns an example of formatted chat
-std::string common_chat_format_example(
-    const struct common_chat_templates * tmpls,
-    bool use_jinja);
-
-const char*               common_chat_format_name(common_chat_format format);
-const char*               common_reasoning_format_name(common_reasoning_format format);
-common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
-
-common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
-
-// Parses a JSON array of messages in OpenAI's chat completion API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
-template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
-
-// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
-// T can be std::string containing JSON or nlohmann::ordered_json
-template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
-template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
-
-template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -203,7 +203,6 @@ bool set_process_priority(enum ggml_sched_priority prio) {

    DWORD p = NORMAL_PRIORITY_CLASS;
    switch (prio) {
-        case GGML_SCHED_PRIO_LOW:      p = BELOW_NORMAL_PRIORITY_CLASS; break;
        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
@@ -229,7 +228,6 @@ bool set_process_priority(enum ggml_sched_priority prio) {

    int p = 0;
    switch (prio) {
-        case GGML_SCHED_PRIO_LOW:      p =  5;  break;
        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
@@ -445,25 +443,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

-bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
-    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
-}
-size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
-    if (!str.empty() && !stop.empty()) {
-        const char text_last_char = str.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-            if (stop[char_index] == text_last_char) {
-                const auto current_partial = stop.substr(0, char_index + 1);
-                if (string_ends_with(str, current_partial)) {
-                    return str.size() - char_index - 1;
-                }
-            }
-        }
-    }
-
-    return std::string::npos;
-}
-
 std::string regex_escape(const std::string & s) {
    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
    return std::regex_replace(s, special_chars, "\\$0");
@@ -851,7 +830,7 @@ std::string fs_get_cache_directory() {
    if (getenv("LLAMA_CACHE")) {
        cache_directory = std::getenv("LLAMA_CACHE");
    } else {
-#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
        } else {
@@ -905,16 +884,13 @@ struct common_init_result common_init_from_params(common_params & params) {
            ok = false;
        }

-        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
-        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
-
-        if (!has_eos && !has_sep) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
            ok = false;
-        } else if (!has_eos) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
-        } else if (!has_sep) {
-            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
+        }
+
+        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
            ok = false;
        }

@@ -1107,9 +1083,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
    }

-    mparams.progress_callback           = params.load_progress_callback;
-    mparams.progress_callback_user_data = params.load_progress_callback_user_data;
-
    return mparams;
 }

@@ -1141,7 +1114,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
-    cparams.swa_full          = params.swa_full;

    if (params.reranking) {
        cparams.embeddings    = true;
@@ -1334,6 +1306,81 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
    return text;
 }

+//
+// KV cache utils
+//
+
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
+    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
+
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
+        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    llama_seq_id * cs_curr = view.cells_sequences;
+
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        int seq_count = 0;
+        for (int j = 0; j < view.n_seq_max; j++) {
+            if (cs_curr[j] >= 0) { seq_count++; }
+        }
+        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
+    }
+
+    printf("\n=== Done dumping\n");
+}
+
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
+    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
+        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+    std::unordered_map<llama_seq_id, size_t> seqs;
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    llama_seq_id * cs_curr = view.cells_sequences;
+
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+        for (int j = 0; j < view.n_seq_max; j++) {
+            if (cs_curr[j] < 0) { continue; }
+            if (seqs.find(cs_curr[j]) == seqs.end()) {
+                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+                const size_t sz = seqs.size();
+                seqs[cs_curr[j]] = sz;
+            }
+        }
+        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+    }
+
+    printf("=== Sequence legend: ");
+    for (const auto & it : seqs) {
+        printf("%zu=%d, ", it.second, it.first);
+    }
+    printf("'+'=other sequence ids");
+
+    c_curr = view.cells;
+    cs_curr = view.cells_sequences;
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        for (int j = 0; j < view.n_seq_max; j++) {
+            if (cs_curr[j] >= 0) {
+                const auto & it = seqs.find(cs_curr[j]);
+                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
+            } else {
+                putchar('.');
+            }
+        }
+        putchar(' ');
+    }
+
+    printf("\n=== Done dumping\n");
+}
+
 //
 // Embedding utils
 //
--- a/llama/llama.cpp/common/common.go
+++ b/llama/llama.cpp/common/common.go
@@ -1,7 +1,6 @@
 package common

-// #cgo CXXFLAGS: -std=c++17
+// #cgo CXXFLAGS: -std=c++11
 // #cgo CPPFLAGS: -I${SRCDIR}/../include
-// #cgo CPPFLAGS: -I${SRCDIR}/../vendor
 // #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
 import "C"
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -6,7 +6,6 @@

 #include <set>
 #include <string>
-#include <string_view>
 #include <vector>
 #include <sstream>

@@ -76,7 +75,7 @@ enum llama_example {
    LLAMA_EXAMPLE_SERVER,
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_MTMD,
+    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
@@ -115,7 +114,7 @@ enum common_grammar_trigger_type {
    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
 };

 struct common_grammar_trigger {
@@ -215,8 +214,7 @@ struct common_params_vocoder {

 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
-    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
+    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
 };

 struct common_params {
@@ -292,7 +290,6 @@ struct common_params {
    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
-    bool    offline                    = false;

    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -325,13 +322,13 @@ struct common_params {
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
-    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
+    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
@@ -370,8 +367,6 @@ struct common_params {
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-    int reasoning_budget = -1;
-    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;

@@ -431,11 +426,6 @@ struct common_params {

    // common params
    std::string out_file; // output filename for all example programs
-    // optional callback for model loading progress and cancellation:
-    // called with a progress value between 0.0 and 1.0.
-    // return false from callback to abort model loading or true to continue
-    llama_progress_callback load_progress_callback = NULL;
-    void *                  load_progress_callback_user_data = NULL;
 };

 // call once at the start of a program if it uses libcommon
@@ -513,9 +503,10 @@ static bool string_starts_with(const std::string & str,
    return str.rfind(prefix, 0) == 0;
 }

-// While we wait for C++20's std::string::ends_with...
-bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
-size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
+static bool string_ends_with(const std::string & str,
+                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}

 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -624,6 +615,16 @@ std::string common_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

+//
+// KV cache utils
+//
+
+// Dump the KV cache view with the number of sequences per cell.
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
 //
 // Embedding utils
 //
--- a/llama/llama.cpp/common/console.cpp
+++ b/llama/llama.cpp/common/console.cpp
@@ -1,504 +0,0 @@
-#include "console.h"
-#include <vector>
-#include <iostream>
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <fcntl.h>
-#include <io.h>
-#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
-#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
-#endif
-#else
-#include <climits>
-#include <sys/ioctl.h>
-#include <unistd.h>
-#include <wchar.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <termios.h>
-#endif
-
-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
-
-namespace console {
-
-    //
-    // Console state
-    //
-
-    static bool      advanced_display = false;
-    static bool      simple_io        = true;
-    static display_t current_display  = reset;
-
-    static FILE*     out              = stdout;
-
-#if defined (_WIN32)
-    static void*     hConsole;
-#else
-    static FILE*     tty              = nullptr;
-    static termios   initial_state;
-#endif
-
-    //
-    // Init and cleanup
-    //
-
-    void init(bool use_simple_io, bool use_advanced_display) {
-        advanced_display = use_advanced_display;
-        simple_io = use_simple_io;
-#if defined(_WIN32)
-        // Windows-specific console initialization
-        DWORD dwMode = 0;
-        hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
-        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
-            hConsole = GetStdHandle(STD_ERROR_HANDLE);
-            if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
-                hConsole = nullptr;
-                simple_io = true;
-            }
-        }
-        if (hConsole) {
-            // Check conditions combined to reduce nesting
-            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
-                !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-                advanced_display = false;
-            }
-            // Set console output codepage to UTF8
-            SetConsoleOutputCP(CP_UTF8);
-        }
-        HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
-        if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
-            // Set console input codepage to UTF16
-            _setmode(_fileno(stdin), _O_WTEXT);
-
-            // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
-            if (simple_io) {
-                dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
-            } else {
-                dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
-            }
-            if (!SetConsoleMode(hConIn, dwMode)) {
-                simple_io = true;
-            }
-        }
-        if (simple_io) {
-            _setmode(_fileno(stdin), _O_U8TEXT);
-        }
-#else
-        // POSIX-specific console initialization
-        if (!simple_io) {
-            struct termios new_termios;
-            tcgetattr(STDIN_FILENO, &initial_state);
-            new_termios = initial_state;
-            new_termios.c_lflag &= ~(ICANON | ECHO);
-            new_termios.c_cc[VMIN] = 1;
-            new_termios.c_cc[VTIME] = 0;
-            tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
-
-            tty = fopen("/dev/tty", "w+");
-            if (tty != nullptr) {
-                out = tty;
-            }
-        }
-
-        setlocale(LC_ALL, "");
-#endif
-    }
-
-    void cleanup() {
-        // Reset console display
-        set_display(reset);
-
-#if !defined(_WIN32)
-        // Restore settings on POSIX systems
-        if (!simple_io) {
-            if (tty != nullptr) {
-                out = stdout;
-                fclose(tty);
-                tty = nullptr;
-            }
-            tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
-        }
-#endif
-    }
-
-    //
-    // Display and IO
-    //
-
-    // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_t display) {
-        if (advanced_display && current_display != display) {
-            fflush(stdout);
-            switch(display) {
-                case reset:
-                    fprintf(out, ANSI_COLOR_RESET);
-                    break;
-                case prompt:
-                    fprintf(out, ANSI_COLOR_YELLOW);
-                    break;
-                case user_input:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
-                    break;
-                case error:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
-            }
-            current_display = display;
-            fflush(out);
-        }
-    }
-
-    static char32_t getchar32() {
-#if defined(_WIN32)
-        HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
-        wchar_t high_surrogate = 0;
-
-        while (true) {
-            INPUT_RECORD record;
-            DWORD count;
-            if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
-                return WEOF;
-            }
-
-            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
-                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
-                if (wc == 0) {
-                    continue;
-                }
-
-                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-                    high_surrogate = wc;
-                    continue;
-                }
-                if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
-                    if (high_surrogate != 0) { // Check if we have a high surrogate
-                        return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
-                    }
-                }
-
-                high_surrogate = 0; // Reset the high surrogate
-                return static_cast<char32_t>(wc);
-            }
-        }
-#else
-        wchar_t wc = getwchar();
-        if (static_cast<wint_t>(wc) == WEOF) {
-            return WEOF;
-        }
-
-#if WCHAR_MAX == 0xFFFF
-        if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-            wchar_t low_surrogate = getwchar();
-            if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
-                return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
-            }
-        }
-        if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
-            return 0xFFFD; // Return the replacement character U+FFFD
-        }
-#endif
-
-        return static_cast<char32_t>(wc);
-#endif
-    }
-
-    static void pop_cursor() {
-#if defined(_WIN32)
-        if (hConsole != NULL) {
-            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
-
-            COORD newCursorPosition = bufferInfo.dwCursorPosition;
-            if (newCursorPosition.X == 0) {
-                newCursorPosition.X = bufferInfo.dwSize.X - 1;
-                newCursorPosition.Y -= 1;
-            } else {
-                newCursorPosition.X -= 1;
-            }
-
-            SetConsoleCursorPosition(hConsole, newCursorPosition);
-            return;
-        }
-#endif
-        putc('\b', out);
-    }
-
-    static int estimateWidth(char32_t codepoint) {
-#if defined(_WIN32)
-        (void)codepoint;
-        return 1;
-#else
-        return wcwidth(codepoint);
-#endif
-    }
-
-    static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
-#if defined(_WIN32)
-        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-        if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
-            // go with the default
-            return expectedWidth;
-        }
-        COORD initialPosition = bufferInfo.dwCursorPosition;
-        DWORD nNumberOfChars = length;
-        WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
-
-        CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
-        GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
-
-        // Figure out our real position if we're in the last column
-        if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
-            DWORD nNumberOfChars;
-            WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
-            GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
-        }
-
-        int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
-        if (width < 0) {
-            width += newBufferInfo.dwSize.X;
-        }
-        return width;
-#else
-        // We can trust expectedWidth if we've got one
-        if (expectedWidth >= 0 || tty == nullptr) {
-            fwrite(utf8_codepoint, length, 1, out);
-            return expectedWidth;
-        }
-
-        fputs("\033[6n", tty); // Query cursor position
-        int x1;
-        int y1;
-        int x2;
-        int y2;
-        int results = 0;
-        results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
-
-        fwrite(utf8_codepoint, length, 1, tty);
-
-        fputs("\033[6n", tty); // Query cursor position
-        results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
-
-        if (results != 4) {
-            return expectedWidth;
-        }
-
-        int width = x2 - x1;
-        if (width < 0) {
-            // Calculate the width considering text wrapping
-            struct winsize w;
-            ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
-            width += w.ws_col;
-        }
-        return width;
-#endif
-    }
-
-    static void replace_last(char ch) {
-#if defined(_WIN32)
-        pop_cursor();
-        put_codepoint(&ch, 1, 1);
-#else
-        fprintf(out, "\b%c", ch);
-#endif
-    }
-
-    static void append_utf8(char32_t ch, std::string & out) {
-        if (ch <= 0x7F) {
-            out.push_back(static_cast<unsigned char>(ch));
-        } else if (ch <= 0x7FF) {
-            out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
-            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-        } else if (ch <= 0xFFFF) {
-            out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
-            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-        } else if (ch <= 0x10FFFF) {
-            out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
-            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
-            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-        } else {
-            // Invalid Unicode code point
-        }
-    }
-
-    // Helper function to remove the last UTF-8 character from a string
-    static void pop_back_utf8_char(std::string & line) {
-        if (line.empty()) {
-            return;
-        }
-
-        size_t pos = line.length() - 1;
-
-        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-            if ((line[pos] & 0xC0) != 0x80) {
-                break; // Found the start of the character
-            }
-        }
-        line.erase(pos);
-    }
-
-    static bool readline_advanced(std::string & line, bool multiline_input) {
-        if (out != stdout) {
-            fflush(stdout);
-        }
-
-        line.clear();
-        std::vector<int> widths;
-        bool is_special_char = false;
-        bool end_of_stream = false;
-
-        char32_t input_char;
-        while (true) {
-            fflush(out); // Ensure all output is displayed before waiting for input
-            input_char = getchar32();
-
-            if (input_char == '\r' || input_char == '\n') {
-                break;
-            }
-
-            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
-                end_of_stream = true;
-                break;
-            }
-
-            if (is_special_char) {
-                set_display(user_input);
-                replace_last(line.back());
-                is_special_char = false;
-            }
-
-            if (input_char == '\033') { // Escape sequence
-                char32_t code = getchar32();
-                if (code == '[' || code == 0x1B) {
-                    // Discard the rest of the escape sequence
-                    while ((code = getchar32()) != (char32_t) WEOF) {
-                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
-                            break;
-                        }
-                    }
-                }
-            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-                if (!widths.empty()) {
-                    int count;
-                    do {
-                        count = widths.back();
-                        widths.pop_back();
-                        // Move cursor back, print space, and move cursor back again
-                        for (int i = 0; i < count; i++) {
-                            replace_last(' ');
-                            pop_cursor();
-                        }
-                        pop_back_utf8_char(line);
-                    } while (count == 0 && !widths.empty());
-                }
-            } else {
-                int offset = line.length();
-                append_utf8(input_char, line);
-                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-                if (width < 0) {
-                    width = 0;
-                }
-                widths.push_back(width);
-            }
-
-            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-                set_display(prompt);
-                replace_last(line.back());
-                is_special_char = true;
-            }
-        }
-
-        bool has_more = multiline_input;
-        if (is_special_char) {
-            replace_last(' ');
-            pop_cursor();
-
-            char last = line.back();
-            line.pop_back();
-            if (last == '\\') {
-                line += '\n';
-                fputc('\n', out);
-                has_more = !has_more;
-            } else {
-                // llama will just eat the single space, it won't act as a space
-                if (line.length() == 1 && line.back() == ' ') {
-                    line.clear();
-                    pop_cursor();
-                }
-                has_more = false;
-            }
-        } else {
-            if (end_of_stream) {
-                has_more = false;
-            } else {
-                line += '\n';
-                fputc('\n', out);
-            }
-        }
-
-        fflush(out);
-        return has_more;
-    }
-
-    static bool readline_simple(std::string & line, bool multiline_input) {
-#if defined(_WIN32)
-        std::wstring wline;
-        if (!std::getline(std::wcin, wline)) {
-            // Input stream is bad or EOF received
-            line.clear();
-            GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
-            return false;
-        }
-
-        int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
-        line.resize(size_needed);
-        WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
-#else
-        if (!std::getline(std::cin, line)) {
-            // Input stream is bad or EOF received
-            line.clear();
-            return false;
-        }
-#endif
-        if (!line.empty()) {
-            char last = line.back();
-            if (last == '/') { // Always return control on '/' symbol
-                line.pop_back();
-                return false;
-            }
-            if (last == '\\') { // '\\' changes the default action
-                line.pop_back();
-                multiline_input = !multiline_input;
-            }
-        }
-        line += '\n';
-
-        // By default, continue input if multiline_input is set
-        return multiline_input;
-    }
-
-    bool readline(std::string & line, bool multiline_input) {
-        set_display(user_input);
-
-        if (simple_io) {
-            return readline_simple(line, multiline_input);
-        }
-        return readline_advanced(line, multiline_input);
-    }
-
-}
--- a/llama/llama.cpp/common/console.h
+++ b/llama/llama.cpp/common/console.h
@@ -1,19 +0,0 @@
-// Console functions
-
-#pragma once
-
-#include <string>
-
-namespace console {
-    enum display_t {
-        reset = 0,
-        prompt,
-        user_input,
-        error
-    };
-
-    void init(bool use_simple_io, bool use_advanced_display);
-    void cleanup();
-    void set_display(display_t display);
-    bool readline(std::string & line, bool multiline_input);
-}
--- a/llama/llama.cpp/common/json-partial.cpp
+++ b/llama/llama.cpp/common/json-partial.cpp
@@ -1,256 +0,0 @@
-#include "json-partial.h"
-
-#include "log.h"
-
-#include <nlohmann/json.hpp>
-
-#include <string>
-
-using json = nlohmann::ordered_json;
-
-enum common_json_stack_element_type {
-    COMMON_JSON_STACK_ELEMENT_OBJECT,
-    COMMON_JSON_STACK_ELEMENT_KEY,
-    COMMON_JSON_STACK_ELEMENT_ARRAY,
-};
-
-struct common_json_stack_element {
-    common_json_stack_element_type type;
-    std::string key;
-};
-
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    std::string::const_iterator it = input.begin();
-    const auto end = input.end();
-    return common_json_parse(it, end, healing_marker, out);
-}
-
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    // // https://json.nlohmann.me/features/parsing/sax_interface/
-    struct json_error_locator : public nlohmann::json_sax<json> {
-        std::size_t position;
-        bool found_error;
-        std::string last_token;
-        std::string exception_message;
-        std::vector<common_json_stack_element> stack;
-
-        json_error_locator() : position(0), found_error(false) {}
-
-        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
-            this->position = position - 1;
-            this->found_error = true;
-            this->last_token = last_token;
-            this->exception_message = ex.what();
-            return false;
-        }
-        void close_value() {
-            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
-                stack.pop_back();
-            }
-        }
-        bool null() override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool boolean(bool) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_integer(number_integer_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_unsigned(number_unsigned_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_float(number_float_t, const string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool string(string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool binary(binary_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool start_object(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
-            return true;
-        }
-        bool end_object() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-        bool key(string_t & key) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
-            return true;
-        }
-        bool start_array(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
-            return true;
-        }
-        bool end_array() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-    };
-    json_error_locator err_loc;
-    auto start = it;
-    json::sax_parse(it, end, &err_loc);
-
-    if (err_loc.found_error) {
-        it = start;
-        auto temptative_end = it + err_loc.position;
-        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
-
-        auto input = std::string(it, temptative_end);
-        try {
-            out.json = json::parse(input);
-            // out.json = json::parse(it, temptative_end);
-            it = temptative_end;
-            return true;
-        } catch (const std::exception & ex) {
-            // No, needs healing.
-            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
-        }
-        auto can_parse = [](const std::string & str) {
-            try {
-                auto _ = json::parse(str); // NOLINT
-                return true;
-            } catch (const std::exception &) {
-                return false;
-            }
-        };
-        if (!healing_marker.empty() && !err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
-            if (last_non_sp_pos == std::string::npos) {
-                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-            }
-            auto last_non_sp_char = str[last_non_sp_pos];
-            // Used to detect stops on a number, which may not be complete.
-            auto was_maybe_number = [&]() {
-                if (!str.empty() && std::isspace(str.back())) {
-                    return false;
-                }
-                return std::isdigit(last_non_sp_char) ||
-                    last_non_sp_char == '.' ||
-                    last_non_sp_char == 'e' ||
-                    last_non_sp_char == 'E' ||
-                    last_non_sp_char == '-';
-            };
-
-            std::string closing;
-            for (size_t i = err_loc.stack.size(); i > 0; i--) {
-                auto & el = err_loc.stack[i - 1];
-                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                    closing += "}";
-                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                    closing += "]";
-                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
-                    throw std::runtime_error("Unexpected stack element type");
-                }
-            }
-
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
-
-            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
-                // We're inside an object value
-                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
-                    // Was about to create an object value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + ": 1" + closing)) {
-                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
-                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
-                    // Was about to create an object
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an object value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an object value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else {
-                    // find last :
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-                    }
-                    // Cutting back to opening : for object value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
-                    // Was about to create an array value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an array value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an array value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
-                    // Had just finished a value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
-                } else {
-                    auto last_pos = str.find_last_of("[,");
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
-                    }
-                    // Cutting back to last [ or , for array value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
-                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\": 1" + closing)) {
-                    // Was inside an object key string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
-                    // Was inside an object key string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
-                } else {
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-                    }
-                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else {
-                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-            }
-            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
-        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-        // fprintf(stderr, "Closing: TODO\n");
-        return false;
-    }
-    out.json = json::parse(it, end);
-    it = end;
-    return true;
-}
--- a/llama/llama.cpp/common/json-partial.h
+++ b/llama/llama.cpp/common/json-partial.h
@@ -1,38 +0,0 @@
-#pragma once
-
-#include <nlohmann/json.hpp>
-
-// Healing marker (empty if the JSON was fully parsed / wasn't healed).
-struct common_healing_marker {
-    // Raw marker.
-    std::string marker;
-
-    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
-    std::string json_dump_marker;
-};
-
-// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
-struct common_json {
-    nlohmann::ordered_json json;
-
-    common_healing_marker healing_marker;
-};
-
-// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
-//
-// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
-// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
-// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
-//
-// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out);
-
-// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out);
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -1,9 +1,8 @@
 #include "json-schema-to-grammar.h"
 #include "common.h"

-#include <nlohmann/json.hpp>
-
 #include <algorithm>
+#include <fstream>
 #include <map>
 #include <regex>
 #include <sstream>
--- a/llama/llama.cpp/common/json-schema-to-grammar.h
+++ b/llama/llama.cpp/common/json-schema-to-grammar.h
@@ -1,9 +1,9 @@
 #pragma once

-#include <nlohmann/json_fwd.hpp>
-
-#include <functional>
-#include <string>
+#include "ggml.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"

 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                   bool force_gbnf = false);
--- a/llama/llama.cpp/common/regex-partial.cpp
+++ b/llama/llama.cpp/common/regex-partial.cpp
@@ -1,204 +0,0 @@
-#include "regex-partial.h"
-#include "common.h"
-#include <functional>
-#include <optional>
-
-common_regex::common_regex(const std::string & pattern) :
-    pattern(pattern),
-    rx(pattern),
-    rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
-
-common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
-    std::smatch match;
-    if (pos > input.size()) {
-        throw std::runtime_error("Position out of bounds");
-    }
-    auto start = input.begin() + pos;
-    auto found = as_match
-        ? std::regex_match(start, input.end(), match, rx)
-        : std::regex_search(start, input.end(), match, rx);
-    if (found) {
-        common_regex_match res;
-        res.type = COMMON_REGEX_MATCH_TYPE_FULL;
-        for (size_t i = 0; i < match.size(); ++i) {
-            auto begin = pos + match.position(i);
-            res.groups.emplace_back(begin, begin + match.length(i));
-        }
-        return res;
-    }
-    std::match_results<std::string::const_reverse_iterator> srmatch;
-    if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
-        auto group = srmatch[1].str();
-        if (group.length() != 0) {
-            auto it = srmatch[1].second.base();
-            // auto position = static_cast<size_t>(std::distance(input.begin(), it));
-            if ((!as_match) || it == input.begin()) {
-                common_regex_match res;
-                res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
-                const size_t begin = std::distance(input.begin(), it);
-                const size_t end = input.size();
-                if (begin == std::string::npos || end == std::string::npos || begin > end) {
-                    throw std::runtime_error("Invalid range");
-                }
-                res.groups.push_back({begin, end});
-                return res;
-            }
-        }
-    }
-    return {};
-}
-
-/*
-  Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
-
-  Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
-  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
-  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
-
-  - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
-  - /a|b/ -> (a|b).*
-  - /a*?/ -> error, could match ""
-  - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
-  - /.*?ab/ -> ((?:b)?a).* (merge .*)
-  - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
-  - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
-  - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
-  - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
-
-  The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
-  (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
-*/
-std::string regex_to_reversed_partial_regex(const std::string & pattern) {
-    auto it = pattern.begin();
-    const auto end = pattern.end();
-
-    std::function<std::string()> process = [&]() {
-        std::vector<std::vector<std::string>> alternatives(1);
-        std::vector<std::string> * sequence = &alternatives.back();
-
-        while (it != end) {
-            if (*it == '[') {
-                auto start = it;
-                ++it;
-                while (it != end) {
-                    if ((*it == '\\') && (++it != end)) {
-                        ++it;
-                    } else if ((it != end) && (*it == ']')) {
-                        break;
-                    } else {
-                        ++it;
-                    }
-                }
-                if (it == end) {
-                    throw std::runtime_error("Unmatched '[' in pattern");
-                }
-                ++it;
-                sequence->push_back(std::string(start, it));
-            } else if (*it == '*' || *it == '?' || *it == '+') {
-                if (sequence->empty()) {
-                    throw std::runtime_error("Quantifier without preceding element");
-                }
-                sequence->back() += *it;
-                auto is_star = *it == '*';
-                ++it;
-                if (is_star) {
-                    if (*it == '?') {
-                        ++it;
-                    }
-                }
-            } else if (*it == '{') {
-                if (sequence->empty()) {
-                    throw std::runtime_error("Repetition without preceding element");
-                }
-                ++it;
-                auto start = it;
-                while (it != end && *it != '}') {
-                    ++it;
-                }
-                if (it == end) {
-                    throw std::runtime_error("Unmatched '{' in pattern");
-                }
-                auto parts = string_split(std::string(start, it), ",");
-                ++it;
-                if (parts.size() > 2) {
-                    throw std::runtime_error("Invalid repetition range in pattern");
-                }
-
-                auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
-                    if (s.empty()) {
-                        return def;
-                    }
-                    return std::stoi(s);
-                };
-                auto min = parseOptInt(parts[0], 0);
-                auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
-                if (min && max && *max < *min) {
-                    throw std::runtime_error("Invalid repetition range in pattern");
-                }
-                // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
-                auto part = sequence->back();
-                sequence->pop_back();
-                for (int i = 0; i < *min; i++) {
-                    sequence->push_back(part);
-                }
-                if (max) {
-                    for (int i = *min; i < *max; i++) {
-                        sequence->push_back(part + "?");
-                    }
-                } else {
-                    sequence->push_back(part + "*");
-                }
-            } else if (*it == '(') {
-                ++it;
-                if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
-                    it += 2;
-                }
-                auto sub = process();
-                if (*it != ')') {
-                    throw std::runtime_error("Unmatched '(' in pattern");
-                }
-                ++it;
-                auto & part = sequence->emplace_back("(?:");
-                part += sub;
-                part += ")";
-            } else if (*it == ')') {
-                break;
-            } else if (*it == '|') {
-                ++it;
-                alternatives.emplace_back();
-                sequence = &alternatives.back();
-            } else if (*it == '\\' && (++it != end)) {
-                auto str = std::string("\\") + *it;
-                sequence->push_back(str);
-                ++it;
-            } else if (it != end) {
-                sequence->push_back(std::string(1, *it));
-                ++it;
-            }
-        }
-
-        // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
-        // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
-        // We'll do the outermost capturing group and final .* in the enclosing function.
-        std::vector<std::string> res_alts;
-        for (const auto & parts : alternatives) {
-            auto & res = res_alts.emplace_back();
-            for (size_t i = 0; i < parts.size() - 1; i++) {
-                res += "(?:";
-            }
-            for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
-                res += *it;
-                if (it != parts.rend() - 1) {
-                    res += ")?";
-                }
-            }
-        }
-        return string_join(res_alts, "|");
-    };
-    auto res = process();
-    if (it != end) {
-        throw std::runtime_error("Unmatched '(' in pattern");
-    }
-
-    return "(" + res + ")[\\s\\S]*";
-}
--- a/llama/llama.cpp/common/regex-partial.h
+++ b/llama/llama.cpp/common/regex-partial.h
@@ -1,56 +0,0 @@
-#pragma once
-
-#include <regex>
-#include <string>
-
-enum common_regex_match_type {
-    COMMON_REGEX_MATCH_TYPE_NONE,
-    COMMON_REGEX_MATCH_TYPE_PARTIAL,
-    COMMON_REGEX_MATCH_TYPE_FULL,
-};
-
-struct common_string_range {
-    size_t begin;
-    size_t end;
-    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
-        if (begin > end) {
-            throw std::runtime_error("Invalid range");
-        }
-    }
-    // prevent default ctor
-    common_string_range() = delete;
-    bool empty() const {
-        return begin == end;
-    }
-    bool operator==(const common_string_range & other) const {
-        return begin == other.begin && end == other.end;
-    }
-};
-
-struct common_regex_match {
-    common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
-    std::vector<common_string_range> groups;
-
-    bool operator==(const common_regex_match & other) const {
-        return type == other.type && groups == other.groups;
-    }
-    bool operator!=(const common_regex_match & other) const {
-        return !(*this == other);
-    }
-};
-
-class common_regex {
-    std::string pattern;
-    std::regex rx;
-    std::regex rx_reversed_partial;
-
-  public:
-    explicit common_regex(const std::string & pattern);
-
-    common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
-
-    const std::string & str() const { return pattern; }
-};
-
-// For testing only (pretty print of failures).
-std::string regex_to_reversed_partial_regex(const std::string & pattern);
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -161,7 +161,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
-        std::vector<std::string> trigger_patterns;
+        std::vector<std::string> patterns_at_start;
        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
@@ -173,13 +173,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
                {
-                    patterns_anywhere.push_back(trigger.value);
-                    break;
-                }
-                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
-                {
-                    trigger_patterns.push_back(trigger.value);
+                    const auto & pattern = trigger.value;
+                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -193,6 +190,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
            }
        }

+        std::vector<std::string> trigger_patterns;
+        if (!patterns_at_start.empty()) {
+            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
+        }
        if (!patterns_anywhere.empty()) {
            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
        }
--- a/llama/llama.cpp/vendor/stb/stb_image.h
+++ b/llama/llama.cpp/vendor/stb/stb_image.h
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -61,10 +61,7 @@ extern "C" {
    struct llama_model;
    struct llama_context;
    struct llama_sampler;
-
-    typedef struct llama_memory_i * llama_memory_t;
-
-    struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
+    struct llama_kv_cache;

    typedef int32_t llama_pos;
    typedef int32_t llama_token;
@@ -262,9 +259,9 @@ extern "C" {
        llama_token  *  token;
        float        *  embd;
        llama_pos    *  pos;
-        int32_t      *  n_seq_id; // TODO: remove, should belong to only 1 sequence
-        llama_seq_id ** seq_id;   // TODO: become llama_seq_id * seq_id;
-        int8_t       *  logits;   // TODO: rename this to "output"
+        int32_t      *  n_seq_id;
+        llama_seq_id ** seq_id;
+        int8_t       *  logits; // TODO: rename this to "output"
    } llama_batch;

    enum llama_model_kv_override_type {
@@ -348,7 +345,7 @@ extern "C" {
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size
-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
+        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)

        ggml_backend_sched_eval_callback cb_eval;
        void * cb_eval_user_data;
@@ -364,13 +361,10 @@ extern "C" {

        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // use flash attention [EXPERIMENTAL]
-        bool no_perf;     // measure performance timings
-        bool op_offload;  // offload host tensor operations to device
-        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
-                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
-                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool no_perf;     // whether to measure performance timings
+        bool op_offload;  // whether to offload host tensor operations to device
    };

    // model quantization parameters
@@ -476,7 +470,6 @@ extern "C" {
    LLAMA_API int64_t llama_time_us(void);

    LLAMA_API size_t llama_max_devices(void);
-    LLAMA_API size_t llama_max_parallel_sequences(void);

    LLAMA_API bool llama_supports_mmap       (void);
    LLAMA_API bool llama_supports_mlock      (void);
@@ -496,11 +489,9 @@ extern "C" {
    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");

    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type

-    DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
-
    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);

@@ -509,7 +500,6 @@ extern "C" {
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -614,92 +604,78 @@ extern "C" {
                         int32_t   il_end);

    //
-    // Memory
+    // KV cache
    //

-    // Clear the memory contents
-    LLAMA_API void llama_memory_clear(llama_memory_t mem);
+    // TODO: start using struct llama_kv_cache

-    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-    // seq_id < 0 : match any sequence
-    // p0 < 0     : [0,  p1]
-    // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_memory_seq_rm(
-            llama_memory_t mem,
-              llama_seq_id seq_id,
-                 llama_pos p0,
-                 llama_pos p1);
+    // Information associated with an individual cell in the KV cache view.
+    struct llama_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
+        llama_pos pos;
+    };

-    // Copy all tokens that belong to the specified sequence to another sequence
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_memory_seq_cp(
-            llama_memory_t mem,
-              llama_seq_id seq_id_src,
-              llama_seq_id seq_id_dst,
-                 llama_pos p0,
-                 llama_pos p1);
+    // An updateable view of the KV cache.
+    struct llama_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
+        int32_t n_cells;

-    // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_memory_seq_keep(
-            llama_memory_t mem,
-              llama_seq_id seq_id);
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
+        int32_t n_seq_max;

-    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_memory_seq_add(
-            llama_memory_t mem,
-              llama_seq_id seq_id,
-                 llama_pos p0,
-                 llama_pos p1,
-                 llama_pos delta);
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
+        int32_t token_count;

-    // Integer division of the positions by factor of `d > 1`
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_memory_seq_div(
-            llama_memory_t mem,
-              llama_seq_id seq_id,
-                 llama_pos p0,
-                 llama_pos p1,
-                       int d);
+        // Number of populated cache cells.
+        int32_t used_cells;

-    // Returns the smallest position present in the memory for the specified sequence
-    // This is typically non-zero only for SWA caches
-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
-    // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_memory_seq_pos_min(
-            llama_memory_t mem,
-              llama_seq_id seq_id);
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;

-    // Returns the largest position present in the memory for the specified sequence
-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
-    // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_memory_seq_pos_max(
-            llama_memory_t mem,
-              llama_seq_id seq_id);
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;

-    // Check if the memory supports shifting
-    LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
+        // Information for an individual cell.
+        struct llama_kv_cache_view_cell * cells;

-    //
-    // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
-    //
+        // The sequences for each cell. There will be n_seq_max items per cell.
+        llama_seq_id * cells_sequences;
+    };
+
+    // Create an empty KV cache view. (use only for debugging purposes)
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
+
+    // Free a KV cache view. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
+    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+    ///

    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_self_n_tokens instead");

    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_self_used_cells instead");

    // Clear the KV cache - both cell info is erased and KV data is zeroed
    LLAMA_API void llama_kv_self_clear(
-                struct llama_context * ctx);
+            struct llama_context * ctx);

    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@@ -731,6 +707,7 @@ extern "C" {
    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_self_seq_add(
@@ -743,6 +720,7 @@ extern "C" {
    // Integer division of the positions by factor of `d > 1`
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_self_seq_div(
@@ -752,40 +730,84 @@ extern "C" {
                       llama_pos   p1,
                             int   d);

-    // Returns the smallest position present in the KV cache for the specified sequence
-    // This is typically non-zero only for SWA caches
-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-    // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
    // Returns the largest position present in the KV cache for the specified sequence
-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-    // Return -1 if the sequence is empty
    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                     llama_seq_id   seq_id);

    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
-    DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
-            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
+    //   - explicitly with llama_kv_self_update()
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);

    // Check if the context supports KV cache shifting
    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);

    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
-            "simply remove this call, updates are applied lazily on the next llama_decode()");
+    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx),
+            "use llama_kv_self_clear instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_rm instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_cp instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_keep instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "use llama_kv_self_seq_add instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "use llama_kv_self_seq_div instead");
+
+    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_pos_max instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
+            "use llama_kv_self_defrag instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
+            "use llama_kv_self_can_shift instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
+            "use llama_kv_self_update instead");
+

    //
    // State / sessions
    //

    // Returns the *actual* size in bytes of the state
-    // (logits, embedding and memory)
+    // (logits, embedding and kv_cache)
    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -841,12 +863,12 @@ extern "C" {
                          size_t   n_token_count),
        "use llama_state_save_file instead");

-    // Get the exact size needed to copy the state of a single sequence
+    // Get the exact size needed to copy the KV cache of a single sequence
    LLAMA_API size_t llama_state_seq_get_size(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);

-    // Copy the state of a single sequence into the specified buffer
+    // Copy the KV cache of a single sequence into the specified buffer
    LLAMA_API size_t llama_state_seq_get_data(
            struct llama_context * ctx,
                         uint8_t * dst,
@@ -912,21 +934,18 @@ extern "C" {
    // For encode-decoder contexts, processes the batch using the encoder.
    // Can store the encoder output internally for later use by the decoder's cross-attention layers.
    //   0 - success
-    // < 0 - error. the memory state is restored to the state before this call
+    // < 0 - error. the KV cache state is restored to the state before this call
    LLAMA_API int32_t llama_encode(
            struct llama_context * ctx,
              struct llama_batch   batch);

    // Process a batch of tokens.
-    // Requires the context to have a memory.
+    // Requires KV cache.
    // For encode-decoder contexts, processes the batch using the decoder.
    // Positive return values does not mean a fatal error, but rather a warning.
-    // Upon non-zero return values, the memory state is restored to the state before this call
-    //    0 - success
-    //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    //    2 - aborted
-    //   -1 - invalid input batch
-    // < -1 - error
+    //   0 - success
+    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    // < 0 - error. the KV cache state is restored to the state before this call
    LLAMA_API int32_t llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -176,8 +176,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
    { LLM_KV_CONVNEXT_BLOCK_COUNT,      "%s.convnext.block_count"      },

-    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
-
    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@@ -452,7 +450,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
            { LLM_TENSOR_POS_EMBD,        "position_embd" },
            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
@@ -1486,9 +1483,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
-            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
-            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
    {
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -215,8 +215,6 @@ enum llm_kv {
    LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
    LLM_KV_CONVNEXT_BLOCK_COUNT,

-    LLM_KV_CLASSIFIER_OUTPUT_LABELS,
-
    // deprecated:
    LLM_KV_TOKENIZER_PREFIX_ID,
    LLM_KV_TOKENIZER_SUFFIX_ID,
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -1,6 +1,5 @@
 #include "llama-batch.h"

-#include <cassert>
 #include <cstring>
 #include <algorithm>

@@ -15,31 +14,24 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
            break;
        }
    }
-
-    udatas.push_back({});
-
-    auto & udata = udatas.back();
-
-    udata.token.resize(!has_embd ? n_ubatch : 0);
-    udata.embd.resize(has_embd ? n_embd * n_ubatch : 0);
-    udata.pos.resize(n_ubatch);
-    udata.n_seq_id.resize(n_ubatch);
-    udata.seq_id.resize(n_ubatch);
-    udata.output.resize(n_ubatch);
-
+    ubatch_token.resize(!has_embd ? n_ubatch : 0);
+    ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
+    ubatch_pos.resize(n_ubatch);
+    ubatch_n_seq_id.resize(n_ubatch);
+    ubatch_seq_id.resize(n_ubatch);
+    ubatch_output.resize(n_ubatch);
    llama_ubatch ubatch = {
        /*equal_seqs   =*/ true,
        /*n_tokens     =*/ 0,
        /*n_seq_tokens =*/ 0,
        /*n_seqs       =*/ 0,
-        /*token        =*/ !has_embd ? udata.token.data() : nullptr,
-        /*embd         =*/ has_embd  ? udata.embd.data()  : nullptr,
-        /*pos          =*/ udata.pos.data(),
-        /*n_seq_id     =*/ udata.n_seq_id.data(),
-        /*seq_id       =*/ udata.seq_id.data(),
-        /*output       =*/ udata.output.data(),
+        /*token        =*/ !has_embd ? ubatch_token.data() : nullptr,
+        /*embd         =*/ has_embd  ? ubatch_embd.data()  : nullptr,
+        /*pos          =*/ ubatch_pos.data(),
+        /*n_seq_id     =*/ ubatch_n_seq_id.data(),
+        /*seq_id       =*/ ubatch_seq_id.data(),
+        /*output       =*/ ubatch_output.data(),
    };
-
    return ubatch;
 }

@@ -289,10 +281,9 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
    batch = in_batch;
    GGML_ASSERT(batch.n_tokens > 0);
    if (!batch.pos) {
-        assert(p0 >= 0);
        pos.resize(batch.n_tokens);
        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = p0 + i;
+            pos[i] = i + p0;
        }
        batch.pos = pos.data();
    }
--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@@ -11,15 +11,15 @@ struct llama_ubatch {
    bool equal_seqs;
    // TODO: whole_seqs for embeddings?

-    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
    uint32_t n_seq_tokens; // tokens per sequence
    uint32_t n_seqs;

    llama_token  *  token;    // [n_tokens]
    float        *  embd;     // [n_embd, n_tokens]
    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
-    llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id;
+    int32_t      *  n_seq_id; // [n_seqs]
+    llama_seq_id ** seq_id;   // [n_seqs]
    int8_t       *  output;   // [n_tokens]
 };

@@ -49,18 +49,13 @@ struct llama_sbatch {

    const llama_batch * batch = nullptr;

-    // buffers for the ubatches
-    // TODO: very hacky, this needs a complete rework
-    struct ubatch_data {
-        std::vector<llama_token>    token;
-        std::vector<float>          embd;
-        std::vector<llama_pos>      pos;
-        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
-        std::vector<int8_t>         output;
-    };
-
-    std::vector<ubatch_data> udatas;
+    // buffers for the ubatch
+    std::vector<llama_token>    ubatch_token;
+    std::vector<float>          ubatch_embd;
+    std::vector<llama_pos>      ubatch_pos;
+    std::vector<int32_t>        ubatch_n_seq_id;
+    std::vector<llama_seq_id *> ubatch_seq_id;
+    std::vector<int8_t>         ubatch_output;

    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -5,6 +5,7 @@
 #include "llama-cparams.h"
 #include "llama-graph.h"
 #include "llama-adapter.h"
+#include "llama-kv-cache.h"

 #include "ggml-cpp.h"
 #include "ggml-opt.h"
@@ -13,13 +14,11 @@
 #include <vector>

 struct llama_model;
+struct llama_kv_cache;

 class llama_io_read_i;
 class llama_io_write_i;

-struct llama_memory_i;
-struct llama_memory_state_i;
-
 struct llama_context {
    // init scheduler and compute buffers, reserve worst-case graphs
    llama_context(
@@ -46,12 +45,10 @@ struct llama_context {
    uint32_t n_threads()       const;
    uint32_t n_threads_batch() const;

-    llama_memory_t get_memory() const;
+          llama_kv_cache * get_kv_self();
+    const llama_kv_cache * get_kv_self() const;

-    // return true of the KV cache was updated
-    // TODO: remove
-    bool kv_self_update(bool optimize);
-    void kv_self_defrag_sched();
+    void kv_self_update();

    enum llama_pooling_type pooling_type() const;

@@ -92,16 +89,6 @@ struct llama_context {
                int32_t   il_start,
                int32_t   il_end);

-    // process a single ubatch with a specific graph type
-    // if memory_state is provided, it will be applied first to the context's memory
-    // ret contains the status of the graph computation
-    // returns nullptr only if ret != GGML_STATUS_SUCCESS
-    llm_graph_result_ptr process_ubatch(
-              const llama_ubatch & ubatch,
-                  llm_graph_type   gtype,
-            llama_memory_state_i * mstate,
-                     ggml_status & ret);
-
    int encode(llama_batch & inp_batch);
    int decode(llama_batch & inp_batch);

@@ -194,18 +181,16 @@ public:
    ggml_cgraph * graph_init();

    // returns the result of ggml_backend_sched_graph_compute_async execution
-    ggml_status graph_compute(ggml_cgraph * gf, bool batched);
-
-    // reserve a graph with a dummy ubatch of the specified size
-    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);
+    ggml_status graph_compute(
+            ggml_cgraph * gf,
+                   bool   batched);

 private:
    llm_graph_result_ptr graph_build(
-                    ggml_context * ctx,
-                     ggml_cgraph * gf,
-              const llama_ubatch & ubatch,
-                  llm_graph_type   gtype,
-      const llama_memory_state_i * mstate);
+            ggml_context * ctx,
+             ggml_cgraph * gf,
+      const llama_ubatch & ubatch,
+          llm_graph_type   gtype);

    llm_graph_cb graph_get_cb() const;

@@ -230,9 +215,6 @@ private:

    std::unique_ptr<llama_memory_i> memory;

-    // TODO: temporary, until the llama_kv_self_defrag() API is removed
-    bool memory_force_optimize = false;
-
    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * logits      = nullptr;
--- a/llama/llama.cpp/src/llama-cparams.cpp
+++ b/llama/llama.cpp/src/llama-cparams.cpp
@@ -1,5 +1 @@
 #include "llama-cparams.h"
-
-size_t llama_max_parallel_sequences(void) {
-    return LLAMA_MAX_PARALLEL_SEQUENCES;
-}
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -4,8 +4,6 @@

 #include <cstdint>

-#define LLAMA_MAX_PARALLEL_SEQUENCES 64
-
 struct llama_cparams {
    uint32_t n_ctx;           // context size used during inference
    uint32_t n_batch;
--- a/llama/llama.cpp/src/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -1186,18 +1186,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
-                    // get from the first matched capturing group to the end of the string
-                    size_t start = std::string::npos;
-                    for (auto i = 1u; i < match.size(); i++) {
-                        if (match.length(i) > 0) {
-                            start = match.position(i);
-                            break;
-                        }
-                    }
-                    if (start == std::string::npos) {
-                        start = match.position(0);
-                    }
-                    auto constrained_str = grammar.trigger_buffer.substr(start);
+                    // get from the first match to the end of the string
+                    auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                    grammar.trigger_buffer.clear();
                    llama_grammar_accept_str(grammar, constrained_str);
--- a/Show More
+++ b/Show More