use range

model: benchmark bpe text processing
fix linter
2025-01-31 14:55:09 -08:00 · 2025-01-31 14:44:20 -08:00 · 2025-01-29 15:08:37 -08:00 · 2025-01-29 15:05:24 -08:00
1002 changed files with 88605 additions and 908724 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,10 +15,6 @@ ml/backend/**/*.cu linguist-vendored
 ml/backend/**/*.cuh linguist-vendored
 ml/backend/**/*.m linguist-vendored
 ml/backend/**/*.metal linguist-vendored
 ml/backend/**/CMakeLists.txt linguist-vendored
 llama/build-info.cpp linguist-generated
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated
 * text=auto
 *.go text eol=lf
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@@ -9,14 +9,6 @@ body:
      description: What happened? What did you expect to happen?
    validations:
      required: true
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md#how-to-troubleshoot-issues) for details.
      render: shell
    validations:
      required: false
  - type: dropdown
    id: os
    attributes:
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -5,10 +5,6 @@ on:
    tags:
      - 'v*'
 env:
  CGO_CFLAGS: '-O3'
  CGO_CXXFLAGS: '-O3'
 jobs:
  setup-environment:
    runs-on: ubuntu-latest
@@ -23,7 +19,7 @@ jobs:
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
  darwin-build:
-    runs-on: macos-13-xlarge
+    runs-on: macos-13
    environment: release
    needs: setup-environment
    strategy:
@@ -54,6 +50,48 @@ jobs:
          name: build-${{ matrix.os }}-${{ matrix.arch }}
          path: dist/*
  darwin-sign:
    runs-on: macos-13
    environment: release
    needs: darwin-build
    steps:
      - uses: actions/checkout@v4
      - run: |
          echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
          security create-keychain -p password build.keychain
          security default-keychain -s build.keychain
          security unlock-keychain -p password build.keychain
          security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
          security set-keychain-settings -lut 3600 build.keychain
        env:
          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
      - uses: actions/download-artifact@v4
        with:
          name: build-darwin-amd64
          path: dist/darwin-amd64
      - uses: actions/download-artifact@v4
        with:
          name: build-darwin-arm64
          path: dist/darwin-arm64
      - run: |
          export VERSION=${GITHUB_REF_NAME#v}
          ./scripts/build_darwin.sh macapp sign
        env:
          APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
          APPLE_ID: ${{ vars.APPLE_ID }}
          SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
          DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
      - uses: actions/upload-artifact@v4
        with:
          name: dist-darwin
          path: |
            dist/Ollama-darwin.zip
            dist/ollama-darwin.tgz
  windows-depends:
    strategy:
      matrix:
@@ -63,38 +101,19 @@ jobs:
        include:
          - os: windows
            arch: amd64
-            preset: 'CUDA 12'
+            preset: 'CUDA 11'
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-            cuda-components:
+            cuda-version: '11.3'
              - '"cudart"'
              - '"nvcc"'
              - '"cublas"'
              - '"cublas_dev"'
            cuda-version: '12.8'
            flags: ''
            runner_dir: 'cuda_v12'
          - os: windows
            arch: amd64
-            preset: 'CUDA 13'
+            preset: 'CUDA 12'
-            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
-            cuda-components:
+            cuda-version: '12.4'
              - '"cudart"'
              - '"nvcc"'
              - '"cublas"'
              - '"cublas_dev"'
              - '"crt"'
              - '"nvvm"'
              - '"nvptxcompiler"'
            cuda-version: '13.0'
            flags: ''
            runner_dir: 'cuda_v13'
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
-            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
+            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-            rocm-version: '6.2'
+            rocm-version: '6.1'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
            runner_dir: 'rocm'
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -118,7 +137,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }
@@ -137,13 +156,6 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: matrix.preset == 'CPU'
        run: |
          echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
@@ -158,12 +170,11 @@ jobs:
          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
      - name: Build target "${{ matrix.preset }}"
        run: |
-          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
+          cmake --preset "${{ matrix.preset }}"
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
          Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
        env:
          CMAKE_GENERATOR: Ninja
      - uses: actions/upload-artifact@v4
@@ -176,60 +187,89 @@ jobs:
      matrix:
        os: [windows]
        arch: [amd64, arm64]
        include:
        - os: windows
          arch: amd64
          llvmarch: x86_64
        - os: windows
          arch: arm64
          llvmarch: aarch64
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: [setup-environment]
    env:
      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
    steps:
-      - name: Install ARM64 system dependencies
+      - name: Install system dependencies
        if: matrix.arch == 'arm64'
        run: |
          $ErrorActionPreference = "Stop"
-          Set-ExecutionPolicy Bypass -Scope Process -Force
+          if ("${{ matrix.arch }}" -eq 'amd64') {
-          [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
+            Start-Process "C:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-          iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
+            echo "C:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+            echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          } elseif ("${{ matrix.arch }}" -eq 'arm64') {
            Set-ExecutionPolicy Bypass -Scope Process -Force
            [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
            iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
            echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          choco install -y --no-progress git gzip
+            choco install -y --no-progress git gzip
-          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+            echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install clang and gcc-compat
+
-        run: |
+            Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip"
-          $ErrorActionPreference = "Stop"
+            Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip -DestinationPath "C:\Program Files\"
-          Set-ExecutionPolicy Bypass -Scope Process -Force
+            $installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt-aarch64").path
-          Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-${{ matrix.llvmarch }}.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt.zip"
+            echo $installPath\bin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt.zip -DestinationPath "C:\Program Files\"
+          }
          $installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt*").path
          echo "$installPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
      - name: Verify gcc is actually clang
        run: |
          $ErrorActionPreference='Continue'
          $version=& gcc -v 2>&1
          $version=$version -join "`n"
          echo "gcc is $version"
          if ($version -notmatch 'clang') {
            echo "ERROR: GCC must be clang for proper utf16 handling"
            exit 1
          }
          $ErrorActionPreference='Stop'
      - run: |
          go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
      - run: |
          $env:VERSION='${{ github.ref_name }}' -Replace "v(.*)", '$1'
          & .\scripts\build_windows.ps1 buildApp
        env:
          VCToolsRedistDir: stub
      - uses: actions/upload-artifact@v4
        with:
          name: build-${{ matrix.os }}-${{ matrix.arch }}
          path: |
            dist\${{ matrix.os }}-${{ matrix.arch }}\*.exe
            dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
  windows-sign:
    runs-on: windows
    environment: release
    needs: [windows-depends, windows-build]
    steps:
      - uses: actions/checkout@v4
      - uses: google-github-actions/auth@v2
        with:
          project_id: ollama
          credentials_json: ${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}
      - run: |
          $ErrorActionPreference = "Stop"
          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${{ runner.temp }}\sdksetup.exe"
          Start-Process "${{ runner.temp }}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${{ runner.temp }}\plugin.zip"
          Expand-Archive -Path "${{ runner.temp }}\plugin.zip" -DestinationPath "${{ runner.temp }}\plugin\"
          & "${{ runner.temp }}\plugin\*\kmscng.msi" /quiet
          echo "${{ vars.OLLAMA_CERT }}" >ollama_inc.crt
      - uses: actions/download-artifact@v4
        with:
          name: build-windows-*
          path: dist\
          merge-multiple: true
      - uses: actions/download-artifact@v4
        with:
          name: depends-windows-amd64-*
          path: dist\windows-amd64\
          merge-multiple: true
      - run: |
          & .\scripts\build_windows.ps1 gatherDependencies sign buildInstaller distZip
      - uses: actions/upload-artifact@v4
        with:
          name: dist-windows
          path: |
            dist\OllamaSetup.exe
            dist\ollama-windows-*.zip
  linux-build:
    strategy:
@@ -237,13 +277,10 @@ jobs:
        include:
          - os: linux
            arch: amd64
-            target: archive_novulkan
+            targets: 'archive rocm'
          - os: linux
            arch: amd64
            target: rocm
          - os: linux
            arch: arm64
-            target: archive_novulkan
+            targets: archive
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: setup-environment
@@ -252,122 +289,38 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - uses: docker/setup-buildx-action@v3
      - uses: docker/build-push-action@v6
        with:
          context: .
          platforms: ${{ matrix.os }}/${{ matrix.arch }}
          target: ${{ matrix.target }}
          build-args: |
            GOFLAGS=${{ env.GOFLAGS }}
            CGO_CFLAGS=${{ env.CGO_CFLAGS }}
            CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
      - run: |
-          for COMPONENT in bin/* lib/ollama/*; do
+          apt-get update && apt-get install pigz
-            case "$COMPONENT" in
+          for TARGET in ${{ matrix.targets }}; do docker buildx build --platform $PLATFORM --target $TARGET --output type=local,dest=dist/$PLATFORM .; done
-              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+          tar c -C dist/$PLATFORM . | pigz -9cv >dist/ollama-${PLATFORM//\//-}.tgz
-              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+        env:
-              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+          PLATFORM: ${{ matrix.os }}/${{ matrix.arch }}
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
            esac
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
          echo "Manifests"
          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
            echo $ARCHIVE
            cat $ARCHIVE
          done
      - run: |
          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
          done
      - uses: actions/upload-artifact@v4
        with:
-          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
+          name: dist-${{ matrix.os }}-${{ matrix.arch }}
          path: |
-            *.tgz
+            dist/ollama-${{ matrix.os }}-${{ matrix.arch }}.tgz
-  # Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower.
+  docker-build:
  docker-build-push:
    strategy:
      matrix:
        include:
-          - os: linux
+          - flavor: 'latest=false'
-            arch: arm64
+            platforms: linux/amd64,linux/arm64
            target: novulkan
            build-args: |
-              CGO_CFLAGS
+              GOFLAGS=${{ needs.setup-environment.outputs.GOFLAGS }}
-              CGO_CXXFLAGS
+          - flavor: 'latest=false,suffix=rocm'
-              GOFLAGS
+            platforms: linux/amd64
          - os: linux
            arch: amd64
            target: novulkan
            build-args: |
-              CGO_CFLAGS
+              GOFLAGS=${{ needs.setup-environment.outputs.GOFLAGS }}
              CGO_CXXFLAGS
              GOFLAGS
          - os: linux
            arch: amd64
            suffix: '-rocm'
            build-args: |
              CGO_CFLAGS
              CGO_CXXFLAGS
              GOFLAGS
              FLAVOR=rocm
          - os: linux
            arch: amd64
            suffix: '-vulkan'
            target: default
            build-args: |
              CGO_CFLAGS
              CGO_CXXFLAGS
              GOFLAGS
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: setup-environment
    env:
      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
    steps:
      - uses: actions/checkout@v4
      - uses: docker/setup-buildx-action@v3
      - uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - id: build-push
        uses: docker/build-push-action@v6
        with:
          context: .
          platforms: ${{ matrix.os }}/${{ matrix.arch }}
          target: ${{ matrix.target }}
          build-args: ${{ matrix.build-args }}
          outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
      - run: |
          mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
          echo "${{ steps.build-push.outputs.digest }}" >${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}.txt
        working-directory: ${{ runner.temp }}
      - uses: actions/upload-artifact@v4
        with:
          name: digest-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}
          path: |
            ${{ runner.temp }}/${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}.txt
  # Merge Docker images for the same flavor into a single multi-arch manifest
  docker-merge-push:
    strategy:
      matrix:
        suffix: ['', '-rocm']
    runs-on: linux
    environment: release
-    needs: [docker-build-push]
+    needs: setup-environment
    steps:
      - uses: actions/checkout@v4
      - uses: docker/setup-qemu-action@v2
      - uses: docker/setup-buildx-action@v2
      - uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
@@ -375,38 +328,62 @@ jobs:
      - id: metadata
        uses: docker/metadata-action@v4
        with:
-          flavor: |
+          flavor: ${{ matrix.flavor }}
            latest=false
            suffix=${{ matrix.suffix }}
          images: |
-            ${{ vars.DOCKER_REPO }}
+            ollama/ollama
          tags: |
            type=ref,enable=true,priority=600,prefix=pr-,event=pr
            type=semver,pattern={{version}}
-      - uses: actions/download-artifact@v4
+      - uses: docker/build-push-action@v6
        with:
-          pattern: digest-*
+          context: .
-          path: ${{ runner.temp }}
+          push: true
-          merge-multiple: true
+          platforms: ${{ matrix.platforms }}
-      - run: |
+          build-args: ${{ matrix.build-args }}
-          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf '${{ vars.DOCKER_REPO }}@%s ')
+          tags: ${{ steps.metadata.outputs.tags }}
-          docker buildx imagetools inspect ${{ vars.DOCKER_REPO }}:${{ steps.metadata.outputs.version }}
+          labels: ${{ steps.metadata.outputs.labels }}
-        working-directory: ${{ runner.temp }}
+          cache-from: type=registry,ref=ollama/ollama:latest
          cache-to: type=inline
          provenance: false
-  # Trigger downstream release process
+  # Aggregate all the assets and ship a release
-  trigger:
+  release:
-    runs-on: ubuntu-latest
+    needs: [darwin-sign, windows-sign, linux-build]
    runs-on: linux
    environment: release
    needs: [darwin-build, windows-build, windows-depends, linux-build]
    permissions:
      contents: write
    env:
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
-      - name: Create or update Release for tag
+      - name: Set Version
        shell: bash
        run: |
-          RELEASE_VERSION="$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
+      - uses: actions/download-artifact@v4
        with:
          path: dist
          pattern: dist-darwin
      - uses: actions/download-artifact@v4
        with:
          path: dist
          pattern: dist-windows
      - uses: actions/download-artifact@v4
        with:
          path: dist
          pattern: dist-linux-*
      - uses: actions/download-artifact@v4
        with:
          path: dist
          pattern: dist-windows
      - run: |
          ls -lh dist/
          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
          mv sha256sum.txt dist/
          cat dist/sha256sum.txt
      - name: Create or update Release
        run: |
          RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
          echo "Looking for existing release for ${RELEASE_VERSION}"
          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
          if [ -n "$OLD_TAG" ]; then
@@ -420,12 +397,5 @@ jobs:
              --generate-notes \
              --prerelease
          fi
-      - name: Trigger downstream release process
+          echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
-        run: |
+          gh release upload ${GITHUB_REF_NAME} dist/* --clobber
          curl -L \
            -X POST \
            -H "Accept: application/vnd.github+json" \
            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"origin\": \"${GITHUB_REPOSITORY}\", \"publish\": \"1\"}}"
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -36,7 +36,7 @@ jobs:
              | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
          }
-          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT
  linux:
    needs: [changes]
@@ -46,18 +46,12 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:13.0.0-devel-ubuntu22.04
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
            extra-packages: rocm-libs
            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
          - preset: Vulkan
            container: ubuntu:22.04
            extra-packages: >
              mesa-vulkan-drivers vulkan-tools
              libvulkan1 libvulkan-dev
              vulkan-sdk cmake ccache g++ make
    runs-on: linux
    container: ${{ matrix.container }}
    steps:
@@ -65,19 +59,7 @@ jobs:
      - run: |
          [ -n "${{ matrix.container }}" ] || sudo=sudo
          $sudo apt-get update
          # Add LunarG Vulkan SDK apt repo for Ubuntu 22.04
          if [ "${{ matrix.preset }}" = "Vulkan" ]; then
            $sudo apt-get install -y --no-install-recommends wget gnupg ca-certificates software-properties-common
            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | $sudo gpg --dearmor -o /usr/share/keyrings/lunarg-archive-keyring.gpg
            # Use signed-by to bind the repo to the installed keyring to avoid NO_PUBKEY
            echo "deb [signed-by=/usr/share/keyrings/lunarg-archive-keyring.gpg]  https://packages.lunarg.com/vulkan/1.4.313 jammy main" | $sudo tee /etc/apt/sources.list.d/lunarg-vulkan-1.4.313-jammy.list > /dev/null
            $sudo apt-get update
          fi
          $sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
          # Export VULKAN_SDK if provided by LunarG package (defensive)
          if [ -d "/usr/lib/x86_64-linux-gnu/vulkan" ] && [ "${{ matrix.preset }}" = "Vulkan" ]; then
            echo "VULKAN_SDK=/usr" >> $GITHUB_ENV
          fi
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/cache@v4
@@ -96,35 +78,23 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
-            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
            cuda-components:
              - '"cudart"'
              - '"nvcc"'
              - '"cublas"'
              - '"cublas_dev"'
              - '"crt"'
              - '"nvvm"'
              - '"nvptxcompiler"'
            cuda-version: '13.0'
          - preset: ROCm
-            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
+            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
+            flags: '-DAMDGPU_TARGETS=gfx1010'
          - preset: Vulkan
            install: https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe
    runs-on: windows
    steps:
      - run: |
          choco install -y --no-progress ccache ninja
          ccache -o cache_dir=${{ github.workspace }}\.ccache
-      - if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm' || matrix.preset == 'Vulkan'
+      - if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm'
        id: cache-install
        uses: actions/cache/restore@v4
        with:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
            C:\VulkanSDK
          key: ${{ matrix.install }}
      - if: matrix.preset == 'CUDA'
        name: Install CUDA ${{ matrix.cuda-version }}
@@ -132,8 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.8", "nvcc_11.8", "cublas_11.8", "cublas_dev_11.8")) -NoNewWindow -Wait
            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
          }
          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
@@ -151,21 +120,6 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: matrix.preset == 'Vulkan'
        name: Install Vulkan ${{ matrix.rocm-version }}
        run: |
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
            Start-Process -FilePath .\install.exe -ArgumentList "-c","--am","--al","in" -NoNewWindow -Wait
          }
          $vulkanPath = (Resolve-Path "C:\VulkanSDK\*").path
          echo "$vulkanPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "VULKAN_SDK=$vulkanPath" >> $env:GITHUB_ENV
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
@@ -179,20 +133,13 @@ jobs:
          path: ${{ github.workspace }}\.ccache
          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
      - run: |
-          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
        env:
          CMAKE_GENERATOR: Ninja
  go_mod_tidy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: check that 'go mod tidy' is clean
        run: go mod tidy --diff || (echo "Please run 'go mod tidy'." && exit 1)
  test:
    strategy:
      matrix:
@@ -200,82 +147,15 @@ jobs:
    runs-on: ${{ matrix.os }}
    env:
      CGO_ENABLED: '1'
      GOEXPERIMENT: 'synctest'
    steps:
-      - name: checkout
+      - uses: actions/checkout@v4
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
+      - uses: actions/setup-go@v5
      - name: cache restore
        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
        with:
          # Note: unlike the other setups, this is only grabbing the mod download
          # cache, rather than the whole mod directory, as the download cache
          # contains zips that can be unpacked in parallel faster than they can be
          # fetched and extracted by tar
          path: |
            ~/.cache/go-build
            ~/go/pkg/mod/cache
            ~\AppData\Local\go-build
          # NOTE: The -3- here should be incremented when the scheme of data to be
          # cached changes (e.g. path above changes).
          key: ${{ github.job }}-${{ runner.os }}-${{ matrix.goarch }}-${{ matrix.buildflags }}-go-3-${{ hashFiles('**/go.sum') }}-${{ github.run_id }}
          restore-keys: |
            ${{ github.job }}-${{ runner.os }}-${{ matrix.goarch }}-${{ matrix.buildflags }}-go-3-${{ hashFiles('**/go.sum') }}
            ${{ github.job }}-${{ runner.os }}-${{ matrix.goarch }}-${{ matrix.buildflags }}-go-3-
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
          # The caching strategy of setup-go is less than ideal, and wastes
          # time by not saving artifacts due to small failures like the linter
          # complaining, etc. This means subsequent have to rebuild their world
          # again until all checks pass. For instance, if you mispell a word,
          # you're punished until you fix it. This is more hostile than
          # helpful.
          cache: false
          go-version-file: go.mod
      # It is tempting to run this in a platform independent way, but the past
      # shows this codebase will see introductions of platform specific code
      # generation, and so we need to check this per platform to ensure we
      # don't abuse go generate on specific platforms.
      - name: check that 'go generate' is clean
        if: always()
        run: |
          go generate ./...
          git diff --name-only --exit-code || (echo "Please run 'go generate ./...'." && exit 1)
      - name: go test
        if: always()
        run: go test -count=1 -benchtime=1x ./...
      # TODO(bmizerany): replace this heavy tool with just the
      # tools/checks/binaries we want and then make them all run in parallel
      # across jobs, not on a single tiny vm on Github Actions.
      - uses: golangci/golangci-lint-action@v6
        with:
          args: --timeout 10m0s -v
-
+      - run: go test ./...
      - name: cache save
        # Always save the cache, even if the job fails. The artifacts produced
        # during the building of test binaries are not all for naught. They can
        # be used to speed up subsequent runs.
        if: always()
        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
        with:
          # Note: unlike the other setups, this is only grabbing the mod download
          # cache, rather than the whole mod directory, as the download cache
          # contains zips that can be unpacked in parallel faster than they can be
          # fetched and extracted by tar
          path: |
            ~/.cache/go-build
            ~/go/pkg/mod/cache
            ~\AppData\Local\go-build
          # NOTE: The -3- here should be incremented when the scheme of data to be
          # cached changes (e.g. path above changes).
          key: ${{ github.job }}-${{ runner.os }}-${{ matrix.goarch }}-${{ matrix.buildflags }}-go-3-${{ hashFiles('**/go.sum') }}-${{ github.run_id }}
  patches:
    runs-on: ubuntu-latest
@@ -283,5 +163,5 @@ jobs:
      - uses: actions/checkout@v4
      - name: Verify patches apply cleanly and do not change files
        run: |
-          make -f Makefile.sync clean checkout apply-patches sync
+          make -f Makefile.sync clean checkout sync
-          git diff --compact-summary --exit-code
+          git diff --compact-summary --exit-code
--- a/.gitignore
+++ b/.gitignore
@@ -5,8 +5,8 @@
 .swp
 dist
 build
 ollama
 .cache
 .gocache
 *.exe
 .idea
 test_data
@@ -14,4 +14,3 @@ test_data
 __debug_bin*
 llama/build
 llama/vendor
 /ollama
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -6,6 +6,8 @@ linters:
    - bidichk
    - bodyclose
    - containedctx
    - contextcheck
    - errcheck
    - gocheckcompilerdirectives
    - gofmt
    - gofumpt
@@ -19,13 +21,12 @@ linters:
    - nolintlint
    - nosprintfhostport
    - staticcheck
    - tenv
    - unconvert
-    - usetesting
+    - unused
    - usestdlibvars
    - wastedassign
    - whitespace
  disable:
    - usestdlibvars
    - errcheck
 linters-settings:
  staticcheck:
    checks:
@@ -38,4 +39,5 @@ severity:
        - gofmt
        - goimports
        - intrange
        - usestdlibvars
      severity: info
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 3.21)
 project(Ollama C CXX)
 include(CheckLanguage)
 include(GNUInstallDirs)
 find_package(Threads REQUIRED)
@@ -24,21 +23,14 @@ set(GGML_SCHED_MAX_COPIES 4)
 set(GGML_LLAMAFILE ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_CUDA_GRAPHS ON)
 set(GGML_CUDA_FA ON)
 set(GGML_CUDA_COMPRESSION_MODE default)
-if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()
 if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
    set(CMAKE_BUILD_RPATH "@loader_path")
    set(CMAKE_INSTALL_RPATH "@loader_path")
 endif()
 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
-set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR})
+set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
@@ -52,8 +44,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
 add_compile_definitions(NDEBUG GGML_VERSION=0x0 GGML_COMMIT=0x0)
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
@@ -79,75 +69,44 @@ if(CMAKE_CUDA_COMPILER)
    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
+        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
-        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
    )
 endif()
 set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(908|90a|1200|1201):xnack[+-]$"
    CACHE STRING
    "Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(908|90a|1200|1201):xnack[+-]$\"."
 )
 check_language(HIP)
 if(CMAKE_HIP_COMPILER)
    set(HIP_PLATFORM "amd")
    find_package(hip REQUIRED)
    if(NOT AMDGPU_TARGETS)
-        find_package(hip REQUIRED)
+        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(94[012]|101[02]|1030|110[012]|120[01])$")
    endif()
    if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
        list(FILTER AMDGPU_TARGETS EXCLUDE REGEX ${WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX})
    endif()
    if(AMDGPU_TARGETS)
        find_package(hip REQUIRED)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
-
+        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        if (WIN32)
            target_compile_definitions(ggml-hip PRIVATE GGML_CUDA_NO_PEER_COPY)
        endif()
        target_compile_definitions(ggml-hip PRIVATE GGML_HIP_NO_VMM)
        install(TARGETS ggml-hip
-            RUNTIME_DEPENDENCY_SET rocm
+            RUNTIME_DEPENDENCIES
            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
        )
        install(RUNTIME_DEPENDENCY_SET rocm
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
-                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
+                PRE_INCLUDE_REGEXES amdhip64 hipblas rocblas amd_comgr hsa_runtime64 rocprofiler-register drm_amdgpu drm numa
                PRE_EXCLUDE_REGEXES ".*"
                POST_EXCLUDE_REGEXES "system32"
-            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+            RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
-            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
        )
        foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
            if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
-                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP)
+                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
                break()
            endif()
        endforeach()
    endif()
 endif()
 find_package(Vulkan)
 if(Vulkan_FOUND)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
    install(TARGETS ggml-vulkan
        RUNTIME_DEPENDENCIES
            PRE_INCLUDE_REGEXES vulkan
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
    )
 endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -6,8 +6,7 @@
      "binaryDir": "${sourceDir}/build",
      "installDir": "${sourceDir}/dist",
      "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_BUILD_TYPE": "Release"
        "CMAKE_MSVC_RUNTIME_LIBRARY": "MultiThreaded"
      }
    },
    {
@@ -22,24 +21,14 @@
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual",
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;62;70;72;75;80;86"
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;52;60;61;70;75;80;86;89;90;90a;120",
+        "CMAKE_CUDA_ARCHITECTURES": "60;61;62;70;72;75;80;86;87;89;90;90a"
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
    {
      "name": "CUDA 13",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
        "CMAKE_CUDA_FLAGS": "-t 2"
      }
    },
    {
@@ -67,13 +56,8 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
-        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
+        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
        "AMDGPU_TARGETS": "gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    },
    {
      "name": "Vulkan",
      "inherits": [ "Default" ]
    }
  ],
  "buildPresets": [
@@ -102,11 +86,6 @@
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 12"
    },
    {
      "name": "CUDA 13",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 13"
    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
@@ -126,11 +105,6 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "configurePreset": "ROCm 6"
    },
    {
      "name": "Vulkan",
      "targets": [ "ggml-vulkan" ],
      "configurePreset": "Vulkan"
    }
  ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,6 +6,8 @@ Thank you for your interest in contributing to Ollama! Here are a few guidelines
 See the [development documentation](./docs/development.md) for instructions on how to build and run Ollama locally.
 ## Pull requests
 ### Ideal issues
 * [Bugs](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Abug): issues where Ollama stops working or where it results in an unexpected error.
@@ -24,65 +26,11 @@ See the [development documentation](./docs/development.md) for instructions on h
 * Changes that add significant friction to the user experience
 * Changes that create a large future maintenance burden for maintainers and contributors
-## Proposing a (non-trivial) change
+### Best practices
-> By "non-trivial", we mean a change that is not a bug fix or small
+* Commit messages: please leave both a title and a description in your commit messages. The title should be a short summary of the changes, with a leading word that explains the section of the code being changed (e.g. `api: fix parsing of prompt field`) . In the description, leave a short 2-3 sentences that explain more about the change and its impact.
-> documentation update. If you are unsure, please ask us on our [Discord
+* Tests: please add test coverage to changes where possible.
-> server](https://discord.gg/ollama).
+* Minimize dependencies: avoid adding new dependencies unless absolutely necessary.
 Before opening a non-trivial Pull Request, please open an issue to discuss the change and
 get feedback from the maintainers. This helps us understand the context of the
 change and how it fits into Ollama's roadmap and prevents us from duplicating
 work or you from spending time on a change that we may not be able to accept.
 Tips for proposals:
 * Explain the problem you are trying to solve, not what you are trying to do.
 * Explain why the change is important.
 * Explain how the change will be used.
 * Explain how the change will be tested.
 Additionally, for bonus points: Provide draft documentation you would expect to
 see if the change were accepted.
 ## Pull requests
 **Commit messages**
 The title should look like:
    <package>: <short description>
 The package is the most affected Go package. If the change does not affect Go
 code, then use the directory name instead. Changes to a single well-known
 file in the root directory may use the file name.
 The short description should start with a lowercase letter and be a
 continuation of the sentence:
      "This changes Ollama to..."
 Examples:
      llm/backend/mlx: support the llama architecture
      CONTRIBUTING: provide clarity on good commit messages, and bad
      docs: simplify manual installation with shorter curl commands
 Bad Examples:
      feat: add more emoji
      fix: was not using famous web framework
      chore: generify code
 **Tests**
 Please include tests. Strive to test behavior, not implementation.
 **New dependencies**
 Dependencies should be added sparingly. If you are adding a new dependency,
 please explain why it is necessary and what other ways you attempted that
 did not work without it.
 ## Need help?
--- a/165
+++ b/165
@@ -1,38 +1,23 @@
 # vim: filetype=dockerfile
 ARG FLAVOR=${TARGETARCH}
 ARG PARALLEL=8
-ARG ROCMVERSION=6.3.3
+ARG ROCMVERSION=6.1.2
 ARG JETPACK5VERSION=r35.4.1
-ARG JETPACK6VERSION=r36.4.0
+ARG JETPACK6VERSION=r36.2.0
 ARG CMAKEVERSION=3.31.2
 ARG VULKANVERSION=1.4.321.1
-# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCMVERSION}-complete AS base-amd64
-FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
-RUN yum install -y yum-utils \
+    && yum install -y yum-utils devtoolset-10-gcc devtoolset-10-gcc-c++ \
-    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo \
-    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
+    && curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
-    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
+ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:/opt/rh/devtoolset-11/root/usr/bin:$PATH
    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 ARG VULKANVERSION
 RUN wget https://sdk.lunarg.com/sdk/download/${VULKANVERSION}/linux/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz -O /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
    && tar xvf /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
    && dnf -y install ninja-build \
    && ln -s /usr/bin/python3 /usr/bin/python \  
    && /${VULKANVERSION}/vulkansdk -j 8 vulkan-headers \
    && /${VULKANVERSION}/vulkansdk -j 8 shaderc
 RUN cp -r /${VULKANVERSION}/x86_64/include/* /usr/local/include/ \
    && cp -r /${VULKANVERSION}/x86_64/lib/* /usr/local/lib
 ENV PATH=/${VULKANVERSION}/x86_64/bin:$PATH
-FROM --platform=linux/arm64 almalinux:8 AS base-arm64
+FROM --platform=linux/arm64 rockylinux:8 AS base-arm64
 # install epel-release for ccache
 RUN yum install -y yum-utils epel-release \
-    && dnf install -y clang ccache \
+    && yum install -y clang ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 ENV CC=clang CXX=clang++
@@ -44,54 +29,37 @@ COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ENV LDFLAGS=-s
 FROM base AS cpu
-RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
+# amd64 uses gcc which requires devtoolset-11 for AVX extensions while arm64 uses clang
-ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
+RUN if [ "$(uname -m)" = "x86_64" ]; then yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++; fi
-ARG PARALLEL
+ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CPU' \
-        && cmake --build --parallel ${PARALLEL} --preset 'CPU' \
+        && cmake --build --parallel --preset 'CPU' \
-        && cmake --install build --component CPU --strip --parallel ${PARALLEL}
+        && cmake --install build --component CPU --strip --parallel 8
 FROM base AS cuda-11
-ARG CUDA11VERSION=11.8
+ARG CUDA11VERSION=11.3
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
+RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \
+    cmake --preset 'CUDA 11' \
-        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
+        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --install build --component CUDA --strip --parallel 8
 FROM base AS cuda-12
-ARG CUDA12VERSION=12.8
+ARG CUDA12VERSION=12.4
-RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
+RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\
+    cmake --preset 'CUDA 12' \
-        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
+        && cmake --build --parallel --preset 'CUDA 12' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --install build --component CUDA --strip --parallel 8
 FROM base AS cuda-13
 ARG CUDA13VERSION=13.0
 RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
 ENV PATH=/usr/local/cuda-13/bin:$PATH
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \
        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
 FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'ROCm 6' -DOLLAMA_RUNNER_DIR="rocm" \
+    cmake --preset 'ROCm 6' \
-        && cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
+        && cmake --build --parallel --preset 'ROCm 6' \
-        && cmake --install build --component HIP --strip --parallel ${PARALLEL}
+        && cmake --install build --component HIP --strip --parallel 8
 RUN rm -f dist/lib/ollama/rocm/rocblas/library/*gfx90[06]*
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
 ARG CMAKEVERSION
@@ -99,11 +67,10 @@ RUN apt-get update && apt-get install -y curl ccache \
    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'JetPack 5' -DOLLAMA_RUNNER_DIR="cuda_jetpack5" \
+    cmake --preset 'JetPack 5' \
-        && cmake --build --parallel ${PARALLEL} --preset 'JetPack 5' \
+        && cmake --build --parallel --preset 'JetPack 5' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --install build --component CUDA --strip --parallel 8
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
 ARG CMAKEVERSION
@@ -111,84 +78,44 @@ RUN apt-get update && apt-get install -y curl ccache \
    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'JetPack 6' -DOLLAMA_RUNNER_DIR="cuda_jetpack6" \
+    cmake --preset 'JetPack 6' \
-        && cmake --build --parallel ${PARALLEL} --preset 'JetPack 6' \
+        && cmake --build --parallel --preset 'JetPack 6' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --install build --component CUDA --strip --parallel 8
 FROM base AS vulkan
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'Vulkan' -DOLLAMA_RUNNER_DIR="vulkan" \
        && cmake --build --parallel --preset 'Vulkan' \
        && cmake --install build --component Vulkan --strip --parallel 8 
 FROM base AS build
-WORKDIR /go/src/github.com/ollama/ollama
+ARG GOVERSION=1.23.4
-COPY go.mod go.sum .
+RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
 RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
 ENV PATH=/usr/local/go/bin:$PATH
-RUN go mod download
+WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
 ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
 RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .
 FROM --platform=linux/amd64 scratch AS amd64
-# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
 FROM --platform=linux/arm64 scratch AS arm64
-# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
-COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
+COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 lib/ollama/cuda_jetpack5
-COPY --from=jetpack-5 dist/lib/ollama/ /lib/ollama/
+COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 lib/ollama/cuda_jetpack6
 COPY --from=jetpack-6 dist/lib/ollama/ /lib/ollama/
-FROM scratch AS rocm
+FROM --platform=linux/arm64 scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama /lib/ollama
+COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
 FROM ${FLAVOR} AS archive
 ARG VULKANVERSION
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama
-# Temporary opt-out stages for Vulkan
+FROM ubuntu:20.04
 FROM --platform=linux/amd64 scratch AS amd64_novulkan
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
 COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 FROM arm64 AS arm64_novulkan
 FROM ${FLAVOR}_novulkan AS archive_novulkan
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama
 FROM ubuntu:24.04 AS novulkan
 RUN apt-get update \
    && apt-get install -y ca-certificates \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
 COPY --from=archive_novulkan /bin /usr/bin
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 COPY --from=archive_novulkan /lib/ollama /usr/lib/ollama
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV OLLAMA_HOST=0.0.0.0:11434
 EXPOSE 11434
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 FROM ubuntu:24.04 AS default
 RUN apt-get update \
    && apt-get install -y ca-certificates libvulkan1 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
 COPY --from=archive /bin /usr/bin
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 COPY --from=archive /lib/ollama /usr/lib/ollama
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
-UPSTREAM=https://github.com/ggml-org/llama.cpp.git
+UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=7049736b2dd9011bf819e298b844ebbc4b5afdc9
+FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
 .PHONY: help
 help:
@@ -12,42 +12,27 @@ help:
 	@echo "    clean                Clean local repository"
 	@echo
 	@echo "Example:"
-	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean apply-patches sync"
+	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
 .PHONY: sync
-sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches
 llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
 	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
 	go generate ./$(@D)
 .PHONY: llama/llama.cpp
-llama/llama.cpp: llama/vendor
+llama/llama.cpp: llama/vendor/ apply-patches
-	rsync -arvzc --delete -f "include LICENSE" -f "merge $@/.rsync-filter" $(addprefix $<,/LICENSE /) $@
+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
-.PHONY: ml/backend/ggml/ggml
+.PHONY: ml/backend/ggml/ggml apply-patches
-ml/backend/ggml/ggml: llama/vendor
+ml/backend/ggml/ggml: llama/vendor/ggml/ apply-patches
-	rsync -arvzc --delete -f "include LICENSE" -f "merge $@/.rsync-filter" $(addprefix $<,/LICENSE /ggml/) $@
+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 PATCHES=$(wildcard llama/patches/*.patch)
 PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
 .PHONY: apply-patches
 .NOTPARALLEL:
-apply-patches: $(PATCHED)
+apply-patches: $(addsuffix ed, $(PATCHES))
-llama/patches/.%.patched: llama/patches/%.patch
+%.patched: %.patch
-	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then \
+	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
 		touch $@;                                                                           \
 	else                                                                                    \
 		echo "Patch failed. Resolve any conflicts then continue.";                          \
 		echo "1. Run 'git -C $(WORKDIR) am --continue'";                                    \
 		echo "2. Run 'make -f $(lastword $(MAKEFILE_LIST)) format-patches'";                \
 		echo "3. Run 'make -f $(lastword $(MAKEFILE_LIST)) clean apply-patches'";           \
 		exit 1;                                                                             \
 	fi
 .PHONY: checkout
 checkout: $(WORKDIR)
@@ -68,5 +53,4 @@ format-patches: llama/patches
 .PHONE: clean
 clean: checkout
-	@git -C $(WORKDIR) am --abort || true
+	$(RM) $(addsuffix ed, $(PATCHES))
 	$(RM) llama/patches/.*.patched
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
-  <a href="https://ollama.com">
+  <a href="https://ollama.com" />
-    <img alt="ollama" width="240" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+    <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
  </a>
 </div>
@@ -10,7 +10,7 @@ Get up and running with large language models.
 ### macOS
-[Download](https://ollama.com/download/Ollama.dmg)
+[Download](https://ollama.com/download/Ollama-darwin.zip)
 ### Windows
@@ -18,7 +18,7 @@ Get up and running with large language models.
 ### Linux
-```shell
+```
 curl -fsSL https://ollama.com/install.sh | sh
 ```
@@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 ## Quickstart
-To run and chat with [Gemma 3](https://ollama.com/library/gemma3):
+To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
-```shell
+```
-ollama run gemma3
+ollama run llama3.2
 ```
 ## Model library
@@ -54,15 +54,6 @@ Here are some example models that can be downloaded:
 | Model              | Parameters | Size  | Download                         |
 | ------------------ | ---------- | ----- | -------------------------------- |
 | Gemma 3            | 1B         | 815MB | `ollama run gemma3:1b`           |
 | Gemma 3            | 4B         | 3.3GB | `ollama run gemma3`              |
 | Gemma 3            | 12B        | 8.1GB | `ollama run gemma3:12b`          |
 | Gemma 3            | 27B        | 17GB  | `ollama run gemma3:27b`          |
 | QwQ                | 32B        | 20GB  | `ollama run qwq`                 |
 | DeepSeek-R1        | 7B         | 4.7GB | `ollama run deepseek-r1`         |
 | DeepSeek-R1        | 671B       | 404GB | `ollama run deepseek-r1:671b`    |
 | Llama 4            | 109B       | 67GB  | `ollama run llama4:scout`        |
 | Llama 4            | 400B       | 245GB | `ollama run llama4:maverick`     |
 | Llama 3.3          | 70B        | 43GB  | `ollama run llama3.3`            |
 | Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
 | Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
@@ -71,7 +62,10 @@ Here are some example models that can be downloaded:
 | Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
 | Phi 4              | 14B        | 9.1GB | `ollama run phi4`                |
-| Phi 4 Mini         | 3.8B       | 2.5GB | `ollama run phi4-mini`           |
+| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
 | Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
 | Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
 | Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
 | Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
@@ -79,7 +73,7 @@ Here are some example models that can be downloaded:
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Granite-3.3         | 8B         | 4.9GB | `ollama run granite3.3`          |
+| Solar              | 10.7B      | 6.1GB | `ollama run solar`               |
 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -98,13 +92,13 @@ Ollama supports importing GGUF models in the Modelfile:
 2. Create the model in Ollama
-   ```shell
+   ```
   ollama create example -f Modelfile
   ```
 3. Run the model
-   ```shell
+   ```
   ollama run example
   ```
@@ -116,7 +110,7 @@ See the [guide](docs/import.md) on importing models for more information.
 Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.2` model:
-```shell
+```
 ollama pull llama3.2
 ```
@@ -151,13 +145,13 @@ For more information on working with a Modelfile, see the [Modelfile](docs/model
 `ollama create` is used to create a model from a Modelfile.
-```shell
+```
 ollama create mymodel -f ./Modelfile
 ```
 ### Pull a model
-```shell
+```
 ollama pull llama3.2
 ```
@@ -165,13 +159,13 @@ ollama pull llama3.2
 ### Remove a model
-```shell
+```
 ollama rm llama3.2
 ```
 ### Copy a model
-```shell
+```
 ollama cp llama3.2 my-model
 ```
@@ -190,39 +184,37 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 ```
 ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```
 > **Output**: The image features a yellow smiley face, which is likely the central focus of the picture.
 ### Pass the prompt as an argument
 ```shell
 ollama run llama3.2 "Summarize this file: $(cat README.md)"
 ```
-
+$ ollama run llama3.2 "Summarize this file: $(cat README.md)"
-> **Output**: Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
+ Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 ### Show model information
-```shell
+```
 ollama show llama3.2
 ```
 ### List models on your computer
-```shell
+```
 ollama list
 ```
 ### List which models are currently loaded
-```shell
+```
 ollama ps
 ```
 ### Stop a model which is currently running
-```shell
+```
 ollama stop llama3.2
 ```
@@ -238,13 +230,13 @@ See the [developer guide](https://github.com/ollama/ollama/blob/main/docs/develo
 Next, start the server:
-```shell
+```
 ./ollama serve
 ```
 Finally, in a separate shell, run a model:
-```shell
+```
 ./ollama run llama3.2
 ```
@@ -254,7 +246,7 @@ Ollama has a REST API for running and managing models.
 ### Generate a response
-```shell
+```
 curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt":"Why is the sky blue?"
@@ -263,7 +255,7 @@ curl http://localhost:11434/api/generate -d '{
 ### Chat with a model
-```shell
+```
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [
@@ -279,7 +271,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Web & Desktop
 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [SwiftChat (macOS with ReactNative)](https://github.com/aws-samples/swift-chat)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
 - [Hollama](https://github.com/fmaclen/hollama)
 - [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
@@ -287,13 +278,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
 - [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
 - [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-AGI)
+- [big-AGI](https://github.com/enricoros/big-AGI/blob/main/docs/config-local-ollama.md)
 - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
@@ -314,8 +304,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
 - [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
 - [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks)
 - [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
@@ -329,14 +317,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
 - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
+- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
@@ -345,34 +332,33 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
+- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
 - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
+- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
+- [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG and deep research on Mac/Windows/Linux)
+- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
 - [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [chat-ollama](https://github.com/annilq/chat-ollama) (a React Native client for Ollama)
 - [SpaceLlama](https://github.com/tcsenpai/spacellama) (Firefox and Chrome extension to quickly summarize web pages with ollama in a sidebar)
 - [YouLama](https://github.com/tcsenpai/youlama) (Webapp to quickly summarize any YouTube video, supporting Invidious as well)
 - [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
 - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
+- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
@@ -383,38 +369,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
 - [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
 - [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
 - [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
 - [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
 - [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
 - [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
 - [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
 - [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
 - [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
 - [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
 - [LangBot](https://github.com/RockChinQ/LangBot) (LLM-based instant messaging bots platform, with Agents, RAG features, supports multiple platforms)
 - [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool)
 - [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
 - [Flufy](https://github.com/Aharon-Bensadoun/Flufy) (A beautiful chat interface for interacting with Ollama's API. Built with React, TypeScript, and Material-UI.)
 - [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance)
 - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
 - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
 - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
 - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
 - [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
 - [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
 ### Cloud
@@ -454,18 +409,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
 - [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
 - [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.
 - [VT Code](https://github.com/vinhnx/vtcode) - VT Code is a Rust-based terminal coding agent with semantic code intelligence via Tree-sitter. Ollama integration for running local/cloud models with configurable endpoints.
 ### Apple Vision Pro
 - [SwiftChat](https://github.com/aws-samples/swift-chat) (Cross-platform AI chat app supporting Apple Vision Pro via "Designed for iPad")
 - [Enchanted](https://github.com/AugustDev/enchanted)
 ### Database
@@ -480,15 +427,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
 - [Homebrew](https://formulae.brew.sh/formula/ollama)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
 - [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
- [Nix package](https://search.nixos.org/packages?show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
+- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
 - [Flox](https://flox.dev/blog/ollama-part-one)
 ### Libraries
- [LangChain](https://python.langchain.com/docs/integrations/chat/ollama/) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
+- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
 - [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration)
@@ -535,27 +481,15 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
+- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
 - [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
 - [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
 - [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
 - [Neuro SAN](https://github.com/cognizant-ai-lab/neuro-san-studio) (Data-driven multi-agent orchestration framework) with [example](https://github.com/cognizant-ai-lab/neuro-san-studio/blob/main/docs/user_guide.md#ollama)
 - [achatbot-go](https://github.com/ai-bot-pro/achatbot-go) a multimodal(text/audio/image) chatbot.
 ### Mobile
 - [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
 ### Extensions & Plugins
@@ -577,7 +511,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
+- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -587,8 +521,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
+- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
@@ -599,23 +533,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
 - [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
 - [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
 - [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
 - [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network)
 ### Supported backends
- [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.
+- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
 ### Observability
- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
+
 - [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
 - [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
 - [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
 - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.
--- a/api/client.go
+++ b/api/client.go
@@ -10,7 +10,7 @@
 // repository].
 //
 // [the API documentation]: https://github.com/ollama/ollama/blob/main/docs/api.md
-// [in the GitHub repository]: https://github.com/ollama/ollama/tree/main/api/examples
+// [in the GitHub repository]: https://github.com/ollama/ollama/tree/main/examples
 package api
 import (
@@ -24,10 +24,7 @@ import (
 	"net/http"
 	"net/url"
 	"runtime"
 	"strconv"
 	"time"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/version"
@@ -45,12 +42,6 @@ func checkError(resp *http.Response, body []byte) error {
 		return nil
 	}
 	if resp.StatusCode == http.StatusUnauthorized {
 		authError := AuthorizationError{StatusCode: resp.StatusCode}
 		json.Unmarshal(body, &authError)
 		return authError
 	}
 	apiError := StatusError{StatusCode: resp.StatusCode}
 	err := json.Unmarshal(body, &apiError)
@@ -85,14 +76,6 @@ func NewClient(base *url.URL, http *http.Client) *Client {
 	}
 }
 func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
 	token, err := auth.Sign(ctx, []byte(challenge))
 	if err != nil {
 		return "", err
 	}
 	return token, nil
 }
 func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
 	var reqBody io.Reader
 	var data []byte
@@ -114,21 +97,6 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	}
 	requestURL := c.base.JoinPath(path)
 	var token string
 	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
 		now := strconv.FormatInt(time.Now().Unix(), 10)
 		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
 		token, err = getAuthorizationToken(ctx, chal)
 		if err != nil {
 			return err
 		}
 		q := requestURL.Query()
 		q.Set("ts", now)
 		requestURL.RawQuery = q.Encode()
 	}
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
 	if err != nil {
 		return err
@@ -138,10 +106,6 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	request.Header.Set("Accept", "application/json")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 	if token != "" {
 		request.Header.Set("Authorization", token)
 	}
 	respObj, err := c.http.Do(request)
 	if err != nil {
 		return err
@@ -168,7 +132,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 const maxBufferSize = 512 * format.KiloByte
 func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
-	var buf io.Reader
+	var buf *bytes.Buffer
 	if data != nil {
 		bts, err := json.Marshal(data)
 		if err != nil {
@@ -179,22 +143,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}
 	requestURL := c.base.JoinPath(path)
 	var token string
 	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
 		var err error
 		now := strconv.FormatInt(time.Now().Unix(), 10)
 		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
 		token, err = getAuthorizationToken(ctx, chal)
 		if err != nil {
 			return err
 		}
 		q := requestURL.Query()
 		q.Set("ts", now)
 		requestURL.RawQuery = q.Encode()
 	}
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
 	if err != nil {
 		return err
@@ -204,10 +152,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 	if token != "" {
 		request.Header.Set("Authorization", token)
 	}
 	response, err := c.http.Do(request)
 	if err != nil {
 		return err
@@ -220,8 +164,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	scanner.Buffer(scanBuf, maxBufferSize)
 	for scanner.Scan() {
 		var errorResponse struct {
-			Error     string `json:"error,omitempty"`
+			Error string `json:"error,omitempty"`
 			SigninURL string `json:"signin_url,omitempty"`
 		}
 		bts := scanner.Bytes()
@@ -229,13 +172,11 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			return fmt.Errorf("unmarshal: %w", err)
 		}
-		if response.StatusCode == http.StatusUnauthorized {
+		if errorResponse.Error != "" {
-			return AuthorizationError{
+			return errors.New(errorResponse.Error)
-				StatusCode: response.StatusCode,
+		}
-				Status:     response.Status,
+
-				SigninURL:  errorResponse.SigninURL,
+		if response.StatusCode >= http.StatusBadRequest {
 			}
 		} else if response.StatusCode >= http.StatusBadRequest {
 			return StatusError{
 				StatusCode:   response.StatusCode,
 				Status:       response.Status,
@@ -243,10 +184,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			}
 		}
 		if errorResponse.Error != "" {
 			return errors.New(errorResponse.Error)
 		}
 		if err := fn(bts); err != nil {
 			return err
 		}
@@ -441,21 +378,3 @@ func (c *Client) Version(ctx context.Context) (string, error) {
 	return version.Version, nil
 }
 // Signout will signout a client for a local ollama server.
 func (c *Client) Signout(ctx context.Context) error {
 	return c.do(ctx, http.MethodPost, "/api/signout", nil, nil)
 }
 // Disconnect will disconnect an ollama instance from ollama.com.
 func (c *Client) Disconnect(ctx context.Context, encodedKey string) error {
 	return c.do(ctx, http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey), nil, nil)
 }
 func (c *Client) Whoami(ctx context.Context) (*UserResponse, error) {
 	var resp UserResponse
 	if err := c.do(ctx, http.MethodPost, "/api/me", nil, &resp); err != nil {
 		return nil, err
 	}
 	return &resp, nil
 }
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -1,12 +1,6 @@
 package api
 import (
 	"encoding/json"
 	"fmt"
 	"net/http"
 	"net/http/httptest"
 	"net/url"
 	"strings"
 	"testing"
 )
@@ -49,216 +43,3 @@ func TestClientFromEnvironment(t *testing.T) {
 		})
 	}
 }
 // testError represents an internal error type with status code and message
 // this is used since the error response from the server is not a standard error struct
 type testError struct {
 	message    string
 	statusCode int
 }
 func (e testError) Error() string {
 	return e.message
 }
 func TestClientStream(t *testing.T) {
 	testCases := []struct {
 		name      string
 		responses []any
 		wantErr   string
 	}{
 		{
 			name: "immediate error response",
 			responses: []any{
 				testError{
 					message:    "test error message",
 					statusCode: http.StatusBadRequest,
 				},
 			},
 			wantErr: "test error message",
 		},
 		{
 			name: "error after successful chunks, ok response",
 			responses: []any{
 				ChatResponse{Message: Message{Content: "partial response 1"}},
 				ChatResponse{Message: Message{Content: "partial response 2"}},
 				testError{
 					message:    "mid-stream error",
 					statusCode: http.StatusOK,
 				},
 			},
 			wantErr: "mid-stream error",
 		},
 		{
 			name: "http status error takes precedence over general error",
 			responses: []any{
 				testError{
 					message:    "custom error message",
 					statusCode: http.StatusInternalServerError,
 				},
 			},
 			wantErr: "500",
 		},
 		{
 			name: "successful stream completion",
 			responses: []any{
 				ChatResponse{Message: Message{Content: "chunk 1"}},
 				ChatResponse{Message: Message{Content: "chunk 2"}},
 				ChatResponse{
 					Message:    Message{Content: "final chunk"},
 					Done:       true,
 					DoneReason: "stop",
 				},
 			},
 		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				flusher, ok := w.(http.Flusher)
 				if !ok {
 					t.Fatal("expected http.Flusher")
 				}
 				w.Header().Set("Content-Type", "application/x-ndjson")
 				for _, resp := range tc.responses {
 					if errResp, ok := resp.(testError); ok {
 						w.WriteHeader(errResp.statusCode)
 						err := json.NewEncoder(w).Encode(map[string]string{
 							"error": errResp.message,
 						})
 						if err != nil {
 							t.Fatal("failed to encode error response:", err)
 						}
 						return
 					}
 					if err := json.NewEncoder(w).Encode(resp); err != nil {
 						t.Fatalf("failed to encode response: %v", err)
 					}
 					flusher.Flush()
 				}
 			}))
 			defer ts.Close()
 			client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient)
 			var receivedChunks []ChatResponse
 			err := client.stream(t.Context(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
 				var resp ChatResponse
 				if err := json.Unmarshal(chunk, &resp); err != nil {
 					return fmt.Errorf("failed to unmarshal chunk: %w", err)
 				}
 				receivedChunks = append(receivedChunks, resp)
 				return nil
 			})
 			if tc.wantErr != "" {
 				if err == nil {
 					t.Fatal("expected error but got nil")
 				}
 				if !strings.Contains(err.Error(), tc.wantErr) {
 					t.Errorf("expected error containing %q, got %v", tc.wantErr, err)
 				}
 				return
 			}
 			if err != nil {
 				t.Errorf("unexpected error: %v", err)
 			}
 		})
 	}
 }
 func TestClientDo(t *testing.T) {
 	testCases := []struct {
 		name     string
 		response any
 		wantErr  string
 	}{
 		{
 			name: "immediate error response",
 			response: testError{
 				message:    "test error message",
 				statusCode: http.StatusBadRequest,
 			},
 			wantErr: "test error message",
 		},
 		{
 			name: "server error response",
 			response: testError{
 				message:    "internal error",
 				statusCode: http.StatusInternalServerError,
 			},
 			wantErr: "internal error",
 		},
 		{
 			name: "successful response",
 			response: struct {
 				ID      string `json:"id"`
 				Success bool   `json:"success"`
 			}{
 				ID:      "msg_123",
 				Success: true,
 			},
 		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				if errResp, ok := tc.response.(testError); ok {
 					w.WriteHeader(errResp.statusCode)
 					err := json.NewEncoder(w).Encode(map[string]string{
 						"error": errResp.message,
 					})
 					if err != nil {
 						t.Fatal("failed to encode error response:", err)
 					}
 					return
 				}
 				w.Header().Set("Content-Type", "application/json")
 				if err := json.NewEncoder(w).Encode(tc.response); err != nil {
 					t.Fatalf("failed to encode response: %v", err)
 				}
 			}))
 			defer ts.Close()
 			client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient)
 			var resp struct {
 				ID      string `json:"id"`
 				Success bool   `json:"success"`
 			}
 			err := client.do(t.Context(), http.MethodPost, "/v1/messages", nil, &resp)
 			if tc.wantErr != "" {
 				if err == nil {
 					t.Fatalf("got nil, want error %q", tc.wantErr)
 				}
 				if err.Error() != tc.wantErr {
 					t.Errorf("error message mismatch: got %q, want %q", err.Error(), tc.wantErr)
 				}
 				return
 			}
 			if err != nil {
 				t.Fatalf("got error %q, want nil", err)
 			}
 			if expectedResp, ok := tc.response.(struct {
 				ID      string `json:"id"`
 				Success bool   `json:"success"`
 			}); ok {
 				if resp.ID != expectedResp.ID {
 					t.Errorf("response ID mismatch: got %q, want %q", resp.ID, expectedResp.ID)
 				}
 				if resp.Success != expectedResp.Success {
 					t.Errorf("response Success mismatch: got %v, want %v", resp.Success, expectedResp.Success)
 				}
 			}
 		})
 	}
 }
--- a/api/examples/README.md
+++ b/api/examples/README.md
@@ -2,10 +2,9 @@
 Run the examples in this directory with:
-```shell
+```
 go run example_name/main.go
 ```
 ## Chat - Chat with a model
 - [chat/main.go](chat/main.go)
--- a/api/types.go
+++ b/api/types.go
@@ -10,11 +10,6 @@ import (
 	"strconv"
 	"strings"
 	"time"
 	"github.com/google/uuid"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )
 // StatusError is an error with an HTTP status code and message.
@@ -38,19 +33,6 @@ func (e StatusError) Error() string {
 	}
 }
 type AuthorizationError struct {
 	StatusCode int
 	Status     string
 	SigninURL  string `json:"signin_url"`
 }
 func (e AuthorizationError) Error() string {
 	if e.Status != "" {
 		return e.Status
 	}
 	return "something went wrong, please see the ollama server logs for details"
 }
 // ImageData represents the raw binary data of an image file.
 type ImageData []byte
@@ -91,32 +73,13 @@ type GenerateRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
-	// Images is an optional list of raw image bytes accompanying this
+	// Images is an optional list of base64-encoded images accompanying this
 	// request, for multimodal models.
 	Images []ImageData `json:"images,omitempty"`
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
-	Options map[string]any `json:"options"`
+	Options map[string]interface{} `json:"options"`
 	// Think controls whether thinking/reasoning models will think before
 	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
 	// for supported models. Needs to be a pointer so we can distinguish between false
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
 	Think *ThinkValue `json:"think,omitempty"`
 	// Truncate is a boolean that, when set to true, truncates the chat history messages
 	// if the rendered prompt exceeds the context length limit.
 	Truncate *bool `json:"truncate,omitempty"`
 	// Shift is a boolean that, when set to true, shifts the chat history
 	// when hitting the context length limit instead of erroring.
 	Shift *bool `json:"shift,omitempty"`
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }
 // ChatRequest describes a request sent by [Client.Chat].
@@ -141,24 +104,7 @@ type ChatRequest struct {
 	Tools `json:"tools,omitempty"`
 	// Options lists model-specific options.
-	Options map[string]any `json:"options"`
+	Options map[string]interface{} `json:"options"`
 	// Think controls whether thinking/reasoning models will think before
 	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
 	// for supported models.
 	Think *ThinkValue `json:"think,omitempty"`
 	// Truncate is a boolean that, when set to true, truncates the chat history messages
 	// if the rendered prompt exceeds the context length limit.
 	Truncate *bool `json:"truncate,omitempty"`
 	// Shift is a boolean that, when set to true, shifts the chat history
 	// when hitting the context length limit instead of erroring.
 	Shift *bool `json:"shift,omitempty"`
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }
 type Tools []Tool
@@ -177,14 +123,10 @@ func (t Tool) String() string {
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
 type Message struct {
-	Role    string `json:"role"`
+	Role      string      `json:"role"`
-	Content string `json:"content"`
+	Content   string      `json:"content"`
 	// Thinking contains the text that was inside thinking tags in the
 	// original model output when ChatRequest.Think is enabled.
 	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
 	ToolName  string      `json:"tool_name,omitempty"`
 }
 func (m *Message) UnmarshalJSON(b []byte) error {
@@ -204,7 +146,7 @@ type ToolCall struct {
 }
 type ToolCallFunction struct {
-	Index     int                       `json:"index"`
+	Index     int                       `json:"index,omitempty"`
 	Name      string                    `json:"name"`
 	Arguments ToolCallFunctionArguments `json:"arguments"`
 }
@@ -218,122 +160,21 @@ func (t *ToolCallFunctionArguments) String() string {
 type Tool struct {
 	Type     string       `json:"type"`
 	Items    any          `json:"items,omitempty"`
 	Function ToolFunction `json:"function"`
 }
 // PropertyType can be either a string or an array of strings
 type PropertyType []string
 // UnmarshalJSON implements the json.Unmarshaler interface
 func (pt *PropertyType) UnmarshalJSON(data []byte) error {
 	// Try to unmarshal as a string first
 	var s string
 	if err := json.Unmarshal(data, &s); err == nil {
 		*pt = []string{s}
 		return nil
 	}
 	// If that fails, try to unmarshal as an array of strings
 	var a []string
 	if err := json.Unmarshal(data, &a); err != nil {
 		return err
 	}
 	*pt = a
 	return nil
 }
 // MarshalJSON implements the json.Marshaler interface
 func (pt PropertyType) MarshalJSON() ([]byte, error) {
 	if len(pt) == 1 {
 		// If there's only one type, marshal as a string
 		return json.Marshal(pt[0])
 	}
 	// Otherwise marshal as an array
 	return json.Marshal([]string(pt))
 }
 // String returns a string representation of the PropertyType
 func (pt PropertyType) String() string {
 	if len(pt) == 0 {
 		return ""
 	}
 	if len(pt) == 1 {
 		return pt[0]
 	}
 	return fmt.Sprintf("%v", []string(pt))
 }
 type ToolProperty struct {
 	AnyOf       []ToolProperty `json:"anyOf,omitempty"`
 	Type        PropertyType   `json:"type,omitempty"`
 	Items       any            `json:"items,omitempty"`
 	Description string         `json:"description,omitempty"`
 	Enum        []any          `json:"enum,omitempty"`
 }
 // ToTypeScriptType converts a ToolProperty to a TypeScript type string
 func (tp ToolProperty) ToTypeScriptType() string {
 	if len(tp.AnyOf) > 0 {
 		var types []string
 		for _, anyOf := range tp.AnyOf {
 			types = append(types, anyOf.ToTypeScriptType())
 		}
 		return strings.Join(types, " | ")
 	}
 	if len(tp.Type) == 0 {
 		return "any"
 	}
 	if len(tp.Type) == 1 {
 		return mapToTypeScriptType(tp.Type[0])
 	}
 	var types []string
 	for _, t := range tp.Type {
 		types = append(types, mapToTypeScriptType(t))
 	}
 	return strings.Join(types, " | ")
 }
 // mapToTypeScriptType maps JSON Schema types to TypeScript types
 func mapToTypeScriptType(jsonType string) string {
 	switch jsonType {
 	case "string":
 		return "string"
 	case "number", "integer":
 		return "number"
 	case "boolean":
 		return "boolean"
 	case "array":
 		return "any[]"
 	case "object":
 		return "Record<string, any>"
 	case "null":
 		return "null"
 	default:
 		return "any"
 	}
 }
 type ToolFunctionParameters struct {
 	Type       string                  `json:"type"`
 	Defs       any                     `json:"$defs,omitempty"`
 	Items      any                     `json:"items,omitempty"`
 	Required   []string                `json:"required"`
 	Properties map[string]ToolProperty `json:"properties"`
 }
 func (t *ToolFunctionParameters) String() string {
 	bts, _ := json.Marshal(t)
 	return string(bts)
 }
 type ToolFunction struct {
-	Name        string                 `json:"name"`
+	Name        string `json:"name"`
-	Description string                 `json:"description,omitempty"`
+	Description string `json:"description"`
-	Parameters  ToolFunctionParameters `json:"parameters"`
+	Parameters  struct {
 		Type       string   `json:"type"`
 		Required   []string `json:"required"`
 		Properties map[string]struct {
 			Type        string   `json:"type"`
 			Description string   `json:"description"`
 			Enum        []string `json:"enum,omitempty"`
 		} `json:"properties"`
 	} `json:"parameters"`
 }
 func (t *ToolFunction) String() string {
@@ -344,38 +185,16 @@ func (t *ToolFunction) String() string {
 // ChatResponse is the response returned by [Client.Chat]. Its fields are
 // similar to [GenerateResponse].
 type ChatResponse struct {
-	// Model is the model name that generated the response.
+	Model      string    `json:"model"`
-	Model string `json:"model"`
+	CreatedAt  time.Time `json:"created_at"`
 	Message    Message   `json:"message"`
 	DoneReason string    `json:"done_reason,omitempty"`
 	// RemoteModel is the name of the upstream model that generated the response.
 	RemoteModel string `json:"remote_model,omitempty"`
 	// RemoteHost is the URL of the upstream Ollama host that generated the response.
 	RemoteHost string `json:"remote_host,omitempty"`
 	// CreatedAt is the timestamp of the response.
 	CreatedAt time.Time `json:"created_at"`
 	// Message contains the message or part of a message from the model.
 	Message Message `json:"message"`
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`
 	// DoneReason is the reason the model stopped generating text.
 	DoneReason string `json:"done_reason,omitempty"`
 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
 	Metrics
 }
 // DebugInfo contains debug information for template rendering
 type DebugInfo struct {
 	RenderedTemplate string `json:"rendered_template"`
 	ImageCount       int    `json:"image_count,omitempty"`
 }
 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
@@ -403,6 +222,9 @@ type Options struct {
 	RepeatPenalty    float32  `json:"repeat_penalty,omitempty"`
 	PresencePenalty  float32  `json:"presence_penalty,omitempty"`
 	FrequencyPenalty float32  `json:"frequency_penalty,omitempty"`
 	Mirostat         int      `json:"mirostat,omitempty"`
 	MirostatTau      float32  `json:"mirostat_tau,omitempty"`
 	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
 	Stop             []string `json:"stop,omitempty"`
 }
@@ -412,7 +234,12 @@ type Runner struct {
 	NumBatch  int   `json:"num_batch,omitempty"`
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
 	LowVRAM   bool  `json:"low_vram,omitempty"`
 	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
 	LogitsAll bool  `json:"logits_all,omitempty"`
 	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
 	UseMLock  bool  `json:"use_mlock,omitempty"`
 	NumThread int   `json:"num_thread,omitempty"`
 }
@@ -428,14 +255,10 @@ type EmbedRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
 	// Truncate truncates the input to fit the model's max sequence length.
 	Truncate *bool `json:"truncate,omitempty"`
 	// Dimensions truncates the output embedding to the specified dimension.
 	Dimensions int `json:"dimensions,omitempty"`
 	// Options lists model-specific options.
-	Options map[string]any `json:"options"`
+	Options map[string]interface{} `json:"options"`
 }
 // EmbedResponse is the response from [Client.Embed].
@@ -461,7 +284,7 @@ type EmbeddingRequest struct {
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
 	// Options lists model-specific options.
-	Options map[string]any `json:"options"`
+	Options map[string]interface{} `json:"options"`
 }
 // EmbeddingResponse is the response from [Client.Embeddings].
@@ -471,47 +294,18 @@ type EmbeddingResponse struct {
 // CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
-	// Model is the model name to create.
+	Model    string `json:"model"`
-	Model string `json:"model"`
+	Stream   *bool  `json:"stream,omitempty"`
 	// Stream specifies whether the response is streaming; it is true by default.
 	Stream *bool `json:"stream,omitempty"`
 	// Quantize is the quantization format for the model; leave blank to not change the quantization level.
 	Quantize string `json:"quantize,omitempty"`
-	// From is the name of the model or file to use as the source.
+	From       string            `json:"from,omitempty"`
-	From string `json:"from,omitempty"`
+	Files      map[string]string `json:"files,omitempty"`
-
+	Adapters   map[string]string `json:"adapters,omitempty"`
-	// RemoteHost is the URL of the upstream ollama API for the model (if any).
+	Template   string            `json:"template,omitempty"`
-	RemoteHost string `json:"remote_host,omitempty"`
+	License    any               `json:"license,omitempty"`
-
+	System     string            `json:"system,omitempty"`
-	// Files is a map of files include when creating the model.
+	Parameters map[string]any    `json:"parameters,omitempty"`
-	Files map[string]string `json:"files,omitempty"`
+	Messages   []Message         `json:"messages,omitempty"`
 	// Adapters is a map of LoRA adapters to include when creating the model.
 	Adapters map[string]string `json:"adapters,omitempty"`
 	// Template is the template used when constructing a request to the model.
 	Template string `json:"template,omitempty"`
 	// License is a string or list of strings for licenses.
 	License any `json:"license,omitempty"`
 	// System is the system prompt for the model.
 	System string `json:"system,omitempty"`
 	// Parameters is a map of hyper-parameters which are applied to the model.
 	Parameters map[string]any `json:"parameters,omitempty"`
 	// Messages is a list of messages added to the model before chat and generation requests.
 	Messages []Message `json:"messages,omitempty"`
 	Renderer string `json:"renderer,omitempty"`
 	Parser   string `json:"parser,omitempty"`
 	// Info is a map of additional information for the model
 	Info map[string]any `json:"info,omitempty"`
 	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
@@ -536,7 +330,7 @@ type ShowRequest struct {
 	Template string `json:"template"`
 	Verbose  bool   `json:"verbose"`
-	Options map[string]any `json:"options"`
+	Options map[string]interface{} `json:"options"`
 	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
@@ -544,22 +338,16 @@ type ShowRequest struct {
 // ShowResponse is the response returned from [Client.Show].
 type ShowResponse struct {
-	License       string             `json:"license,omitempty"`
+	License       string         `json:"license,omitempty"`
-	Modelfile     string             `json:"modelfile,omitempty"`
+	Modelfile     string         `json:"modelfile,omitempty"`
-	Parameters    string             `json:"parameters,omitempty"`
+	Parameters    string         `json:"parameters,omitempty"`
-	Template      string             `json:"template,omitempty"`
+	Template      string         `json:"template,omitempty"`
-	System        string             `json:"system,omitempty"`
+	System        string         `json:"system,omitempty"`
-	Renderer      string             `json:"renderer,omitempty"`
+	Details       ModelDetails   `json:"details,omitempty"`
-	Parser        string             `json:"parser,omitempty"`
+	Messages      []Message      `json:"messages,omitempty"`
-	Details       ModelDetails       `json:"details,omitempty"`
+	ModelInfo     map[string]any `json:"model_info,omitempty"`
-	Messages      []Message          `json:"messages,omitempty"`
+	ProjectorInfo map[string]any `json:"projector_info,omitempty"`
-	RemoteModel   string             `json:"remote_model,omitempty"`
+	ModifiedAt    time.Time      `json:"modified_at,omitempty"`
 	RemoteHost    string             `json:"remote_host,omitempty"`
 	ModelInfo     map[string]any     `json:"model_info,omitempty"`
 	ProjectorInfo map[string]any     `json:"projector_info,omitempty"`
 	Tensors       []Tensor           `json:"tensors,omitempty"`
 	Capabilities  []model.Capability `json:"capabilities,omitempty"`
 	ModifiedAt    time.Time          `json:"modified_at,omitempty"`
 }
 // CopyRequest is the request passed to [Client.Copy].
@@ -571,9 +359,9 @@ type CopyRequest struct {
 // PullRequest is the request passed to [Client.Pull].
 type PullRequest struct {
 	Model    string `json:"model"`
-	Insecure bool   `json:"insecure,omitempty"` // Deprecated: ignored
+	Insecure bool   `json:"insecure,omitempty"`
-	Username string `json:"username"`           // Deprecated: ignored
+	Username string `json:"username"`
-	Password string `json:"password"`           // Deprecated: ignored
+	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`
 	// Deprecated: set the model name with Model instead
@@ -613,26 +401,30 @@ type ProcessResponse struct {
 // ListModelResponse is a single model description in [ListResponse].
 type ListModelResponse struct {
-	Name        string       `json:"name"`
+	Name       string       `json:"name"`
-	Model       string       `json:"model"`
+	Model      string       `json:"model"`
-	RemoteModel string       `json:"remote_model,omitempty"`
+	ModifiedAt time.Time    `json:"modified_at"`
-	RemoteHost  string       `json:"remote_host,omitempty"`
+	Size       int64        `json:"size"`
-	ModifiedAt  time.Time    `json:"modified_at"`
+	Digest     string       `json:"digest"`
-	Size        int64        `json:"size"`
+	Details    ModelDetails `json:"details,omitempty"`
 	Digest      string       `json:"digest"`
 	Details     ModelDetails `json:"details,omitempty"`
 }
 // ProcessModelResponse is a single model description in [ProcessResponse].
 type ProcessModelResponse struct {
-	Name          string       `json:"name"`
+	Name      string       `json:"name"`
-	Model         string       `json:"model"`
+	Model     string       `json:"model"`
-	Size          int64        `json:"size"`
+	Size      int64        `json:"size"`
-	Digest        string       `json:"digest"`
+	Digest    string       `json:"digest"`
-	Details       ModelDetails `json:"details,omitempty"`
+	Details   ModelDetails `json:"details,omitempty"`
-	ExpiresAt     time.Time    `json:"expires_at"`
+	ExpiresAt time.Time    `json:"expires_at"`
-	SizeVRAM      int64        `json:"size_vram"`
+	SizeVRAM  int64        `json:"size_vram"`
-	ContextLength int          `json:"context_length"`
+}
 type RetrieveModelResponse struct {
 	Id      string `json:"id"`
 	Object  string `json:"object"`
 	Created int64  `json:"created"`
 	OwnedBy string `json:"owned_by"`
 }
 type TokenResponse struct {
@@ -644,22 +436,12 @@ type GenerateResponse struct {
 	// Model is the model name that generated the response.
 	Model string `json:"model"`
 	// RemoteModel is the name of the upstream model that generated the response.
 	RemoteModel string `json:"remote_model,omitempty"`
 	// RemoteHost is the URL of the upstream Ollama host that generated the response.
 	RemoteHost string `json:"remote_host,omitempty"`
 	// CreatedAt is the timestamp of the response.
 	CreatedAt time.Time `json:"created_at"`
 	// Response is the textual response itself.
 	Response string `json:"response"`
 	// Thinking contains the text that was inside thinking tags in the
 	// original model output when ChatRequest.Think is enabled.
 	Thinking string `json:"thinking,omitempty"`
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`
@@ -671,10 +453,6 @@ type GenerateResponse struct {
 	Context []int `json:"context,omitempty"`
 	Metrics
 	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
 }
 // ModelDetails provides details about a model.
@@ -687,25 +465,6 @@ type ModelDetails struct {
 	QuantizationLevel string   `json:"quantization_level"`
 }
 // UserResponse provides information about a user.
 type UserResponse struct {
 	ID        uuid.UUID `json:"id"`
 	Email     string    `json:"email"`
 	Name      string    `json:"name"`
 	Bio       string    `json:"bio,omitempty"`
 	AvatarURL string    `json:"avatarurl,omitempty"`
 	FirstName string    `json:"firstname,omitempty"`
 	LastName  string    `json:"lastname,omitempty"`
 	Plan      string    `json:"plan,omitempty"`
 }
 // Tensor describes the metadata for a given tensor.
 type Tensor struct {
 	Name  string   `json:"name"`
 	Type  string   `json:"type"`
 	Shape []uint64 `json:"shape"`
 }
 func (m *Metrics) Summary() {
 	if m.TotalDuration > 0 {
 		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
@@ -734,7 +493,7 @@ func (m *Metrics) Summary() {
 	}
 }
-func (opts *Options) FromMap(m map[string]any) error {
+func (opts *Options) FromMap(m map[string]interface{}) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
 	typeOpts := reflect.TypeOf(opts).Elem()   // types of the fields in the options struct
@@ -791,12 +550,12 @@ func (opts *Options) FromMap(m map[string]any) error {
 				}
 				field.SetString(val)
 			case reflect.Slice:
-				// JSON unmarshals to []any, not []string
+				// JSON unmarshals to []interface{}, not []string
-				val, ok := val.([]any)
+				val, ok := val.([]interface{})
 				if !ok {
 					return fmt.Errorf("option %q must be of type array", key)
 				}
-				// convert []any to []string
+				// convert []interface{} to []string
 				slice := make([]string, len(val))
 				for i, item := range val {
 					str, ok := item.(string)
@@ -843,126 +602,24 @@ func DefaultOptions() Options {
 		RepeatPenalty:    1.1,
 		PresencePenalty:  0.0,
 		FrequencyPenalty: 0.0,
 		Mirostat:         0,
 		MirostatTau:      5.0,
 		MirostatEta:      0.1,
 		Seed:             -1,
 		Runner: Runner{
 			// options set when the model is loaded
-			NumCtx:    int(envconfig.ContextLength()),
+			NumCtx:    2048,
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
 			UseMLock:  false,
 			UseMMap:   nil,
 		},
 	}
 }
 // ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low")
 type ThinkValue struct {
 	// Value can be a bool or string
 	Value interface{}
 }
 // IsValid checks if the ThinkValue is valid
 func (t *ThinkValue) IsValid() bool {
 	if t == nil || t.Value == nil {
 		return true // nil is valid (means not set)
 	}
 	switch v := t.Value.(type) {
 	case bool:
 		return true
 	case string:
 		return v == "high" || v == "medium" || v == "low"
 	default:
 		return false
 	}
 }
 // IsBool returns true if the value is a boolean
 func (t *ThinkValue) IsBool() bool {
 	if t == nil || t.Value == nil {
 		return false
 	}
 	_, ok := t.Value.(bool)
 	return ok
 }
 // IsString returns true if the value is a string
 func (t *ThinkValue) IsString() bool {
 	if t == nil || t.Value == nil {
 		return false
 	}
 	_, ok := t.Value.(string)
 	return ok
 }
 // Bool returns the value as a bool (true if enabled in any way)
 func (t *ThinkValue) Bool() bool {
 	if t == nil || t.Value == nil {
 		return false
 	}
 	switch v := t.Value.(type) {
 	case bool:
 		return v
 	case string:
 		// Any string value ("high", "medium", "low") means thinking is enabled
 		return v == "high" || v == "medium" || v == "low"
 	default:
 		return false
 	}
 }
 // String returns the value as a string
 func (t *ThinkValue) String() string {
 	if t == nil || t.Value == nil {
 		return ""
 	}
 	switch v := t.Value.(type) {
 	case string:
 		return v
 	case bool:
 		if v {
 			return "medium" // Default level when just true
 		}
 		return ""
 	default:
 		return ""
 	}
 }
 // UnmarshalJSON implements json.Unmarshaler
 func (t *ThinkValue) UnmarshalJSON(data []byte) error {
 	// Try to unmarshal as bool first
 	var b bool
 	if err := json.Unmarshal(data, &b); err == nil {
 		t.Value = b
 		return nil
 	}
 	// Try to unmarshal as string
 	var s string
 	if err := json.Unmarshal(data, &s); err == nil {
 		// Validate string values
 		if s != "high" && s != "medium" && s != "low" {
 			return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", true, or false)", s)
 		}
 		t.Value = s
 		return nil
 	}
 	return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\", true, or false)")
 }
 // MarshalJSON implements json.Marshaler
 func (t *ThinkValue) MarshalJSON() ([]byte, error) {
 	if t == nil || t.Value == nil {
 		return []byte("null"), nil
 	}
 	return json.Marshal(t.Value)
 }
 type Duration struct {
 	time.Duration
 }
@@ -987,7 +644,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 		if t < 0 {
 			d.Duration = time.Duration(math.MaxInt64)
 		} else {
-			d.Duration = time.Duration(t * float64(time.Second))
+			d.Duration = time.Duration(int(t) * int(time.Second))
 		}
 	case string:
 		d.Duration, err = time.ParseDuration(t)
@@ -1005,7 +662,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 }
 // FormatParams converts specified parameter options to their correct types
-func FormatParams(params map[string][]string) (map[string]any, error) {
+func FormatParams(params map[string][]string) (map[string]interface{}, error) {
 	opts := Options{}
 	valueOpts := reflect.ValueOf(&opts).Elem() // names of the fields in the options struct
 	typeOpts := reflect.TypeOf(opts)           // types of the fields in the options struct
@@ -1019,7 +676,7 @@ func FormatParams(params map[string][]string) (map[string]any, error) {
 		}
 	}
-	out := make(map[string]any)
+	out := make(map[string]interface{})
 	// iterate params and set values based on json struct tags
 	for key, vals := range params {
 		if opt, ok := jsonOpts[key]; !ok {
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -17,11 +17,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		req  string
 		exp  *Duration
 	}{
 		{
 			name: "Unset",
 			req:  `{ }`,
 			exp:  nil,
 		},
 		{
 			name: "Positive Integer",
 			req:  `{ "keep_alive": 42 }`,
@@ -30,7 +25,7 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		{
 			name: "Positive Float",
 			req:  `{ "keep_alive": 42.5 }`,
-			exp:  &Duration{42500 * time.Millisecond},
+			exp:  &Duration{42 * time.Second},
 		},
 		{
 			name: "Positive Integer String",
@@ -139,7 +134,7 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			var oMap map[string]any
+			var oMap map[string]interface{}
 			err := json.Unmarshal([]byte(test.req), &oMap)
 			require.NoError(t, err)
 			opts := DefaultOptions()
@@ -236,279 +231,3 @@ func TestMessage_UnmarshalJSON(t *testing.T) {
 		}
 	}
 }
 func TestToolFunction_UnmarshalJSON(t *testing.T) {
 	tests := []struct {
 		name    string
 		input   string
 		wantErr string
 	}{
 		{
 			name: "valid enum with same types",
 			input: `{
 				"name": "test",
 				"description": "test function",
 				"parameters": {
 					"type": "object",
 					"required": ["test"],
 					"properties": {
 						"test": {
 							"type": "string",
 							"description": "test prop",
 							"enum": ["a", "b", "c"]
 						}
 					}
 				}
 			}`,
 			wantErr: "",
 		},
 		{
 			name: "empty enum array",
 			input: `{
 				"name": "test",
 				"description": "test function",
 				"parameters": {
 					"type": "object",
 					"required": ["test"],
 					"properties": {
 						"test": {
 							"type": "string",
 							"description": "test prop",
 							"enum": []
 						}
 					}
 				}
 			}`,
 			wantErr: "",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			var tf ToolFunction
 			err := json.Unmarshal([]byte(tt.input), &tf)
 			if tt.wantErr != "" {
 				require.Error(t, err)
 				assert.Contains(t, err.Error(), tt.wantErr)
 			} else {
 				require.NoError(t, err)
 			}
 		})
 	}
 }
 func TestToolCallFunction_IndexAlwaysMarshals(t *testing.T) {
 	fn := ToolCallFunction{
 		Name:      "echo",
 		Arguments: ToolCallFunctionArguments{"message": "hi"},
 	}
 	data, err := json.Marshal(fn)
 	require.NoError(t, err)
 	raw := map[string]any{}
 	require.NoError(t, json.Unmarshal(data, &raw))
 	require.Contains(t, raw, "index")
 	assert.Equal(t, float64(0), raw["index"])
 	fn.Index = 3
 	data, err = json.Marshal(fn)
 	require.NoError(t, err)
 	raw = map[string]any{}
 	require.NoError(t, json.Unmarshal(data, &raw))
 	require.Contains(t, raw, "index")
 	assert.Equal(t, float64(3), raw["index"])
 }
 func TestPropertyType_UnmarshalJSON(t *testing.T) {
 	tests := []struct {
 		name     string
 		input    string
 		expected PropertyType
 	}{
 		{
 			name:     "string type",
 			input:    `"string"`,
 			expected: PropertyType{"string"},
 		},
 		{
 			name:     "array of types",
 			input:    `["string", "number"]`,
 			expected: PropertyType{"string", "number"},
 		},
 		{
 			name:     "array with single type",
 			input:    `["string"]`,
 			expected: PropertyType{"string"},
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			var pt PropertyType
 			if err := json.Unmarshal([]byte(test.input), &pt); err != nil {
 				t.Errorf("Unexpected error: %v", err)
 			}
 			if len(pt) != len(test.expected) {
 				t.Errorf("Length mismatch: got %v, expected %v", len(pt), len(test.expected))
 			}
 			for i, v := range pt {
 				if v != test.expected[i] {
 					t.Errorf("Value mismatch at index %d: got %v, expected %v", i, v, test.expected[i])
 				}
 			}
 		})
 	}
 }
 func TestPropertyType_MarshalJSON(t *testing.T) {
 	tests := []struct {
 		name     string
 		input    PropertyType
 		expected string
 	}{
 		{
 			name:     "single type",
 			input:    PropertyType{"string"},
 			expected: `"string"`,
 		},
 		{
 			name:     "multiple types",
 			input:    PropertyType{"string", "number"},
 			expected: `["string","number"]`,
 		},
 		{
 			name:     "empty type",
 			input:    PropertyType{},
 			expected: `[]`,
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			data, err := json.Marshal(test.input)
 			if err != nil {
 				t.Errorf("Unexpected error: %v", err)
 			}
 			if string(data) != test.expected {
 				t.Errorf("Marshaled data mismatch: got %v, expected %v", string(data), test.expected)
 			}
 		})
 	}
 }
 func TestThinking_UnmarshalJSON(t *testing.T) {
 	tests := []struct {
 		name             string
 		input            string
 		expectedThinking *ThinkValue
 		expectedError    bool
 	}{
 		{
 			name:             "true",
 			input:            `{ "think": true }`,
 			expectedThinking: &ThinkValue{Value: true},
 		},
 		{
 			name:             "false",
 			input:            `{ "think": false }`,
 			expectedThinking: &ThinkValue{Value: false},
 		},
 		{
 			name:             "unset",
 			input:            `{ }`,
 			expectedThinking: nil,
 		},
 		{
 			name:             "string_high",
 			input:            `{ "think": "high" }`,
 			expectedThinking: &ThinkValue{Value: "high"},
 		},
 		{
 			name:             "string_medium",
 			input:            `{ "think": "medium" }`,
 			expectedThinking: &ThinkValue{Value: "medium"},
 		},
 		{
 			name:             "string_low",
 			input:            `{ "think": "low" }`,
 			expectedThinking: &ThinkValue{Value: "low"},
 		},
 		{
 			name:             "invalid_string",
 			input:            `{ "think": "invalid" }`,
 			expectedThinking: nil,
 			expectedError:    true,
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			var req GenerateRequest
 			err := json.Unmarshal([]byte(test.input), &req)
 			if test.expectedError {
 				require.Error(t, err)
 			} else {
 				require.NoError(t, err)
 				if test.expectedThinking == nil {
 					assert.Nil(t, req.Think)
 				} else {
 					require.NotNil(t, req.Think)
 					assert.Equal(t, test.expectedThinking.Value, req.Think.Value)
 				}
 			}
 		})
 	}
 }
 func TestToolFunctionParameters_String(t *testing.T) {
 	tests := []struct {
 		name     string
 		params   ToolFunctionParameters
 		expected string
 	}{
 		{
 			name: "simple object with string property",
 			params: ToolFunctionParameters{
 				Type:     "object",
 				Required: []string{"name"},
 				Properties: map[string]ToolProperty{
 					"name": {
 						Type:        PropertyType{"string"},
 						Description: "The name of the person",
 					},
 				},
 			},
 			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
 		},
 		{
 			name: "marshal failure returns empty string",
 			params: ToolFunctionParameters{
 				Type: "object",
 				Defs: func() any {
 					// Create a cycle that will cause json.Marshal to fail
 					type selfRef struct {
 						Self *selfRef
 					}
 					s := &selfRef{}
 					s.Self = s
 					return s
 				}(),
 				Properties: map[string]ToolProperty{},
 			},
 			expected: "",
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			result := test.params.String()
 			assert.Equal(t, test.expected, result)
 		})
 	}
 }
--- a/api/types_typescript_test.go
+++ b/api/types_typescript_test.go
@@ -1,142 +0,0 @@
 package api
 import (
 	"testing"
 )
 func TestToolParameterToTypeScriptType(t *testing.T) {
 	tests := []struct {
 		name     string
 		param    ToolProperty
 		expected string
 	}{
 		{
 			name: "single string type",
 			param: ToolProperty{
 				Type: PropertyType{"string"},
 			},
 			expected: "string",
 		},
 		{
 			name: "single number type",
 			param: ToolProperty{
 				Type: PropertyType{"number"},
 			},
 			expected: "number",
 		},
 		{
 			name: "integer maps to number",
 			param: ToolProperty{
 				Type: PropertyType{"integer"},
 			},
 			expected: "number",
 		},
 		{
 			name: "boolean type",
 			param: ToolProperty{
 				Type: PropertyType{"boolean"},
 			},
 			expected: "boolean",
 		},
 		{
 			name: "array type",
 			param: ToolProperty{
 				Type: PropertyType{"array"},
 			},
 			expected: "any[]",
 		},
 		{
 			name: "object type",
 			param: ToolProperty{
 				Type: PropertyType{"object"},
 			},
 			expected: "Record<string, any>",
 		},
 		{
 			name: "null type",
 			param: ToolProperty{
 				Type: PropertyType{"null"},
 			},
 			expected: "null",
 		},
 		{
 			name: "multiple types as union",
 			param: ToolProperty{
 				Type: PropertyType{"string", "number"},
 			},
 			expected: "string | number",
 		},
 		{
 			name: "string or null union",
 			param: ToolProperty{
 				Type: PropertyType{"string", "null"},
 			},
 			expected: "string | null",
 		},
 		{
 			name: "anyOf with single types",
 			param: ToolProperty{
 				AnyOf: []ToolProperty{
 					{Type: PropertyType{"string"}},
 					{Type: PropertyType{"number"}},
 				},
 			},
 			expected: "string | number",
 		},
 		{
 			name: "anyOf with multiple types in each branch",
 			param: ToolProperty{
 				AnyOf: []ToolProperty{
 					{Type: PropertyType{"string", "null"}},
 					{Type: PropertyType{"number"}},
 				},
 			},
 			expected: "string | null | number",
 		},
 		{
 			name: "nested anyOf",
 			param: ToolProperty{
 				AnyOf: []ToolProperty{
 					{Type: PropertyType{"boolean"}},
 					{
 						AnyOf: []ToolProperty{
 							{Type: PropertyType{"string"}},
 							{Type: PropertyType{"number"}},
 						},
 					},
 				},
 			},
 			expected: "boolean | string | number",
 		},
 		{
 			name: "empty type returns any",
 			param: ToolProperty{
 				Type: PropertyType{},
 			},
 			expected: "any",
 		},
 		{
 			name: "unknown type maps to any",
 			param: ToolProperty{
 				Type: PropertyType{"unknown_type"},
 			},
 			expected: "any",
 		},
 		{
 			name: "multiple types including array",
 			param: ToolProperty{
 				Type: PropertyType{"string", "array", "null"},
 			},
 			expected: "string | any[] | null",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := tt.param.ToTypeScriptType()
 			if result != tt.expected {
 				t.Errorf("ToTypeScriptType() = %q, want %q", result, tt.expected)
 			}
 		})
 	}
 }
--- a/app/README.md
+++ b/app/README.md
@@ -17,6 +17,6 @@ If you want to build the installer, youll need to install
 In the top directory of this repo, run the following powershell script
 to build the ollama CLI, ollama app, and ollama installer.
-```powershell
+```
 powershell -ExecutionPolicy Bypass -File .\scripts\build_windows.ps1
 ```
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -4,14 +4,20 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/logutil"
 )
 func InitLogging() {
 	level := slog.LevelInfo
 	if envconfig.Debug() {
 		level = slog.LevelDebug
 	}
 	var logFile *os.File
 	var err error
 	// Detect if we're a GUI app on windows, and if not, send logs to console
@@ -27,8 +33,20 @@ func InitLogging() {
 			return
 		}
 	}
 	handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			if attr.Key == slog.SourceKey {
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
 	slog.Info("ollama app started")
 }
--- a/auth/auth.go
+++ b/auth/auth.go
@@ -18,13 +18,21 @@ import (
 const defaultPrivateKey = "id_ed25519"
-func GetPublicKey() (string, error) {
+func keyPath() (string, error) {
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
 	}
-	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
+	return filepath.Join(home, ".ollama", defaultPrivateKey), nil
 }
 func GetPublicKey() (string, error) {
 	keyPath, err := keyPath()
 	if err != nil {
 		return "", err
 	}
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
@@ -51,12 +59,11 @@ func NewNonce(r io.Reader, length int) (string, error) {
 }
 func Sign(ctx context.Context, bts []byte) (string, error) {
-	home, err := os.UserHomeDir()
+	keyPath, err := keyPath()
 	if err != nil {
 		return "", err
 	}
 	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
--- a/cache/cache.go
+++ b/cache/cache.go
@@ -0,0 +1,63 @@
 package cache
 import (
 	"github.com/ollama/ollama/ml"
 )
 type Options struct {
 	Position int
 }
 type Cache interface {
 	Sub(i int) Cache
 	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
 }
 type Simple struct {
 	DType    ml.DType
 	Capacity int
 	keys, values []ml.Tensor
 }
 func (c *Simple) Sub(i int) Cache {
 	if i >= len(c.keys) {
 		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
 		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
 	}
 	return &Simple{
 		keys:     c.keys[i : i+1],
 		values:   c.values[i : i+1],
 		Capacity: c.Capacity,
 		DType:    c.DType,
 	}
 }
 func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
 	if c.keys[0] == nil || c.values[0] == nil {
 		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
 		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
 	}
 	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
 	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
 	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
 	key = c.keys[0].View(ctx, 0,
 		int(key.Dim(0)), int(key.Stride(1)),
 		int(key.Dim(1)), int(key.Stride(2)),
 		n,
 	)
 	value = c.values[0].View(ctx, 0,
 		int(value.Dim(0)), int(value.Stride(1)),
 		int(value.Dim(1)), int(value.Stride(2)),
 		n,
 	)
 	// TODO shift context if necessary
 	return key, value
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -2,22 +2,19 @@ package cmd
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"reflect"
 	"strings"
 	"testing"
 	"time"
 	"github.com/google/go-cmp/cmp"
 	"github.com/spf13/cobra"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/types/model"
 )
 func TestShowInfo(t *testing.T) {
@@ -29,7 +26,7 @@ func TestShowInfo(t *testing.T) {
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
-		}, false, &b); err != nil {
+		}, &b); err != nil {
 			t.Fatal(err)
 		}
@@ -59,7 +56,7 @@ func TestShowInfo(t *testing.T) {
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
-		}, false, &b); err != nil {
+		}, &b); err != nil {
 			t.Fatal(err)
 		}
@@ -70,60 +67,6 @@ func TestShowInfo(t *testing.T) {
    embedding length    0       
    quantization        FP16    
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 	t.Run("verbose model", func(t *testing.T) {
 		var b bytes.Buffer
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "8B",
 				QuantizationLevel: "FP16",
 			},
 			Parameters: `
 			stop up`,
 			ModelInfo: map[string]any{
 				"general.architecture":    "test",
 				"general.parameter_count": float64(8_000_000_000),
 				"some.true_bool":          true,
 				"some.false_bool":         false,
 				"test.context_length":     float64(1000),
 				"test.embedding_length":   float64(11434),
 			},
 			Tensors: []api.Tensor{
 				{Name: "blk.0.attn_k.weight", Type: "BF16", Shape: []uint64{42, 3117}},
 				{Name: "blk.0.attn_q.weight", Type: "FP16", Shape: []uint64{3117, 42}},
 			},
 		}, true, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := `  Model
    architecture        test     
    parameters          8B       
    context length      1000     
    embedding length    11434    
    quantization        FP16     
  Parameters
    stop    up    
  Metadata
    general.architecture       test     
    general.parameter_count    8e+09    
    some.false_bool            false    
    some.true_bool             true     
    test.context_length        1000     
    test.embedding_length      11434    
  Tensors
    blk.0.attn_k.weight    BF16    [42 3117]    
    blk.0.attn_q.weight    FP16    [3117 42]    
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
@@ -145,7 +88,7 @@ func TestShowInfo(t *testing.T) {
 			stop you
 			stop up
 			temperature 99`,
-		}, false, &b); err != nil {
+		}, &b); err != nil {
 			t.Fatal(err)
 		}
@@ -182,7 +125,7 @@ func TestShowInfo(t *testing.T) {
 				"clip.vision.embedding_length": float64(0),
 				"clip.vision.projection_dim":   float64(0),
 			},
-		}, false, &b); err != nil {
+		}, &b); err != nil {
 			t.Fatal(err)
 		}
@@ -215,7 +158,7 @@ func TestShowInfo(t *testing.T) {
 Ahoy, matey!
 Weigh anchor!
 			`,
-		}, false, &b); err != nil {
+		}, &b); err != nil {
 			t.Fatal(err)
 		}
@@ -227,7 +170,6 @@ Weigh anchor!
  System
    You are a pirate!    
    Ahoy, matey!         
    ...                  
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
@@ -245,7 +187,7 @@ Weigh anchor!
 				QuantizationLevel: "FP16",
 			},
 			License: license,
-		}, false, &b); err != nil {
+		}, &b); err != nil {
 			t.Fatal(err)
 		}
@@ -263,34 +205,6 @@ Weigh anchor!
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 	t.Run("capabilities", func(t *testing.T) {
 		var b bytes.Buffer
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
 			Capabilities: []model.Capability{model.CapabilityVision, model.CapabilityTools},
 		}, false, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := "  Model\n" +
 			"    architecture    test    \n" +
 			"    parameters      7B      \n" +
 			"    quantization    FP16    \n" +
 			"\n" +
 			"  Capabilities\n" +
 			"    vision    \n" +
 			"    tools     \n" +
 			"\n"
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 }
 func TestDeleteHandler(t *testing.T) {
@@ -306,8 +220,6 @@ func TestDeleteHandler(t *testing.T) {
 				w.WriteHeader(http.StatusOK)
 			} else {
 				w.WriteHeader(http.StatusNotFound)
 				errPayload := `{"error":"model '%s' not found"}`
 				w.Write([]byte(fmt.Sprintf(errPayload, req.Name)))
 			}
 			return
 		}
@@ -341,7 +253,7 @@ func TestDeleteHandler(t *testing.T) {
 	t.Cleanup(mockServer.Close)
 	cmd := &cobra.Command{}
-	cmd.SetContext(t.Context())
+	cmd.SetContext(context.TODO())
 	if err := DeleteHandler(cmd, []string{"test-model"}); err != nil {
 		t.Fatalf("DeleteHandler failed: %v", err)
 	}
@@ -350,7 +262,7 @@ func TestDeleteHandler(t *testing.T) {
 	}
 	err := DeleteHandler(cmd, []string{"test-model-not-found"})
-	if err == nil || !strings.Contains(err.Error(), "model 'test-model-not-found' not found") {
+	if err == nil || !strings.Contains(err.Error(), "unable to stop existing running model \"test-model-not-found\"") {
 		t.Fatalf("DeleteHandler failed: expected error about stopping non-existent model, got %v", err)
 	}
 }
@@ -403,6 +315,11 @@ func TestGetModelfileName(t *testing.T) {
 			var expectedFilename string
 			if tt.fileExists {
 				tempDir, err := os.MkdirTemp("", "modelfiledir")
 				defer os.RemoveAll(tempDir)
 				if err != nil {
 					t.Fatalf("temp modelfile dir creation failed: %v", err)
 				}
 				var fn string
 				if tt.modelfileName != "" {
 					fn = tt.modelfileName
@@ -410,11 +327,10 @@ func TestGetModelfileName(t *testing.T) {
 					fn = "Modelfile"
 				}
-				tempFile, err := os.CreateTemp(t.TempDir(), fn)
+				tempFile, err := os.CreateTemp(tempDir, fn)
 				if err != nil {
 					t.Fatalf("temp modelfile creation failed: %v", err)
 				}
 				defer tempFile.Close()
 				expectedFilename = tempFile.Name()
 				err = cmd.Flags().Set("file", expectedFilename)
@@ -492,35 +408,9 @@ func TestPushHandler(t *testing.T) {
 						w.(http.Flusher).Flush()
 					}
 				},
 				"/api/me": func(w http.ResponseWriter, r *http.Request) {
 					if r.Method != http.MethodPost {
 						t.Errorf("expected POST request, got %s", r.Method)
 					}
 				},
 			},
 			expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
 		},
 		{
 			name:      "not signed in push",
 			modelName: "notsignedin-model",
 			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
 				"/api/me": func(w http.ResponseWriter, r *http.Request) {
 					if r.Method != http.MethodPost {
 						t.Errorf("expected POST request, got %s", r.Method)
 					}
 					w.Header().Set("Content-Type", "application/json")
 					w.WriteHeader(http.StatusUnauthorized)
 					err := json.NewEncoder(w).Encode(map[string]string{
 						"error":      "unauthorized",
 						"signin_url": "https://somethingsomething",
 					})
 					if err != nil {
 						t.Fatal(err)
 					}
 				},
 			},
 			expectedOutput: "You need to be signed in to push",
 		},
 		{
 			name:      "unauthorized push",
 			modelName: "unauthorized-model",
@@ -529,17 +419,12 @@ func TestPushHandler(t *testing.T) {
 					w.Header().Set("Content-Type", "application/json")
 					w.WriteHeader(http.StatusUnauthorized)
 					err := json.NewEncoder(w).Encode(map[string]string{
-						"error": "403: {\"errors\":[{\"code\":\"ACCESS DENIED\", \"message\":\"access denied\"}]}",
+						"error": "access denied",
 					})
 					if err != nil {
 						t.Fatal(err)
 					}
 				},
 				"/api/me": func(w http.ResponseWriter, r *http.Request) {
 					if r.Method != http.MethodPost {
 						t.Errorf("expected POST request, got %s", r.Method)
 					}
 				},
 			},
 			expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
 		},
@@ -557,14 +442,10 @@ func TestPushHandler(t *testing.T) {
 			defer mockServer.Close()
 			t.Setenv("OLLAMA_HOST", mockServer.URL)
 			tmpDir := t.TempDir()
 			t.Setenv("HOME", tmpDir)
 			t.Setenv("USERPROFILE", tmpDir)
 			initializeKeypair()
 			cmd := &cobra.Command{}
 			cmd.Flags().Bool("insecure", false, "")
-			cmd.SetContext(t.Context())
+			cmd.SetContext(context.TODO())
 			// Redirect stderr to capture progress output
 			oldStderr := os.Stderr
@@ -596,7 +477,7 @@ func TestPushHandler(t *testing.T) {
 					t.Errorf("expected no error, got %v", err)
 				}
 				if tt.expectedOutput != "" {
-					if got := string(stdout); !strings.Contains(got, tt.expectedOutput) {
+					if got := string(stdout); got != tt.expectedOutput {
 						t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
 					}
 				}
@@ -609,96 +490,6 @@ func TestPushHandler(t *testing.T) {
 	}
 }
 func TestListHandler(t *testing.T) {
 	tests := []struct {
 		name           string
 		args           []string
 		serverResponse []api.ListModelResponse
 		expectedError  string
 		expectedOutput string
 	}{
 		{
 			name: "list all models",
 			args: []string{},
 			serverResponse: []api.ListModelResponse{
 				{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
 				{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-48 * time.Hour)},
 			},
 			expectedOutput: "NAME      ID              SIZE      MODIFIED     \n" +
 				"model1    sha256:abc12    1.0 KB    24 hours ago    \n" +
 				"model2    sha256:def45    2.0 KB    2 days ago      \n",
 		},
 		{
 			name: "filter models by prefix",
 			args: []string{"model1"},
 			serverResponse: []api.ListModelResponse{
 				{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
 				{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-24 * time.Hour)},
 			},
 			expectedOutput: "NAME      ID              SIZE      MODIFIED     \n" +
 				"model1    sha256:abc12    1.0 KB    24 hours ago    \n",
 		},
 		{
 			name:          "server error",
 			args:          []string{},
 			expectedError: "server error",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				if r.URL.Path != "/api/tags" || r.Method != http.MethodGet {
 					t.Errorf("unexpected request to %s %s", r.Method, r.URL.Path)
 					http.Error(w, "not found", http.StatusNotFound)
 					return
 				}
 				if tt.expectedError != "" {
 					http.Error(w, tt.expectedError, http.StatusInternalServerError)
 					return
 				}
 				response := api.ListResponse{Models: tt.serverResponse}
 				if err := json.NewEncoder(w).Encode(response); err != nil {
 					t.Fatal(err)
 				}
 			}))
 			defer mockServer.Close()
 			t.Setenv("OLLAMA_HOST", mockServer.URL)
 			cmd := &cobra.Command{}
 			cmd.SetContext(t.Context())
 			// Capture stdout
 			oldStdout := os.Stdout
 			r, w, _ := os.Pipe()
 			os.Stdout = w
 			err := ListHandler(cmd, tt.args)
 			// Restore stdout and get output
 			w.Close()
 			os.Stdout = oldStdout
 			output, _ := io.ReadAll(r)
 			if tt.expectedError == "" {
 				if err != nil {
 					t.Errorf("expected no error, got %v", err)
 				}
 				if got := string(output); got != tt.expectedOutput {
 					t.Errorf("expected output:\n%s\ngot:\n%s", tt.expectedOutput, got)
 				}
 			} else {
 				if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
 					t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
 				}
 			}
 		})
 	}
 }
 func TestCreateHandler(t *testing.T) {
 	tests := []struct {
 		name           string
@@ -724,7 +515,7 @@ func TestCreateHandler(t *testing.T) {
 						return
 					}
-					if req.Model != "test-model" {
+					if req.Name != "test-model" {
 						t.Errorf("expected model name 'test-model', got %s", req.Name)
 					}
@@ -764,7 +555,7 @@ func TestCreateHandler(t *testing.T) {
 			}))
 			t.Setenv("OLLAMA_HOST", mockServer.URL)
 			t.Cleanup(mockServer.Close)
-			tempFile, err := os.CreateTemp(t.TempDir(), "modelfile")
+			tempFile, err := os.CreateTemp("", "modelfile")
 			if err != nil {
 				t.Fatal(err)
 			}
@@ -784,7 +575,7 @@ func TestCreateHandler(t *testing.T) {
 			}
 			cmd.Flags().Bool("insecure", false, "")
-			cmd.SetContext(t.Context())
+			cmd.SetContext(context.TODO())
 			// Redirect stderr to capture progress output
 			oldStderr := os.Stderr
@@ -825,415 +616,3 @@ func TestCreateHandler(t *testing.T) {
 		})
 	}
 }
 func TestNewCreateRequest(t *testing.T) {
 	tests := []struct {
 		name     string
 		from     string
 		opts     runOptions
 		expected *api.CreateRequest
 	}{
 		{
 			"basic test",
 			"newmodel",
 			runOptions{
 				Model:       "mymodel",
 				ParentModel: "",
 				Prompt:      "You are a fun AI agent",
 				Messages:    []api.Message{},
 				WordWrap:    true,
 			},
 			&api.CreateRequest{
 				From:  "mymodel",
 				Model: "newmodel",
 			},
 		},
 		{
 			"parent model test",
 			"newmodel",
 			runOptions{
 				Model:       "mymodel",
 				ParentModel: "parentmodel",
 				Messages:    []api.Message{},
 				WordWrap:    true,
 			},
 			&api.CreateRequest{
 				From:  "parentmodel",
 				Model: "newmodel",
 			},
 		},
 		{
 			"parent model as filepath test",
 			"newmodel",
 			runOptions{
 				Model:       "mymodel",
 				ParentModel: "/some/file/like/etc/passwd",
 				Messages:    []api.Message{},
 				WordWrap:    true,
 			},
 			&api.CreateRequest{
 				From:  "mymodel",
 				Model: "newmodel",
 			},
 		},
 		{
 			"parent model as windows filepath test",
 			"newmodel",
 			runOptions{
 				Model:       "mymodel",
 				ParentModel: "D:\\some\\file\\like\\etc\\passwd",
 				Messages:    []api.Message{},
 				WordWrap:    true,
 			},
 			&api.CreateRequest{
 				From:  "mymodel",
 				Model: "newmodel",
 			},
 		},
 		{
 			"options test",
 			"newmodel",
 			runOptions{
 				Model:       "mymodel",
 				ParentModel: "parentmodel",
 				Options: map[string]any{
 					"temperature": 1.0,
 				},
 			},
 			&api.CreateRequest{
 				From:  "parentmodel",
 				Model: "newmodel",
 				Parameters: map[string]any{
 					"temperature": 1.0,
 				},
 			},
 		},
 		{
 			"messages test",
 			"newmodel",
 			runOptions{
 				Model:       "mymodel",
 				ParentModel: "parentmodel",
 				System:      "You are a fun AI agent",
 				Messages: []api.Message{
 					{
 						Role:    "user",
 						Content: "hello there!",
 					},
 					{
 						Role:    "assistant",
 						Content: "hello to you!",
 					},
 				},
 				WordWrap: true,
 			},
 			&api.CreateRequest{
 				From:   "parentmodel",
 				Model:  "newmodel",
 				System: "You are a fun AI agent",
 				Messages: []api.Message{
 					{
 						Role:    "user",
 						Content: "hello there!",
 					},
 					{
 						Role:    "assistant",
 						Content: "hello to you!",
 					},
 				},
 			},
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			actual := NewCreateRequest(tt.from, tt.opts)
 			if !cmp.Equal(actual, tt.expected) {
 				t.Errorf("expected output %#v, got %#v", tt.expected, actual)
 			}
 		})
 	}
 }
 func TestRunOptions_Copy(t *testing.T) {
 	// Setup test data
 	originalKeepAlive := &api.Duration{Duration: 5 * time.Minute}
 	originalThink := &api.ThinkValue{Value: "test reasoning"}
 	original := runOptions{
 		Model:       "test-model",
 		ParentModel: "parent-model",
 		Prompt:      "test prompt",
 		Messages: []api.Message{
 			{Role: "user", Content: "hello"},
 			{Role: "assistant", Content: "hi there"},
 		},
 		WordWrap: true,
 		Format:   "json",
 		System:   "system prompt",
 		Images: []api.ImageData{
 			[]byte("image1"),
 			[]byte("image2"),
 		},
 		Options: map[string]any{
 			"temperature": 0.7,
 			"max_tokens":  1000,
 			"top_p":       0.9,
 		},
 		MultiModal:   true,
 		KeepAlive:    originalKeepAlive,
 		Think:        originalThink,
 		HideThinking: false,
 		ShowConnect:  true,
 	}
 	// Test the copy
 	copied := original.Copy()
 	// Test 1: Verify the copy is not the same instance
 	if &copied == &original {
 		t.Error("Copy should return a different instance")
 	}
 	// Test 2: Verify all fields are copied correctly
 	tests := []struct {
 		name string
 		got  interface{}
 		want interface{}
 	}{
 		{"Model", copied.Model, original.Model},
 		{"ParentModel", copied.ParentModel, original.ParentModel},
 		{"Prompt", copied.Prompt, original.Prompt},
 		{"WordWrap", copied.WordWrap, original.WordWrap},
 		{"Format", copied.Format, original.Format},
 		{"System", copied.System, original.System},
 		{"MultiModal", copied.MultiModal, original.MultiModal},
 		{"HideThinking", copied.HideThinking, original.HideThinking},
 		{"ShowConnect", copied.ShowConnect, original.ShowConnect},
 	}
 	for _, tt := range tests {
 		if !reflect.DeepEqual(tt.got, tt.want) {
 			t.Errorf("%s mismatch: got %v, want %v", tt.name, tt.got, tt.want)
 		}
 	}
 	// Test 3: Verify Messages slice is deeply copied
 	if len(copied.Messages) != len(original.Messages) {
 		t.Errorf("Messages length mismatch: got %d, want %d", len(copied.Messages), len(original.Messages))
 	}
 	if len(copied.Messages) > 0 && &copied.Messages[0] == &original.Messages[0] {
 		t.Error("Messages should be different instances")
 	}
 	// Modify original to verify independence
 	if len(original.Messages) > 0 {
 		originalContent := original.Messages[0].Content
 		original.Messages[0].Content = "modified"
 		if len(copied.Messages) > 0 && copied.Messages[0].Content == "modified" {
 			t.Error("Messages should be independent after copy")
 		}
 		// Restore for other tests
 		original.Messages[0].Content = originalContent
 	}
 	// Test 4: Verify Images slice is deeply copied
 	if len(copied.Images) != len(original.Images) {
 		t.Errorf("Images length mismatch: got %d, want %d", len(copied.Images), len(original.Images))
 	}
 	if len(copied.Images) > 0 && &copied.Images[0] == &original.Images[0] {
 		t.Error("Images should be different instances")
 	}
 	// Modify original to verify independence
 	if len(original.Images) > 0 {
 		originalImage := original.Images[0]
 		original.Images[0] = []byte("modified")
 		if len(copied.Images) > 0 && string(copied.Images[0]) == "modified" {
 			t.Error("Images should be independent after copy")
 		}
 		// Restore for other tests
 		original.Images[0] = originalImage
 	}
 	// Test 5: Verify Options map is deeply copied
 	if len(copied.Options) != len(original.Options) {
 		t.Errorf("Options length mismatch: got %d, want %d", len(copied.Options), len(original.Options))
 	}
 	if len(copied.Options) > 0 && &copied.Options == &original.Options {
 		t.Error("Options map should be different instances")
 	}
 	// Modify original to verify independence
 	if len(original.Options) > 0 {
 		originalTemp := original.Options["temperature"]
 		original.Options["temperature"] = 0.9
 		if copied.Options["temperature"] == 0.9 {
 			t.Error("Options should be independent after copy")
 		}
 		// Restore for other tests
 		original.Options["temperature"] = originalTemp
 	}
 	// Test 6: Verify KeepAlive pointer is copied (shallow copy)
 	if copied.KeepAlive != original.KeepAlive {
 		t.Error("KeepAlive pointer should be the same (shallow copy)")
 	}
 	// Test 7: Verify Think pointer creates a new instance
 	if original.Think != nil && copied.Think == original.Think {
 		t.Error("Think should be a different instance")
 	}
 	if original.Think != nil && copied.Think != nil {
 		if !reflect.DeepEqual(copied.Think.Value, original.Think.Value) {
 			t.Errorf("Think.Value mismatch: got %v, want %v", copied.Think.Value, original.Think.Value)
 		}
 	}
 	// Test 8: Test with zero values
 	zeroOriginal := runOptions{}
 	zeroCopy := zeroOriginal.Copy()
 	if !reflect.DeepEqual(zeroCopy, zeroOriginal) {
 		fmt.Printf("orig: %#v\ncopy: %#v\n", zeroOriginal, zeroCopy)
 		t.Error("Copy of zero value should equal original zero value")
 	}
 }
 func TestRunOptions_Copy_EmptySlicesAndMaps(t *testing.T) {
 	// Test with empty slices and maps
 	original := runOptions{
 		Messages: []api.Message{},
 		Images:   []api.ImageData{},
 		Options:  map[string]any{},
 	}
 	copied := original.Copy()
 	if copied.Messages == nil {
 		t.Error("Empty Messages slice should remain empty, not nil")
 	}
 	if copied.Images == nil {
 		t.Error("Empty Images slice should remain empty, not nil")
 	}
 	if copied.Options == nil {
 		t.Error("Empty Options map should remain empty, not nil")
 	}
 	if len(copied.Messages) != 0 {
 		t.Error("Empty Messages slice should remain empty")
 	}
 	if len(copied.Images) != 0 {
 		t.Error("Empty Images slice should remain empty")
 	}
 	if len(copied.Options) != 0 {
 		t.Error("Empty Options map should remain empty")
 	}
 }
 func TestRunOptions_Copy_NilPointers(t *testing.T) {
 	// Test with nil pointers
 	original := runOptions{
 		KeepAlive: nil,
 		Think:     nil,
 	}
 	copied := original.Copy()
 	if copied.KeepAlive != nil {
 		t.Error("Nil KeepAlive should remain nil")
 	}
 	if copied.Think != nil {
 		t.Error("Nil Think should remain nil")
 	}
 }
 func TestRunOptions_Copy_ThinkValueVariants(t *testing.T) {
 	tests := []struct {
 		name  string
 		think *api.ThinkValue
 	}{
 		{"nil Think", nil},
 		{"bool true", &api.ThinkValue{Value: true}},
 		{"bool false", &api.ThinkValue{Value: false}},
 		{"string value", &api.ThinkValue{Value: "reasoning text"}},
 		{"int value", &api.ThinkValue{Value: 42}},
 		{"nil value", &api.ThinkValue{Value: nil}},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			original := runOptions{Think: tt.think}
 			copied := original.Copy()
 			if tt.think == nil {
 				if copied.Think != nil {
 					t.Error("Nil Think should remain nil")
 				}
 				return
 			}
 			if copied.Think == nil {
 				t.Error("Non-nil Think should not become nil")
 				return
 			}
 			if copied.Think == original.Think {
 				t.Error("Think should be a different instance")
 			}
 			if !reflect.DeepEqual(copied.Think.Value, original.Think.Value) {
 				t.Errorf("Think.Value mismatch: got %v, want %v", copied.Think.Value, original.Think.Value)
 			}
 		})
 	}
 }
 func TestRunOptions_Copy_Independence(t *testing.T) {
 	// Test that modifications to original don't affect copy
 	originalThink := &api.ThinkValue{Value: "original"}
 	original := runOptions{
 		Model:    "original-model",
 		Messages: []api.Message{{Role: "user", Content: "original"}},
 		Options:  map[string]any{"key": "value"},
 		Think:    originalThink,
 	}
 	copied := original.Copy()
 	// Modify original
 	original.Model = "modified-model"
 	if len(original.Messages) > 0 {
 		original.Messages[0].Content = "modified"
 	}
 	original.Options["key"] = "modified"
 	if original.Think != nil {
 		original.Think.Value = "modified"
 	}
 	// Verify copy is unchanged
 	if copied.Model == "modified-model" {
 		t.Error("Copy Model should not be affected by original modification")
 	}
 	if len(copied.Messages) > 0 && copied.Messages[0].Content == "modified" {
 		t.Error("Copy Messages should not be affected by original modification")
 	}
 	if copied.Options["key"] == "modified" {
 		t.Error("Copy Options should not be affected by original modification")
 	}
 	if copied.Think != nil && copied.Think.Value == "modified" {
 		t.Error("Copy Think should not be affected by original modification")
 	}
 }
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -18,7 +18,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 )
 type MultilineState int
@@ -44,7 +43,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
 		if opts.MultiModal {
-			fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
+			fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
 		}
 		fmt.Fprintln(os.Stderr, "")
@@ -62,8 +61,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set think             Enable thinking")
 		fmt.Fprintln(os.Stderr, "  /set nothink           Disable thinking")
 		fmt.Fprintln(os.Stderr, "")
 	}
@@ -130,7 +127,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	var sb strings.Builder
 	var multiline MultilineState
 	var thinkExplicitlySet bool = opts.Think != nil
 	for {
 		line, err := scanner.Readline()
@@ -195,30 +191,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				fmt.Println("Usage:\n  /load <modelname>")
 				continue
 			}
 			origOpts := opts.Copy()
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
 			opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
 			if err != nil {
 				if strings.Contains(err.Error(), "not found") {
 					fmt.Printf("Couldn't find model '%s'\n", opts.Model)
 					opts = origOpts.Copy()
 					continue
 				}
 				return err
 			}
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				if strings.Contains(err.Error(), "not found") {
 					fmt.Printf("Couldn't find model '%s'\n", opts.Model)
 					opts = origOpts.Copy()
 					continue
 				}
 				if strings.Contains(err.Error(), "does not support thinking") {
 					fmt.Printf("error: %v\n", err)
 					continue
 				}
 				return err
 			}
 			continue
@@ -279,35 +255,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						return err
 					}
 					fmt.Println("Set 'quiet' mode.")
 				case "think":
 					thinkValue := api.ThinkValue{Value: true}
 					var maybeLevel string
 					if len(args) > 2 {
 						maybeLevel = args[2]
 					}
 					if maybeLevel != "" {
 						// TODO(drifkin): validate the level, could be model dependent
 						// though... It will also be validated on the server once a call is
 						// made.
 						thinkValue.Value = maybeLevel
 					}
 					opts.Think = &thinkValue
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
 					}
 					if maybeLevel != "" {
 						fmt.Printf("Set 'think' mode to '%s'.\n", maybeLevel)
 					} else {
 						fmt.Println("Set 'think' mode.")
 					}
 				case "nothink":
 					opts.Think = &api.ThinkValue{Value: false}
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
 					}
 					fmt.Println("Set 'nothink' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
@@ -396,7 +343,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				switch args[1] {
 				case "info":
-					_ = showInfo(resp, false, os.Stderr)
+					_ = showInfo(resp, os.Stderr)
 				case "license":
 					if resp.License == "" {
 						fmt.Println("No license was specified for this model.")
@@ -406,21 +353,18 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				case "modelfile":
 					fmt.Println(resp.Modelfile)
 				case "parameters":
 					fmt.Println("Model defined parameters:")
 					if resp.Parameters == "" {
-						fmt.Println("  No additional parameters were specified for this model.")
+						fmt.Println("No parameters were specified for this model.")
 					} else {
-						for _, l := range strings.Split(resp.Parameters, "\n") {
+						if len(opts.Options) > 0 {
-							fmt.Printf("  %s\n", l)
+							fmt.Println("User defined parameters:")
 							for k, v := range opts.Options {
 								fmt.Printf("%-*s %v\n", 30, k, v)
 							}
 							fmt.Println()
 						}
-					}
+						fmt.Println("Model defined parameters:")
-					fmt.Println()
+						fmt.Println(resp.Parameters)
 					if len(opts.Options) > 0 {
 						fmt.Println("User defined parameters:")
 						for k, v := range opts.Options {
 							fmt.Printf("  %-*s %v\n", 30, k, v)
 						}
 						fmt.Println()
 					}
 				case "system":
 					switch {
@@ -499,12 +443,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			assistant, err := chat(cmd, opts)
 			if err != nil {
 				if strings.Contains(err.Error(), "does not support thinking") ||
 					strings.Contains(err.Error(), "invalid think value") {
 					fmt.Printf("error: %v\n", err)
 					sb.Reset()
 					continue
 				}
 				return err
 			}
 			if assistant != nil {
@@ -517,16 +455,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 }
 func NewCreateRequest(name string, opts runOptions) *api.CreateRequest {
 	parentModel := opts.ParentModel
 	modelName := model.ParseName(parentModel)
 	if !modelName.IsValid() {
 		parentModel = ""
 	}
 	req := &api.CreateRequest{
-		Model: name,
+		Name: name,
-		From:  cmp.Or(parentModel, opts.Model),
+		From: cmp.Or(opts.ParentModel, opts.Model),
 	}
 	if opts.System != "" {
@@ -560,7 +491,6 @@ func normalizeFilePath(fp string) string {
 		"\\\\", "\\", // Escaped backslash
 		"\\*", "*", // Escaped asterisk
 		"\\?", "?", // Escaped question mark
 		"\\~", "~", // Escaped tilde
 	).Replace(fp)
 }
@@ -568,7 +498,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
 	re := regexp.MustCompile(regexPattern)
 	return re.FindAllString(input, -1)
@@ -588,8 +518,6 @@ func extractFileData(input string) (string, []api.ImageData, error) {
 			return "", imgs, err
 		}
 		fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
 		input = strings.ReplaceAll(input, "'"+nfp+"'", "")
 		input = strings.ReplaceAll(input, "'"+fp+"'", "")
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
@@ -610,7 +538,7 @@ func getImageData(filePath string) ([]byte, error) {
 	}
 	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -1,8 +1,6 @@
 package cmd
 import (
 	"os"
 	"path/filepath"
 	"testing"
 	"github.com/stretchr/testify/assert"
@@ -12,17 +10,14 @@ func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
 ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
 /unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
 	res := extractFileNames(input)
-	assert.Len(t, res, 7)
+	assert.Len(t, res, 5)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
 	assert.Contains(t, res[4], "five.JPG")
 	assert.Contains(t, res[5], "six.webp")
 	assert.Contains(t, res[6], "seven.WEBP")
 	assert.NotContains(t, res[4], '"')
 	assert.NotContains(t, res, "inbetween1")
 	assert.NotContains(t, res, "./1.svg")
@@ -33,12 +28,10 @@ func TestExtractFilenames(t *testing.T) {
 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
 ./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
 d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
 c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
 d:\path with\spaces\thirteen.WEBP some ending
 `
 	res = extractFileNames(input)
-	assert.Len(t, res, 13)
+	assert.Len(t, res, 10)
 	assert.NotContains(t, res, "inbetween2")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
@@ -56,31 +49,4 @@ d:\path with\spaces\thirteen.WEBP some ending
 	assert.Contains(t, res[8], "d:")
 	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
 	assert.Contains(t, res[10], "eleven.webp")
 	assert.Contains(t, res[10], "c:")
 	assert.Contains(t, res[11], "twelve.WebP")
 	assert.Contains(t, res[11], "c:")
 	assert.Contains(t, res[12], "thirteen.WEBP")
 	assert.Contains(t, res[12], "d:")
 }
 // Ensure that file paths wrapped in single quotes are removed with the quotes.
 func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) {
 	dir := t.TempDir()
 	fp := filepath.Join(dir, "img.jpg")
 	data := make([]byte, 600)
 	copy(data, []byte{
 		0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F',
 		0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0xff, 0xd9,
 	})
 	if err := os.WriteFile(fp, data, 0o600); err != nil {
 		t.Fatalf("failed to write test image: %v", err)
 	}
 	input := "before '" + fp + "' after"
 	cleaned, imgs, err := extractFileData(input)
 	assert.NoError(t, err)
 	assert.Len(t, imgs, 1)
 	assert.Equal(t, cleaned, "before  after")
 }
--- a/cmd/runner/main.go
+++ b/cmd/runner/main.go
@@ -4,7 +4,7 @@ import (
 	"fmt"
 	"os"
-	"github.com/ollama/ollama/runner"
+	"github.com/ollama/ollama/llama/runner"
 )
 func main() {
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -5,7 +5,7 @@ import (
 	"errors"
 	"os"
 	"os/exec"
-	"regexp"
+	"strings"
 	"github.com/ollama/ollama/api"
 )
@@ -19,12 +19,11 @@ func startApp(ctx context.Context, client *api.Client) error {
 	if err != nil {
 		return err
 	}
-	r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
+	if !strings.Contains(link, "Ollama.app") {
 	m := r.FindStringSubmatch(link)
 	if len(m) != 1 {
 		return errors.New("could not find ollama app")
 	}
-	if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
+	path := strings.Split(link, "Ollama.app")
 	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -4,27 +4,17 @@ import (
 	"context"
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
 	"unsafe"
 	"github.com/ollama/ollama/api"
 	"golang.org/x/sys/windows"
 )
 const (
 	Installer = "OllamaSetup.exe"
 )
 func startApp(ctx context.Context, client *api.Client) error {
-	if len(isProcRunning(Installer)) > 0 {
+	// log.Printf("XXX Attempting to find and start ollama app")
 		return fmt.Errorf("upgrade in progress...")
 	}
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@@ -45,11 +35,14 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
 	// log.Printf("XXX attempting to start app %s", appExe)
 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
+	cmd := exec.Command(cmd_path, "/c", appExe)
 	// TODO - these hide flags aren't working - still pops up a command window for some reason
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
 	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -63,50 +56,3 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
 func isProcRunning(procName string) []uint32 {
 	pids := make([]uint32, 2048)
 	var ret uint32
 	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
 		slog.Debug("failed to check for running installers", "error", err)
 		return nil
 	}
 	if ret > uint32(len(pids)) {
 		pids = make([]uint32, ret+10)
 		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
 			slog.Debug("failed to check for running installers", "error", err)
 			return nil
 		}
 	}
 	if ret < uint32(len(pids)) {
 		pids = pids[:ret]
 	}
 	var matches []uint32
 	for _, pid := range pids {
 		if pid == 0 {
 			continue
 		}
 		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
 		if err != nil {
 			continue
 		}
 		defer windows.CloseHandle(hProcess)
 		var module windows.Handle
 		var cbNeeded uint32
 		cb := (uint32)(unsafe.Sizeof(module))
 		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
 			continue
 		}
 		var sz uint32 = 1024 * 8
 		moduleName := make([]uint16, sz)
 		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
 		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
 			continue
 		}
 		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
 		if strings.EqualFold(exeFile, procName) {
 			matches = append(matches, pid)
 		}
 	}
 	return matches
 }
--- a/cmd/warn_thinking_test.go
+++ b/cmd/warn_thinking_test.go
@@ -1,63 +0,0 @@
 package cmd
 import (
 	"encoding/json"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"strings"
 	"testing"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/types/model"
 )
 // Test that a warning is printed when thinking is requested but not supported.
 func TestWarnMissingThinking(t *testing.T) {
 	cases := []struct {
 		capabilities []model.Capability
 		expectWarn   bool
 	}{
 		{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
 		{capabilities: []model.Capability{}, expectWarn: true},
 	}
 	for _, tc := range cases {
 		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
 				t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
 			}
 			var req api.ShowRequest
 			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 				t.Fatalf("decode request: %v", err)
 			}
 			resp := api.ShowResponse{Capabilities: tc.capabilities}
 			if err := json.NewEncoder(w).Encode(resp); err != nil {
 				t.Fatalf("encode response: %v", err)
 			}
 		}))
 		defer srv.Close()
 		t.Setenv("OLLAMA_HOST", srv.URL)
 		client, err := api.ClientFromEnvironment()
 		if err != nil {
 			t.Fatal(err)
 		}
 		oldStderr := os.Stderr
 		r, w, _ := os.Pipe()
 		os.Stderr = w
 		ensureThinkingSupport(t.Context(), client, "m")
 		w.Close()
 		os.Stderr = oldStderr
 		out, _ := io.ReadAll(r)
 		warned := strings.Contains(string(out), "warning:")
 		if tc.expectWarn && !warned {
 			t.Errorf("expected warning, got none")
 		}
 		if !tc.expectWarn && warned {
 			t.Errorf("did not expect warning, got: %s", string(out))
 		}
 	}
 }
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,14 +1,12 @@
 package convert
 import (
 	"cmp"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
@@ -17,10 +15,6 @@ import (
 type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`
 	TextModel struct {
 		VocabSize uint32 `json:"vocab_size"`
 	} `json:"text_config"`
 }
 type AdapterParameters struct {
@@ -53,11 +47,8 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}
 	for _, sv := range t.SpecialVocabulary {
 		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
-		if len(sv.IDs) > 0 {
+		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
 		}
 	}
 	return kv
@@ -88,17 +79,27 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }
 func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
 	return ggml.WriteGGUF(ws, kv, ts)
 }
 func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
 	return ggml.WriteGGUF(ws, kv, ts)
 }
 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []*ggml.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
 	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }
 type moreParser interface {
@@ -109,13 +110,15 @@ type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(ggml.KV) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []*ggml.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }
-func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -150,14 +153,14 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 		return err
 	}
-	return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
+	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
 }
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, f *os.File) error {
+func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
@@ -174,38 +177,24 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	var conv ModelConverter
 	switch p.Architectures[0] {
-	case "LlamaForCausalLM":
+	case "LlamaForCausalLM", "MistralForCausalLM":
 		conv = &llamaModel{}
 	case "MllamaForConditionalGeneration":
 		conv = &mllamaModel{}
 	case "Llama4ForConditionalGeneration":
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
 		conv = &mistral3Model{}
 	case "MixtralForCausalLM":
 		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
 		conv = &gemmaModel{}
 	case "Gemma2ForCausalLM":
 		conv = &gemma2Model{}
 	case "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration":
 		conv = &gemma3Model{Architecture: p.Architectures[0]}
 	case "Gemma3nForConditionalGeneration":
 		conv = &gemma3nModel{}
 	case "Phi3ForCausalLM":
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
 	case "Qwen2_5_VLForConditionalGeneration":
 		conv = &qwen25VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
 	case "GptOssForCausalLM":
 		conv = &gptossModel{}
 	default:
-		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
+		return errors.New("unsupported architecture")
 	}
 	if err := json.Unmarshal(bts, conv); err != nil {
@@ -223,22 +212,17 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}
-	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
+	vocabSize := int(p.VocabSize)
 	switch {
 	case vocabSize == 0:
 		slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
 	case vocabSize > len(t.Vocabulary.Tokens):
-		slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
+		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
 	case vocabSize < len(t.Vocabulary.Tokens):
-		slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
+		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
 		p.VocabSize = uint32(len(t.Vocabulary.Tokens))
 		p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
@@ -248,13 +232,5 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}
-	return writeFile(f, conv.KV(t), conv.Tensors(ts))
+	return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts))
 }
 func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
 	for i := range ts {
 		ts[i].Shape = slices.Clone(ts[i].Shape)
 		slices.Reverse(ts[i].Shape)
 	}
 	return ggml.WriteGGUF(f, kv, ts)
 }
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -28,7 +28,6 @@ type bertModel struct {
 	LayerNormEPS          float32 `json:"layer_norm_eps"`
 	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
 	NormEpsilon           float32 `json:"norm_epsilon"`
 	normalizeEmbeddings   bool
 	PoolingType uint32
 }
@@ -55,11 +54,9 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	var pooling string
 	for _, m := range modules {
-		switch m.Type {
+		if m.Type == "sentence_transformers.models.Pooling" {
 		case "sentence_transformers.models.Pooling":
 			pooling = m.Path
-		case "sentence_transformers.models.Normalize":
+			break
 			p.normalizeEmbeddings = true
 		}
 	}
@@ -93,7 +90,6 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.pooling_type"] = p.PoolingType
 	kv["bert.normalize_embeddings"] = p.normalizeEmbeddings
 	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@@ -136,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
 			"embeddings.position_ids",
@@ -147,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
 			continue
 		}
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	for _, t := range ts {
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	for _, t := range ts {
-		if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
+		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
 	return kv
 }
-func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma3.go
+++ b/convert/convert_gemma3.go
@@ -1,142 +0,0 @@
 package convert
 import (
 	"cmp"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type gemma3Model struct {
 	gemmaModel
 	Architecture string
 	TextModel    struct {
 		HeadDim          uint32 `json:"head_dim"`
 		HiddenSize       uint32 `json:"hidden_size"`
 		HiddenLayers     uint32 `json:"num_hidden_layers"`
 		IntermediateSize uint32 `json:"intermediate_size"`
 		SlidingWindow    uint32 `json:"sliding_window"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumAttentionHeads uint32  `json:"num_attention_heads"` // attention.head_count 16
 		LayerNormEpsilon  float32 `json:"layer_norm_eps"`      // attention.layer_norm_epsilon 1e-05
 		NumHiddenLayers   uint32  `json:"num_hidden_layers"`   // block_count 32
 		HiddenSize        uint32  `json:"hidden_size"`         // embedding_length 1280
 		IntermediateSize  uint32  `json:"intermediate_size"`   // feed_forward_length 5120
 		ImageSize         uint32  `json:"image_size"`          // image_size 560
 		NumChannels       uint32  `json:"num_channels"`        // num_channels 3
 		PatchSize         uint32  `json:"patch_size"`          // patch_size 14
 	} `json:"vision_config"`
 	MaxPositionEmbeddings    uint32  `json:"max_position_embeddings"`
 	NumAttentionHeads        uint32  `json:"num_attention_heads"`
 	NumKeyValueHeads         uint32  `json:"num_key_value_heads"`
 	RMSNormEPS               float32 `json:"rms_norm_eps"`
 	HeadDim                  uint32  `json:"head_dim"`
 	FinalLogitSoftcap        float32 `json:"final_logit_softcapping"`
 	RopeLocalTheta           float32 `json:"rope_local_base_freq"`
 	RopeGlobalTheta          float32 `json:"rope_global_base_freq"`
 	SlidingWindow            uint32  `json:"sliding_window"`
 	MultiModalTokensPerImage uint32  `json:"mm_tokens_per_image"`
 }
 const (
 	gemma4BLayerCount  = 34
 	gemma12BLayerCount = 48
 	gemma27BLayerCount = 62
 )
 func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3"
 	numBlocks := cmp.Or(p.HiddenLayers, p.TextModel.HiddenLayers)
 	kv["gemma3.block_count"] = numBlocks
 	var (
 		numHeads   uint32
 		numKVHeads uint32
 	)
 	switch numBlocks {
 	case gemma4BLayerCount:
 		numHeads = 8
 		numKVHeads = 4
 	case gemma12BLayerCount:
 		numHeads = 16
 		numKVHeads = 8
 	case gemma27BLayerCount:
 		numHeads = 32
 		numKVHeads = 16
 	default:
 		numHeads = p.NumAttentionHeads
 		numKVHeads = p.NumKeyValueHeads
 	}
 	kv["gemma3.attention.head_count"] = numHeads
 	kv["gemma3.attention.head_count_kv"] = numKVHeads
 	switch p.Architecture {
 	case "Gemma3ForCausalLM":
 		kv["gemma3.context_length"] = p.MaxPositionEmbeddings
 		kv["gemma3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
 		kv["gemma3.attention.key_length"] = p.HeadDim
 		kv["gemma3.attention.value_length"] = p.HeadDim
 		kv["gemma3.attention.sliding_window"] = p.SlidingWindow
 		kv["gemma3.final_logit_softcapping"] = cmp.Or(p.FinalLogitSoftcap, 30)
 		kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
 		kv["gemma3.rope.global.freq_base"] = cmp.Or(p.RopeGlobalTheta, 1000000.0)
 		kv["gemma3.embedding_length"] = p.HiddenSize
 		kv["gemma3.feed_forward_length"] = p.IntermediateSize
 	default:
 		kv["gemma3.context_length"] = cmp.Or(p.MaxPositionEmbeddings, 131072)
 		kv["gemma3.embedding_length"] = p.TextModel.HiddenSize
 		kv["gemma3.feed_forward_length"] = p.TextModel.IntermediateSize
 		kv["gemma3.attention.sliding_window"] = p.TextModel.SlidingWindow
 		kv["gemma3.vision.block_count"] = p.VisionModel.NumHiddenLayers
 		kv["gemma3.vision.embedding_length"] = p.VisionModel.HiddenSize
 		kv["gemma3.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
 		kv["gemma3.vision.image_size"] = p.VisionModel.ImageSize
 		kv["gemma3.vision.patch_size"] = p.VisionModel.PatchSize
 		kv["gemma3.vision.num_channels"] = cmp.Or(p.VisionModel.NumChannels, 3)
 		kv["gemma3.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
 		kv["gemma3.vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionModel.LayerNormEpsilon, 1e-6)
 		kv["gemma3.attention.key_length"] = cmp.Or(p.TextModel.HeadDim, 256)
 		kv["gemma3.attention.value_length"] = cmp.Or(p.TextModel.HeadDim, 256)
 	}
 	if p.MultiModalTokensPerImage > 0 {
 		kv["gemma3.mm.tokens_per_image"] = p.MultiModalTokensPerImage
 	}
 	return kv
 }
 func (p *gemma3Model) Replacements() []string {
 	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"vision_tower.vision_model.embeddings", "v",
 		"vision_tower.vision_model", "v",
 		"vision_model.vision_model.embeddings", "v",
 		"vision_model.vision_model", "v",
 		"language_model.", "",
 		"model.layers", "blk",
 		"encoder.layers", "blk",
 		"input_layernorm", "attn_norm",
 		"self_attn.q_proj", "attn_q",
 		"self_attn.q_norm", "attn_q_norm",
 		"self_attn.k_proj", "attn_k",
 		"self_attn.k_norm", "attn_k_norm",
 		"self_attn.v_proj", "attn_v",
 		"self_attn.o_proj", "attn_output",
 		"self_attn.out_proj", "attn_output",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "post_attention_norm",
 		"pre_feedforward_layernorm", "ffn_norm",
 		"post_feedforward_layernorm", "post_ffw_norm",
 		"input_projection_weight", "input_projection.weight",
 		"multi_modal_projector", "mm",
 	}
 }
--- a/convert/convert_gemma3n.go
+++ b/convert/convert_gemma3n.go
@@ -1,165 +0,0 @@
 package convert
 import (
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"gonum.org/v1/gonum/stat/distuv"
 )
 type gemma3nModel struct {
 	ModelParameters
 	TextModel struct {
 		ActivationSparsityPattern []float32 `json:"activation_sparsity_pattern"`
 		AltupActiveIdx            uint32    `json:"altup_active_idx"`
 		AltupCoefClip             float32   `json:"altup_coef_clip"`
 		AltupCorrectScale         bool      `json:"altup_correct_scale"`
 		AltupLRMultiplier         float32   `json:"altup_lr_multiplier"`
 		AltupNumInputs            uint32    `json:"altup_num_inputs"`
 		HeadDim                   uint32    `json:"head_dim"`
 		HiddenSize                uint32    `json:"hidden_size"`
 		HiddenSizePerLayerInput   uint32    `json:"hidden_size_per_layer_input"`
 		IntermediateSize          uint32    `json:"intermediate_size"`
 		MaxPositionEmbeddings     uint32    `json:"max_position_embeddings"`
 		NumAttentionHeads         uint32    `json:"num_attention_heads"`
 		NumHiddenLayers           uint32    `json:"num_hidden_layers"`
 		NumKeyValueHeads          uint32    `json:"num_key_value_heads"`
 		NumKVSharedLayers         uint32    `json:"num_kv_shared_layers"`
 		RMSNormEPS                float32   `json:"rms_norm_eps"`
 		RopeLocalBaseFreq         float32   `json:"rope_local_base_freq"`
 		RopeTheta                 float32   `json:"rope_theta"`
 		SlidingWindow             uint32    `json:"sliding_window"`
 		LayerTypes                []string  `json:"layer_types"`
 	} `json:"text_config"`
 	VisionModel struct{} `json:"vision_config"`
 }
 func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3n"
 	kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
 		norm := distuv.Normal{Mu: 0, Sigma: 1}
 		for _, v := range m.TextModel.ActivationSparsityPattern {
 			if !yield(float32(norm.Quantile(float64(v)))) {
 				break
 			}
 		}
 	})
 	kv["gemma3n.altup.active_idx"] = m.TextModel.AltupActiveIdx
 	kv["gemma3n.altup.correct_scale"] = m.TextModel.AltupCorrectScale
 	kv["gemma3n.altup.lr_multiplier"] = m.TextModel.AltupLRMultiplier
 	kv["gemma3n.altup.num_inputs"] = m.TextModel.AltupNumInputs
 	kv["gemma3n.attention.head_count_kv"] = m.TextModel.NumKeyValueHeads
 	kv["gemma3n.attention.head_count"] = m.TextModel.NumAttentionHeads
 	kv["gemma3n.attention.layer_norm_rms_epsilon"] = m.TextModel.RMSNormEPS
 	kv["gemma3n.attention.sliding_window"] = m.TextModel.SlidingWindow
 	kv["gemma3n.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
 		for _, t := range m.TextModel.LayerTypes {
 			if !yield(t == "sliding_attention") {
 				break
 			}
 		}
 	})
 	kv["gemma3n.attention.shared_kv_layers"] = m.TextModel.NumKVSharedLayers
 	kv["gemma3n.block_count"] = m.TextModel.NumHiddenLayers
 	kv["gemma3n.context_length"] = m.TextModel.MaxPositionEmbeddings
 	kv["gemma3n.embedding_length_per_layer_input"] = m.TextModel.HiddenSizePerLayerInput
 	kv["gemma3n.embedding_length"] = m.TextModel.HiddenSize
 	kv["gemma3n.feed_forward_length"] = m.TextModel.IntermediateSize
 	kv["gemma3n.head_dim"] = m.TextModel.HeadDim
 	kv["gemma3n.rope.freq_base_local"] = m.TextModel.RopeLocalBaseFreq
 	kv["gemma3n.rope.freq_base"] = m.TextModel.RopeTheta
 	return kv
 }
 func (m *gemma3nModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	out, ts := mergeTensors(ts,
 		merge{"altup_proj.*.weight", "altup_proj.weight"},
 		merge{"altup_unembd_proj.*.weight", "altup_unembd_proj.weight"},
 	)
 	for _, t := range ts {
 		switch {
 		case strings.Contains(t.Name(), "audio_tower"),
 			strings.Contains(t.Name(), "embed_audio"),
 			strings.Contains(t.Name(), "vision_tower"),
 			strings.Contains(t.Name(), "embed_vision"):
 			// TODO: handle audio and vision towers
 			continue
 		case strings.Contains(t.Name(), "altup_predict_coef"),
 			strings.Contains(t.Name(), "altup_correct_coef"):
 			if m.TextModel.AltupCoefClip > 0 {
 				t.SetRepacker(func(name string, data []float32, shape []uint64) (_ []float32, err error) {
 					dims := make([]int, len(shape))
 					for i := range shape {
 						dims[i] = int(shape[i])
 					}
 					var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 					t, err = tensor.Clamp(t, -m.TextModel.AltupCoefClip, m.TextModel.AltupCoefClip)
 					if err != nil {
 						return nil, err
 					}
 					if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 						return nil, err
 					}
 					return native.VectorF32(t.(*tensor.Dense))
 				})
 			}
 		}
 		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
 		})
 	}
 	return out
 }
 func (m *gemma3nModel) Replacements() []string {
 	return []string{
 		"model.language_model.embed_tokens_per_layer", "per_layer_token_embd",
 		"model.language_model.embed_tokens", "token_embd",
 		"model.language_model.per_layer_model_projection", "per_layer_model_proj",
 		"model.language_model.per_layer_projection_norm", "per_layer_proj_norm", "model.language_model.altup_projections", "altup_proj",
 		"model.language_model.altup_unembed_projections", "altup_unembd_proj",
 		"model.language_model.norm", "output_norm",
 		"model.language_model.layers", "blk",
 		"input_layernorm", "attn_norm",
 		"self_attn.q_proj", "attn_q",
 		"self_attn.q_norm", "attn_q_norm",
 		"self_attn.k_proj", "attn_k",
 		"self_attn.k_norm", "attn_k_norm",
 		"self_attn.v_proj", "attn_v",
 		"self_attn.o_proj", "attn_output",
 		"post_attention_layernorm", "post_attention_norm",
 		"pre_feedforward_layernorm", "ffn_norm",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
 		"mlp.down_proj", "ffn_down",
 		"post_feedforward_layernorm", "post_ffw_norm",
 		"per_layer_input_gate", "inp_gate",
 		"per_layer_projection", "proj",
 		"post_per_layer_input_norm", "post_norm",
 		"altup.", "altup_",
 		"modality_router", "router",
 		"prediction_coefs", "predict_coef",
 		"correction_coefs", "correct_coef",
 		"correct_output_scale", "correct_scale.weight",
 		"laurel.", "laurel_",
 		"linear_left", "l",
 		"linear_right", "r",
 		"post_laurel_norm", "post_norm",
 	}
 }
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -1,266 +0,0 @@
 package convert
 import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
 	"io"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 )
 type gptossModel struct {
 	ModelParameters
 	HiddenLayers          uint32  `json:"num_hidden_layers"`
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	IntermediateSize      uint32  `json:"intermediate_size"`
 	AttentionHeads        uint32  `json:"num_attention_heads"`
 	KeyValueHeads         uint32  `json:"num_key_value_heads"`
 	HeadDim               uint32  `json:"head_dim"`
 	Experts               uint32  `json:"num_experts"`
 	LocalExperts          uint32  `json:"num_local_experts"`
 	ExpertsPerToken       uint32  `json:"experts_per_token"`
 	RMSNormEpsilon        float32 `json:"rms_norm_eps"`
 	InitialContextLength  uint32  `json:"initial_context_length"`
 	RopeTheta             float32 `json:"rope_theta"`
 	RopeScalingFactor     float32 `json:"rope_scaling_factor"`
 	RopeScaling           struct {
 		Factor float32 `json:"factor"`
 	} `json:"rope_scaling"`
 	SlidingWindow uint32 `json:"sliding_window"`
 }
 var _ ModelConverter = (*gptossModel)(nil)
 func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
 	kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
 	kv["gptoss.block_count"] = m.HiddenLayers
 	kv["gptoss.embedding_length"] = m.HiddenSize
 	kv["gptoss.feed_forward_length"] = m.IntermediateSize
 	kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
 	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
 	kv["gptoss.attention.head_count"] = m.AttentionHeads
 	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
 	kv["gptoss.attention.key_length"] = m.HeadDim
 	kv["gptoss.attention.value_length"] = m.HeadDim
 	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
 	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
 	kv["gptoss.rope.freq_base"] = m.RopeTheta
 	kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
 	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
 	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
 	kv["tokenizer.ggml.add_bos_token"] = false
 	kv["tokenizer.ggml.eos_token_id"] = uint32(199999) // <|endoftext|>
 	kv["tokenizer.ggml.eos_token_ids"] = []int32{
 		199999, /* <|endoftext|> */
 		200002, /* <|return|> */
 		200012, /* <|call|> */
 	}
 	kv["tokenizer.ggml.add_eos_token"] = false
 	return kv
 }
 func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	mxfp4s := make(map[string]*mxfp4)
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), ".blocks") || strings.HasSuffix(t.Name(), ".scales") {
 			dot := strings.LastIndex(t.Name(), ".")
 			name, suffix := t.Name()[:dot], t.Name()[dot+1:]
 			if _, ok := mxfp4s[name]; !ok {
 				mxfp4s[name] = &mxfp4{}
 			}
 			switch suffix {
 			case "blocks":
 				mxfp4s[name].blocks = t
 			case "scales":
 				mxfp4s[name].scales = t
 			}
 		} else if strings.HasSuffix(t.Name(), "gate_up_exps.bias") {
 			// gate_up_exps is interleaved, need to split into gate_exps and up_exps
 			// e.g. gate_exps, up_exps = gate_up_exps[:, 0::2, ...], gate_up_exps[:, 1::2, ...]
 			out = append(out, slices.Collect(splitDim(t, 1,
 				split{
 					Replacer: strings.NewReplacer("gate_up_exps", "gate_exps"),
 					slices:   []tensor.Slice{nil, tensor.S(0, int(t.Shape()[1]), 2)},
 				},
 				split{
 					Replacer: strings.NewReplacer("gate_up_exps", "up_exps"),
 					slices:   []tensor.Slice{nil, tensor.S(1, int(t.Shape()[1]), 2)},
 				},
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		}
 	}
 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
 		if strings.Contains(name, "ffn_down_exps") {
 			out = append(out, &ggml.Tensor{
 				Name:     name + ".weight",
 				Kind:     uint32(ggml.TensorTypeMXFP4),
 				Shape:    []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
 				WriterTo: mxfp4,
 			})
 		} else if strings.Contains(name, "ffn_gate_up_exps") {
 			// gate_up_exps is interleaved, need to split into gate_exps and up_exps
 			// e.g. gate_exps, up_exps = gate_up_exps[:, 0::2, ...], gate_up_exps[:, 1::2, ...]
 			out = append(out, &ggml.Tensor{
 				Name:     strings.Replace(name, "gate_up", "gate", 1) + ".weight",
 				Kind:     uint32(ggml.TensorTypeMXFP4),
 				Shape:    []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
 				WriterTo: mxfp4.slice(1, 0, int(dims[1]), 2),
 			}, &ggml.Tensor{
 				Name:     strings.Replace(name, "gate_up", "up", 1) + ".weight",
 				Kind:     uint32(ggml.TensorTypeMXFP4),
 				Shape:    []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
 				WriterTo: mxfp4.slice(1, 1, int(dims[1]), 2),
 			})
 		}
 	}
 	return out
 }
 func (m *gptossModel) Replacements() []string {
 	var replacements []string
 	if m.MaxPositionEmbeddings > 0 {
 		// hf flavored model
 		replacements = []string{
 			"lm_head", "output",
 			"model.embed_tokens", "token_embd",
 			"model.layers", "blk",
 			"input_layernorm", "attn_norm",
 			"self_attn.q_proj", "attn_q",
 			"self_attn.k_proj", "attn_k",
 			"self_attn.v_proj", "attn_v",
 			"self_attn.o_proj", "attn_out",
 			"self_attn.sinks", "attn_sinks",
 			"post_attention_layernorm", "ffn_norm",
 			"mlp.router", "ffn_gate_inp",
 			"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
 			"mlp.experts.down_proj_", "ffn_down_exps.",
 			"model.norm", "output_norm",
 		}
 	} else {
 		replacements = []string{
 			// noop replacements so other replacements will not be applied
 			".blocks", ".blocks",
 			".scales", ".scales",
 			// real replacements
 			"block", "blk",
 			"attn.norm", "attn_norm",
 			"attn.qkv", "attn_qkv",
 			"attn.sinks", "attn_sinks",
 			"attn.out", "attn_out",
 			"mlp.norm", "ffn_norm",
 			"mlp.gate", "ffn_gate_inp",
 			"mlp.mlp1_", "ffn_gate_up_exps.",
 			"mlp.mlp2_", "ffn_down_exps.",
 			"embedding", "token_embd",
 			"norm", "output_norm",
 			"unembedding", "output",
 			"scale", "weight",
 		}
 	}
 	return replacements
 }
 type mxfp4 struct {
 	slices []tensor.Slice
 	blocks, scales Tensor
 }
 func (m *mxfp4) slice(dim, start, end, step int) *mxfp4 {
 	slice := slices.Repeat([]tensor.Slice{nil}, len(m.blocks.Shape()))
 	slice[dim] = tensor.S(start, end, step)
 	return &mxfp4{
 		slices: slice,
 		blocks: m.blocks,
 		scales: m.scales,
 	}
 }
 func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
 	var b bytes.Buffer
 	if _, err := m.blocks.WriteTo(&b); err != nil {
 		return 0, err
 	}
 	blocksDims := make([]int, len(m.blocks.Shape()))
 	for i, d := range m.blocks.Shape() {
 		blocksDims[i] = int(d)
 	}
 	bts := b.Bytes()
 	var tmp [16]byte
 	for i := 0; i < b.Len(); i += 16 {
 		for j := range 8 {
 			// transform a1b2c3 ... x7y8z9 -> 71xa82yb93zc
 			a, b := bts[i+j], bts[i+j+8]
 			tmp[2*j+0] = (a & 0x0F) | (b << 4)
 			tmp[2*j+1] = (a >> 4) | (b & 0xF0)
 		}
 		copy(bts[i:i+16], tmp[:])
 	}
 	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(bts))
 	var s bytes.Buffer
 	if _, err := m.scales.WriteTo(&s); err != nil {
 		return 0, err
 	}
 	scalesDims := slices.Repeat([]int{1}, len(m.blocks.Shape()))
 	for i, d := range m.scales.Shape() {
 		scalesDims[i] = int(d)
 	}
 	var scales tensor.Tensor = tensor.New(tensor.WithShape(scalesDims...), tensor.WithBacking(s.Bytes()))
 	out, err := tensor.Concat(3, scales, blocks)
 	if err != nil {
 		return 0, err
 	}
 	if len(m.slices) > 0 {
 		out, err = out.Slice(m.slices...)
 		if err != nil {
 			return 0, err
 		}
 	}
 	out = tensor.Materialize(out)
 	if err := out.Reshape(out.Shape().TotalSize()); err != nil {
 		return 0, err
 	}
 	u8s, err := native.VectorU8(out.(*tensor.Dense))
 	if err != nil {
 		return 0, err
 	}
 	if err := binary.Write(w, binary.LittleEndian, u8s); err != nil {
 		return 0, err
 	}
 	return int64(len(u8s)), nil
 }
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -28,12 +28,12 @@ type llamaModel struct {
 	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 	RopeTheta             float32 `json:"rope_theta"`
 	RopeScaling           struct {
-		Type                          string  `json:"type"`
+		Type                            string  `json:"type"`
-		RopeType                      string  `json:"rope_type"`
+		RopeType                        string  `json:"rope_type"`
-		Factor                        float32 `json:"factor"`
+		Factor                          float32 `json:"factor"`
-		LowFrequencyFactor            float32 `json:"low_freq_factor"`
+		LowFrequencyFactor              float32 `json:"low_freq_factor"`
-		HighFrequencyFactor           float32 `json:"high_freq_factor"`
+		HighFrequencyFactor             float32 `json:"high_freq_factor"`
-		OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+		OriginalMaxPositionalEmbeddings uint32  `json:"original_max_positional_embeddings"`
 		factors ropeFactor
 	} `json:"rope_scaling"`
@@ -42,8 +42,6 @@ type llamaModel struct {
 	LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
 	NormEpsilon      float32 `json:"norm_epsilon"`
 	HeadDim          uint32  `json:"head_dim"`
 	skipRepack bool
 }
 var _ ModelConverter = (*llamaModel)(nil)
@@ -72,10 +70,6 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 		kv["llama.rope.dimension_count"] = p.HiddenSize / headCount
 	}
 	if p.HeadDim > 0 {
 		kv["llama.attention.head_dim"] = p.HeadDim
 	}
 	if p.RopeTheta > 0 {
 		kv["llama.rope.freq_base"] = p.RopeTheta
 	}
@@ -90,7 +84,7 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
 			factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
-			original := cmp.Or(p.RopeScaling.OriginalMaxPositionEmbeddings, 8192)
+			original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
 			lambdaLow := float32(original) / factorLow
 			lambdaHigh := float32(original) / factorHigh
@@ -126,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	if p.RopeScaling.factors != nil {
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -139,14 +133,12 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}
 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
+		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
-			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
+			strings.HasSuffix(t.Name(), "attn_k.weight") {
-			if !p.skipRepack {
+			t.SetRepacker(p.repack)
 				t.SetRepacker(p.repack)
 			}
 		}
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
@@ -182,9 +174,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}
 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@@ -1,169 +0,0 @@
 package convert
 import (
 	"slices"
 	"strings"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type llama4Model struct {
 	ModelParameters
 	TextModel struct {
 		llamaModel
 		NumExpertsPerToken     uint32 `json:"num_experts_per_tok"`
 		NumLocalExperts        uint32 `json:"num_local_experts"`
 		InterleaveMOELayerStep uint32 `json:"interleave_moe_layer_step"`
 		UseQKNorm              bool   `json:"use_qk_norm"`
 		IntermediateSizeMLP    uint32 `json:"intermediate_size_mlp"`
 		AttentionChunkSize     uint32 `json:"attention_chunk_size"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
 		HiddenSize        uint32  `json:"hidden_size"`
 		IntermediateSize  uint32  `json:"intermediate_size"`
 		NumAttentionHeads uint32  `json:"num_attention_heads"`
 		ImageSize         uint32  `json:"image_size"`
 		PatchSize         uint32  `json:"patch_size"`
 		RopeTheta         float32 `json:"rope_theta"`
 		NormEpsilon       float32 `json:"norm_eps"`
 		PixelShuffleRatio float32 `json:"pixel_shuffle_ratio"`
 	} `json:"vision_config"`
 }
 // KV implements ModelConverter.
 func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama4"
 	for k, v := range p.TextModel.KV(t) {
 		if strings.HasPrefix(k, "llama.") {
 			kv[strings.ReplaceAll(k, "llama.", "llama4.")] = v
 		}
 	}
 	kv["llama4.feed_forward_length"] = p.TextModel.IntermediateSizeMLP
 	kv["llama4.expert_feed_forward_length"] = p.TextModel.IntermediateSize
 	kv["llama4.expert_count"] = p.TextModel.NumLocalExperts
 	kv["llama4.expert_used_count"] = p.TextModel.NumExpertsPerToken
 	kv["llama4.interleave_moe_layer_step"] = p.TextModel.InterleaveMOELayerStep
 	kv["llama4.use_qk_norm"] = p.TextModel.UseQKNorm
 	kv["llama4.attention.chunk_size"] = p.TextModel.AttentionChunkSize
 	kv["llama4.vision.block_count"] = p.VisionModel.NumHiddenLayers
 	kv["llama4.vision.embedding_length"] = p.VisionModel.HiddenSize
 	kv["llama4.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
 	kv["llama4.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
 	kv["llama4.vision.image_size"] = p.VisionModel.ImageSize
 	kv["llama4.vision.patch_size"] = p.VisionModel.PatchSize
 	kv["llama4.vision.rope.freq_base"] = p.VisionModel.RopeTheta
 	kv["llama4.vision.layer_norm_epsilon"] = p.VisionModel.NormEpsilon
 	kv["llama4.vision.pixel_shuffle_ratio"] = p.VisionModel.PixelShuffleRatio
 	return kv
 }
 // Replacements implements ModelConverter.
 func (p *llama4Model) Replacements() []string {
 	return append(
 		p.TextModel.Replacements(),
 		"language_model.", "",
 		"vision_model", "v",
 		"multi_modal_projector", "mm",
 		"feed_forward.down_proj", "ffn_down",
 		"feed_forward.up_proj", "ffn_up",
 		"feed_forward.gate_proj", "ffn_gate",
 		"feed_forward.", "ffn_",
 		"shared_expert.down_proj", "down_shexp",
 		"shared_expert.gate_proj", "gate_shexp",
 		"shared_expert.up_proj", "up_shexp",
 		"experts.down_proj", "down_exps.weight",
 		"experts.gate_up_proj", "gate_up_exps.weight",
 		"router", "gate_inp",
 		"patch_embedding.linear", "patch_embedding",
 	)
 }
 // Tensors implements ModelConverter.
 func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var textTensors []Tensor
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		} else if strings.Contains(t.Name(), "ffn_gate_up_exps") {
 			// gate and up projectors are fused
 			// dims[1], dims[2] must be swapped
 			// [experts, hidden_size, intermediate_size * 2] --> [experts, intermediate_size, hidden_size]
 			halfDim := int(t.Shape()[2]) / 2
 			newShape := slices.Clone(t.Shape())
 			newShape[1], newShape[2] = newShape[2]/2, newShape[1]
 			for i, name := range []string{"ffn_gate_exps", "ffn_up_exps"} {
 				// clone tensor since we need separate repackers
 				tt := t.Clone()
 				tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
 				out = append(out, &ggml.Tensor{
 					Name:     strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
 					Kind:     tt.Kind(),
 					Shape:    newShape,
 					WriterTo: tt,
 				})
 			}
 		} else if strings.Contains(t.Name(), "ffn_down_exps") {
 			// dims[1], dims[2] must be swapped
 			// [experts, intermediate_size, hidden_size] --> [experts, hidden_size, intermediate_size]
 			t.SetRepacker(p.repack())
 			newShape := slices.Clone(t.Shape())
 			newShape[1], newShape[2] = newShape[2], newShape[1]
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    newShape,
 				WriterTo: t,
 			})
 		} else {
 			textTensors = append(textTensors, t)
 		}
 	}
 	p.TextModel.skipRepack = true
 	out = append(out, p.TextModel.Tensors(textTensors)...)
 	return out
 }
 func (p *llama4Model) repack(slice ...tensor.Slice) Repacker {
 	return func(name string, data []float32, shape []uint64) ([]float32, error) {
 		dims := make([]int, len(shape))
 		for i, dim := range shape {
 			dims[i] = int(dim)
 		}
 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 		t, err := t.Slice(slice...)
 		if err != nil {
 			return nil, err
 		}
 		if err := t.T(0, 2, 1); err != nil {
 			return nil, err
 		}
 		t = tensor.Materialize(t)
 		// flatten tensor so it can be return as a vector
 		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 			return nil, err
 		}
 		return native.VectorF32(t.(*tensor.Dense))
 	}
 }
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	return kv
 }
-func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    shape,
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -1,190 +0,0 @@
 package convert
 import (
 	"cmp"
 	"fmt"
 	"strings"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type mistral3Model struct {
 	ModelParameters
 	ImageTokenIndex    uint32 `json:"image_token_index"`
 	SpatialMergeSize   uint32 `json:"spatial_merge_size"`
 	VisionFeatureLayer int32  `json:"vision_feature_layer"`
 	TextModel          struct {
 		NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 		MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 		HiddenSize            uint32  `json:"hidden_size"`
 		IntermediateSize      uint32  `json:"intermediate_size"`
 		NumAttentionHeads     uint32  `json:"num_attention_heads"`
 		NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 		RopeTheta             float32 `json:"rope_theta"`
 		RMSNormEPS            float32 `json:"rms_norm_eps"`
 		HeadDim               uint32  `json:"head_dim"`
 		SlidingWindow         *uint32 `json:"sliding_window"`
 		HiddenAct             string  `json:"hidden_act"`
 		VocabSize             uint32  `json:"vocab_size"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumAttentionHeads uint32  `json:"num_attention_heads"`
 		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
 		HiddenSize        uint32  `json:"hidden_size"`
 		IntermediateSize  uint32  `json:"intermediate_size"`
 		ImageSize         uint32  `json:"image_size"`
 		NumChannels       uint32  `json:"num_channels"`
 		PatchSize         uint32  `json:"patch_size"`
 		HeadDim           uint32  `json:"head_dim"`
 		HiddenAct         string  `json:"hidden_act"`
 		RopeTheta         float32 `json:"rope_theta"`
 	} `json:"vision_config"`
 	MultiModalProjectorBias bool   `json:"multimodal_projector_bias"`
 	ProjectorHiddenAct      string `json:"projector_hidden_act"`
 }
 func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral3"
 	kv["mistral3.vocab_size"] = p.TextModel.VocabSize
 	// Text configuration
 	kv["mistral3.block_count"] = p.TextModel.NumHiddenLayers
 	kv["mistral3.context_length"] = p.TextModel.MaxPositionEmbeddings
 	kv["mistral3.embedding_length"] = p.TextModel.HiddenSize
 	kv["mistral3.feed_forward_length"] = p.TextModel.IntermediateSize
 	kv["mistral3.attention.head_count"] = p.TextModel.NumAttentionHeads
 	kv["mistral3.attention.head_count_kv"] = p.TextModel.NumKeyValueHeads
 	kv["mistral3.attention.layer_norm_rms_epsilon"] = p.TextModel.RMSNormEPS
 	kv["mistral3.attention.key_length"] = p.TextModel.HeadDim
 	kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
 	kv["mistral3.rope.dimension_count"] = p.TextModel.HiddenSize / p.TextModel.NumHiddenLayers
 	kv["mistral3.rope.freq_base"] = p.TextModel.RopeTheta
 	// Vision configuration
 	kv["mistral3.vision.block_count"] = p.VisionModel.NumHiddenLayers
 	kv["mistral3.vision.embedding_length"] = p.VisionModel.HiddenSize
 	kv["mistral3.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
 	kv["mistral3.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
 	kv["mistral3.vision.attention.key_length"] = p.VisionModel.HeadDim
 	kv["mistral3.vision.image_size"] = p.VisionModel.ImageSize
 	kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
 	kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
 	// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
 	kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta
 	// Multimodal configuration
 	kv["mistral3.image_token_index"] = p.ImageTokenIndex
 	kv["mistral3.spatial_merge_size"] = p.SpatialMergeSize
 	kv["mistral3.mm.projector_bias"] = p.MultiModalProjectorBias
 	if p.ProjectorHiddenAct != "" {
 		kv["mistral3.mm.projector_hidden_act"] = p.ProjectorHiddenAct
 	}
 	return kv
 }
 func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") {
 			if strings.HasSuffix(t.Name(), ".attn_q.weight") ||
 				strings.HasSuffix(t.Name(), ".attn_k.weight") {
 				t.SetRepacker(p.repack)
 			}
 		}
 		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
 		})
 	}
 	return out
 }
 func (p *mistral3Model) Replacements() []string {
 	return []string{
 		"language_model.model.norm", "output_norm",
 		"language_model.model.", "",
 		"language_model.", "",
 		"layers", "blk",
 		"transformer.layers", "blk",
 		"vision_tower", "v",
 		"ln_pre", "encoder_norm",
 		"input_layernorm", "attn_norm",
 		"post_attention_layernorm", "ffn_norm",
 		"embed_tokens", "token_embd",
 		"self_attn.q_proj", "attn_q",
 		"self_attn.k_proj", "attn_k",
 		"self_attn.v_proj", "attn_v",
 		"self_attn.o_proj", "attn_output",
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
 		"attention.q_proj", "attn_q",
 		"attention.k_proj", "attn_k",
 		"attention.v_proj", "attn_v",
 		"attention.o_proj", "attn_output",
 		"attention_norm", "attn_norm",
 		"feed_forward.gate_proj", "ffn_gate",
 		"feed_forward.down_proj", "ffn_down",
 		"feed_forward.up_proj", "ffn_up",
 		"multi_modal_projector", "mm",
 		"ffn_norm", "ffn_norm",
 		"lm_head", "output",
 	}
 }
 func (p *mistral3Model) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
 	}
 	var heads uint32
 	if strings.HasSuffix(name, ".attn_q.weight") {
 		heads = p.TextModel.NumAttentionHeads
 	} else if strings.HasSuffix(name, ".attn_k.weight") {
 		heads = cmp.Or(p.TextModel.NumKeyValueHeads, p.TextModel.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
 	}
 	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
 		return nil, err
 	}
 	if err := n.T(0, 2, 1, 3); err != nil {
 		return nil, err
 	}
 	if err := n.Reshape(dims...); err != nil {
 		return nil, err
 	}
 	if err := n.Transpose(); err != nil {
 		return nil, err
 	}
 	ts, err := native.SelectF32(n, 1)
 	if err != nil {
 		return nil, err
 	}
 	var f32s []float32
 	for _, t := range ts {
 		f32s = append(f32s, t...)
 	}
 	return f32s, nil
 }
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,6 +2,9 @@ package convert
 import (
 	"fmt"
 	"io"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -26,39 +29,66 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
-	merges := make([]merge, 0, p.NumHiddenLayers*6)
+	oldnew := []string{
-	for i := range p.NumHiddenLayers {
+		"model.layers", "blk",
-		merges = append(merges, merge{
+		"w1", "ffn_gate_exps",
-			fmt.Sprintf("blk.%d.*.w1.weight", i),
+		"w2", "ffn_down_exps",
-			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		"w3", "ffn_up_exps",
-		}, merge{
+	}
-			fmt.Sprintf("blk.%d.*.w1.bias", i),
+
-			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
+	for i := range p.NumLocalExperts {
-		}, merge{
+		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
-			fmt.Sprintf("blk.%d.*.w2.weight", i),
+	}
-			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+
-		}, merge{
+	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
-			fmt.Sprintf("blk.%d.*.w2.bias", i),
+	namer := strings.NewReplacer(oldnew...)
-			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
+	experts := make(map[string]experts)
-		}, merge{
+
-			fmt.Sprintf("blk.%d.*.w3.weight", i),
+	// merge experts into a single tensor while removing them from ts
-			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
-		}, merge{
+		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
-			fmt.Sprintf("blk.%d.*.w3.bias", i),
+			return false
-			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
+		}
 		name := namer.Replace(t.Name())
 		experts[name] = append(experts[name], t)
 		return true
 	})
 	var out []ggml.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
 		out = append(out, ggml.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
 			WriterTo: e,
 		})
 	}
 	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }
 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
 		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
 		"block_sparse_moe.experts.", ".",
 	)
 }
 type experts []Tensor
 func (e experts) WriteTo(w io.Writer) (int64, error) {
 	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
 	for _, t := range e {
 		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
 		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
 		// this accomplishes the same thing by writing each expert tensor in sequence
 		if _, err := t.WriteTo(w); err != nil {
 			return 0, err
 		}
 	}
 	return 0, nil
 }
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -1,179 +0,0 @@
 package convert
 import (
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 )
 type mllamaModel struct {
 	ModelParameters
 	TextModel struct {
 		llamaModel
 		CrossAttentionLayers []int32 `json:"cross_attention_layers"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumHiddenLayers           uint32  `json:"num_hidden_layers"`
 		NumGlobalLayers           uint32  `json:"num_global_layers"`
 		IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
 		HiddenSize       uint32 `json:"hidden_size"`
 		IntermediateSize uint32 `json:"intermediate_size"`
 		AttentionHeads uint32 `json:"attention_heads"`
 		ImageSize   uint32  `json:"image_size"`
 		PatchSize   uint32  `json:"patch_size"`
 		NumChannels uint32  `json:"num_channels"`
 		MaxNumTiles uint32  `json:"max_num_tiles"`
 		NormEpsilon float32 `json:"norm_eps"`
 		RopeTheta   float32 `json:"rope.freq_base"`
 	} `json:"vision_config"`
 }
 func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "mllama"
 	for k, v := range m.TextModel.KV(t) {
 		if strings.HasPrefix(k, "llama.") {
 			kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
 		}
 	}
 	kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
 	kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
 	kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
 	kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
 	kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
 	kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
 	kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
 	kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
 	kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
 	kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
 	kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
 	kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
 	return kv
 }
 func (m *mllamaModel) Replacements() []string {
 	return append(
 		m.TextModel.Replacements(),
 		"language_model.", "",
 		"gate_attn", "attn_gate",
 		"gate_ffn", "ffn_gate",
 		"cross_attn.", "cross_attn_",
 		"vision_model", "v",
 		"class_embedding", "class_embd",
 		"patch_embedding", "patch_embd",
 		"gated_positional_embedding.tile_embedding", "tile_position_embd",
 		"gated_positional_embedding.embedding", "position_embd.weight",
 		"gated_positional_embedding", "position_embd",
 		"embedding.weight", "weight",
 		"pre_tile_positional_embedding", "pre_tile_position_embd",
 		"post_tile_positional_embedding", "post_tile_position_embd",
 		"layernorm_pre", "pre_ln",
 		"layernorm_post", "post_ln",
 		"global_transformer.layers", "global.blk",
 		"transformer.layers", "blk",
 		"mlp.fc1", "ffn_up",
 		"mlp.fc2", "ffn_down",
 		"multi_modal_projector", "mm.0",
 	)
 }
 func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
 			text = append(text, t)
 		} else if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
 				out = append(out, &ggml.Tensor{
 					Name:     name,
 					Kind:     t.Kind(),
 					Shape:    t.Shape(),
 					WriterTo: tt,
 				})
 			}
 		} else {
 			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
 				t.SetRepacker(m.repack(t.Name()))
 			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
 				t.SetRepacker(m.repack(t.Name()))
 			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
 				t.SetRepacker(m.repack(t.Name()))
 			}
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		}
 	}
 	return append(out, m.TextModel.Tensors(text)...)
 }
 func (m *mllamaModel) repack(name string) Repacker {
 	return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
 		dims := make([]int, len(shape))
 		for i, dim := range shape {
 			dims[i] = int(dim)
 		}
 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
 			heads := m.VisionModel.AttentionHeads
 			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
 				return nil, err
 			}
 			if err := t.T(0, 2, 1, 3); err != nil {
 				return nil, err
 			}
 			if err := t.Reshape(dims...); err != nil {
 				return nil, err
 			}
 			if err := t.Transpose(); err != nil {
 				return nil, err
 			}
 		} else {
 			t, err = tensor.Tanh(t)
 			if err != nil {
 				return nil, err
 			}
 			if name == "v.position_embd.gate" {
 				t, err = tensor.Sub(float32(1), t)
 				if err != nil {
 					return nil, err
 				}
 			}
 		}
 		t = tensor.Materialize(t)
 		// flatten tensor so it can be return as a vector
 		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 			return nil, err
 		}
 		return native.VectorF32(t.(*tensor.Dense))
 	}
 }
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
 	var addRopeFactors sync.Once
-	out := make([]*ggml.Tensor, 0, len(ts)+2)
+	out := make([]ggml.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
-				out = append(out, &ggml.Tensor{
+				out = append(out, ggml.Tensor{
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
-				}, &ggml.Tensor{
+				}, ggml.Tensor{
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
 			})
 		}
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
@@ -118,5 +118,6 @@ func (p *phi3Model) Replacements() []string {
 type ropeFactor []float32
 func (r ropeFactor) WriteTo(w io.Writer) (int64, error) {
-	return 0, binary.Write(w, binary.LittleEndian, r)
+	err := binary.Write(w, binary.LittleEndian, r)
 	return 0, err
 }
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -15,7 +15,6 @@ type qwen2Model struct {
 		Type                          string     `json:"type"`
 		Factor                        ropeFactor `json:"factor"`
 		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
 		MropeSection                  []int32    `json:"mrope_section"`
 	} `json:"rope_scaling"`
 	RMSNormEPS float32 `json:"rms_norm_eps"`
 }
@@ -40,18 +39,16 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	case "yarn":
 		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
 		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
 	case "mrope", "default":
 		kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
 	default:
 		panic("unknown rope scaling type")
 	}
 	return kv
 }
-func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor {
+func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	for _, t := range ts {
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -1,102 +0,0 @@
 package convert
 import (
 	"cmp"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type qwen25VLModel struct {
 	qwen2Model
 	VisionModel struct {
 		Depth               uint32  `json:"depth"`
 		HiddenSize          uint32  `json:"hidden_size"`
 		NumHeads            uint32  `json:"num_heads"`
 		InChannels          uint32  `json:"in_chans"`
 		PatchSize           uint32  `json:"patch_size"`
 		SpatialMergeSize    uint32  `json:"spatial_merge_size"`
 		SpatialPatchSize    uint32  `json:"spatial_patch_size"`
 		WindowSize          uint32  `json:"window_size"`
 		RMSNormEps          float32 `json:"layer_norm_epsilon"`
 		RopeTheta           float32 `json:"rope_theta"`
 		FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
 		TemporalPatchSize   uint32  `json:"temporal_patch_size"`
 	} `json:"vision_config"`
 }
 var _ ModelConverter = (*qwen25VLModel)(nil)
 func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen25vl"
 	for k, v := range q.qwen2Model.KV(t) {
 		if strings.HasPrefix(k, "qwen2.") {
 			kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
 		}
 	}
 	if q.VisionModel.FullAttentionBlocks == nil {
 		kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
 	}
 	kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
 	kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
 	kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
 	kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
 	kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
 	kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
 	kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
 	kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
 	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
 	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
 	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
 	kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
 	return kv
 }
 func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	for _, t := range ts {
 		if strings.Contains(t.Name(), "patch_embed.proj") {
 			for t := range splitDim(t, 2,
 				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
 				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
 			) {
 				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
 				out = append(out, t)
 			}
 		} else if strings.Contains(t.Name(), "attn.qkv") {
 			out = append(out, slices.Collect(splitDim(t, 0,
 				split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
 				split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
 				split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		}
 	}
 	return out
 }
 func (p *qwen25VLModel) Replacements() []string {
 	return append(
 		p.qwen2Model.Replacements(),
 		"visual", "v",
 		"blocks", "blk",
 		"attn.proj", "attn_out",
 		"norm1", "ln1",
 		"norm2", "ln2",
 	)
 }
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -11,14 +11,15 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
-	"maps"
+	"math"
 	"os"
 	"path/filepath"
 	"slices"
 	"strings"
 	"testing"
-	"github.com/google/go-cmp/cmp"
+	"golang.org/x/exp/maps"
 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -47,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })
-	m, err := ggml.Decode(r, -1)
+	m, _, err := ggml.Decode(r, math.MaxInt)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -130,14 +131,15 @@ func TestConvertModel(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
 			defer expectFile.Close()
 			var expect map[string]string
 			if err := json.NewDecoder(expectFile).Decode(&expect); err != nil {
 				t.Fatal(err)
 			}
-			for _, k := range slices.Sorted(maps.Keys(expect)) {
+			keys := maps.Keys(expect)
 			slices.Sort(keys)
 			for _, k := range keys {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != expect[k] {
@@ -330,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()
-			m, err := ggml.Decode(r, -1)
+			m, _, err := ggml.Decode(r, math.MaxInt)
 			if err != nil {
 				t.Fatal(err)
 			}
@@ -340,8 +342,15 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
-			if diff := cmp.Diff(c.Expected, actual); diff != "" {
+
-				t.Errorf("mismatch (-want +got):\n%s", diff)
+			keys := maps.Keys(c.Expected)
 			slices.Sort(keys)
 			for _, k := range keys {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != c.Expected[k] {
 					t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
 				}
 			}
 		})
 	}
--- a/convert/fs.go
+++ b/convert/fs.go
@@ -0,0 +1,58 @@
 package convert
 import (
 	"archive/zip"
 	"errors"
 	"io"
 	"io/fs"
 	"os"
 	"path/filepath"
 )
 type ZipReader struct {
 	r *zip.Reader
 	p string
 	// limit is the maximum size of a file that can be read directly
 	// from the zip archive. Files larger than this size will be extracted
 	limit int64
 }
 func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS {
 	return &ZipReader{r, p, limit}
 }
 func (z *ZipReader) Open(name string) (fs.File, error) {
 	r, err := z.r.Open(name)
 	if err != nil {
 		return nil, err
 	}
 	defer r.Close()
 	if fi, err := r.Stat(); err != nil {
 		return nil, err
 	} else if fi.Size() < z.limit {
 		return r, nil
 	}
 	if !filepath.IsLocal(name) {
 		return nil, zip.ErrInsecurePath
 	}
 	n := filepath.Join(z.p, name)
 	if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) {
 		w, err := os.Create(n)
 		if err != nil {
 			return nil, err
 		}
 		defer w.Close()
 		if _, err := io.Copy(w, r); err != nil {
 			return nil, err
 		}
 	} else if err != nil {
 		return nil, err
 	}
 	return os.Open(n)
 }
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -11,15 +11,14 @@ type Tensor interface {
 	Name() string
 	Shape() []uint64
 	Kind() uint32
-	SetRepacker(Repacker)
+	SetRepacker(repacker)
 	WriteTo(io.Writer) (int64, error)
 	Clone() Tensor
 }
 type tensorBase struct {
-	name     string
+	name  string
-	shape    []uint64
+	shape []uint64
-	repacker Repacker
+	repacker
 }
 func (t tensorBase) Name() string {
@@ -31,46 +30,42 @@ func (t tensorBase) Shape() []uint64 {
 }
 const (
-	tensorKindFP32 uint32 = iota
+	tensorKindF32 uint32 = iota
-	tensorKindFP16
+	tensorKindF16
 	tensorKindBF16  = 30
 	tensorKindMXFP4 = 39
 )
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
-		strings.HasSuffix(t.name, ".bias") ||
+		t.name == "token_types.weight" {
 		t.name == "token_types.weight" ||
 		t.name == "v.positional_embedding_vlm" ||
 		t.name == "v.tile_position_embd.weight" ||
 		t.name == "v.pre_tile_position_embd.weight" ||
 		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
-		return tensorKindFP32
+		return 0
 	}
 	switch len(t.shape) {
 	case 0:
 		panic("invalid tensor shape")
 	case 1:
-		return tensorKindFP32
+		return tensorKindF32
 	default:
-		return tensorKindFP16
+		return tensorKindF16
 	}
 }
-func (t *tensorBase) SetRepacker(fn Repacker) {
+func (t *tensorBase) SetRepacker(fn repacker) {
 	t.repacker = fn
 }
-type Repacker func(string, []float32, []uint64) ([]float32, error)
+type repacker func(string, []float32, []uint64) ([]float32, error)
 func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	patterns := []struct {
 		Pattern string
 		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
 	}{
-		{"*.safetensors", parseSafetensors},
+		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
 		{"adapters.safetensors", parseSafetensors},
 		{"adapter_model.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -1,7 +1,6 @@
 package convert
 import (
 	"bufio"
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
@@ -9,12 +8,12 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
 	"maps"
 	"slices"
 	"strings"
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
 	"golang.org/x/exp/maps"
 )
 type safetensorMetadata struct {
@@ -47,7 +46,8 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 			return nil, err
 		}
-		keys := slices.Sorted(maps.Keys(headers))
+		keys := maps.Keys(headers)
 		slices.Sort(keys)
 		names := make(map[string]struct{}, len(keys))
@@ -94,30 +94,6 @@ type safetensor struct {
 	*tensorBase
 }
 func (st safetensor) Kind() uint32 {
 	kind := st.tensorBase.Kind()
 	if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
 		kind = tensorKindBF16
 	}
 	return kind
 }
 func (st safetensor) Clone() Tensor {
 	return &safetensor{
 		fs:     st.fs,
 		path:   st.path,
 		dtype:  st.dtype,
 		offset: st.offset,
 		size:   st.size,
 		tensorBase: &tensorBase{
 			name:     st.name,
 			repacker: st.repacker,
 			shape:    slices.Clone(st.shape),
 		},
 	}
 }
 func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	f, err := st.fs.Open(st.path)
 	if err != nil {
@@ -125,41 +101,26 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	}
 	defer f.Close()
-	r, err := func() (io.Reader, error) {
+	if seeker, ok := f.(io.Seeker); ok {
-		if readerAt, ok := f.(io.ReaderAt); ok {
+		if _, err := seeker.Seek(st.offset, io.SeekStart); err != nil {
-			return io.NewSectionReader(readerAt, st.offset, st.size), nil
+			return 0, err
-		} else if seeker, ok := f.(io.Seeker); ok {
+		}
-			_, err := seeker.Seek(st.offset, io.SeekStart)
+	} else {
-			return f, err
+		if _, err := io.CopyN(io.Discard, f, st.offset); err != nil {
-		} else {
+			return 0, err
 			_, err := io.CopyN(io.Discard, f, st.offset)
 			return f, err
 		}
 	}()
 	if err != nil {
 		return 0, err
 	}
 	br := bufio.NewReaderSize(r, min(32<<10, int(st.size)))
 	// special case when input and output are same type and the
 	// tensor doesn't need repacking
 	if (st.repacker == nil) &&
 		((st.dtype == "F32" && st.Kind() == tensorKindFP32) ||
 			(st.dtype == "F16" && st.Kind() == tensorKindFP16) ||
 			(st.dtype == "U8")) {
 		return io.CopyN(w, br, st.size)
 	}
 	var f32s []float32
 	switch st.dtype {
 	case "F32":
 		f32s = make([]float32, st.size/4)
-		if err = binary.Read(br, binary.LittleEndian, f32s); err != nil {
+		if err = binary.Read(f, binary.LittleEndian, f32s); err != nil {
 			return 0, err
 		}
 	case "F16":
 		u16s := make([]uint16, st.size/2)
-		if err = binary.Read(br, binary.LittleEndian, u16s); err != nil {
+		if err = binary.Read(f, binary.LittleEndian, u16s); err != nil {
 			return 0, err
 		}
@@ -170,7 +131,7 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	case "BF16":
 		u8s := make([]uint8, st.size)
-		if err = binary.Read(br, binary.LittleEndian, u8s); err != nil {
+		if err = binary.Read(f, binary.LittleEndian, u8s); err != nil {
 			return 0, err
 		}
@@ -187,18 +148,15 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	}
 	switch st.Kind() {
-	case tensorKindFP32:
+	case tensorKindF32:
-		return int64(len(f32s) * 4), binary.Write(w, binary.LittleEndian, f32s)
+		return 0, binary.Write(w, binary.LittleEndian, f32s)
-	case tensorKindFP16:
+	case tensorKindF16:
 		f16s := make([]uint16, len(f32s))
 		for i := range f32s {
 			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
 		}
-		return int64(len(f16s) * 2), binary.Write(w, binary.LittleEndian, f16s)
+		return 0, binary.Write(w, binary.LittleEndian, f16s)
 	case tensorKindBF16:
 		u8s := bfloat16.EncodeFloat32(f32s)
 		return int64(len(u8s)), binary.Write(w, binary.LittleEndian, u8s)
 	default:
 		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
 	}
--- a/convert/reader_test.go
+++ b/convert/reader_test.go
@@ -1,294 +0,0 @@
 package convert
 import (
 	"bytes"
 	"encoding/binary"
 	"os"
 	"path/filepath"
 	"testing"
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/google/go-cmp/cmp"
 	"github.com/x448/float16"
 )
 func TestSafetensors(t *testing.T) {
 	t.Parallel()
 	root, err := os.OpenRoot(t.TempDir())
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer root.Close()
 	cases := []struct {
 		name,
 		dtype string
 		offset,
 		size int64
 		shape []uint64
 		setup func(*testing.T, *os.File)
 		want  []byte
 	}{
 		{
 			name:  "fp32-fp32",
 			dtype: "F32",
 			size:  32 * 4, // 32 floats, each 4 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
 				f32s := make([]float32, 32)
 				for i := range f32s {
 					f32s[i] = float32(i)
 				}
 				if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
 					t.Fatal(err)
 				}
 			},
 			want: []byte{
 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40,
 				0x00, 0x00, 0x80, 0x40, 0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40,
 				0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41,
 				0x00, 0x00, 0x40, 0x41, 0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41,
 				0x00, 0x00, 0x80, 0x41, 0x00, 0x00, 0x88, 0x41, 0x00, 0x00, 0x90, 0x41, 0x00, 0x00, 0x98, 0x41,
 				0x00, 0x00, 0xa0, 0x41, 0x00, 0x00, 0xa8, 0x41, 0x00, 0x00, 0xb0, 0x41, 0x00, 0x00, 0xb8, 0x41,
 				0x00, 0x00, 0xc0, 0x41, 0x00, 0x00, 0xc8, 0x41, 0x00, 0x00, 0xd0, 0x41, 0x00, 0x00, 0xd8, 0x41,
 				0x00, 0x00, 0xe0, 0x41, 0x00, 0x00, 0xe8, 0x41, 0x00, 0x00, 0xf0, 0x41, 0x00, 0x00, 0xf8, 0x41,
 			},
 		},
 		{
 			name:  "fp32-fp16",
 			dtype: "F32",
 			size:  32 * 4, // 32 floats, each 4 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
 				f32s := make([]float32, 32)
 				for i := range f32s {
 					f32s[i] = float32(i)
 				}
 				if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
 					t.Fatal(err)
 				}
 			},
 			want: []byte{
 				0x00, 0x00, 0x00, 0x3c, 0x00, 0x40, 0x00, 0x42, 0x00, 0x44, 0x00, 0x45, 0x00, 0x46, 0x00, 0x47,
 				0x00, 0x48, 0x80, 0x48, 0x00, 0x49, 0x80, 0x49, 0x00, 0x4a, 0x80, 0x4a, 0x00, 0x4b, 0x80, 0x4b,
 				0x00, 0x4c, 0x40, 0x4c, 0x80, 0x4c, 0xc0, 0x4c, 0x00, 0x4d, 0x40, 0x4d, 0x80, 0x4d, 0xc0, 0x4d,
 				0x00, 0x4e, 0x40, 0x4e, 0x80, 0x4e, 0xc0, 0x4e, 0x00, 0x4f, 0x40, 0x4f, 0x80, 0x4f, 0xc0, 0x4f,
 			},
 		},
 		{
 			name:  "fp16-fp16",
 			dtype: "F16",
 			size:  32 * 2, // 32 floats, each 2 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
 				u16s := make([]uint16, 32)
 				for i := range u16s {
 					u16s[i] = float16.Fromfloat32(float32(i)).Bits()
 				}
 				if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
 					t.Fatal(err)
 				}
 			},
 			want: []byte{
 				0x00, 0x00, 0x00, 0x3c, 0x00, 0x40, 0x00, 0x42, 0x00, 0x44, 0x00, 0x45, 0x00, 0x46, 0x00, 0x47,
 				0x00, 0x48, 0x80, 0x48, 0x00, 0x49, 0x80, 0x49, 0x00, 0x4a, 0x80, 0x4a, 0x00, 0x4b, 0x80, 0x4b,
 				0x00, 0x4c, 0x40, 0x4c, 0x80, 0x4c, 0xc0, 0x4c, 0x00, 0x4d, 0x40, 0x4d, 0x80, 0x4d, 0xc0, 0x4d,
 				0x00, 0x4e, 0x40, 0x4e, 0x80, 0x4e, 0xc0, 0x4e, 0x00, 0x4f, 0x40, 0x4f, 0x80, 0x4f, 0xc0, 0x4f,
 			},
 		},
 		{
 			name:  "fp16-fp32",
 			dtype: "F16",
 			size:  32 * 2, // 32 floats, each 2 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
 				u16s := make([]uint16, 32)
 				for i := range u16s {
 					u16s[i] = float16.Fromfloat32(float32(i)).Bits()
 				}
 				if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
 					t.Fatal(err)
 				}
 			},
 			want: []byte{
 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40,
 				0x00, 0x00, 0x80, 0x40, 0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40,
 				0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41,
 				0x00, 0x00, 0x40, 0x41, 0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41,
 				0x00, 0x00, 0x80, 0x41, 0x00, 0x00, 0x88, 0x41, 0x00, 0x00, 0x90, 0x41, 0x00, 0x00, 0x98, 0x41,
 				0x00, 0x00, 0xa0, 0x41, 0x00, 0x00, 0xa8, 0x41, 0x00, 0x00, 0xb0, 0x41, 0x00, 0x00, 0xb8, 0x41,
 				0x00, 0x00, 0xc0, 0x41, 0x00, 0x00, 0xc8, 0x41, 0x00, 0x00, 0xd0, 0x41, 0x00, 0x00, 0xd8, 0x41,
 				0x00, 0x00, 0xe0, 0x41, 0x00, 0x00, 0xe8, 0x41, 0x00, 0x00, 0xf0, 0x41, 0x00, 0x00, 0xf8, 0x41,
 			},
 		},
 		{
 			name:  "bf16-bf16",
 			dtype: "BF16",
 			size:  32 * 2, // 32 brain floats, each 2 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
 				f32s := make([]float32, 32)
 				for i := range f32s {
 					f32s[i] = float32(i)
 				}
 				if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
 			want: []byte{
 				0x00, 0x00, 0x80, 0x3f, 0x00, 0x40, 0x40, 0x40, 0x80, 0x40, 0xa0, 0x40, 0xc0, 0x40, 0xe0, 0x40,
 				0x00, 0x41, 0x10, 0x41, 0x20, 0x41, 0x30, 0x41, 0x40, 0x41, 0x50, 0x41, 0x60, 0x41, 0x70, 0x41,
 				0x80, 0x41, 0x88, 0x41, 0x90, 0x41, 0x98, 0x41, 0xa0, 0x41, 0xa8, 0x41, 0xb0, 0x41, 0xb8, 0x41,
 				0xc0, 0x41, 0xc8, 0x41, 0xd0, 0x41, 0xd8, 0x41, 0xe0, 0x41, 0xe8, 0x41, 0xf0, 0x41, 0xf8, 0x41,
 			},
 		},
 		{
 			name:  "bf16-fp32",
 			dtype: "BF16",
 			size:  32 * 2, // 32 brain floats, each 2 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
 				f32s := make([]float32, 32)
 				for i := range f32s {
 					f32s[i] = float32(i)
 				}
 				if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
 			want: []byte{
 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40,
 				0x00, 0x00, 0x80, 0x40, 0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40,
 				0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41,
 				0x00, 0x00, 0x40, 0x41, 0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41,
 				0x00, 0x00, 0x80, 0x41, 0x00, 0x00, 0x88, 0x41, 0x00, 0x00, 0x90, 0x41, 0x00, 0x00, 0x98, 0x41,
 				0x00, 0x00, 0xa0, 0x41, 0x00, 0x00, 0xa8, 0x41, 0x00, 0x00, 0xb0, 0x41, 0x00, 0x00, 0xb8, 0x41,
 				0x00, 0x00, 0xc0, 0x41, 0x00, 0x00, 0xc8, 0x41, 0x00, 0x00, 0xd0, 0x41, 0x00, 0x00, 0xd8, 0x41,
 				0x00, 0x00, 0xe0, 0x41, 0x00, 0x00, 0xe8, 0x41, 0x00, 0x00, 0xf0, 0x41, 0x00, 0x00, 0xf8, 0x41,
 			},
 		},
 		{
 			name:  "u8-u8",
 			dtype: "U8",
 			size:  32, // 32 brain floats, each 1 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
 				u8s := make([]uint8, 32)
 				for i := range u8s {
 					u8s[i] = uint8(i)
 				}
 				if err := binary.Write(f, binary.LittleEndian, u8s); err != nil {
 					t.Fatal(err)
 				}
 			},
 			want: []byte{
 				0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 				0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 			},
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			path := filepath.Base(t.Name())
 			st := safetensor{
 				fs:     root.FS(),
 				path:   path,
 				dtype:  tt.dtype,
 				offset: tt.offset,
 				size:   tt.size,
 				tensorBase: &tensorBase{
 					name:  tt.name,
 					shape: tt.shape,
 				},
 			}
 			f, err := root.Create(path)
 			if err != nil {
 				t.Fatal(err)
 			}
 			defer f.Close()
 			tt.setup(t, f)
 			var b bytes.Buffer
 			if _, err := st.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			if diff := cmp.Diff(tt.want, b.Bytes()); diff != "" {
 				t.Errorf("safetensor.WriteTo() mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
 }
 func TestSafetensorKind(t *testing.T) {
 	tests := []struct {
 		name     string
 		st       safetensor
 		expected uint32
 	}{
 		{
 			name: "BF16 dtype with non-v. prefix and non-FP32 base kind should return BF16",
 			st: safetensor{
 				tensorBase: &tensorBase{
 					name:  "weight.matrix",
 					shape: []uint64{10, 10}, // will default to FP16
 				},
 				dtype: "BF16",
 			},
 			expected: tensorKindBF16,
 		},
 		{
 			name: "BF16 dtype with v. prefix should return base kind",
 			st: safetensor{
 				tensorBase: &tensorBase{
 					name:  "v.weight.matrix",
 					shape: []uint64{10, 10}, // will default to FP16
 				},
 				dtype: "BF16",
 			},
 			expected: tensorKindFP16,
 		},
 		{
 			name: "BF16 dtype with FP32 base kind should return FP32",
 			st: safetensor{
 				tensorBase: &tensorBase{
 					name:  "weight.matrix",
 					shape: []uint64{10}, // will default to FP32
 				},
 				dtype: "BF16",
 			},
 			expected: tensorKindFP32,
 		},
 		{
 			name: "Non-BF16 dtype should return base kind",
 			st: safetensor{
 				tensorBase: &tensorBase{
 					name:  "weight.matrix",
 					shape: []uint64{10, 10}, // will default to FP16
 				},
 				dtype: "FP16",
 			},
 			expected: tensorKindFP16,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := tt.st.Kind()
 			if result != tt.expected {
 				t.Errorf("Kind() = %d, expected %d", result, tt.expected)
 			}
 		})
 	}
 }
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@@ -43,17 +43,6 @@ type torch struct {
 	*tensorBase
 }
 func (t torch) Clone() Tensor {
 	return torch{
 		storage: t.storage,
 		tensorBase: &tensorBase{
 			name:     t.name,
 			shape:    t.shape,
 			repacker: t.repacker,
 		},
 	}
 }
 func (pt torch) WriteTo(w io.Writer) (int64, error) {
 	return 0, nil
 }
--- a/convert/sentencepiece/sentencepiece_model.pb.go
+++ b/convert/sentencepiece/sentencepiece_model.pb.go
@@ -1360,7 +1360,7 @@ func file_sentencepiece_model_proto_rawDescGZIP() []byte {
 var file_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2)
 var file_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
-var file_sentencepiece_model_proto_goTypes = []any{
+var file_sentencepiece_model_proto_goTypes = []interface{}{
 	(TrainerSpec_ModelType)(0),         // 0: sentencepiece.TrainerSpec.ModelType
 	(ModelProto_SentencePiece_Type)(0), // 1: sentencepiece.ModelProto.SentencePiece.Type
 	(*TrainerSpec)(nil),                // 2: sentencepiece.TrainerSpec
@@ -1392,7 +1392,7 @@ func file_sentencepiece_model_proto_init() {
 		return
 	}
 	if !protoimpl.UnsafeEnabled {
-		file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v any, i int) any {
+		file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
 			switch v := v.(*TrainerSpec); i {
 			case 0:
 				return &v.state
@@ -1406,7 +1406,7 @@ func file_sentencepiece_model_proto_init() {
 				return nil
 			}
 		}
-		file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v any, i int) any {
+		file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
 			switch v := v.(*NormalizerSpec); i {
 			case 0:
 				return &v.state
@@ -1420,7 +1420,7 @@ func file_sentencepiece_model_proto_init() {
 				return nil
 			}
 		}
-		file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v any, i int) any {
+		file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
 			switch v := v.(*SelfTestData); i {
 			case 0:
 				return &v.state
@@ -1434,7 +1434,7 @@ func file_sentencepiece_model_proto_init() {
 				return nil
 			}
 		}
-		file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v any, i int) any {
+		file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
 			switch v := v.(*ModelProto); i {
 			case 0:
 				return &v.state
@@ -1448,7 +1448,7 @@ func file_sentencepiece_model_proto_init() {
 				return nil
 			}
 		}
-		file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v any, i int) any {
+		file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
 			switch v := v.(*SelfTestData_Sample); i {
 			case 0:
 				return &v.state
@@ -1460,7 +1460,7 @@ func file_sentencepiece_model_proto_init() {
 				return nil
 			}
 		}
-		file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v any, i int) any {
+		file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
 			switch v := v.(*ModelProto_SentencePiece); i {
 			case 0:
 				return &v.state
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -1,133 +0,0 @@
 package convert
 import (
 	"cmp"
 	"io"
 	"iter"
 	"path"
 	"slices"
 	"strings"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type split struct {
 	*strings.Replacer
 	dim    int
 	slices []tensor.Slice
 	// fn is an optional function to apply to the tensor after slicing
 	fn func(tensor.Tensor) (tensor.Tensor, error)
 }
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
 // is split evenly based on the number of replacers provided unless a specific count is given.
 func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 	return func(yield func(*ggml.Tensor) bool) {
 		var offset int
 		for _, split := range splits {
 			t := t.Clone()
 			shape := slices.Clone(t.Shape())
 			shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
 			slice := split.slices
 			if len(slice) == 0 {
 				slice = slices.Repeat([]tensor.Slice{nil}, len(shape))
 				slice[dim] = tensor.S(offset, offset+int(shape[dim]))
 				offset += int(shape[dim])
 			}
 			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
 				dims := make([]int, len(shape))
 				for i := range shape {
 					dims[i] = int(shape[i])
 				}
 				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 				tt, err := tt.Slice(slice...)
 				if err != nil {
 					return nil, err
 				}
 				tt = tensor.Materialize(tt)
 				if split.fn != nil {
 					tt, err = split.fn(tt)
 					if err != nil {
 						return nil, err
 					}
 				}
 				// flatten tensor so it can be written as a vector
 				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
 					return nil, err
 				}
 				return native.VectorF32(tt.(*tensor.Dense))
 			})
 			if !yield(&ggml.Tensor{
 				Name:     split.Replace(t.Name()),
 				Kind:     t.Kind(),
 				Shape:    shape,
 				WriterTo: t,
 			}) {
 				break
 			}
 		}
 	}
 }
 type merge struct {
 	pattern, name string
 }
 // mergeTensors merges tensors that match a given pattern into a single tensor.
 func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
 	var matched []Tensor
 	for i := range merges {
 		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
 			matched, _ := path.Match(merges[i].pattern, t.Name())
 			return matched
 		})
 		if len(matched) > 0 {
 			out = append(out, &ggml.Tensor{
 				Name:     merges[i].name,
 				Kind:     matched[0].Kind(),
 				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
 				WriterTo: mergeGroup(matched),
 			})
 		}
 	}
 	return out, unmatched
 }
 // slicesSplitFunc splits a slice into two slices based on a predicate function.
 func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
 	for _, e := range s {
 		if fn(e) {
 			matched = append(matched, e)
 		} else {
 			unmatched = append(unmatched, e)
 		}
 	}
 	return matched, unmatched
 }
 type mergeGroup []Tensor
 func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
 	for _, t := range g {
 		if _, err := t.WriteTo(w); err != nil {
 			return 0, err
 		}
 	}
 	return 0, nil
 }
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -1,953 +0,0 @@
 package convert
 import (
 	"bytes"
 	"encoding/binary"
 	"io"
 	"iter"
 	"slices"
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 )
 type fakeTensor struct {
 	name  string
 	shape []uint64
 	data  []float32
 	repacker Repacker
 }
 func (f fakeTensor) Name() string {
 	return f.name
 }
 func (f fakeTensor) Shape() []uint64 {
 	return f.shape
 }
 func (f fakeTensor) Kind() uint32 {
 	return 0
 }
 func (f *fakeTensor) SetRepacker(fn Repacker) {
 	f.repacker = fn
 }
 func (f fakeTensor) Clone() Tensor {
 	return &fakeTensor{
 		name:     f.name,
 		shape:    slices.Clone(f.shape),
 		data:     slices.Clone(f.data),
 		repacker: f.repacker,
 	}
 }
 func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
 	data := f.data
 	if f.repacker != nil {
 		data, err = f.repacker(f.name, data, f.shape)
 		if err != nil {
 			return 0, err
 		}
 	}
 	if err := binary.Write(w, binary.LittleEndian, data); err != nil {
 		return 0, err
 	}
 	return int64(len(data) * 4), nil
 }
 func mul(shape []uint64) int {
 	n := 1
 	for _, dim := range shape {
 		n *= int(dim)
 	}
 	return n
 }
 func TestSplitDim(t *testing.T) {
 	t.Run("2d", func(t *testing.T) {
 		r := fakeTensor{
 			name:  "a.b",
 			shape: []uint64{3, 4},
 			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
 		}
 		t.Run("no split", func(t *testing.T) {
 			for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
 				if tt.Name != "x.b" {
 					t.Fatalf("expected name 'x', got '%s'", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 4}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("even split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 1,
 				split{Replacer: strings.NewReplacer("a", "x")},
 				split{Replacer: strings.NewReplacer("b", "y")},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'a.y', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{2, 3, 6, 7, 10, 11}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("uneven split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 0,
 				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
 				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{2, 4}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'a.y', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("three way split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 0,
 				split{Replacer: strings.NewReplacer("a", "x"), dim: 1},
 				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
 				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{4, 5, 6, 7}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.z" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("uneven three way split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 1,
 				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
 				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
 				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 1}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{2, 6, 10}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.z" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 1}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{3, 7, 11}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("split with transpose", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 1,
 				split{Replacer: strings.NewReplacer("a", "x")},
 				split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
 					return tensor.Transpose(tt, 1, 0)
 				}},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 4, 5, 8, 9}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'a.y', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{2, 6, 10, 3, 7, 11}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 	})
 	t.Run("3d", func(t *testing.T) {
 		r := fakeTensor{
 			name:  "a.b",
 			shape: []uint64{3, 4, 2},
 			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
 		}
 		t.Run("no split", func(t *testing.T) {
 			for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
 				if tt.Name != "x.b" {
 					t.Fatalf("expected name 'x', got '%s'", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 4, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("even split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 1,
 				split{Replacer: strings.NewReplacer("a", "x")},
 				split{Replacer: strings.NewReplacer("b", "y")},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'a.y', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("uneven split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 0,
 				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
 				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{2, 4, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'a.y', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("three way split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 0,
 				split{Replacer: strings.NewReplacer("a", "x"), dim: 1},
 				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
 				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{8, 9, 10, 11, 12, 13, 14, 15}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.z" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{1, 4, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{16, 17, 18, 19, 20, 21, 22, 23}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 		t.Run("uneven three way split", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 1,
 				split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
 				split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
 				split{Replacer: strings.NewReplacer("b", "z"), dim: 1},
 			))
 			defer stop()
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "x.b" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 2, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.y" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 1, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{4, 5, 12, 13, 20, 21}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 			{
 				tt, ok := next()
 				if !ok {
 					t.Fatal("expected at least one split")
 				}
 				if tt.Name != "a.z" {
 					t.Fatal("expected name 'x.b', got", tt.Name)
 				}
 				if diff := cmp.Diff(tt.Shape, []uint64{3, 1, 2}); diff != "" {
 					t.Errorf("unexpected shape (-want +got):\n%s", diff)
 				}
 				var b bytes.Buffer
 				if _, err := tt.WriteTo(&b); err != nil {
 					t.Fatal(err)
 				}
 				f32s := make([]float32, mul(tt.Shape))
 				if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(f32s, []float32{6, 7, 14, 15, 22, 23}); diff != "" {
 					t.Errorf("unexpected data (-want +got):\n%s", diff)
 				}
 			}
 		})
 	})
 }
 func TestMerge(t *testing.T) {
 	unmatched := []Tensor{
 		&fakeTensor{
 			name:  "a.0.b",
 			shape: []uint64{5, 2},
 			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
 		},
 		&fakeTensor{
 			name:  "a.1.b",
 			shape: []uint64{5, 2},
 			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
 		},
 		&fakeTensor{
 			name:  "c.0.d",
 			shape: []uint64{5, 2},
 			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
 		},
 		&fakeTensor{
 			name:  "c.1.d",
 			shape: []uint64{5, 2},
 			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
 		},
 		&fakeTensor{
 			name:  "e.0.f",
 			shape: []uint64{5, 2},
 			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
 		},
 	}
 	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
 		for i := range n {
 			got := matched[i]
 			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
 				t.Errorf("unexpected (-want +got):\n%s", diff)
 			}
 			var b bytes.Buffer
 			if _, err := got.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, 20)
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			offset := 10 + (i * 20)
 			want := make([]float32, 20)
 			for j := range 20 {
 				want[j] = float32(offset + j)
 			}
 			if diff := cmp.Diff(want, f32s); diff != "" {
 				t.Errorf("unexpected data (-want +got):\n%s", diff)
 			}
 		}
 	}
 	t.Run("single merge", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
 		if len(unmatched) != 3 {
 			t.Error("expected 3 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 1 {
 			t.Error("expected 1 merged tensor, got", len(matched))
 		}
 		checkMatched(t, 1, matched)
 	})
 	t.Run("multiple merges", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
 		if len(unmatched) != 1 {
 			t.Error("expected 1 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 2 {
 			t.Error("expected 2 merged tensor, got", len(matched))
 		}
 		checkMatched(t, 2, matched)
 	})
 	t.Run("no match", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
 		if len(unmatched) != 5 {
 			t.Error("expected 5 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 0 {
 			t.Error("expected no merged tensors, got", len(matched))
 		}
 	})
 }
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -8,10 +8,11 @@ import (
 	"fmt"
 	"io/fs"
 	"log/slog"
 	"maps"
 	"os"
 	"slices"
 	"strings"
 	"golang.org/x/exp/maps"
 )
 const (
@@ -109,7 +110,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}
 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
 		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -171,34 +171,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}
 	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
 	} else if err != nil {
 		return nil, err
 	} else {
 		defer f.Close()
 		var p map[string]json.RawMessage
 		if err := json.NewDecoder(f).Decode(&p); err != nil {
 			return nil, err
 		}
 		for _, st := range specialTokenTypes {
 			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
 				var ids []int32
 				if err := json.Unmarshal(bts, &ids); err != nil {
 					// value is not a list so the existing ID is used
 					continue
 				}
 				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
 					return sv.Type == st
 				}); i >= 0 {
 					t.SpecialVocabulary[i].IDs = ids
 				}
 			}
 		}
 	}
 	return t, nil
 }
@@ -259,8 +231,11 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		tokens[token.ID] = token
 	}
 	keys := maps.Keys(tokens)
 	slices.Sort(keys)
 	v := Vocabulary{Model: "gpt2"}
-	for _, k := range slices.Sorted(maps.Keys(tokens)) {
+	for _, k := range keys {
 		token := tokens[k]
 		v.Tokens = append(v.Tokens, token.Content)
 		v.Scores = append(v.Scores, float32(token.ID))
@@ -305,9 +280,6 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
 	// IDs is populated by generation_config.json
 	IDs []int32
 }
 func (sv SpecialVocabulary) Key() string {
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -6,9 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
 	"log/slog"
 	"os"
 	"reflect"
 	"slices"
 	"google.golang.org/protobuf/proto"
@@ -17,8 +15,6 @@ import (
 )
 func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 	slog.Debug("using spm vocabulary")
 	ast, err := parseAdditionalSpecialTokens(fsys)
 	if err != nil {
 		return nil, err
@@ -47,19 +43,10 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			v.Types = append(v.Types, int32(t))
 		default:
 			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
-
+			if slices.Contains(ast, piece.GetPiece()) {
 			// temporary fix to handle gemma3 broken configs
 			if slices.Contains([]string{"<end_of_turn>", "<start_of_turn>"}, piece.GetPiece()) {
 				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
 			}
 			for _, t := range ast {
 				if t.Content == piece.GetPiece() {
 					tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
 					break
 				}
 			}
 			v.Types = append(v.Types, tt)
 		}
 	}
@@ -91,16 +78,10 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 		return cmp.Compare(i.id, j.id)
 	})
-	for _, t := range ts {
+	n := len(v.Tokens)
-		if t.id < len(v.Tokens) {
+	for i, t := range ts {
-			if v.Tokens[t.id] == t.content {
+		if t.id != i+n {
-				slog.Warn("tokenizer", "duplicate token", t.content, "id", t.id)
+			return nil, fmt.Errorf("invalid token id: %d", t.id)
 				continue
 			}
 			return nil, fmt.Errorf("token mismatch: %s != %s at pos [%d]", t.content, v.Tokens[t.id], t.id)
 		}
 		if t.id != len(v.Tokens) {
 			return nil, fmt.Errorf("invalid token id: [%d] as pos [%d]", t.id, len(v.Tokens))
 		}
 		v.Tokens = append(v.Tokens, t.content)
@@ -111,15 +92,7 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 	return &v, nil
 }
-type specialToken struct {
+func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
 	Content    string `json:"content"`
 	Lstrip     bool   `json:"lstrip"`
 	Normalized bool   `json:"normalized"`
 	Rstrip     bool   `json:"rstrip"`
 	SingleWord bool   `json:"single_word"`
 }
 func parseAdditionalSpecialTokens(fsys fs.FS) ([]specialToken, error) {
 	f, err := fsys.Open("special_tokens_map.json")
 	if errors.Is(err, os.ErrNotExist) {
 		return nil, nil
@@ -129,43 +102,12 @@ func parseAdditionalSpecialTokens(fsys fs.FS) ([]specialToken, error) {
 	defer f.Close()
 	var m struct {
-		AdditionalSpecialTokens any `json:"additional_special_tokens"`
+		AdditionalSpecialTokens []string `json:"additional_special_tokens"`
 	}
 	if err := json.NewDecoder(f).Decode(&m); err != nil {
 		return nil, err
 	}
-	var ast []specialToken
+	return m.AdditionalSpecialTokens, nil
 	switch st := m.AdditionalSpecialTokens.(type) {
 	case []string:
 		for _, s := range st {
 			ast = append(ast, specialToken{Content: s})
 		}
 	case []any:
 		for _, s := range st {
 			// marshal and unmarshal the object to get the special token
 			tMap := s.(map[string]any)
 			data, err := json.Marshal(tMap)
 			if err != nil {
 				return nil, err
 			}
 			var token specialToken
 			err = json.Unmarshal(data, &token)
 			if err != nil {
 				return nil, err
 			}
 			ast = append(ast, token)
 		}
 	default:
 		slog.Warn("special token", "unknown token", reflect.TypeOf(st))
 	}
 	slog.Debug("spm tokenizer", "additional tokens", ast)
 	return ast, nil
 }
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -247,67 +247,6 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
 		{
 			name: "generation config eos token ids",
 			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
 				"tokenizer.json": strings.NewReader(`{
 					"added_tokens": [
 						{
 							"id": 0,
 							"content": "<bos>",
 							"special": true
 						},
 						{
 							"id": 1,
 							"content": "<eos>",
 							"special": true
 						},
 						{
 							"id": 2,
 							"content": "<eot>",
 							"special": true
 						},
 						{
 							"id": 3,
 							"content": "<eom>",
 							"special": true
 						}
 					],
 					"model": {
 						"vocab": {
 							"<bos>": 0,
 							"<eos>": 1,
 							"<eot>": 2,
 							"<eom>": 3
 						}
 					}
 				}`),
 				"tokenizer_config.json": strings.NewReader(`{
 					"add_bos_token": true,
 					"add_eos_token": false,
 					"bos_token": "<bos>",
 					"eos_token": "<eos>"
 				}`),
 				"generation_config.json": strings.NewReader(`{
 					"bos_token_id": 0,
 					"eos_token_id": [1, 2, 3]
 				}`),
 			}),
 			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
 			want: &Tokenizer{
 				Vocabulary: &Vocabulary{
 					Model:  "gpt2",
 					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
 					Scores: []float32{0, 1, 2, 3},
 					Types:  []int32{3, 3, 3, 3},
 				},
 				SpecialVocabulary: []*SpecialVocabulary{
 					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
 					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
 				},
 				Pre: "default",
 			},
 		},
 	}
 	for _, tt := range cases {
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@@ -0,0 +1,83 @@
 //go:build linux || windows
 package discover
 import (
 	"errors"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
 )
 // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
 func rocmLibUsable(libDir string) bool {
 	slog.Debug("evaluating potential rocm lib dir " + libDir)
 	for _, g := range ROCmLibGlobs {
 		res, _ := filepath.Glob(filepath.Join(libDir, g))
 		if len(res) == 0 {
 			return false
 		}
 	}
 	return true
 }
 func GetSupportedGFX(libDir string) ([]string, error) {
 	var ret []string
 	files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
 	if err != nil {
 		return nil, err
 	}
 	for _, file := range files {
 		ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
 	}
 	return ret, nil
 }
 func commonAMDValidateLibDir() (string, error) {
 	// Favor our bundled version
 	// Installer payload location if we're running the installed binary
 	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 		return rocmTargetDir, nil
 	}
 	// Prefer explicit HIP env var
 	hipPath := os.Getenv("HIP_PATH")
 	if hipPath != "" {
 		hipLibDir := filepath.Join(hipPath, "bin")
 		if rocmLibUsable(hipLibDir) {
 			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
 			return hipLibDir, nil
 		}
 	}
 	// Scan the LD_LIBRARY_PATH or PATH
 	pathEnv := "LD_LIBRARY_PATH"
 	if runtime.GOOS == "windows" {
 		pathEnv = "PATH"
 	}
 	paths := os.Getenv(pathEnv)
 	for _, path := range filepath.SplitList(paths) {
 		d, err := filepath.Abs(path)
 		if err != nil {
 			continue
 		}
 		if rocmLibUsable(d) {
 			return d, nil
 		}
 	}
 	// Well known location(s)
 	for _, path := range RocmStandardLocations {
 		if rocmLibUsable(path) {
 			return path, nil
 		}
 	}
 	return "", errors.New("no suitable rocm found, falling back to CPU")
 }
--- a/discover/amd_hip_windows.go
+++ b/discover/amd_hip_windows.go
@@ -0,0 +1,147 @@
 package discover
 import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"syscall"
 	"unsafe"
 	"golang.org/x/sys/windows"
 )
 const (
 	hipSuccess       = 0
 	hipErrorNoDevice = 100
 )
 type hipDevicePropMinimal struct {
 	Name        [256]byte
 	unused1     [140]byte
 	GcnArchName [256]byte // gfx####
 	iGPU        int       // Doesn't seem to actually report correctly
 	unused2     [128]byte
 }
 // Wrap the amdhip64.dll library for GPU discovery
 type HipLib struct {
 	dll                    windows.Handle
 	hipGetDeviceCount      uintptr
 	hipGetDeviceProperties uintptr
 	hipMemGetInfo          uintptr
 	hipSetDevice           uintptr
 	hipDriverGetVersion    uintptr
 }
 func NewHipLib() (*HipLib, error) {
 	// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
 	h, err := windows.LoadLibrary("amdhip64_6.dll")
 	if err != nil {
 		return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
 	}
 	hl := &HipLib{}
 	hl.dll = h
 	hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
 	if err != nil {
 		return nil, err
 	}
 	hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
 	if err != nil {
 		return nil, err
 	}
 	hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
 	if err != nil {
 		return nil, err
 	}
 	hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
 	if err != nil {
 		return nil, err
 	}
 	hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
 	if err != nil {
 		return nil, err
 	}
 	return hl, nil
 }
 // The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
 // so we have to unload/reset the library after we do our initial discovery
 // to make sure our updates to that variable are processed by llama.cpp
 func (hl *HipLib) Release() {
 	err := windows.FreeLibrary(hl.dll)
 	if err != nil {
 		slog.Warn("failed to unload amdhip64.dll", "error", err)
 	}
 	hl.dll = 0
 }
 func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	if hl.dll == 0 {
 		return 0, 0, errors.New("dll has been unloaded")
 	}
 	var version int
 	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
 	if status != hipSuccess {
 		return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
 	}
 	slog.Debug("hipDriverGetVersion", "version", version)
 	driverMajor = version / 10000000
 	driverMinor = (version - (driverMajor * 10000000)) / 100000
 	return driverMajor, driverMinor, nil
 }
 func (hl *HipLib) HipGetDeviceCount() int {
 	if hl.dll == 0 {
 		slog.Error("dll has been unloaded")
 		return 0
 	}
 	var count int
 	status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
 	if status == hipErrorNoDevice {
 		slog.Info("AMD ROCm reports no devices found")
 		return 0
 	}
 	if status != hipSuccess {
 		slog.Warn("failed call to hipGetDeviceCount", "status", status, "error", err)
 	}
 	return count
 }
 func (hl *HipLib) HipSetDevice(device int) error {
 	if hl.dll == 0 {
 		return errors.New("dll has been unloaded")
 	}
 	status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
 	if status != hipSuccess {
 		return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
 	}
 	return nil
 }
 func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
 	if hl.dll == 0 {
 		return nil, errors.New("dll has been unloaded")
 	}
 	var props hipDevicePropMinimal
 	status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
 	if status != hipSuccess {
 		return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
 	}
 	return &props, nil
 }
 // free, total, err
 func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
 	if hl.dll == 0 {
 		return 0, 0, errors.New("dll has been unloaded")
 	}
 	var totalMemory uint64
 	var freeMemory uint64
 	status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
 	if status != hipSuccess {
 		return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
 	}
 	return freeMemory, totalMemory, nil
 }
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -0,0 +1,538 @@
 package discover
 import (
 	"bufio"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"regexp"
 	"slices"
 	"sort"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )
 // Discovery logic for AMD/ROCm GPUs
 const (
 	DriverVersionFile     = "/sys/module/amdgpu/version"
 	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
 	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
 	// Prefix with the node dir
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	// Direct Rendering Manager sysfs location
 	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
 	DRMTotalMemoryFile = "mem_info_vram_total"
 	DRMUsedMemoryFile  = "mem_info_vram_used"
 	// In hex; properties file is in decimal
 	DRMUniqueIDFile = "unique_id"
 	DRMVendorFile   = "vendor"
 	DRMDeviceFile   = "device"
 )
 var (
 	// Used to validate if the given ROCm lib is usable
 	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
 	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
 )
 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
 // Only called once during bootstrap
 func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	resp := []RocmGPUInfo{}
 	if !AMDDetected() {
 		return resp, fmt.Errorf("AMD GPUs not detected")
 	}
 	// Opportunistic logging of driver version to aid in troubleshooting
 	driverMajor, driverMinor, err := AMDDriverVersion()
 	if err != nil {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
 		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
 	}
 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
 	var visibleDevices []string
 	hipVD := envconfig.HipVisibleDevices()   // zero based index only
 	rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
 	gpuDO := envconfig.GpuDeviceOrdinal()    // zero based index
 	switch {
 	case rocrVD != "":
 		visibleDevices = strings.Split(rocrVD, ",")
 	case hipVD != "":
 		visibleDevices = strings.Split(hipVD, ",")
 	case gpuDO != "":
 		visibleDevices = strings.Split(gpuDO, ",")
 	}
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
 	var libDir string
 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
 	sort.Slice(matches, func(i, j int) bool {
 		// /sys/class/kfd/kfd/topology/nodes/<number>/properties
 		a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
 		if err != nil {
 			slog.Debug("parse err", "error", err, "match", matches[i])
 			return false
 		}
 		b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
 		if err != nil {
 			slog.Debug("parse err", "error", err, "match", matches[i])
 			return false
 		}
 		return a < b
 	})
 	gpuCount := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)
 		fp, err := os.Open(match)
 		if err != nil {
 			slog.Debug("failed to open sysfs node", "file", match, "error", err)
 			continue
 		}
 		defer fp.Close()
 		scanner := bufio.NewScanner(fp)
 		isCPU := false
 		var major, minor, patch uint64
 		var vendor, device, uniqueID uint64
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
 			if strings.HasPrefix(line, "gfx_target_version") {
 				ver := strings.Fields(line)
 				// Detect CPUs
 				if len(ver) == 2 && ver[1] == "0" {
 					slog.Debug("detected CPU " + match)
 					isCPU = true
 					break
 				}
 				if len(ver) != 2 || len(ver[1]) < 5 {
 					slog.Warn("malformed "+match, "gfx_target_version", line)
 					// If this winds up being a CPU, our offsets may be wrong
 					continue
 				}
 				l := len(ver[1])
 				var err1, err2, err3 error
 				patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
 				minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
 				major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
 				if err1 != nil || err2 != nil || err3 != nil {
 					slog.Debug("malformed int " + line)
 					continue
 				}
 			} else if strings.HasPrefix(line, "vendor_id") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 {
 					slog.Debug("malformed", "vendor_id", line)
 					continue
 				}
 				vendor, err = strconv.ParseUint(ver[1], 10, 64)
 				if err != nil {
 					slog.Debug("malformed", "vendor_id", line, "error", err)
 				}
 			} else if strings.HasPrefix(line, "device_id") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 {
 					slog.Debug("malformed", "device_id", line)
 					continue
 				}
 				device, err = strconv.ParseUint(ver[1], 10, 64)
 				if err != nil {
 					slog.Debug("malformed", "device_id", line, "error", err)
 				}
 			} else if strings.HasPrefix(line, "unique_id") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 {
 					slog.Debug("malformed", "unique_id", line)
 					continue
 				}
 				uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
 				if err != nil {
 					slog.Debug("malformed", "unique_id", line, "error", err)
 				}
 			}
 			// TODO - any other properties we want to extract and record?
 			// vendor_id + device_id -> pci lookup for "Name"
 			// Other metrics that may help us understand relative performance between multiple GPUs
 		}
 		// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
 		// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
 		// do reliably report VRAM usage.
 		if isCPU {
 			continue
 		}
 		// Skip over any GPUs that are masked
 		if major == 0 && minor == 0 && patch == 0 {
 			slog.Debug("skipping gpu with gfx000")
 			continue
 		}
 		// Keep track of numeric IDs based on valid GPUs
 		gpuID := gpuCount
 		gpuCount += 1
 		// Look up the memory for the current node
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
 		var usedFile string
 		mapping := []struct {
 			id       uint64
 			filename string
 		}{
 			{vendor, DRMVendorFile},
 			{device, DRMDeviceFile},
 			{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
 		}
 		slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
 		// Map over to DRM location to find the total/free memory
 		drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
 		for _, devDir := range drmMatches {
 			matched := true
 			for _, m := range mapping {
 				if m.id == 0 {
 					// Null ID means it didn't populate, so we can't use it to match
 					continue
 				}
 				filename := filepath.Join(devDir, m.filename)
 				buf, err := os.ReadFile(filename)
 				if err != nil {
 					slog.Debug("failed to read sysfs node", "file", filename, "error", err)
 					matched = false
 					break
 				}
 				// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
 				cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
 				if err != nil {
 					slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
 					matched = false
 					break
 				}
 				if cmp != m.id {
 					matched = false
 					break
 				}
 			}
 			if !matched {
 				continue
 			}
 			// Found the matching DRM directory
 			slog.Debug("matched", "amdgpu", match, "drm", devDir)
 			totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
 			buf, err := os.ReadFile(totalFile)
 			if err != nil {
 				slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
 				break
 			}
 			totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
 			if err != nil {
 				slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
 				break
 			}
 			usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
 			usedMemory, err = getFreeMemory(usedFile)
 			if err != nil {
 				slog.Debug("failed to update used memory", "error", err)
 			}
 			break
 		}
 		var name string
 		// TODO - PCI ID lookup
 		if vendor > 0 && device > 0 {
 			name = fmt.Sprintf("%04x:%04x", vendor, device)
 		}
 		// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
 		var ID string
 		if uniqueID != 0 {
 			ID = fmt.Sprintf("GPU-%016x", uniqueID)
 		} else {
 			ID = strconv.Itoa(gpuID)
 		}
 		gpuInfo := RocmGPUInfo{
 			GpuInfo: GpuInfo{
 				Library: "rocm",
 				memInfo: memInfo{
 					TotalMemory: totalMemory,
 					FreeMemory:  (totalMemory - usedMemory),
 				},
 				ID:            ID,
 				Name:          name,
 				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 				MinimumMemory: rocmMinimumMemory,
 				DriverMajor:   driverMajor,
 				DriverMinor:   driverMinor,
 			},
 			usedFilepath: usedFile,
 			index:        gpuID,
 		}
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
 			reason := "unsupported Radeon iGPU detected skipping"
 			slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			continue
 		}
 		minVer, err := strconv.Atoi(RocmComputeMajorMin)
 		if err != nil {
 			slog.Error("invalid RocmComputeMajorMin setting", "value", RocmComputeMajorMin, "error", err)
 		}
 		if int(major) < minVer {
 			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
 			slog.Warn(reason, "gpu", gpuID)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			continue
 		}
 		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
 		if len(visibleDevices) > 0 {
 			include := false
 			for _, visible := range visibleDevices {
 				if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
 					include = true
 					break
 				}
 			}
 			if !include {
 				reason := "filtering out device per user request"
 				slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
 				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 					GpuInfo: gpuInfo.GpuInfo,
 					Reason:  reason,
 				})
 				continue
 			}
 		}
 		// Final validation is gfx compatibility - load the library if we haven't already loaded it
 		// even if the user overrides, we still need to validate the library
 		if libDir == "" {
 			libDir, err = AMDValidateLibDir()
 			if err != nil {
 				err = fmt.Errorf("unable to verify rocm library: %w", err)
 				slog.Warn(err.Error())
 				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 					GpuInfo: gpuInfo.GpuInfo,
 					Reason:  err.Error(),
 				})
 				return nil, err
 			}
 		}
 		gpuInfo.DependencyPath = []string{libDir}
 		if gfxOverride == "" {
 			// Only load supported list once
 			if len(supported) == 0 {
 				supported, err = GetSupportedGFX(libDir)
 				if err != nil {
 					err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
 					slog.Warn(err.Error())
 					unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 						GpuInfo: gpuInfo.GpuInfo,
 						Reason:  err.Error(),
 					})
 					return nil, err
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
 			gfx := gpuInfo.Compute
 			if !slices.Contains[[]string, string](supported, gfx) {
 				reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
 				slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
 				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 					GpuInfo: gpuInfo.GpuInfo,
 					Reason:  reason,
 				})
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
 			} else {
 				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
 			}
 		} else {
 			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 		}
 		// Check for env var workarounds
 		if name == "1002:687f" { // Vega RX 56
 			gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
 		}
 		// The GPU has passed all the verification steps and is supported
 		resp = append(resp, gpuInfo)
 	}
 	if len(resp) == 0 {
 		err := fmt.Errorf("no compatible amdgpu devices detected")
 		slog.Info(err.Error())
 		return nil, err
 	}
 	if err := verifyKFDDriverAccess(); err != nil {
 		err = fmt.Errorf("amdgpu devices detected but permission problems block access: %w", err)
 		slog.Error(err.Error())
 		return nil, err
 	}
 	return resp, nil
 }
 // Quick check for AMD driver so we can skip amdgpu discovery if not present
 func AMDDetected() bool {
 	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
 	sysfsDir := filepath.Dir(DriverVersionFile)
 	_, err := os.Stat(sysfsDir)
 	if errors.Is(err, os.ErrNotExist) {
 		slog.Debug("amdgpu driver not detected " + sysfsDir)
 		return false
 	} else if err != nil {
 		slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
 		return false
 	}
 	return true
 }
 // Prefer to use host installed ROCm, as long as it meets our minimum requirements
 // failing that, tell the user how to download it on their own
 func AMDValidateLibDir() (string, error) {
 	libDir, err := commonAMDValidateLibDir()
 	if err == nil {
 		return libDir, nil
 	}
 	// Well known ollama installer path
 	installedRocmDir := "/usr/share/ollama/lib/rocm"
 	if rocmLibUsable(installedRocmDir) {
 		return installedRocmDir, nil
 	}
 	// If we still haven't found a usable rocm, the user will have to install it on their own
 	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
 	return "", errors.New("no suitable rocm found, falling back to CPU")
 }
 func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	_, err = os.Stat(DriverVersionFile)
 	if err != nil {
 		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
 	}
 	fp, err := os.Open(DriverVersionFile)
 	if err != nil {
 		return 0, 0, err
 	}
 	defer fp.Close()
 	verString, err := io.ReadAll(fp)
 	if err != nil {
 		return 0, 0, err
 	}
 	pattern := `\A(\d+)\.(\d+).*`
 	regex := regexp.MustCompile(pattern)
 	match := regex.FindStringSubmatch(string(verString))
 	if len(match) < 2 {
 		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
 	}
 	driverMajor, err = strconv.Atoi(match[1])
 	if err != nil {
 		return 0, 0, err
 	}
 	driverMinor, err = strconv.Atoi(match[2])
 	if err != nil {
 		return 0, 0, err
 	}
 	return driverMajor, driverMinor, nil
 }
 func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 	if len(gpus) == 0 {
 		return nil
 	}
 	for i := range gpus {
 		usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
 		if err != nil {
 			return err
 		}
 		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
 		gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
 	}
 	return nil
 }
 func getFreeMemory(usedFile string) (uint64, error) {
 	buf, err := os.ReadFile(usedFile)
 	if err != nil {
 		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
 	}
 	usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
 	if err != nil {
 		slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
 		return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
 	}
 	return usedMemory, nil
 }
 func verifyKFDDriverAccess() error {
 	// Verify we have permissions - either running as root, or we have group access to the driver
 	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
 	if err != nil {
 		if errors.Is(err, fs.ErrPermission) {
 			return fmt.Errorf("permissions not set up properly.  Either run ollama as root, or add you user account to the render group. %w", err)
 		} else if errors.Is(err, fs.ErrNotExist) {
 			// Container runtime failure?
 			return fmt.Errorf("kfd driver not loaded.  If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
 		}
 		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 	}
 	fd.Close()
 	return nil
 }
 func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
 		if info.Library != "rocm" {
 			// TODO shouldn't happen if things are wired correctly...
 			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
 			continue
 		}
 		ids = append(ids, info.ID)
 	}
 	// There are 3 potential env vars to use to select GPUs.
 	// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
 	// GPU_DEVICE_ORDINAL supports numeric IDs only
 	// HIP_VISIBLE_DEVICES supports numeric IDs only
 	return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -0,0 +1,218 @@
 package discover
 import (
 	"bytes"
 	"errors"
 	"fmt"
 	"log/slog"
 	"path/filepath"
 	"slices"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )
 const (
 	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
 	iGPUName = "AMD Radeon(TM) Graphics"
 )
 var (
 	// Used to validate if the given ROCm lib is usable
 	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
 	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
 )
 // Only called once during bootstrap
 func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	resp := []RocmGPUInfo{}
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
 		return nil, err
 	}
 	defer hl.Release()
 	driverMajor, driverMinor, err := hl.AMDDriverVersion()
 	if err != nil {
 		// For now this is benign, but we may eventually need to fail compatibility checks
 		slog.Debug("error looking up amd driver version", "error", err)
 	}
 	// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
 	if count == 0 {
 		err := fmt.Errorf("no compatible amdgpu devices detected")
 		slog.Info(err.Error())
 		return nil, err
 	}
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
 		err = fmt.Errorf("unable to verify rocm library: %w", err)
 		slog.Warn(err.Error())
 		return nil, err
 	}
 	var supported []string
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	if gfxOverride == "" {
 		supported, err = GetSupportedGFX(libDir)
 		if err != nil {
 			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
 			slog.Warn(err.Error())
 			return nil, err
 		}
 	} else {
 		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 	}
 	slog.Debug("detected hip devices", "count", count)
 	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
 	for i := range count {
 		err = hl.HipSetDevice(i)
 		if err != nil {
 			slog.Warn("set device", "id", i, "error", err)
 			continue
 		}
 		props, err := hl.HipGetDeviceProperties(i)
 		if err != nil {
 			slog.Warn("get properties", "id", i, "error", err)
 			continue
 		}
 		n := bytes.IndexByte(props.Name[:], 0)
 		name := string(props.Name[:n])
 		// TODO is UUID actually populated on windows?
 		// Can luid be used on windows for setting visible devices (and is it actually set?)
 		n = bytes.IndexByte(props.GcnArchName[:], 0)
 		gfx := string(props.GcnArchName[:n])
 		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
 		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
 		// TODO  Why isn't props.iGPU accurate!?
 		freeMemory, totalMemory, err := hl.HipMemGetInfo()
 		if err != nil {
 			slog.Warn("get mem info", "id", i, "error", err)
 			continue
 		}
 		gpuInfo := RocmGPUInfo{
 			GpuInfo: GpuInfo{
 				Library: "rocm",
 				memInfo: memInfo{
 					TotalMemory: totalMemory,
 					FreeMemory:  freeMemory,
 				},
 				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
 				UnreliableFreeMemory: true,
 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
 				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
 				DriverMajor:    driverMajor,
 				DriverMinor:    driverMinor,
 			},
 			index: i,
 		}
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
 			reason := "unsupported Radeon iGPU detected skipping"
 			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			continue
 		}
 		// Strip off Target Features when comparing
 		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
 			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
 			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			// HSA_OVERRIDE_GFX_VERSION not supported on windows
 			continue
 		} else {
 			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 		}
 		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
 		resp = append(resp, gpuInfo)
 	}
 	return resp, nil
 }
 func AMDValidateLibDir() (string, error) {
 	libDir, err := commonAMDValidateLibDir()
 	if err == nil {
 		return libDir, nil
 	}
 	// Installer payload (if we're running from some other location)
 	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
 	}
 	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
 	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
 	return "", errors.New("no suitable rocm found, falling back to CPU")
 }
 func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 	if len(gpus) == 0 {
 		return nil
 	}
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
 		return err
 	}
 	defer hl.Release()
 	for i := range gpus {
 		err := hl.HipSetDevice(gpus[i].index)
 		if err != nil {
 			return err
 		}
 		freeMemory, _, err := hl.HipMemGetInfo()
 		if err != nil {
 			slog.Warn("get mem info", "id", i, "error", err)
 			continue
 		}
 		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
 		gpus[i].FreeMemory = freeMemory
 	}
 	return nil
 }
 func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
 		if info.Library != "rocm" {
 			// TODO shouldn't happen if things are wired correctly...
 			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
 			continue
 		}
 		ids = append(ids, info.ID)
 	}
 	// There are 3 potential env vars to use to select GPUs.
 	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
 	// HIP_VISIBLE_DEVICES supports numeric IDs only
 	// GPU_DEVICE_ORDINAL supports numeric IDs only
 	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@@ -0,0 +1,24 @@
 package discover
 import (
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
 )
 func IsNUMA() bool {
 	if runtime.GOOS != "linux" {
 		// numa support in llama.cpp is linux only
 		return false
 	}
 	ids := map[string]interface{}{}
 	packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
 	for _, packageId := range packageIds {
 		id, err := os.ReadFile(packageId)
 		if err == nil {
 			ids[strings.TrimSpace(string(id))] = struct{}{}
 		}
 	}
 	return len(ids) > 1
 }
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -0,0 +1,64 @@
 //go:build linux || windows
 package discover
 import (
 	"log/slog"
 	"os"
 	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
 )
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
 		if info.Library != "cuda" {
 			// TODO shouldn't happen if things are wired correctly...
 			slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
 			continue
 		}
 		ids = append(ids, info.ID)
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
 func cudaVariant(gpuInfo CudaGPUInfo) string {
 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
 		if CudaTegra != "" {
 			ver := strings.Split(CudaTegra, ".")
 			if len(ver) > 0 {
 				return "jetpack" + ver[0]
 			}
 		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
 			r := regexp.MustCompile(` R(\d+) `)
 			m := r.FindSubmatch(data)
 			if len(m) != 2 {
 				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
 			} else {
 				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
 					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
 					// https://developer.nvidia.com/embedded/jetpack-archive
 					switch l4t {
 					case 35:
 						return "jetpack5"
 					case 36:
 						return "jetpack6"
 					default:
 						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
 					}
 				}
 			}
 		}
 	}
 	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
 		return "v11"
 	}
 	return "v12"
 }
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -1,73 +1,718 @@
 //go:build linux || windows
 package discover
 /*
 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
 #cgo windows LDFLAGS: -lpthread
 #include "gpu_info.h"
 */
 import "C"
 import (
 	"fmt"
 	"log/slog"
 	"os"
-	"regexp"
+	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
-	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )
-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+type cudaHandles struct {
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+	deviceCount int
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
+	cudart      *C.cudart_handle_t
-
+	nvcuda      *C.nvcuda_handle_t
-// GetSystemInfo returns the last cached state of the GPUs on the system
+	nvml        *C.nvml_handle_t
 func GetSystemInfo() ml.SystemInfo {
 	memInfo, err := GetCPUMem()
 	if err != nil {
 		slog.Warn("error looking up system memory", "error", err)
 	}
 	var threadCount int
 	cpus := GetCPUDetails()
 	for _, c := range cpus {
 		threadCount += c.CoreCount - c.EfficiencyCoreCount
 	}
 	if threadCount == 0 {
 		// Fall back to Go's num CPU
 		threadCount = runtime.NumCPU()
 	}
 	return ml.SystemInfo{
 		ThreadCount: threadCount,
 		TotalMemory: memInfo.TotalMemory,
 		FreeMemory:  memInfo.FreeMemory,
 		FreeSwap:    memInfo.FreeSwap,
 	}
 }
-func cudaJetpack() string {
+type oneapiHandles struct {
-	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+	oneapi      *C.oneapi_handle_t
-		if CudaTegra != "" {
+	deviceCount int
-			ver := strings.Split(CudaTegra, ".")
+}
-			if len(ver) > 0 {
+
-				return "jetpack" + ver[0]
+const (
 	cudaMinimumMemory = 457 * format.MebiByte
 	rocmMinimumMemory = 457 * format.MebiByte
 	// TODO OneAPI minimum memory
 )
 var (
 	gpuMutex      sync.Mutex
 	bootstrapped  bool
 	cpus          []CPUInfo
 	cudaGPUs      []CudaGPUInfo
 	nvcudaLibPath string
 	cudartLibPath string
 	oneapiLibPath string
 	nvmlLibPath   string
 	rocmGPUs      []RocmGPUInfo
 	oneapiGPUs    []OneapiGPUInfo
 	// If any discovered GPUs are incompatible, report why
 	unsupportedGPUs []UnsupportedGPUInfo
 	// Keep track of errors during bootstrapping so that if GPUs are missing
 	// they expected to be present this may explain why
 	bootstrapErrors []error
 )
 // With our current CUDA compile flags, older than 5.0 will not work properly
 // (string values used to allow ldflags overrides at build time)
 var (
 	CudaComputeMajorMin = "5"
 	CudaComputeMinorMin = "0"
 )
 var RocmComputeMajorMin = "9"
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
 	cHandles := &cudaHandles{}
 	// Short Circuit if we already know which library to use
 	// ignore bootstrap errors in this case since we already recorded them
 	if nvmlLibPath != "" {
 		cHandles.nvml, _, _ = loadNVMLMgmt([]string{nvmlLibPath})
 		return cHandles
 	}
 	if nvcudaLibPath != "" {
 		cHandles.deviceCount, cHandles.nvcuda, _, _ = loadNVCUDAMgmt([]string{nvcudaLibPath})
 		return cHandles
 	}
 	if cudartLibPath != "" {
 		cHandles.deviceCount, cHandles.cudart, _, _ = loadCUDARTMgmt([]string{cudartLibPath})
 		return cHandles
 	}
 	slog.Debug("searching for GPU discovery libraries for NVIDIA")
 	var cudartMgmtPatterns []string
 	// Aligned with driver, we can't carry as payloads
 	nvcudaMgmtPatterns := NvcudaGlobs
 	cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(LibOllamaPath, "cuda_v*", CudartMgmtName))
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
 	if len(NvmlGlobs) > 0 {
 		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
 		if len(nvmlLibPaths) > 0 {
 			nvml, libPath, err := loadNVMLMgmt(nvmlLibPaths)
 			if nvml != nil {
 				slog.Debug("nvidia-ml loaded", "library", libPath)
 				cHandles.nvml = nvml
 				nvmlLibPath = libPath
 			}
-		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+			if err != nil {
-			r := regexp.MustCompile(` R(\d+) `)
+				bootstrapErrors = append(bootstrapErrors, err)
-			m := r.FindSubmatch(data)
+			}
-			if len(m) != 2 {
+		}
-				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+	}
-			} else {
+
-				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
-					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+	if len(nvcudaLibPaths) > 0 {
-					// https://developer.nvidia.com/embedded/jetpack-archive
+		deviceCount, nvcuda, libPath, err := loadNVCUDAMgmt(nvcudaLibPaths)
-					switch l4t {
+		if nvcuda != nil {
-					case 35:
+			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
-						return "jetpack5"
+			cHandles.nvcuda = nvcuda
-					case 36:
+			cHandles.deviceCount = deviceCount
-						return "jetpack6"
+			nvcudaLibPath = libPath
-					default:
+			return cHandles
-						// Newer Jetson systems use the SBSU runtime
+		}
-						slog.Debug("unrecognized L4T version", "nv_tegra_release", string(data))
+		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 	}
 	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
 	if len(cudartLibPaths) > 0 {
 		deviceCount, cudart, libPath, err := loadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
 			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
 			cHandles.cudart = cudart
 			cHandles.deviceCount = deviceCount
 			cudartLibPath = libPath
 			return cHandles
 		}
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 	}
 	return cHandles
 }
 // Note: gpuMutex must already be held
 func initOneAPIHandles() *oneapiHandles {
 	oHandles := &oneapiHandles{}
 	// Short Circuit if we already know which library to use
 	// ignore bootstrap errors in this case since we already recorded them
 	if oneapiLibPath != "" {
 		oHandles.deviceCount, oHandles.oneapi, _, _ = loadOneapiMgmt([]string{oneapiLibPath})
 		return oHandles
 	}
 	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
 	if len(oneapiLibPaths) > 0 {
 		var err error
 		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath, err = loadOneapiMgmt(oneapiLibPaths)
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 	}
 	return oHandles
 }
 func GetCPUInfo() GpuInfoList {
 	gpuMutex.Lock()
 	if !bootstrapped {
 		gpuMutex.Unlock()
 		GetGPUInfo()
 	} else {
 		gpuMutex.Unlock()
 	}
 	return GpuInfoList{cpus[0].GpuInfo}
 }
 func GetGPUInfo() GpuInfoList {
 	// TODO - consider exploring lspci (and equivalent on windows) to check for
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
 	needRefresh := true
 	var cHandles *cudaHandles
 	var oHandles *oneapiHandles
 	defer func() {
 		if cHandles != nil {
 			if cHandles.cudart != nil {
 				C.cudart_release(*cHandles.cudart)
 			}
 			if cHandles.nvcuda != nil {
 				C.nvcuda_release(*cHandles.nvcuda)
 			}
 			if cHandles.nvml != nil {
 				C.nvml_release(*cHandles.nvml)
 			}
 		}
 		if oHandles != nil {
 			if oHandles.oneapi != nil {
 				// TODO - is this needed?
 				C.oneapi_release(*oHandles.oneapi)
 			}
 		}
 	}()
 	if !bootstrapped {
 		slog.Info("looking for compatible GPUs")
 		cudaComputeMajorMin, err := strconv.Atoi(CudaComputeMajorMin)
 		if err != nil {
 			slog.Error("invalid CudaComputeMajorMin setting", "value", CudaComputeMajorMin, "error", err)
 		}
 		cudaComputeMinorMin, err := strconv.Atoi(CudaComputeMinorMin)
 		if err != nil {
 			slog.Error("invalid CudaComputeMinorMin setting", "value", CudaComputeMinorMin, "error", err)
 		}
 		bootstrapErrors = []error{}
 		needRefresh = false
 		var memInfo C.mem_info_t
 		mem, err := GetCPUMem()
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
 		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
 		}
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
 					memInfo: mem,
 					Library: "cpu",
 					ID:      "0",
 				},
 				CPUs: details,
 			},
 		}
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 		// NVIDIA
 		for i := range cHandles.deviceCount {
 			if cHandles.cudart != nil || cHandles.nvcuda != nil {
 				gpuInfo := CudaGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "cuda",
 					},
 					index: i,
 				}
 				var driverMajor int
 				var driverMinor int
 				if cHandles.cudart != nil {
 					C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
 				} else {
 					C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
 					driverMajor = int(cHandles.nvcuda.driver_major)
 					driverMinor = int(cHandles.nvcuda.driver_minor)
 				}
 				if memInfo.err != nil {
 					slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 					C.free(unsafe.Pointer(memInfo.err))
 					continue
 				}
 				gpuInfo.TotalMemory = uint64(memInfo.total)
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
 				gpuInfo.computeMajor = int(memInfo.major)
 				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
 				// Start with our bundled libraries
 				if variant != "" {
 					variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant)
 					if _, err := os.Stat(variantPath); err == nil {
 						// Put the variant directory first in the search path to avoid runtime linking to the wrong library
 						gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...)
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.Variant = variant
 				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) {
 					unsupportedGPUs = append(unsupportedGPUs,
 						UnsupportedGPUInfo{
 							GpuInfo: gpuInfo.GpuInfo,
 						})
 					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
 					continue
 				}
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
 				if cHandles.nvml != nil {
 					uuid := C.CString(gpuInfo.ID)
 					defer C.free(unsafe.Pointer(uuid))
 					C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 					if memInfo.err != nil {
 						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 						C.free(unsafe.Pointer(memInfo.err))
 					} else {
 						if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory {
 							gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory
 							slog.Info("detected OS VRAM overhead",
 								"id", gpuInfo.ID,
 								"library", gpuInfo.Library,
 								"compute", gpuInfo.Compute,
 								"driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor),
 								"name", gpuInfo.Name,
 								"overhead", format.HumanBytes2(gpuInfo.OSOverhead),
 							)
 						}
 					}
 				}
 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 				cudaGPUs = append(cudaGPUs, gpuInfo)
 			}
 		}
 		// Intel
 		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			if oHandles != nil && oHandles.oneapi != nil {
 				for d := range oHandles.oneapi.num_drivers {
 					if oHandles.oneapi == nil {
 						// shouldn't happen
 						slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
 						continue
 					}
 					devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
 					for i := range devCount {
 						gpuInfo := OneapiGPUInfo{
 							GpuInfo: GpuInfo{
 								Library: "oneapi",
 							},
 							driverIndex: int(d),
 							gpuIndex:    int(i),
 						}
 						// TODO - split bootstrapping from updating free memory
 						C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
 						// TODO - convert this to MinimumMemory based on testing...
 						var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
 						memInfo.free = C.uint64_t(totalFreeMem)
 						gpuInfo.TotalMemory = uint64(memInfo.total)
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 						gpuInfo.DependencyPath = []string{LibOllamaPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
 			}
 		}
 		rocmGPUs, err = AMDGetGPUInfo()
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 		bootstrapped = true
 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
 			slog.Info("no compatible GPUs were discovered")
 		}
 		// TODO verify we have runners for the discovered GPUs, filter out any that aren't supported with good error messages
 	}
 	// For detected GPUs, load library if not loaded
 	// Refresh free memory usage
 	if needRefresh {
 		mem, err := GetCPUMem()
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		} else {
 			slog.Debug("updating system memory data",
 				slog.Group(
 					"before",
 					"total", format.HumanBytes2(cpus[0].TotalMemory),
 					"free", format.HumanBytes2(cpus[0].FreeMemory),
 					"free_swap", format.HumanBytes2(cpus[0].FreeSwap),
 				),
 				slog.Group(
 					"now",
 					"total", format.HumanBytes2(mem.TotalMemory),
 					"free", format.HumanBytes2(mem.FreeMemory),
 					"free_swap", format.HumanBytes2(mem.FreeSwap),
 				),
 			)
 			cpus[0].FreeMemory = mem.FreeMemory
 			cpus[0].FreeSwap = mem.FreeSwap
 		}
 		var memInfo C.mem_info_t
 		if cHandles == nil && len(cudaGPUs) > 0 {
 			cHandles = initCudaHandles()
 		}
 		for i, gpu := range cudaGPUs {
 			if cHandles.nvml != nil {
 				uuid := C.CString(gpu.ID)
 				defer C.free(unsafe.Pointer(uuid))
 				C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 			} else if cHandles.cudart != nil {
 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
 			} else if cHandles.nvcuda != nil {
 				C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
 				memInfo.used = memInfo.total - memInfo.free
 			} else {
 				// shouldn't happen
 				slog.Warn("no valid cuda library loaded to refresh vram usage")
 				break
 			}
 			if memInfo.err != nil {
 				slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 				C.free(unsafe.Pointer(memInfo.err))
 				continue
 			}
 			if memInfo.free == 0 {
 				slog.Warn("error looking up nvidia GPU memory")
 				continue
 			}
 			if cHandles.nvml != nil && gpu.OSOverhead > 0 {
 				// When using the management library update based on recorded overhead
 				memInfo.free -= C.uint64_t(gpu.OSOverhead)
 			}
 			slog.Debug("updating cuda memory data",
 				"gpu", gpu.ID,
 				"name", gpu.Name,
 				"overhead", format.HumanBytes2(gpu.OSOverhead),
 				slog.Group(
 					"before",
 					"total", format.HumanBytes2(gpu.TotalMemory),
 					"free", format.HumanBytes2(gpu.FreeMemory),
 				),
 				slog.Group(
 					"now",
 					"total", format.HumanBytes2(uint64(memInfo.total)),
 					"free", format.HumanBytes2(uint64(memInfo.free)),
 					"used", format.HumanBytes2(uint64(memInfo.used)),
 				),
 			)
 			cudaGPUs[i].FreeMemory = uint64(memInfo.free)
 		}
 		if oHandles == nil && len(oneapiGPUs) > 0 {
 			oHandles = initOneAPIHandles()
 		}
 		for i, gpu := range oneapiGPUs {
 			if oHandles.oneapi == nil {
 				// shouldn't happen
 				slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
 				continue
 			}
 			C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
 			// TODO - convert this to MinimumMemory based on testing...
 			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
 			memInfo.free = C.uint64_t(totalFreeMem)
 			oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
 		}
 		err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
 		if err != nil {
 			slog.Debug("problem refreshing ROCm free memory", "error", err)
 		}
 	}
 	resp := []GpuInfo{}
 	for _, gpu := range cudaGPUs {
 		resp = append(resp, gpu.GpuInfo)
 	}
 	for _, gpu := range rocmGPUs {
 		resp = append(resp, gpu.GpuInfo)
 	}
 	for _, gpu := range oneapiGPUs {
 		resp = append(resp, gpu.GpuInfo)
 	}
 	if len(resp) == 0 {
 		resp = append(resp, cpus[0].GpuInfo)
 	}
 	return resp
 }
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)
 	// search our bundled libraries first
 	patterns := []string{filepath.Join(LibOllamaPath, baseLibName)}
 	var ldPaths []string
 	switch runtime.GOOS {
 	case "windows":
 		ldPaths = strings.Split(os.Getenv("PATH"), string(os.PathListSeparator))
 	case "linux":
 		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), string(os.PathListSeparator))
 	}
 	// then search the system's LD_LIBRARY_PATH
 	for _, p := range ldPaths {
 		p, err := filepath.Abs(p)
 		if err != nil {
 			continue
 		}
 		patterns = append(patterns, filepath.Join(p, baseLibName))
 	}
 	// finally, search the default patterns provided by the caller
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
 	for _, pattern := range patterns {
 		// Nvidia PhysX known to return bogus results
 		if strings.Contains(pattern, "PhysX") {
 			slog.Debug("skipping PhysX cuda library path", "path", pattern)
 			continue
 		}
 		// Ignore glob discovery errors
 		matches, _ := filepath.Glob(pattern)
 		for _, match := range matches {
 			// Resolve any links so we don't try the same lib multiple times
 			// and weed out any dups across globs
 			libPath := match
 			tmp := match
 			var err error
 			for ; err == nil; tmp, err = os.Readlink(libPath) {
 				if !filepath.IsAbs(tmp) {
 					tmp = filepath.Join(filepath.Dir(libPath), tmp)
 				}
 				libPath = tmp
 			}
 			new := true
 			for _, cmp := range gpuLibPaths {
 				if cmp == libPath {
 					new = false
 					break
 				}
 			}
 			if new {
 				gpuLibPaths = append(gpuLibPaths, libPath)
 			}
 		}
 	}
 	slog.Debug("discovered GPU libraries", "paths", gpuLibPaths)
 	return gpuLibPaths
 }
 // Bootstrap the runtime library
 // Returns: num devices, handle, libPath, error
 func loadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string, error) {
 	var resp C.cudart_init_resp_t
 	resp.ch.verbose = getVerboseState()
 	var err error
 	for _, libPath := range cudartLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.cudart_init(lib, &resp)
 		if resp.err != nil {
 			err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
 			slog.Debug(err.Error())
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			err = nil
 			return int(resp.num_devices), &resp.ch, libPath, err
 		}
 	}
 	return 0, nil, "", err
 }
 // Bootstrap the driver library
 // Returns: num devices, handle, libPath, error
 func loadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string, error) {
 	var resp C.nvcuda_init_resp_t
 	resp.ch.verbose = getVerboseState()
 	var err error
 	for _, libPath := range nvcudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.nvcuda_init(lib, &resp)
 		if resp.err != nil {
 			// Decide what log level based on the type of error message to help users understand why
 			switch resp.cudaErr {
 			case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
 				err = fmt.Errorf("version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s", libPath)
 				slog.Warn(err.Error())
 			case C.CUDA_ERROR_NO_DEVICE:
 				err = fmt.Errorf("no nvidia devices detected by library %s", libPath)
 				slog.Info(err.Error())
 			case C.CUDA_ERROR_UNKNOWN:
 				err = fmt.Errorf("unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information", libPath, C.GoString(resp.err))
 				slog.Warn(err.Error())
 			default:
 				msg := C.GoString(resp.err)
 				if strings.Contains(msg, "wrong ELF class") {
 					slog.Debug("skipping 32bit library", "library", libPath)
 				} else {
 					err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
 					slog.Info(err.Error())
 				}
 			}
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			err = nil
 			return int(resp.num_devices), &resp.ch, libPath, err
 		}
 	}
 	return 0, nil, "", err
 }
 // Bootstrap the management library
 // Returns: handle, libPath, error
 func loadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string, error) {
 	var resp C.nvml_init_resp_t
 	resp.ch.verbose = getVerboseState()
 	var err error
 	for _, libPath := range nvmlLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.nvml_init(lib, &resp)
 		if resp.err != nil {
 			err = fmt.Errorf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))
 			slog.Info(err.Error())
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			err = nil
 			return &resp.ch, libPath, err
 		}
 	}
 	return nil, "", err
 }
 // bootstrap the Intel GPU library
 // Returns: num devices, handle, libPath, error
 func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, error) {
 	var resp C.oneapi_init_resp_t
 	num_devices := 0
 	resp.oh.verbose = getVerboseState()
 	var err error
 	for _, libPath := range oneapiLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.oneapi_init(lib, &resp)
 		if resp.err != nil {
 			err = fmt.Errorf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err))
 			slog.Debug(err.Error())
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			err = nil
 			for i := range resp.oh.num_drivers {
 				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
 			}
 			return num_devices, &resp.oh, libPath, err
 		}
 	}
 	return 0, nil, "", err
 }
 func getVerboseState() C.uint16_t {
 	if envconfig.Debug() {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
 }
 // Given the list of GPUs this instantiation is targeted for,
 // figure out the visible devices environment variable
 //
 // If different libraries are detected, the first one is what we use
 func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	if len(l) == 0 {
 		return "", ""
 	}
 	switch l[0].Library {
 	case "cuda":
 		return cudaGetVisibleDevicesEnv(l)
 	case "rocm":
 		return rocmGetVisibleDevicesEnv(l)
 	case "oneapi":
 		return oneapiGetVisibleDevicesEnv(l)
 	default:
 		slog.Debug("no filter required for library " + l[0].Library)
 		return "", ""
 	}
 }
 func GetSystemInfo() SystemInfo {
 	gpus := GetGPUInfo()
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
 	discoveryErrors := []string{}
 	for _, err := range bootstrapErrors {
 		discoveryErrors = append(discoveryErrors, err.Error())
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		gpus = []GpuInfo{}
 	}
 	return SystemInfo{
 		System:          cpus[0],
 		GPUs:            gpus,
 		UnsupportedGPUs: unsupportedGPUs,
 		DiscoveryErrors: discoveryErrors,
 	}
 	return ""
 }
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -1,3 +1,5 @@
 //go:build darwin
 package discover
 /*
@@ -9,6 +11,7 @@ import "C"
 import (
 	"log/slog"
 	"runtime"
 	"syscall"
 	"github.com/ollama/ollama/format"
@@ -18,6 +21,39 @@ const (
 	metalMinimumMemory = 512 * format.MebiByte
 )
 func GetGPUInfo() GpuInfoList {
 	mem, _ := GetCPUMem()
 	if runtime.GOARCH == "amd64" {
 		return []GpuInfo{
 			{
 				Library: "cpu",
 				memInfo: mem,
 			},
 		}
 	}
 	info := GpuInfo{
 		Library: "metal",
 		ID:      "0",
 	}
 	info.TotalMemory = uint64(C.getRecommendedMaxVRAM())
 	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
 	info.FreeMemory = info.TotalMemory
 	info.MinimumMemory = metalMinimumMemory
 	return []GpuInfo{info}
 }
 func GetCPUInfo() GpuInfoList {
 	mem, _ := GetCPUMem()
 	return []GpuInfo{
 		{
 			Library: "cpu",
 			memInfo: mem,
 		},
 	}
 }
 func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
@@ -26,7 +62,13 @@ func GetCPUMem() (memInfo, error) {
 	}, nil
 }
-func GetCPUDetails() []CPU {
+func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	// No-op on darwin
 	return "", ""
 }
 func GetSystemInfo() SystemInfo {
 	mem, _ := GetCPUMem()
 	query := "hw.perflevel0.physicalcpu"
 	perfCores, err := syscall.SysctlUint32(query)
 	if err != nil {
@@ -39,16 +81,19 @@ func GetCPUDetails() []CPU {
 	query = "hw.logicalcpu"
 	logicalCores, _ := syscall.SysctlUint32(query)
-	return []CPU{
+	return SystemInfo{
-		{
+		System: CPUInfo{
-			CoreCount:           int(perfCores + efficiencyCores),
+			GpuInfo: GpuInfo{
-			EfficiencyCoreCount: int(efficiencyCores),
+				memInfo: mem,
-			ThreadCount:         int(logicalCores),
+			},
 			CPUs: []CPU{
 				{
 					CoreCount:           int(perfCores + efficiencyCores),
 					EfficiencyCoreCount: int(efficiencyCores),
 					ThreadCount:         int(logicalCores),
 				},
 			},
 		},
 		GPUs: GetGPUInfo(),
 	}
 }
 func IsNUMA() bool {
 	// numa support in ggml is linux only
 	return false
 }
--- a/discover/gpu_info.h
+++ b/discover/gpu_info.h
@@ -0,0 +1,70 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_H__
 #define __GPU_INFO_H__
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #ifndef _WIN32
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #else
 #include <windows.h>
 #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
 #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
 #define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
 #define LOAD_ERR() ({\
  LPSTR messageBuffer = NULL; \
  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
  char *resp = strdup(messageBuffer); \
  LocalFree(messageBuffer); \
  resp; \
 })
 #endif
 #define LOG(verbose, ...) \
  do { \
    if (verbose) { \
      fprintf(stderr, __VA_ARGS__); \
    } \
  } while (0)
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define GPU_ID_LEN 64
 #define GPU_NAME_LEN 96
 typedef struct mem_info {
  char *err;  // If non-nill, caller responsible for freeing
  char gpu_id[GPU_ID_LEN];
  char gpu_name[GPU_NAME_LEN];
  uint64_t total;
  uint64_t free;
  uint64_t used;
  // Compute Capability
  int major; 
  int minor;
  int patch;
 } mem_info_t;
 void cpu_check_ram(mem_info_t *resp);
 #ifdef __cplusplus
 }
 #endif
 #include "gpu_info_cudart.h"
 #include "gpu_info_nvcuda.h"
 #include "gpu_info_nvml.h"
 #include "gpu_info_oneapi.h"
 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
@@ -0,0 +1,183 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
 #include <string.h>
 #include "gpu_info_cudart.h"
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  cudartReturn_t ret;
  resp->err = NULL;
  resp->num_devices = 0;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
      {"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
      {NULL, NULL},
  };
  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
    snprintf(buf, buflen,
            "Unable to load %s library to query for Nvidia GPUs: %s",
            cudart_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }
  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!*(l[i].p)) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
              msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }
  ret = (*resp->ch.cudaSetDevice)(0);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
      return;
    }
    snprintf(buf, buflen, "cudart init failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  int version = 0;
  cudartDriverVersion_t driverVersion;
  driverVersion.major = 0;
  driverVersion.minor = 0;
  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.cudaDriverGetVersion)(&version);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
  } else {
    driverVersion.major = version / 1000;
    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
  }
  ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }
 }
 void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  cudartMemory_t memInfo = {0,0,0};
  cudartReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
    resp->err = strdup("cudart handle isn't initialized");
    return;
  }
  ret = (*h.cudaSetDevice)(i);
  if (ret != CUDART_SUCCESS) {
    snprintf(buf, buflen, "cudart device failed to initialize");
    resp->err = strdup(buf);
    return;
  }
  cudaDeviceProp_t props;
  ret = (*h.cudaGetDeviceProperties)(&props, i);
  if (ret != CUDART_SUCCESS) {
    LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
    resp->major = 0;
    resp->minor = 0;
  } else {
    int allNull = 1;
    for (int j = 0; j < 16; j++) {
      if (props.uuid.bytes[j] != 0) {
        allNull = 0;
        break;
      }
    }
    if (allNull != 0) {
      snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
    } else {
      // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
      snprintf(&resp->gpu_id[0], GPU_ID_LEN,
          "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
          props.uuid.bytes[0],
          props.uuid.bytes[1],
          props.uuid.bytes[2],
          props.uuid.bytes[3],
          props.uuid.bytes[4],
          props.uuid.bytes[5],
          props.uuid.bytes[6],
          props.uuid.bytes[7],
          props.uuid.bytes[8],
          props.uuid.bytes[9],
          props.uuid.bytes[10],
          props.uuid.bytes[11],
          props.uuid.bytes[12],
          props.uuid.bytes[13],
          props.uuid.bytes[14],
          props.uuid.bytes[15]
        );
    }
    resp->major = props.major;
    resp->minor = props.minor;
    // TODO add other useful properties from props
  }
  ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
  if (ret != CUDART_SUCCESS) {
    snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
    resp->err = strdup(buf);
    return;
  }
  resp->total = memInfo.total;
  resp->free = memInfo.free;
  resp->used = memInfo.used;
  LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
  LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
 }
 void cudart_release(cudart_handle_t h) {
  LOG(h.verbose, "releasing cudart library\n");
  UNLOAD_LIBRARY(h.handle);
  h.handle = NULL;
 }
 #endif  // __APPLE__
--- a/discover/gpu_info_cudart.h
+++ b/discover/gpu_info_cudart.h
@@ -0,0 +1,148 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_CUDART_H__
 #define __GPU_INFO_CUDART_H__
 #include "gpu_info.h"
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum cudartReturn_enum {
  CUDART_SUCCESS = 0,
  CUDART_ERROR_INVALID_VALUE = 1,
  CUDART_ERROR_MEMORY_ALLOCATION = 2,
  CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
  // Other values omitted for now...
 } cudartReturn_t;
 typedef enum cudartDeviceAttr_enum {
  cudartDevAttrComputeCapabilityMajor = 75,
  cudartDevAttrComputeCapabilityMinor = 76,
  // TODO - not yet wired up but may be useful for Jetson or other
  // integrated GPU scenarios with shared memory
  cudaDevAttrIntegrated = 18
 } cudartDeviceAttr_t;
 typedef void *cudartDevice_t;  // Opaque is sufficient
 typedef struct cudartMemory_st {
  size_t total;
  size_t free;
  size_t used;
 } cudartMemory_t;
 typedef struct cudartDriverVersion {
  int major;
  int minor;
 } cudartDriverVersion_t;
 typedef struct cudaUUID {
    unsigned char bytes[16];
 } cudaUUID_t;
 typedef struct cudaDeviceProp {
    char         name[256];                  /**< ASCII string identifying device */
    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
    int          regsPerBlock;               /**< 32-bit registers available per block */
    int          warpSize;                   /**< Warp size in threads */
    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
    int          clockRate;                  /**< Clock frequency in kilohertz */
    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
    int          major;                      /**< Major compute capability */
    int          minor;                      /**< Minor compute capability */
    size_t       textureAlignment;           /**< Alignment requirement for textures */
    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
    int          multiProcessorCount;        /**< Number of multiprocessors on device */
    int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
    int          integrated;                 /**< Device is integrated as opposed to discrete */
    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
    int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
    int          maxTexture1D;               /**< Maximum 1D texture size */
    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
    int          maxTexture1DLinear;         /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
    int          maxSurface1D;               /**< Maximum 1D surface size */
    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
    int          ECCEnabled;                 /**< Device has ECC support enabled */
    int          pciBusID;                   /**< PCI bus ID of the device */
    int          pciDeviceID;                /**< PCI device ID of the device */
    int          pciDomainID;                /**< PCI domain ID of the device */
    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
    int          asyncEngineCount;           /**< Number of asynchronous engines */
    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
    int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
    int          memoryBusWidth;             /**< Global memory bus width in bits */
    int          l2CacheSize;                /**< Size of L2 cache in bytes */
    int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
    int          managedMemory;              /**< Device supports allocating managed memory on this system */
    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
    int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
    int          computePreemptionSupported; /**< Device supports Compute Preemption */
    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
    int          cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
    int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
    int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
    size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */
  } cudaDeviceProp_t;
 typedef struct cudart_handle {
  void *handle;
  uint16_t verbose;
  cudartReturn_t (*cudaSetDevice)(int device);
  cudartReturn_t (*cudaDeviceSynchronize)(void);
  cudartReturn_t (*cudaDeviceReset)(void);
  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
  cudartReturn_t (*cudaGetDeviceCount)(int *);
  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
  cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
 } cudart_handle_t;
 typedef struct cudart_init_resp {
  char *err;  // If err is non-null handle is invalid
  cudart_handle_t ch;
  int num_devices;
 } cudart_init_resp_t;
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
 void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
 // TODO - if we keep this library longer term, add cudart_get_free
 void cudart_release(cudart_handle_t ch);
 #endif  // __GPU_INFO_CUDART_H__
 #endif  // __APPLE__
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@@ -0,0 +1,250 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
 #include <string.h>
 #include "gpu_info_nvcuda.h"
 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
  resp->cudaErr = CUDA_SUCCESS;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"cuInit", (void *)&resp->ch.cuInit},
      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
      {NULL, NULL},
  };
  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
    snprintf(buf, buflen,
            "Unable to load %s library to query for Nvidia GPUs: %s",
            nvcuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    resp->cudaErr = -1;
    return;
  }
  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!*(l[i].p)) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
              msg);
      free(msg);
      resp->err = strdup(buf);
      resp->cudaErr = -1;
      return;
    }
    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
  }
  LOG(resp->ch.verbose, "calling cuInit\n");
  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
    resp->err = strdup(buf);
    resp->cudaErr = ret;
    return;
  }
  int version = 0;
  resp->ch.driver_major = 0;
  resp->ch.driver_minor = 0;
  // Report driver version if we're in verbose mode, ignore errors
  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }
  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    resp->cudaErr = ret;
    return;
  }
  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
 }
 const int buflen = 256;
 void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  nvcudaMemory_t memInfo = {0,0};
  CUresult ret;
  CUdevice device = -1;
  CUcontext ctx = NULL;
  char buf[buflen + 1];
  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  if (h.handle == NULL) {
    resp->err = strdup("cuda driver library handle isn't initialized");
    return;
  }
  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
    snprintf(buf, buflen, "cuda driver library device failed to initialize");
    resp->err = strdup(buf);
    return;
  }
  int major = 0;
  int minor = 0;
  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  } else {
    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
    if (ret != CUDA_SUCCESS) {
      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
    } else {
      resp->minor = minor;  
      resp->major = major;  
    }
  }
  ret = (*h.cuDeviceGetUuid)(&uuid, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  } else {
    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
        uuid.bytes[0],
        uuid.bytes[1],
        uuid.bytes[2],
        uuid.bytes[3],
        uuid.bytes[4],
        uuid.bytes[5],
        uuid.bytes[6],
        uuid.bytes[7],
        uuid.bytes[8],
        uuid.bytes[9],
        uuid.bytes[10],
        uuid.bytes[11],
        uuid.bytes[12],
        uuid.bytes[13],
        uuid.bytes[14],
        uuid.bytes[15]
      );
  }
  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
    resp->gpu_name[0] = '\0';
  }
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
    resp->err = strdup(buf);
    return;
  }
  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  if (ret != CUDA_SUCCESS) {
    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
    resp->err = strdup(buf);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }
  resp->total = memInfo.total;
  resp->free = memInfo.free;
  LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
    LOG(1, "cuda driver library failed to release device context %d", ret);
  }
 }
 void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
  CUresult ret;
  CUcontext ctx = NULL;
  CUdevice device = -1;
  *free = 0;
  *total = 0;
  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
    LOG(1, "cuda driver library device failed to initialize");
    return;
  }
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
    LOG(1, "cuda driver library failed to get device context %d", ret);
    return;
  }
  ret = (*h.cuMemGetInfo_v2)(free, total);
  if (ret != CUDA_SUCCESS) {
    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }
  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
    LOG(1, "cuda driver library failed to release device context %d", ret);
  }
 }
 void nvcuda_release(nvcuda_handle_t h) {
  LOG(h.verbose, "releasing cuda driver library\n");
  UNLOAD_LIBRARY(h.handle);
  // TODO and other context release logic?
  h.handle = NULL;
 }
 #endif  // __APPLE__
--- a/discover/gpu_info_nvcuda.h
+++ b/discover/gpu_info_nvcuda.h
@@ -0,0 +1,79 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_NVCUDA_H__
 #define __GPU_INFO_NVCUDA_H__
 #include "gpu_info.h"
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum cudaError_enum {
  CUDA_SUCCESS = 0,
  CUDA_ERROR_INVALID_VALUE = 1,
  CUDA_ERROR_OUT_OF_MEMORY = 2,
  CUDA_ERROR_NOT_INITIALIZED = 3,
  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
  CUDA_ERROR_NO_DEVICE = 100,
  CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
  CUDA_ERROR_UNKNOWN = 999,
  // Other values omitted for now...
 } CUresult;
 typedef enum CUdevice_attribute_enum {
  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
  // TODO - not yet wired up but may be useful for Jetson or other
  // integrated GPU scenarios with shared memory
  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
 } CUdevice_attribute;
 typedef void *nvcudaDevice_t;  // Opaque is sufficient
 typedef struct nvcudaMemory_st {
  uint64_t total;
  uint64_t free;
 } nvcudaMemory_t;
 typedef struct nvcudaDriverVersion {
  int major;
  int minor;
 } nvcudaDriverVersion_t;
 typedef struct CUuuid_st {
    unsigned char bytes[16];
 } CUuuid;
 typedef int CUdevice;
 typedef void* CUcontext;
 typedef struct nvcuda_handle {
  void *handle;
  uint16_t verbose;
  int driver_major;
  int driver_minor;
  CUresult (*cuInit)(unsigned int Flags);
  CUresult (*cuDriverGetVersion)(int *driverVersion);
  CUresult (*cuDeviceGetCount)(int *);
  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
  CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
  // Context specific aspects
  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
  CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
  CUresult (*cuCtxDestroy)(CUcontext ctx);
 } nvcuda_handle_t;
 typedef struct nvcuda_init_resp {
  char *err;  // If err is non-null handle is invalid
  nvcuda_handle_t ch;
  int num_devices;
  CUresult cudaErr;
 } nvcuda_init_resp_t;
 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
 void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
 void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free, uint64_t *total);
 void nvcuda_release(nvcuda_handle_t ch);
 #endif  // __GPU_INFO_NVCUDA_H__
 #endif  // __APPLE__
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
@@ -0,0 +1,104 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
 #include <string.h>
 #include "gpu_info_nvml.h"
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
      {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
      {NULL, NULL},
  };
  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
             nvml_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }
  // TODO once we've squashed the remaining corner cases remove this log
  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!*(l[i].p)) {
      resp->ch.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }
  ret = (*resp->ch.nvmlInit_v2)();
  if (ret != NVML_SUCCESS) {
    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
 }
 void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
    nvmlDevice_t device;
    nvmlMemory_t memInfo = {0};
    nvmlReturn_t ret;
    ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
    if (ret != NVML_SUCCESS) {
        LOG(1, "unable to get device handle %s: %d", uuid, ret);
        *free = 0;
        return;
    }
    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
        LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
        *free = 0;
        return;
    }
    *free = memInfo.free;
    *total = memInfo.total;
    *used = memInfo.used;
 }
 void nvml_release(nvml_handle_t h) {
  LOG(h.verbose, "releasing nvml library\n");
  nvmlReturn_t ret;
  ret = (*h.nvmlShutdown)();
  if (ret != NVML_SUCCESS) {
    LOG(1, "error during nvmlShutdown %d", ret);
  }
  UNLOAD_LIBRARY(h.handle);
  h.handle = NULL;
 }
 #endif  // __APPLE__
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
@@ -0,0 +1,48 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_NVML_H__
 #define __GPU_INFO_NVML_H__
 #include "gpu_info.h"
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum nvmlReturn_enum {
  NVML_SUCCESS = 0,
  // Other values omitted for now...
 } nvmlReturn_t;
 typedef void *nvmlDevice_t;  // Opaque is sufficient
 typedef struct nvmlMemory_st {
  unsigned long long total;
  unsigned long long free;
  unsigned long long used;
 } nvmlMemory_t;
 typedef enum nvmlBrandType_enum
 {
    NVML_BRAND_UNKNOWN          = 0,
 } nvmlBrandType_t;
 typedef struct nvml_handle {
  void *handle;
  uint16_t verbose;
  nvmlReturn_t (*nvmlInit_v2)(void);
  nvmlReturn_t (*nvmlShutdown)(void);
  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
 } nvml_handle_t;
 typedef struct nvml_init_resp {
  char *err;  // If err is non-null handle is invalid
  nvml_handle_t ch;
 } nvml_init_resp_t;
 typedef struct nvml_compute_capability {
  char *err;
  int major;
  int minor;
 } nvml_compute_capability_t;
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
 void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
 void nvml_release(nvml_handle_t ch);
 #endif  // __GPU_INFO_NVML_H__
 #endif  // __APPLE__
--- a/discover/gpu_info_oneapi.c
+++ b/discover/gpu_info_oneapi.c
@@ -0,0 +1,259 @@
 #ifndef __APPLE__
 #include "gpu_info_oneapi.h"
 #include <string.h>
 void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
  ze_result_t ret;
  resp->err = NULL;
  resp->oh.devices = NULL;
  resp->oh.num_devices = NULL;
  resp->oh.drivers = NULL;
  resp->oh.num_drivers = 0;
  const int buflen = 256;
  char buf[buflen + 1];
  int i, d;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"zesInit", (void *)&resp->oh.zesInit},
      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
      {"zesDeviceEnumMemoryModules",
       (void *)&resp->oh.zesDeviceEnumMemoryModules},
      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
      {NULL, NULL},
  };
  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
  if (!resp->oh.handle) {
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Intel GPUs: %s\n",
             oneapi_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }
  // TODO once we've squashed the remaining corner cases remove this log
  LOG(resp->oh.verbose,
      "wiring Level-Zero management library functions in %s\n",
      oneapi_lib_path);
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
    if (!*(l[i].p)) {
      resp->oh.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->oh.handle);
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }
  LOG(resp->oh.verbose, "calling zesInit\n");
  ret = (*resp->oh.zesInit)(0);
  if (ret != ZE_RESULT_SUCCESS) {
    LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
    snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
    resp->err = strdup(buf);
    oneapi_release(resp->oh);
    return;
  }
  LOG(resp->oh.verbose, "calling zesDriverGet\n");
  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
  if (ret != ZE_RESULT_SUCCESS) {
    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
    snprintf(buf, buflen, "unable to get driver count: %x", ret);
    resp->err = strdup(buf);
    oneapi_release(resp->oh);
    return;
  }
  LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
  resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
  resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
  memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
  resp->oh.devices =
      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
  if (ret != ZE_RESULT_SUCCESS) {
    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
    snprintf(buf, buflen, "unable to get driver count: %x", ret);
    resp->err = strdup(buf);
    oneapi_release(resp->oh);
    return;
  }
  for (d = 0; d < resp->oh.num_drivers; d++) {
    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
                                   &resp->oh.num_devices[d], NULL);
    if (ret != ZE_RESULT_SUCCESS) {
      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
      snprintf(buf, buflen, "unable to get device count: %x", ret);
      resp->err = strdup(buf);
      oneapi_release(resp->oh);
      return;
    }
    resp->oh.devices[d] =
        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
    ret = (*resp->oh.zesDeviceGet)(
        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
    if (ret != ZE_RESULT_SUCCESS) {
      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
      snprintf(buf, buflen, "unable to get device count: %x", ret);
      resp->err = strdup(buf);
      oneapi_release(resp->oh);
      return;
    }
  }
  return;
 }
 void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
                       mem_info_t *resp) {
  ze_result_t ret;
  resp->err = NULL;
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  const int buflen = 256;
  char buf[buflen + 1];
  int i, d, m;
  if (h.handle == NULL) {
    resp->err = strdup("Level-Zero handle not initialized");
    return;
  }
  if (driver > h.num_drivers || device > h.num_devices[driver]) {
    resp->err = strdup("driver of device index out of bounds");
    return;
  }
  resp->total = 0;
  resp->free = 0;
  zes_device_ext_properties_t ext_props;
  ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
  ext_props.pNext = NULL;
  zes_device_properties_t props;
  props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
  props.pNext = &ext_props;
  ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
  if (ret != ZE_RESULT_SUCCESS) {
    snprintf(buf, buflen, "unable to get device properties: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
  // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
  // (this is probably wrong...)
  // TODO - the driver isn't included - what if there are multiple drivers?
  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
  if (h.verbose) {
    // When in verbose mode, report more information about
    // the card we discover.
    LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
        props.modelName);
    LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
        props.brandName);
    LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
        props.vendorName);
    LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
        props.serialNumber);
    LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
        props.boardNumber);
  }
  // TODO
  // Compute Capability equivalent in resp->major, resp->minor, resp->patch
  uint32_t memCount = 0;
  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
                                        NULL);
  if (ret != ZE_RESULT_SUCCESS) {
    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
             ret);
    resp->err = strdup(buf);
    return;
  }
  LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
  zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
  (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
  for (m = 0; m < memCount; m++) {
    zes_mem_state_t state;
    state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
    state.pNext = NULL;
    ret = (*h.zesMemoryGetState)(mems[m], &state);
    if (ret != ZE_RESULT_SUCCESS) {
      snprintf(buf, buflen, "unable to get memory state: %x", ret);
      resp->err = strdup(buf);
      free(mems);
      return;
    }
    resp->total += state.size;
    resp->free += state.free;
  }
  free(mems);
 }
 void oneapi_release(oneapi_handle_t h) {
  int d;
  LOG(h.verbose, "releasing oneapi library\n");
  for (d = 0; d < h.num_drivers; d++) {
    if (h.devices != NULL && h.devices[d] != NULL) {
      free(h.devices[d]);
    }
  }
  if (h.devices != NULL) {
    free(h.devices);
    h.devices = NULL;
  }
  if (h.num_devices != NULL) {
    free(h.num_devices);
    h.num_devices = NULL;
  }
  if (h.drivers != NULL) {
    free(h.drivers);
    h.drivers = NULL;
  }
  h.num_drivers = 0;
  UNLOAD_LIBRARY(h.handle);
  h.handle = NULL;
 }
 int oneapi_get_device_count(oneapi_handle_t h, int driver) {
  if (h.handle == NULL || h.num_devices == NULL) {
    return 0;
  }
  if (driver > h.num_drivers) {
    return 0;
  }
  return (int)h.num_devices[driver];
 }
 #endif // __APPLE__
--- a/discover/gpu_info_oneapi.h
+++ b/discover/gpu_info_oneapi.h
@@ -0,0 +1,203 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_ONEAPI_H__
 #define __GPU_INFO_ONEAPI_H__
 #include "gpu_info.h"
 #define ZE_MAX_DEVICE_NAME 256
 #define ZE_MAX_DEVICE_UUID_SIZE 16
 #define ZES_STRING_PROPERTY_SIZE 64
 #define ZE_BIT(_i) (1 << _i)
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum ze_result_t {
  ZE_RESULT_SUCCESS = 0,
  // Other values omitted for now...
 } ze_result_t;
 typedef uint8_t ze_bool_t;
 typedef struct _zes_driver_handle_t *zes_driver_handle_t;
 typedef struct _zes_device_handle_t *zes_device_handle_t;
 typedef struct _zes_mem_handle_t *zes_mem_handle_t;
 typedef enum _ze_structure_type_t {
  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_structure_type_t;
 typedef enum _zes_structure_type_t {
  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_structure_type_t;
 typedef enum _zes_mem_type_t {
  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_mem_type_t;
 typedef enum _zes_mem_loc_t {
  ZES_MEM_LOC_SYSTEM = 0,
  ZES_MEM_LOC_DEVICE = 1,
  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
 } zes_mem_loc_t;
 typedef enum _zes_mem_health_t {
  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
 } zes_mem_health_t;
 typedef struct _ze_device_uuid_t {
  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } ze_device_uuid_t;
 typedef struct _zes_uuid_t {
  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } zes_uuid_t;
 typedef enum _ze_device_type_t {
  ZE_DEVICE_TYPE_GPU = 1,
  ZE_DEVICE_TYPE_CPU = 2,
  ZE_DEVICE_TYPE_FPGA = 3,
  ZE_DEVICE_TYPE_MCA = 4,
  ZE_DEVICE_TYPE_VPU = 5,
  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_device_type_t;
 typedef enum _zes_device_type_t {
  ZES_DEVICE_TYPE_GPU = 1,
  ZES_DEVICE_TYPE_CPU = 2,
  ZES_DEVICE_TYPE_FPGA = 3,
  ZES_DEVICE_TYPE_MCA = 4,
  ZES_DEVICE_TYPE_VPU = 5,
  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_device_type_t;
 typedef uint32_t ze_device_property_flags_t;
 typedef enum _ze_device_property_flag_t {
  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 } ze_device_property_flag_t;
 typedef uint32_t zes_device_property_flags_t;
 typedef enum _zes_device_property_flag_t {
  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 } zes_device_property_flag_t;
 typedef struct _ze_device_properties_t {
  ze_structure_type_t stype;
  void *pNext;
  ze_device_type_t type;
  uint32_t vendorId;
  uint32_t deviceId;
  ze_device_property_flags_t flags;
  uint32_t subdeviceId;
  uint32_t coreClockRate;
  uint64_t maxMemAllocSize;
  uint32_t maxHardwareContexts;
  uint32_t maxCommandQueuePriority;
  uint32_t numThreadsPerEU;
  uint32_t physicalEUSimdWidth;
  uint32_t numEUsPerSubslice;
  uint32_t numSubslicesPerSlice;
  uint32_t numSlices;
  uint64_t timerResolution;
  uint32_t timestampValidBits;
  uint32_t kernelTimestampValidBits;
  ze_device_uuid_t uuid;
  char name[ZE_MAX_DEVICE_NAME];
 } ze_device_properties_t;
 typedef struct _zes_device_properties_t {
  zes_structure_type_t stype;
  void *pNext;
  ze_device_properties_t core;
  uint32_t numSubdevices;
  char serialNumber[ZES_STRING_PROPERTY_SIZE];
  char boardNumber[ZES_STRING_PROPERTY_SIZE];
  char brandName[ZES_STRING_PROPERTY_SIZE];
  char modelName[ZES_STRING_PROPERTY_SIZE];
  char vendorName[ZES_STRING_PROPERTY_SIZE];
  char driverVersion[ZES_STRING_PROPERTY_SIZE];
 } zes_device_properties_t;
 typedef struct _zes_device_ext_properties_t {
  zes_structure_type_t stype;
  void *pNext;
  zes_uuid_t uuid;
  zes_device_type_t type;
  zes_device_property_flags_t flags;
 } zes_device_ext_properties_t;
 typedef struct _zes_mem_properties_t {
  zes_structure_type_t stype;
  void *pNext;
  zes_mem_type_t type;
  ze_bool_t onSubdevice;
  uint32_t subdeviceId;
  zes_mem_loc_t location;
  uint64_t physicalSize;
  int32_t busWidth;
  int32_t numChannels;
 } zes_mem_properties_t;
 typedef struct _zes_mem_state_t {
  zes_structure_type_t stype;
  const void *pNext;
  zes_mem_health_t health;
  uint64_t free;
  uint64_t size;
 } zes_mem_state_t;
 typedef struct oneapi_handle {
  void *handle;
  uint16_t verbose;
  uint32_t num_drivers;
  zes_driver_handle_t *drivers;
  uint32_t *num_devices;
  zes_device_handle_t **devices;
  // TODO Driver major, minor information
  // int driver_major;
  // int driver_minor;
  ze_result_t (*zesInit)(int);
  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
                              zes_device_handle_t *phDevices);
  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
                                        zes_device_properties_t *pProperties);
  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
                                            uint32_t *pCount,
                                            zes_mem_handle_t *phMemory);
  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
                                        zes_mem_properties_t *pProperties);
  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
                                   zes_mem_state_t *pState);
 } oneapi_handle_t;
 typedef struct oneapi_init_resp {
  char *err; // If err is non-null handle is invalid
  oneapi_handle_t oh;
 } oneapi_init_resp_t;
 typedef struct oneapi_version_resp {
  ze_result_t status;
  char *str; // Contains version or error string if status != 0
 } oneapi_version_resp_t;
 void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
 void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
                       mem_info_t *resp);
 void oneapi_release(oneapi_handle_t h);
 int oneapi_get_device_count(oneapi_handle_t h, int driver);
 #endif // __GPU_INFO_INTEL_H__
 #endif // __APPLE__
--- a/discover/gpu_linux.go
+++ b/discover/gpu_linux.go
@@ -4,9 +4,7 @@ import (
 	"bufio"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"reflect"
 	"regexp"
 	"sort"
@@ -15,6 +13,47 @@ import (
 	"github.com/ollama/ollama/format"
 )
 var CudartGlobs = []string{
 	"/usr/local/cuda/lib64/libcudart.so*",
 	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
 	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
 	"/usr/lib/wsl/lib/libcudart.so*",
 	"/usr/lib/wsl/drivers/*/libcudart.so*",
 	"/opt/cuda/lib64/libcudart.so*",
 	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
 	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
 	"/usr/local/cuda/lib*/libcudart.so*",
 	"/usr/lib*/libcudart.so*",
 	"/usr/local/lib*/libcudart.so*",
 }
 var NvmlGlobs = []string{}
 var NvcudaGlobs = []string{
 	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
 	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
 	"/usr/lib/*-linux-gnu/libcuda.so*",
 	"/usr/lib/wsl/lib/libcuda.so*",
 	"/usr/lib/wsl/drivers/*/libcuda.so*",
 	"/opt/cuda/lib*/libcuda.so*",
 	"/usr/local/cuda/lib*/libcuda.so*",
 	"/usr/lib*/libcuda.so*",
 	"/usr/local/lib*/libcuda.so*",
 }
 var OneapiGlobs = []string{
 	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
 	"/usr/lib*/libze_intel_gpu.so*",
 }
 var (
 	CudartMgmtName = "libcudart.so*"
 	NvcudaMgmtName = "libcuda.so*"
 	NvmlMgmtName   = "" // not currently wired on linux
 	OneapiMgmtName = "libze_intel_gpu.so*"
 )
 func GetCPUMem() (memInfo, error) {
 	var mem memInfo
 	var total, available, free, buffers, cached, freeSwap uint64
@@ -67,17 +106,15 @@ type linuxCpuInfo struct {
 	CoreID     string `cpuinfo:"core id"`
 }
-func GetCPUDetails() []CPU {
+func GetCPUDetails() ([]CPU, error) {
 	file, err := os.Open(CpuInfoFilename)
 	if err != nil {
-		slog.Warn("failed to get CPU details", "error", err)
+		return nil, err
 		return nil
 	}
 	defer file.Close()
 	return linuxCPUDetails(file)
 }
-func linuxCPUDetails(file io.Reader) []CPU {
+func linuxCPUDetails(file io.Reader) ([]CPU, error) {
 	reColumns := regexp.MustCompile("\t+: ")
 	scanner := bufio.NewScanner(file)
 	cpuInfos := []linuxCpuInfo{}
@@ -131,11 +168,13 @@ func linuxCPUDetails(file io.Reader) []CPU {
 	for id, s := range socketByID {
 		s.CoreCount = len(coreBySocket[id])
 		s.ThreadCount = 0
 		for _, tc := range threadsByCoreBySocket[id] {
 			s.ThreadCount += tc
 		}
 		// This only works if HT is enabled, consider a more reliable model, maybe cache size comparisons?
 		efficiencyCoreCount := 0
 		for _, threads := range threadsByCoreBySocket[id] {
 			s.ThreadCount += threads
 			if threads == 1 {
 				efficiencyCoreCount++
 			}
@@ -156,17 +195,5 @@ func linuxCPUDetails(file io.Reader) []CPU {
 	for _, k := range keys {
 		result = append(result, *socketByID[k])
 	}
-	return result
+	return result, nil
 }
 func IsNUMA() bool {
 	ids := map[string]any{}
 	packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
 	for _, packageId := range packageIds {
 		id, err := os.ReadFile(packageId)
 		if err == nil {
 			ids[strings.TrimSpace(string(id))] = struct{}{}
 		}
 	}
 	return len(ids) > 1
 }
--- a/discover/gpu_linux_test.go
+++ b/discover/gpu_linux_test.go
@@ -2062,9 +2062,18 @@ power management:
 	for k, v := range testCases {
 		t.Run(k, func(t *testing.T) {
 			buf := bytes.NewBufferString(v.input)
-			cpus := linuxCPUDetails(buf)
+			cpus, err := linuxCPUDetails(buf)
 			if err != nil {
 				t.Fatal(err)
 			}
 			slog.Info("example", "scenario", k, "cpus", cpus)
 			si := SystemInfo{
 				System: CPUInfo{
 					CPUs: cpus,
 				},
 			}
 			threadCount := si.GetOptimalThreadCount()
 			if len(v.expCPUs) != len(cpus) {
 				t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
 			}
@@ -2079,6 +2088,10 @@ power management:
 					t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
 				}
 			}
 			if threadCount != v.expThreadCount {
 				t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
 			}
 		})
 	}
 }
--- a/discover/gpu_oneapi.go
+++ b/discover/gpu_oneapi.go
@@ -0,0 +1,21 @@
 //go:build linux || windows
 package discover
 import (
 	"log/slog"
 	"strings"
 )
 func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
 		if info.Library != "oneapi" {
 			// TODO shouldn't happen if things are wired correctly...
 			slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
 			continue
 		}
 		ids = append(ids, info.ID)
 	}
 	return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
 }
--- a/discover/gpu_test.go
+++ b/discover/gpu_test.go
@@ -0,0 +1,60 @@
 package discover
 import (
 	"runtime"
 	"testing"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 func TestBasicGetGPUInfo(t *testing.T) {
 	info := GetGPUInfo()
 	assert.NotEmpty(t, len(info))
 	assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
 	if info[0].Library != "cpu" {
 		assert.Greater(t, info[0].TotalMemory, uint64(0))
 		assert.Greater(t, info[0].FreeMemory, uint64(0))
 	}
 }
 func TestCPUMemInfo(t *testing.T) {
 	info, err := GetCPUMem()
 	require.NoError(t, err)
 	switch runtime.GOOS {
 	case "darwin":
 		t.Skip("CPU memory not populated on darwin")
 	case "linux", "windows":
 		assert.Greater(t, info.TotalMemory, uint64(0))
 		assert.Greater(t, info.FreeMemory, uint64(0))
 	default:
 		return
 	}
 }
 func TestByLibrary(t *testing.T) {
 	type testCase struct {
 		input  []GpuInfo
 		expect int
 	}
 	testCases := map[string]*testCase{
 		"empty":                    {input: []GpuInfo{}, expect: 0},
 		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
 		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
 		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
 		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
 		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
 	}
 	for k, v := range testCases {
 		t.Run(k, func(t *testing.T) {
 			resp := (GpuInfoList)(v.input).ByLibrary()
 			if len(resp) != v.expect {
 				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
 			}
 		})
 	}
 }
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/discover/gpu_windows.go
+++ b/discover/gpu_windows.go
@@ -26,6 +26,29 @@ var (
 	GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
 )
 var CudartGlobs = []string{
 	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
 }
 var NvmlGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }
 var NvcudaGlobs = []string{
 	"c:\\windows\\system*\\nvcuda.dll",
 }
 var OneapiGlobs = []string{
 	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
 }
 var (
 	CudartMgmtName = "cudart64_*.dll"
 	NvcudaMgmtName = "nvcuda.dll"
 	NvmlMgmtName   = "nvml.dll"
 	OneapiMgmtName = "ze_intel_gpu64.dll"
 )
 func GetCPUMem() (memInfo, error) {
 	memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
 	r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
@@ -99,22 +122,27 @@ func (pkg *winPackage) IsMember(target *GROUP_AFFINITY) bool {
 }
 func getLogicalProcessorInformationEx() ([]byte, error) {
-	buf := make([]byte, 1024)
+	buf := make([]byte, 1)
 	bufSize := len(buf)
-	var err error
+	ret, _, err := GetLogicalProcessorInformationEx.Call(
-	for range 3 {
+		uintptr(RelationAll),
-		var ret uintptr
+		uintptr(unsafe.Pointer(&buf[0])),
-		ret, _, err = GetLogicalProcessorInformationEx.Call(
+		uintptr(unsafe.Pointer(&bufSize)),
-			uintptr(RelationAll),
+	)
-			uintptr(unsafe.Pointer(&buf[0])),
+	if ret != 0 {
-			uintptr(unsafe.Pointer(&bufSize)),
+		return nil, fmt.Errorf("failed to determine size info ret:%d %w", ret, err)
 		)
 		if ret == 1 && bufSize <= len(buf) {
 			return buf, nil
 		}
 		buf = make([]byte, bufSize)
 	}
-	return nil, fmt.Errorf("unable to determine CPU details: %w", err)
+
 	buf = make([]byte, bufSize)
 	ret, _, err = GetLogicalProcessorInformationEx.Call(
 		uintptr(RelationAll),
 		uintptr(unsafe.Pointer(&buf[0])),
 		uintptr(unsafe.Pointer(&bufSize)),
 	)
 	if ret == 0 {
 		return nil, fmt.Errorf("failed to gather processor information ret:%d buflen:%d %w", ret, bufSize, err)
 	}
 	return buf, nil
 }
 func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
@@ -189,11 +217,10 @@ func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
 	return packages
 }
-func GetCPUDetails() []CPU {
+func GetCPUDetails() ([]CPU, error) {
 	buf, err := getLogicalProcessorInformationEx()
 	if err != nil {
-		slog.Warn("failed to get CPU details", "error", err)
+		return nil, err
 		return nil
 	}
 	packages := processSystemLogicalProcessorInforationList(buf)
 	cpus := make([]CPU, len(packages))
@@ -203,10 +230,5 @@ func GetCPUDetails() []CPU {
 		cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
 		cpus[i].ThreadCount = pkg.threadCount
 	}
-	return cpus
+	return cpus, nil
 }
 func IsNUMA() bool {
 	// numa support in ggml is linux only
 	return false
 }
--- a/discover/gpu_windows_test.go
+++ b/discover/gpu_windows_test.go
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,25 +12,24 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v12', 'rocm', etc.
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
 		return ""
 	}
-	if eval, err := filepath.EvalSymlinks(exe); err == nil {
+	exe, err = filepath.EvalSymlinks(exe)
-		exe = eval
+	if err != nil {
 		return ""
 	}
-	var libPath string
+	libPath := filepath.Dir(exe)
 	switch runtime.GOOS {
 	case "windows":
 		libPath = filepath.Join(filepath.Dir(exe), "lib", "ollama")
 	case "linux":
 		libPath = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 	case "darwin":
 		libPath = filepath.Dir(exe)
 	}
 	cwd, err := os.Getwd()
@@ -38,19 +37,17 @@ var LibOllamaPath string = func() string {
 		return ""
 	}
-	paths := []string{
+	// build paths for development
-		libPath,
+	buildPaths := []string{
 		// build paths for development
 		filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"),
 		filepath.Join(cwd, "build", "lib", "ollama"),
 	}
-	for _, p := range paths {
+	for _, p := range buildPaths {
 		if _, err := os.Stat(p); err == nil {
 			return p
 		}
 	}
-	return filepath.Dir(exe)
+	return libPath
 }()
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -1,488 +0,0 @@
 package discover
 // Runner based GPU discovery
 import (
 	"context"
 	"io"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"runtime"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 )
 var (
 	deviceMu     sync.Mutex
 	devices      []ml.DeviceInfo
 	libDirs      map[string]struct{}
 	rocmDir      string
 	exe          string
 	bootstrapped bool
 )
 func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
 	deviceMu.Lock()
 	defer deviceMu.Unlock()
 	startDiscovery := time.Now()
 	msg := "overall device VRAM discovery took"
 	defer func() {
 		slog.Debug(msg, "duration", time.Since(startDiscovery))
 	}()
 	if !bootstrapped {
 		msg = "GPU bootstrap discovery took"
 		libDirs = make(map[string]struct{})
 		var err error
 		exe, err = os.Executable()
 		if err != nil {
 			slog.Error("unable to lookup executable path", "error", err)
 			return nil
 		}
 		if eval, err := filepath.EvalSymlinks(exe); err == nil {
 			exe = eval
 		}
 		files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
 		if err != nil {
 			slog.Debug("unable to lookup runner library directories", "error", err)
 		}
 		for _, file := range files {
 			libDirs[filepath.Dir(file)] = struct{}{}
 		}
 		// Our current packaging model places ggml-hip in the main directory
 		// but keeps rocm in an isolated directory.  We have to add it to
 		// the [LD_LIBRARY_]PATH so ggml-hip will load properly
 		rocmDir = filepath.Join(LibOllamaPath, "rocm")
 		if _, err := os.Stat(rocmDir); err != nil {
 			rocmDir = ""
 		}
 		if len(libDirs) == 0 {
 			libDirs[""] = struct{}{}
 		}
 		slog.Info("discovering available GPUs...")
 		requested := envconfig.LLMLibrary()
 		jetpack := cudaJetpack()
 		// For our initial discovery pass, we gather all the known GPUs through
 		// all the libraries that were detected. This pass may include GPUs that
 		// are enumerated, but not actually supported.
 		// We run this in serial to avoid potentially initializing a GPU multiple
 		// times concurrently leading to memory contention
 		// TODO refactor so we group the lib dirs and do serial per version, but parallel for different libs
 		for dir := range libDirs {
 			bootstrapTimeout := 30 * time.Second
 			var dirs []string
 			if dir != "" {
 				if requested != "" && filepath.Base(dir) != requested {
 					slog.Debug("skipping available library at users request", "requested", requested, "libDir", dir)
 					continue
 				} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
 					continue
 				}
 			}
 			if dir == "" {
 				dirs = []string{LibOllamaPath}
 			} else {
 				dirs = []string{LibOllamaPath, dir}
 			}
 			// ROCm can take a long time on some systems, so give it more time before giving up
 			if dir != "" && strings.Contains(filepath.Base(dir), "rocm") {
 				bootstrapTimeout = 60 * time.Second
 			}
 			// Typically bootstrapping takes < 1s, but on some systems, with devices
 			// in low power/idle mode, initialization can take multiple seconds.  We
 			// set a long timeout just for bootstrap discovery to reduce the chance
 			// of giving up too quickly
 			ctx1stPass, cancel := context.WithTimeout(ctx, bootstrapTimeout)
 			defer cancel()
 			// For this pass, we retain duplicates in case any are incompatible with some libraries
 			devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
 		}
 		// In the second pass, we more deeply initialize the GPUs to weed out devices that
 		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
 		slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
 		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
 		defer cancel()
 		var wg sync.WaitGroup
 		needsDelete := make([]bool, len(devices))
 		supportedMu := sync.Mutex{}
 		supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
 		for i := range devices {
 			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
 			if devices[i].Library == "Metal" {
 				continue
 			}
 			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
 				var envVar string
 				id := devices[i].ID
 				if devices[i].Library == "ROCm" {
 					if runtime.GOOS != "linux" {
 						envVar = "HIP_VISIBLE_DEVICES"
 					} else {
 						envVar = "ROCR_VISIBLE_DEVICES"
 					}
 				} else if devices[i].Library == "CUDA" {
 					envVar = "CUDA_VISIBLE_DEVICES"
 				} else if devices[i].Library == "Vulkan" {
 					id = devices[i].FilteredID
 					envVar = "GGML_VK_VISIBLE_DEVICES"
 				} else {
 					slog.Error("Unknown Library:" + devices[i].Library)
 				}
 				extraEnvs := map[string]string{
 					"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
 					envVar:           id,  // Filter to just this one GPU
 				}
 				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
 					needsDelete[i] = true
 				} else {
 					supportedMu.Lock()
 					if _, ok := supported[devices[i].Library]; !ok {
 						supported[devices[i].Library] = make(map[string]map[string]int)
 					}
 					if _, ok := supported[devices[i].Library][libDir]; !ok {
 						supported[devices[i].Library][libDir] = make(map[string]int)
 					}
 					supported[devices[i].Library][libDir][devices[i].ID] = i
 					supportedMu.Unlock()
 				}
 			}(i)
 		}
 		wg.Wait()
 		logutil.Trace("supported GPU library combinations", "supported", supported)
 		filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)
 		// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
 		filterOverlapByLibrary(supported, needsDelete)
 		// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
 		rocmID := 0
 		for i := 0; i < len(needsDelete); i++ {
 			if needsDelete[i] {
 				logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
 				devices = append(devices[:i], devices[i+1:]...)
 				needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
 				i--
 			} else if devices[i].Library == "ROCm" {
 				if _, err := strconv.Atoi(devices[i].ID); err == nil {
 					// Replace the numeric ID with the post-filtered IDs
 					devices[i].FilteredID = devices[i].ID
 					devices[i].ID = strconv.Itoa(rocmID)
 				}
 				rocmID++
 			}
 		}
 		// Now filter out any overlap with different libraries (favor CUDA/HIP over others)
 		for i := 0; i < len(devices); i++ {
 			for j := i + 1; j < len(devices); j++ {
 				// For this pass, we only drop exact duplicates
 				switch devices[i].Compare(devices[j]) {
 				case ml.SameBackendDevice:
 					// Same library and device, skip it
 					devices = append(devices[:j], devices[j+1:]...)
 					j--
 					continue
 				case ml.DuplicateDevice:
 					// Different library, choose based on priority
 					var droppedDevice ml.DeviceInfo
 					if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
 						droppedDevice = devices[j]
 					} else {
 						droppedDevice = devices[i]
 						devices[i] = devices[j]
 					}
 					devices = append(devices[:j], devices[j+1:]...)
 					j--
 					typeStr := "discrete"
 					if droppedDevice.Integrated {
 						typeStr = "iGPU"
 					}
 					slog.Debug("dropping duplicate device",
 						"id", droppedDevice.ID,
 						"library", droppedDevice.Library,
 						"compute", droppedDevice.Compute(),
 						"name", droppedDevice.Name,
 						"description", droppedDevice.Description,
 						"libdirs", strings.Join(droppedDevice.LibraryPath, ","),
 						"driver", droppedDevice.Driver(),
 						"pci_id", droppedDevice.PCIID,
 						"type", typeStr,
 						"total", format.HumanBytes2(droppedDevice.TotalMemory),
 						"available", format.HumanBytes2(droppedDevice.FreeMemory),
 					)
 					continue
 				}
 			}
 		}
 		// Reset the libDirs to what we actually wind up using for future refreshes
 		libDirs = make(map[string]struct{})
 		for _, dev := range devices {
 			dir := dev.LibraryPath[len(dev.LibraryPath)-1]
 			if dir != LibOllamaPath {
 				libDirs[dir] = struct{}{}
 			}
 		}
 		if len(libDirs) == 0 {
 			libDirs[""] = struct{}{}
 		}
 		bootstrapped = true
 	} else {
 		if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 			// metal never updates free VRAM
 			return devices
 		}
 		slog.Debug("refreshing free memory")
 		updated := make([]bool, len(devices))
 		allDone := func() bool {
 			allDone := true
 			for _, done := range updated {
 				if !done {
 					allDone = false
 					break
 				}
 			}
 			return allDone
 		}
 		// First try to use existing runners to refresh VRAM since they're already
 		// active on GPU(s)
 		for _, runner := range runners {
 			if runner == nil {
 				continue
 			}
 			deviceIDs := runner.GetActiveDeviceIDs()
 			if len(deviceIDs) == 0 {
 				// Skip this runner since it doesn't have active GPU devices
 				continue
 			}
 			// Check to see if this runner is active on any devices that need a refresh
 			skip := true
 		devCheck:
 			for _, dev := range deviceIDs {
 				for i := range devices {
 					if dev == devices[i].DeviceID {
 						if !updated[i] {
 							skip = false
 							break devCheck
 						}
 					}
 				}
 			}
 			if skip {
 				continue
 			}
 			// Typical refresh on existing runner is ~500ms but allow longer if the system
 			// is under stress before giving up and using stale data.
 			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
 			defer cancel()
 			start := time.Now()
 			updatedDevices := runner.GetDeviceInfos(ctx)
 			slog.Debug("existing runner discovery took", "duration", time.Since(start))
 			for _, u := range updatedDevices {
 				for i := range devices {
 					if u.DeviceID == devices[i].DeviceID {
 						updated[i] = true
 						devices[i].FreeMemory = u.FreeMemory
 						break
 					}
 				}
 			}
 			// Short circuit if we've updated all the devices
 			if allDone() {
 				break
 			}
 		}
 		if !allDone() {
 			slog.Debug("unable to refresh all GPUs with existing runners, performing bootstrap discovery")
 			// Bootstrapping may take longer in some cases (AMD windows), but we
 			// would rather use stale free data to get the model running sooner
 			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
 			defer cancel()
 			for dir := range libDirs {
 				updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
 				for _, u := range updatedDevices {
 					for i := range devices {
 						if u.DeviceID == devices[i].DeviceID {
 							updated[i] = true
 							devices[i].FreeMemory = u.FreeMemory
 							break
 						}
 					}
 					// TODO - consider evaluating if new devices have appeared (e.g. hotplug)
 				}
 				if allDone() {
 					break
 				}
 			}
 			if !allDone() {
 				slog.Warn("unable to refresh free memory, using old values")
 			}
 		}
 	}
 	return devices
 }
 func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
 	// Filter out Vulkan devices that share a PCI ID with a non-Vulkan device that is not marked for deletion
 	for i := range devices {
 		if devices[i].Library != "Vulkan" || needsDelete[i] {
 			continue
 		}
 		if devices[i].PCIID == "" {
 			continue
 		}
 		for j := range devices {
 			if i == j {
 				continue
 			}
 			if devices[j].PCIID == "" {
 				continue
 			}
 			if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
 				needsDelete[i] = true
 				slog.Debug("dropping Vulkan duplicate by PCI ID",
 					"vulkan_id", devices[i].ID,
 					"vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
 					"pci_id", devices[i].PCIID,
 					"kept_library", devices[j].Library,
 					"kept_id", devices[j].ID,
 				)
 				break
 			}
 		}
 	}
 }
 func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
 	// For multi-GPU systems, use the newest version that supports all the GPUs
 	for _, byLibDirs := range supported {
 		libDirs := make([]string, 0, len(byLibDirs))
 		for libDir := range byLibDirs {
 			libDirs = append(libDirs, libDir)
 		}
 		sort.Sort(sort.Reverse(sort.StringSlice(libDirs)))
 		anyMissing := false
 		var newest string
 		for _, newest = range libDirs {
 			for _, libDir := range libDirs {
 				if libDir == newest {
 					continue
 				}
 				if len(byLibDirs[newest]) != len(byLibDirs[libDir]) {
 					anyMissing = true
 					break
 				}
 				for dev := range byLibDirs[newest] {
 					if _, found := byLibDirs[libDir][dev]; !found {
 						anyMissing = true
 						break
 					}
 				}
 			}
 			if !anyMissing {
 				break
 			}
 		}
 		// Now we can mark overlaps for deletion
 		for _, libDir := range libDirs {
 			if libDir == newest {
 				continue
 			}
 			for dev, i := range byLibDirs[libDir] {
 				if _, found := byLibDirs[newest][dev]; found {
 					needsDelete[i] = true
 				}
 			}
 		}
 	}
 }
 type bootstrapRunner struct {
 	port int
 	cmd  *exec.Cmd
 }
 func (r *bootstrapRunner) GetPort() int {
 	return r.port
 }
 func (r *bootstrapRunner) HasExited() bool {
 	if r.cmd != nil && r.cmd.ProcessState != nil {
 		return true
 	}
 	return false
 }
 func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
 	var out io.Writer
 	if envconfig.LogLevel() == logutil.LevelTrace {
 		out = os.Stderr
 	}
 	start := time.Now()
 	defer func() {
 		slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
 	}()
 	logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
 	cmd, port, err := llm.StartRunner(
 		true, // ollama engine
 		"",   // no model
 		ollamaLibDirs,
 		out,
 		extraEnvs,
 	)
 	if err != nil {
 		slog.Debug("failed to start runner to discovery GPUs", "error", err)
 		return nil
 	}
 	go func() {
 		cmd.Wait() // exit status ignored
 	}()
 	defer cmd.Process.Kill()
 	devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
 	if err != nil {
 		if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
 			// Expected during bootstrapping while we filter out unsupported AMD GPUs
 			logutil.Trace("runner exited", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "code", cmd.ProcessState.ExitCode())
 		} else {
 			slog.Info("failure during GPU discovery", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "error", err)
 		}
 	}
 	logutil.Trace("runner enumerated devices", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "devices", devices)
 	return devices
 }
--- a/discover/runner_test.go
+++ b/discover/runner_test.go
@@ -1,108 +0,0 @@
 package discover
 import (
 	"testing"
 	"github.com/ollama/ollama/app/lifecycle"
 )
 func init() {
 	lifecycle.InitLogging()
 }
 func TestFilterOverlapByLibrary(t *testing.T) {
 	type testcase struct {
 		name string
 		inp  map[string]map[string]map[string]int
 		exp  []bool
 	}
 	for _, tc := range []testcase{
 		{
 			name: "empty",
 			inp:  map[string]map[string]map[string]int{},
 			exp:  []bool{}, // needs deletion
 		},
 		{
 			name: "single no overlap",
 			inp: map[string]map[string]map[string]int{
 				"CUDA": {
 					"cuda_v12": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
 					},
 				},
 			},
 			exp: []bool{false},
 		},
 		{
 			name: "100% overlap pick 2nd",
 			inp: map[string]map[string]map[string]int{
 				"CUDA": {
 					"cuda_v12": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
 						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
 					},
 					"cuda_v13": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
 						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
 					},
 				},
 			},
 			exp: []bool{true, true, false, false},
 		},
 		{
 			name: "100% overlap pick 1st",
 			inp: map[string]map[string]map[string]int{
 				"CUDA": {
 					"cuda_v13": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
 						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
 					},
 					"cuda_v12": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
 						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
 					},
 				},
 			},
 			exp: []bool{false, false, true, true},
 		},
 		{
 			name: "partial overlap pick older",
 			inp: map[string]map[string]map[string]int{
 				"CUDA": {
 					"cuda_v13": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
 					},
 					"cuda_v12": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 1,
 						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 2,
 					},
 				},
 			},
 			exp: []bool{true, false, false},
 		},
 		{
 			name: "no overlap",
 			inp: map[string]map[string]map[string]int{
 				"CUDA": {
 					"cuda_v13": {
 						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
 					},
 					"cuda_v12": {
 						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
 					},
 				},
 			},
 			exp: []bool{false, false},
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			needsDelete := make([]bool, len(tc.exp))
 			filterOverlapByLibrary(tc.inp, needsDelete)
 			for i, exp := range tc.exp {
 				if needsDelete[i] != exp {
 					t.Fatalf("expected: %v\ngot: %v", tc.exp, needsDelete)
 				}
 			}
 		})
 	}
 }
--- a/discover/types.go
+++ b/discover/types.go
@@ -1,12 +1,10 @@
 package discover
 import (
 	"fmt"
 	"log/slog"
 	"path/filepath"
 	"strings"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/ml"
 )
 type memInfo struct {
@@ -15,6 +13,52 @@ type memInfo struct {
 	FreeSwap    uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
 }
 // Beginning of an `ollama info` command
 type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	memInfo
 	Library string `json:"library,omitempty"`
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant"`
 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath []string `json:"lib_path,omitempty"`
 	// Extra environment variables specific to the GPU as list of [key,value]
 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
 	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
 	// the FreeMemory is best effort, and may over or under report actual memory usage
 	// False indicates FreeMemory can generally be trusted on this GPU
 	UnreliableFreeMemory bool
 	// GPU information
 	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
 	Name    string `json:"name"`    // user friendly name if available
 	Compute string `json:"compute"` // Compute Capability or gfx
 	// Driver Information - TODO no need to put this on each GPU
 	DriverMajor int `json:"driver_major,omitempty"`
 	DriverMinor int `json:"driver_minor,omitempty"`
 	// TODO other performance capability info to help in scheduling decisions
 }
 func (gpu GpuInfo) RunnerName() string {
 	if gpu.Variant != "" {
 		return gpu.Library + "_" + gpu.Variant
 	}
 	return gpu.Library
 }
 type CPUInfo struct {
 	GpuInfo
 	CPUs []CPU
 }
 // CPU type represents a CPU Package occupying a socket
 type CPU struct {
 	ID                  string `cpuinfo:"processor"`
@@ -25,47 +69,115 @@ type CPU struct {
 	ThreadCount         int
 }
-func LogDetails(devices []ml.DeviceInfo) {
+type CudaGPUInfo struct {
-	for _, dev := range devices {
+	GpuInfo
-		var libs []string
+	OSOverhead   uint64 // Memory overhead between the driver library and management library
-		for _, dir := range dev.LibraryPath {
+	index        int    //nolint:unused,nolintlint
-			if strings.Contains(dir, filepath.Join("lib", "ollama")) {
+	computeMajor int    //nolint:unused,nolintlint
-				libs = append(libs, filepath.Base(dir))
+	computeMinor int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo
 type RocmGPUInfo struct {
 	GpuInfo
 	usedFilepath string //nolint:unused,nolintlint
 	index        int    //nolint:unused,nolintlint
 }
 type RocmGPUInfoList []RocmGPUInfo
 type OneapiGPUInfo struct {
 	GpuInfo
 	driverIndex int //nolint:unused,nolintlint
 	gpuIndex    int //nolint:unused,nolintlint
 }
 type OneapiGPUInfoList []OneapiGPUInfo
 type GpuInfoList []GpuInfo
 type UnsupportedGPUInfo struct {
 	GpuInfo
 	Reason string `json:"reason"`
 }
 // Split up the set of gpu info's by Library and variant
 func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	resp := []GpuInfoList{}
 	libs := []string{}
 	for _, info := range l {
 		found := false
 		requested := info.Library
 		if info.Variant != "" {
 			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
 			if lib == requested {
 				resp[i] = append(resp[i], info)
 				found = true
 				break
 			}
 		}
-		typeStr := "discrete"
+		if !found {
-		if dev.Integrated {
+			libs = append(libs, requested)
-			typeStr = "iGPU"
+			resp = append(resp, []GpuInfo{info})
 		}
 		slog.Info("inference compute",
 			"id", dev.ID,
 			"library", dev.Library,
 			"compute", dev.Compute(),
 			"name", dev.Name,
 			"description", dev.Description,
 			"libdirs", strings.Join(libs, ","),
 			"driver", dev.Driver(),
 			"pci_id", dev.PCIID,
 			"type", typeStr,
 			"total", format.HumanBytes2(dev.TotalMemory),
 			"available", format.HumanBytes2(dev.FreeMemory),
 		)
 	}
-	// CPU inference
+	return resp
-	if len(devices) == 0 {
+}
-		dev, _ := GetCPUMem()
+
 // Report the GPU information into the log an Info level
 func (l GpuInfoList) LogDetails() {
 	for _, g := range l {
 		slog.Info("inference compute",
-			"id", "cpu",
+			"id", g.ID,
-			"library", "cpu",
+			"library", g.Library,
-			"compute", "",
+			"variant", g.Variant,
-			"name", "cpu",
+			"compute", g.Compute,
-			"description", "cpu",
+			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
-			"libdirs", "ollama",
+			"name", g.Name,
-			"driver", "",
+			"total", format.HumanBytes2(g.TotalMemory),
-			"pci_id", "",
+			"available", format.HumanBytes2(g.FreeMemory),
 			"type", "",
 			"total", format.HumanBytes2(dev.TotalMemory),
 			"available", format.HumanBytes2(dev.FreeMemory),
 		)
 	}
 }
 // Sort by Free Space
 type ByFreeMemory []GpuInfo
 func (a ByFreeMemory) Len() int           { return len(a) }
 func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
 type SystemInfo struct {
 	System          CPUInfo              `json:"system"`
 	GPUs            []GpuInfo            `json:"gpus"`
 	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
 	DiscoveryErrors []string             `json:"discovery_errors"`
 }
 // Return the optimal number of threads to use for inference
 func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
 		return 0
 	}
 	coreCount := 0
 	for _, c := range si.System.CPUs {
 		coreCount += c.CoreCount - c.EfficiencyCoreCount
 	}
 	return coreCount
 }
 // For each GPU, check if it does NOT support flash attention
 func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "metal" ||
 			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
 			gpu.Library == "rocm"
 		if !supportsFA {
 			return false
 		}
 	}
 	return true
 }
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,6 @@
 * [Quickstart](../README.md#quickstart)
 * [Examples](./examples.md)
 * [Importing models](./import.md)
 * [MacOS Documentation](./macos.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](./docker.md)
--- a/docs/api.md
+++ b/docs/api.md
@@ -19,7 +19,7 @@
 ### Model names
-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
 ### Durations
@@ -31,7 +31,7 @@ Certain endpoints stream responses as JSON objects. Streaming can be disabled by
 ## Generate a completion
-```
+```shell
 POST /api/generate
 ```
@@ -43,7 +43,6 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `prompt`: the prompt to generate a response for
 - `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
 - `think`: (for thinking models) should the model think before responding?
 Advanced parameters (optional):
@@ -174,7 +173,7 @@ curl http://localhost:11434/api/generate -d '{
 ##### Response
-```json5
+```json
 {
  "model": "codellama:code",
  "created_at": "2024-07-22T20:47:51.147561Z",
@@ -307,7 +306,7 @@ curl http://localhost:11434/api/generate -d '{
 #### Response
-```json
+```
 {
  "model": "llava",
  "created_at": "2023-11-03T15:36:02.583064Z",
@@ -395,6 +394,9 @@ curl http://localhost:11434/api/generate -d '{
    "repeat_penalty": 1.2,
    "presence_penalty": 1.5,
    "frequency_penalty": 1.0,
    "mirostat": 1,
    "mirostat_tau": 0.8,
    "mirostat_eta": 0.6,
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
@@ -402,7 +404,10 @@ curl http://localhost:11434/api/generate -d '{
    "num_batch": 2,
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
    "num_thread": 8
  }
 }'
@@ -480,7 +485,7 @@ A single JSON object is returned:
 ## Generate a chat completion
-```
+```shell
 POST /api/chat
 ```
@@ -490,40 +495,29 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
- `tools`: list of tools in JSON for the model to use if supported
+- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false`
 - `think`: (for thinking models) should the model think before responding?
 The `message` object has the following fields:
 - `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
- `tool_calls` (optional): a list of tools in JSON that the model wants to use
+- `tool_calls` (optional): a list of tools the model wants to use
 - `tool_name` (optional): add the name of the tool that was executed to inform the model of the result
 Advanced parameters (optional):
- `format`: the format to return a response in. Format can be `json` or a JSON schema.
+- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 ### Tool calling
 Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below.
 Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below.
 [See models with tool calling capabilities](https://ollama.com/search?c=tool).
 ### Structured outputs
 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
 ### Examples
-#### Chat request (Streaming)
+#### Chat Request (Streaming)
 ##### Request
@@ -564,10 +558,6 @@ Final response:
 {
  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "message": {
    "role": "assistant",
    "content": ""
  },
  "done": true,
  "total_duration": 4883583458,
  "load_duration": 1334875,
@@ -578,88 +568,6 @@ Final response:
 }
 ```
 #### Chat request (Streaming with tools)
 ##### Request
 ```shell
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
      "content": "what is the weather in tokyo?"
    }
  ],
  "tools": [
    {
      "type": "function",
      "function": {
        "name": "get_weather",
        "description": "Get the weather in a given city",
        "parameters": {
          "type": "object",
          "properties": {
            "city": {
              "type": "string",
              "description": "The city to get the weather for"
            }
          },
          "required": ["city"]
        }
      }
    }
  ],
  "stream": true
 }'
 ```
 ##### Response
 A stream of JSON objects is returned:
 ```json
 {
    "model": "llama3.2",
    "created_at": "2025-07-07T20:22:19.184789Z",
    "message": {
        "role": "assistant",
        "content": "",
        "tool_calls": [
            {
                "function": {
                    "name": "get_weather",
                    "arguments": {
                        "city": "Tokyo"
                    }
                },
            }
        ]
    },
    "done": false
 }
 ```
 Final response:
 ```json
 {
  "model":"llama3.2",
  "created_at":"2025-07-07T20:22:19.19314Z",
  "message": {
    "role": "assistant",
    "content": ""
  },
  "done_reason": "stop",
  "done": true,
  "total_duration": 182242375,
  "load_duration": 41295167,
  "prompt_eval_count": 169,
  "prompt_eval_duration": 24573166,
  "eval_count": 15,
  "eval_duration": 115959084
 }
 ```
 #### Chat request (No streaming)
 ##### Request
@@ -697,74 +605,6 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```
 #### Chat request (No streaming, with tools)
 ##### Request
 ```shell
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
      "content": "what is the weather in tokyo?"
    }
  ],
  "tools": [
    {
      "type": "function",
      "function": {
        "name": "get_weather",
        "description": "Get the weather in a given city",
        "parameters": {
          "type": "object",
          "properties": {
            "city": {
              "type": "string",
              "description": "The city to get the weather for"
            }
          },
          "required": ["city"]
        }
      }
    }
  ],
  "stream": false 
 }'
 ```
 ##### Response
 ```json
 {
  "model": "llama3.2",
  "created_at": "2025-07-07T20:32:53.844124Z",
  "message": {
    "role": "assistant",
    "content": "",
    "tool_calls": [
      {
        "function": {
          "name": "get_weather",
          "arguments": {
            "city": "Tokyo"
          }
        },
      }
    ]
  },
  "done_reason": "stop",
  "done": true,
  "total_duration": 3244883583,
  "load_duration": 2969184542,
  "prompt_eval_count": 169,
  "prompt_eval_duration": 141656333,
  "eval_count": 18,
  "eval_duration": 133293625
 }
 ```
 #### Chat request (Structured outputs)
 ##### Request
@@ -871,87 +711,6 @@ Final response:
 }
 ```
 #### Chat request (With history, with tools)
 ##### Request
 ```shell
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
      "content": "what is the weather in Toronto?"
    },
    // the message from the model appended to history
    {
      "role": "assistant",
      "content": "",
      "tool_calls": [
        {
          "function": {
            "name": "get_temperature",
            "arguments": {
              "city": "Toronto"
            }
          },
        }
      ]
    },
    // the tool call result appended to history
    {
      "role": "tool",
      "content": "11 degrees celsius",
      "tool_name": "get_temperature",
    }
  ],
  "stream": false,
  "tools": [
    {
      "type": "function",
      "function": {
        "name": "get_weather",
        "description": "Get the weather in a given city",
        "parameters": {
          "type": "object",
          "properties": {
            "city": {
              "type": "string",
              "description": "The city to get the weather for"
            }
          },
          "required": ["city"]
        }
      }
    }
  ]
 }'
 ```
 ##### Response
 ```json
 {
  "model": "llama3.2",
  "created_at": "2025-07-07T20:43:37.688511Z",
  "message": {
    "role": "assistant",
    "content": "The current temperature in Toronto is 11°C."
  },
  "done_reason": "stop",
  "done": true,
  "total_duration": 890771750,
  "load_duration": 707634750,
  "prompt_eval_count": 94,
  "prompt_eval_duration": 91703208,
  "eval_count": 11,
  "eval_duration": 90282125
 }
 ```
 #### Chat request (with images)
 ##### Request
@@ -1036,7 +795,7 @@ curl http://localhost:11434/api/chat -d '{
 ##### Request
-```shell
+```
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [
@@ -1111,7 +870,7 @@ If the messages array is empty, the model will be loaded into memory.
 ##### Request
-```shell
+```
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": []
@@ -1119,7 +878,6 @@ curl http://localhost:11434/api/chat -d '{
 ```
 ##### Response
 ```json
 {
  "model": "llama3.2",
@@ -1139,7 +897,7 @@ If the messages array is empty and the `keep_alive` parameter is set to `0`, a m
 ##### Request
-```shell
+```
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [],
@@ -1166,7 +924,7 @@ A single JSON object is returned:
 ## Create a Model
-```
+```shell
 POST /api/create
 ```
@@ -1195,8 +953,19 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
 | Type | Recommended |
 | --- | :-: |
 | q2_K | |
 | q3_K_L | |
 | q3_K_M | |
 | q3_K_S | |
 | q4_0 | |
 | q4_1 | |
 | q4_K_M | * |
 | q4_K_S | |
 | q5_0 | |
 | q5_1 | |
 | q5_K_M | |
 | q5_K_S | |
 | q6_K | |
 | q8_0 | * |
 ### Examples
@@ -1241,8 +1010,8 @@ Quantize a non-quantized model.
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "model": "llama3.2:quantized",
+  "model": "llama3.1:quantized",
-  "from": "llama3.2:3b-instruct-fp16",
+  "from": "llama3.1:8b-instruct-fp16",
  "quantize": "q4_K_M"
 }'
 ```
@@ -1251,15 +1020,13 @@ curl http://localhost:11434/api/create -d '{
 A stream of JSON objects is returned:
-```json
+```
-{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
+{"status":"quantizing F16 model to Q4_K_M"}
-{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
+{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
-{"status":"verifying conversion"}
+{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
-{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
+{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
 {"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
 {"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
 {"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
 {"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
 {"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
 {"status":"writing manifest"}
 {"status":"success"}
 ```
@@ -1284,7 +1051,7 @@ curl http://localhost:11434/api/create -d '{
 A stream of JSON objects is returned:
-```json
+```
 {"status":"parsing GGUF"}
 {"status":"using existing layer sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3"}
 {"status":"writing manifest"}
@@ -1351,7 +1118,7 @@ Return 200 OK if the blob exists, 404 Not Found if it does not.
 ## Push a Blob
-```
+```shell
 POST /api/blobs/:digest
 ```
@@ -1375,7 +1142,7 @@ Return 201 Created if the blob was successfully created, 400 Bad Request if the
 ## List Local Models
-```
+```shell
 GET /api/tags
 ```
@@ -1397,37 +1164,29 @@ A single JSON object will be returned.
 {
  "models": [
    {
-      "name": "deepseek-r1:latest",
+      "name": "codellama:13b",
-      "model": "deepseek-r1:latest",
+      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
-      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
+      "size": 7365960935,
-      "size": 4683075271,
+      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
      "details": {
        "parent_model": "",
        "format": "gguf",
-        "family": "qwen2",
+        "family": "llama",
-        "families": [
+        "families": null,
-          "qwen2"
+        "parameter_size": "13B",
-        ],
+        "quantization_level": "Q4_0"
        "parameter_size": "7.6B",
        "quantization_level": "Q4_K_M"
      }
    },
    {
-      "name": "llama3.2:latest",
+      "name": "llama3:latest",
-      "model": "llama3.2:latest",
+      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
-      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
+      "size": 3825819519,
-      "size": 2019393189,
+      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
      "details": {
        "parent_model": "",
        "format": "gguf",
        "family": "llama",
-        "families": [
+        "families": null,
-          "llama"
+        "parameter_size": "7B",
-        ],
+        "quantization_level": "Q4_0"
        "parameter_size": "3.2B",
        "quantization_level": "Q4_K_M"
      }
    }
  ]
@@ -1436,7 +1195,7 @@ A single JSON object will be returned.
 ## Show Model Information
-```
+```shell
 POST /api/show
 ```
@@ -1453,13 +1212,13 @@ Show information about a model including details, modelfile, template, parameter
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "model": "llava"
+  "model": "llama3.2"
 }'
 ```
 #### Response
-```json5
+```json
 {
  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
  "parameters": "num_keep                       24\nstop                           \"<|start_header_id|>\"\nstop                           \"<|end_header_id|>\"\nstop                           \"<|eot_id|>\"",
@@ -1496,17 +1255,13 @@ curl http://localhost:11434/api/show -d '{
    "tokenizer.ggml.pre": "llama-bpe",
    "tokenizer.ggml.token_type": [],        // populates if `verbose=true`
    "tokenizer.ggml.tokens": []             // populates if `verbose=true`
-  },
+  }
  "capabilities": [
    "completion",
    "vision"
  ],
 }
 ```
 ## Copy a Model
-```
+```shell
 POST /api/copy
 ```
@@ -1529,7 +1284,7 @@ Returns a 200 OK if successful, or a 404 Not Found if the source model doesn't e
 ## Delete a Model
-```
+```shell
 DELETE /api/delete
 ```
@@ -1555,7 +1310,7 @@ Returns a 200 OK if successful, 404 Not Found if the model to be deleted doesn't
 ## Pull a Model
-```
+```shell
 POST /api/pull
 ```
@@ -1593,7 +1348,7 @@ Then there is a series of downloading responses. Until any of the download is co
 ```json
 {
-  "status": "pulling digestname",
+  "status": "downloading digestname",
  "digest": "digestname",
  "total": 2142590208,
  "completed": 241970
@@ -1627,7 +1382,7 @@ if `stream` is set to false, then the response is a single JSON object:
 ## Push a Model
-```
+```shell
 POST /api/push
 ```
@@ -1692,7 +1447,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
 ## Generate Embeddings
-```
+```shell
 POST /api/embed
 ```
@@ -1708,7 +1463,6 @@ Advanced parameters:
 - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 - `dimensions`: number of dimensions for the embedding
 ### Examples
@@ -1761,7 +1515,7 @@ curl http://localhost:11434/api/embed -d '{
 ```
 ## List Running Models
-```
+```shell
 GET /api/ps
 ```
@@ -1808,7 +1562,7 @@ A single JSON object will be returned.
 > Note: this endpoint has been superseded by `/api/embed`
-```
+```shell
 POST /api/embeddings
 ```
@@ -1848,7 +1602,7 @@ curl http://localhost:11434/api/embeddings -d '{
 ## Version
-```
+```shell
 GET /api/version
 ```
--- a/docs/cloud.md
+++ b/docs/cloud.md
@@ -1,40 +0,0 @@
 # Cloud
 | Ollama's cloud is currently in preview. For full documentation, see [Ollama's documentation](https://docs.ollama.com/cloud).
 ## Cloud Models
 [Cloud models](https://ollama.com/cloud) are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn’t fit on a personal computer.
 Ollama currently supports the following cloud models, with more coming soon:
 - `gpt-oss:20b-cloud`
 - `gpt-oss:120b-cloud`
 - `deepseek-v3.1:671b-cloud`
 - `qwen3-coder:480b-cloud`
 ### Get started
 To run a cloud model, open the terminal and run:
 ```
 ollama run gpt-oss:120b-cloud
 ```
 To run cloud models with integrations that work with Ollama, first download the cloud model:
 ```
 ollama pull qwen3-coder:480b-cloud
 ```
 Then sign in to Ollama:
 ```
 ollama signin
 ```
 Finally, access the model using the model name `qwen3-coder:480b-cloud` via Ollama's local API or tooling.
 ## Cloud API access
 Cloud models can also be accessed directly on ollama.com's API. For more information, see the [docs](https://docs.ollama.com/cloud).
--- a/docs/development.md
+++ b/docs/development.md
@@ -3,18 +3,14 @@
 Install prerequisites:
 - [Go](https://go.dev/doc/install)
- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://github.com/jmeubank/tdm-gcc/releases/latest) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.
+- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://jmeubank.github.io/tdm-gcc/download/) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.
 Then build and run Ollama from the root directory of the repository:
-```shell
+```
 go run . serve
 ```
 > [!NOTE]
 > Ollama includes native code compiled with CGO.  From time to time these data structures can change and CGO can get out of sync resulting in unexpected crashes.  You can force a full build of the native code by running `go clean -cache` first. 
 ## macOS (Apple Silicon)
 macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
@@ -27,14 +23,14 @@ Install prerequisites:
 Then, configure and build the project:
-```shell
+```
 cmake -B build
 cmake --build build
 ```
 Lastly, run Ollama:
-```shell
+```
 go run . serve
 ```
@@ -45,35 +41,36 @@ Install prerequisites:
 - [CMake](https://cmake.org/download/)
 - [Visual Studio 2022](https://visualstudio.microsoft.com/downloads/) including the Native Desktop Workload
 - (Optional) AMD GPU support
-    - [ROCm](https://rocm.docs.amd.com/en/latest/)
+    - [ROCm](https://rocm.github.io/install.html)
    - [Ninja](https://github.com/ninja-build/ninja/releases)
 - (Optional) NVIDIA GPU support
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
 > [!IMPORTANT]
 > Ensure prerequisites are in `PATH` before running CMake.
 > [!IMPORTANT]
 > ROCm is not compatible with Visual Studio CMake generators. Use `-GNinja` when configuring the project.
 > [!IMPORTANT]
 > CUDA is only compatible with Visual Studio CMake generators.
 Then, configure and build the project:
-```shell
+```
 cmake -B build
 cmake --build build --config Release
 ```
 > [!IMPORTANT]
 > Building for ROCm requires additional flags:
 > ```
 > cmake -B build -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
 > cmake --build build --config Release
 > ```
 Lastly, run Ollama:
-```shell
+```
 go run . serve
 ```
 ## Windows (ARM)
-Windows ARM does not support additional acceleration libraries at this time.  Do not use cmake, simply `go run` or `go build`.
+Windows ARM does not support additional acceleration libraries at this time.
 ## Linux
@@ -91,26 +88,26 @@ Install prerequisites:
 Then, configure and build the project:
-```shell
+```
 cmake -B build
 cmake --build build
 ```
 Lastly, run Ollama:
-```shell
+```
 go run . serve
 ```
 ## Docker
-```shell
+```
 docker build .
 ```
 ### ROCm
-```shell
+```
 docker build --build-arg FLAVOR=rocm .
 ```
@@ -118,46 +115,6 @@ docker build --build-arg FLAVOR=rocm .
 To run tests, use `go test`:
-```shell
+```
 go test ./...
 ```
 > NOTE: In rare circumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
 > test failures resulting from your change(s), if any, locally, but CI will
 > break.
 >
 > If you see failures in CI, you can either keep pushing changes to see if the
 > CI build passes, or you can enable the "synctest" package locally to see the
 > failures before pushing.
 >
 > To enable the "synctest" package for testing, run the following command:
 >
 > ```shell
 > GOEXPERIMENT=synctest go test ./...
 > ```
 >
 > If you wish to enable synctest for all go commands, you can set the
 > `GOEXPERIMENT` environment variable in your shell profile or by using:
 >
 > ```shell
 > go env -w GOEXPERIMENT=synctest
 > ```
 >
 > Which will enable the "synctest" package for all go commands without needing
 > to set it for all shell sessions.
 >
 > The synctest package is not required for production builds.
 ## Library detection
 Ollama looks for acceleration libraries in the following paths relative to the `ollama` executable:
 * `./lib/ollama` (Windows)
 * `../lib/ollama` (Linux)
 * `.` (macOS)
 * `build/lib/ollama` (for development)
 If the libraries are not found, Ollama will not run with any acceleration libraries.
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -2,7 +2,7 @@
 ### CPU only
-```shell
+```bash
 docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
@@ -11,46 +11,42 @@ Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-
 #### Install with Apt
 1.  Configure the repository
-
+```bash
-    ```shell
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
-    curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+    | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-        | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
-    curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+    | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
-        | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+    | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
-        | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update
-    sudo apt-get update
+```
    ```
 2.  Install the NVIDIA Container Toolkit packages
-
+```bash
-    ```shell
+sudo apt-get install -y nvidia-container-toolkit
-    sudo apt-get install -y nvidia-container-toolkit
+```
    ```
 #### Install with Yum or Dnf
 1.  Configure the repository
-    ```shell
+```bash
-    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
-        | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
+    | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
-    ```
+```
 2. Install the NVIDIA Container Toolkit packages
-    ```shell
+```bash
-    sudo yum install -y nvidia-container-toolkit
+sudo yum install -y nvidia-container-toolkit
-    ```
+```
 #### Configure Docker to use Nvidia driver
-
+```
 ```shell
 sudo nvidia-ctk runtime configure --runtime=docker
 sudo systemctl restart docker
 ```
 #### Start the container
-```shell
+```bash
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
@@ -61,7 +57,7 @@ docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ol
 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
-```shell
+```
 docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
 ```
@@ -69,7 +65,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
 Now you can run a model:
-```shell
+```
 docker exec -it ollama ollama run llama3.2
 ```
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,17 +20,11 @@ Please refer to the [GPU docs](./gpu.md).
 ## How can I specify the context window size?
-By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.
+By default, Ollama uses a context window size of 2048 tokens.
 This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
 ```shell
 OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 ```
 To change this when using `ollama run`, use `/set parameter`:
-```shell
+```
 /set parameter num_ctx 4096
 ```
@@ -46,23 +40,16 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```
 Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
 ## How can I tell if my model was loaded onto the GPU?
 Use the `ollama ps` command to see what models are currently loaded into memory.
 ```shell
 ollama ps
 NAME      	ID          	SIZE 	PROCESSOR	UNTIL
 llama3:70b	bcfb190ca3a7	42 GB	100% GPU 	4 minutes from now
 ```
 > **Output**:
 >
 > ```
 > NAME           ID              SIZE     PROCESSOR    CONTEXT    UNTIL
 > gpt-oss:20b    05afbac4bad6    16 GB    100% GPU     8192       4 minutes from now
 > ```
 The `Processor` column will show which memory the model was loaded in to:
 * `100% GPU` means the model was loaded entirely into the GPU
 * `100% CPU` means the model was loaded entirely in system memory
@@ -79,7 +66,7 @@ If Ollama is run as a macOS application, environment variables should be set usi
 1. For each environment variable, call `launchctl setenv`.
    ```bash
-    launchctl setenv OLLAMA_HOST "0.0.0.0:11434"
+    launchctl setenv OLLAMA_HOST "0.0.0.0"
    ```
 2. Restart Ollama application.
@@ -94,14 +81,14 @@ If Ollama is run as a systemd service, environment variables should be set using
    ```ini
    [Service]
-    Environment="OLLAMA_HOST=0.0.0.0:11434"
+    Environment="OLLAMA_HOST=0.0.0.0"
    ```
 3. Save and exit.
 4. Reload `systemd` and restart Ollama:
-   ```shell
+   ```bash
   systemctl daemon-reload
   systemctl restart ollama
   ```
@@ -150,11 +137,9 @@ docker build -t ollama-with-ca .
 docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
 ```
-## Does Ollama send my prompts and responses back to ollama.com?
+## Does Ollama send my prompts and answers back to ollama.com?
-If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
+No. Ollama runs locally, and conversation data does not leave your machine.
 If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.
 ## How can I expose Ollama on my network?
@@ -197,13 +182,6 @@ cloudflared tunnel --url http://localhost:11434 --http-host-header="localhost:11
 Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
 For browser extensions, you'll need to explicitly allow the extension's origin pattern. Set `OLLAMA_ORIGINS` to include `chrome-extension://*`, `moz-extension://*`, and `safari-web-extension://*` if you wish to allow all browser extensions access, or specific extensions as needed:
 ```
 # Allow all Chrome, Firefox, and Safari extensions
 OLLAMA_ORIGINS=chrome-extension://*,moz-extension://*,safari-web-extension://* ollama serve
 ```
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
 ## Where are models stored?
@@ -243,19 +221,16 @@ properties.
 If you are using the API you can preload a model by sending the Ollama server an empty request. This works with both the `/api/generate` and `/api/chat` API endpoints.
 To preload the mistral model using the generate endpoint, use:
 ```shell
 curl http://localhost:11434/api/generate -d '{"model": "mistral"}'
 ```
 To use the chat completions endpoint, use:
 ```shell
 curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
 ```
 To preload a model using the CLI, use the command:
 ```shell
 ollama run llama3.2 ""
 ```
@@ -275,13 +250,11 @@ If you're using the API, use the `keep_alive` parameter with the `/api/generate`
 * '0' which will unload the model immediately after generating a response
 For example, to preload a model and leave it in memory use:
 ```shell
 curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": -1}'
 ```
 To unload the model and free up memory use:
 ```shell
 curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": 0}'
 ```
@@ -296,7 +269,7 @@ If too many requests are sent to the server, it will respond with a 503 error in
 ## How does Ollama handle concurrent requests?
-Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.
+Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
 If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded.  As prior models become idle, one or more will be unloaded to make room for the new model.  Queued requests will be processed in order.  When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
@@ -305,7 +278,7 @@ Parallel request processing for a given model results in increasing the context
 The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
 - `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory.  The default is 3 * the number of GPUs or 3 for CPU inference.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default is 1, and will handle 1 request per model at a time.
+- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
 Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
@@ -337,16 +310,3 @@ The currently available K/V cache quantization types are:
 How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
 You may need to experiment with different quantization types to find the best balance between memory usage and quality.
 ## How can I stop Ollama from starting when I login to my computer
 Ollama for Windows and macOS register as a login item during installation.  You can disable this if you prefer not to have Ollama automatically start.  Ollama will respect this setting across upgrades, unless you uninstall the application.
 **Windows**
 - Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
 **MacOS Monterey (v12)**
 - Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
 **MacOS Ventura (v13) and later**
 - Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,28 +1,21 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
+Ollama supports Nvidia GPUs with compute capability 5.0+.
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
 | Compute Capability | Family              | Cards                                                                                                       |
 | ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
-| 12.0               | GeForce RTX 50xx    | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090`                                     |
+| 9.0                | NVIDIA              | `H100`                                                                                                      |
 |                    | NVIDIA Professioal  | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell`         |
 | 11.0               | Jetson              | `T4000` `T5000` (Requires driver 580 or newer)                                                              |
 | 10.3               | NVIDIA Professioal  | `B300` `GB300` (Requires driver 580 or newer)                                                               |
 | 10.0               | NVIDIA Professioal  | `B200` `GB200` (Requires driver 580 or newer)                                                               |
 | 9.0                | NVIDIA              | `H200` `H100` `GH200`                                                                                       |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
 | 8.7                | Jetson              | `Orin Nano` `Orin NX` `AGX Orin`                                                                            |
 | 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050`   |
 |                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
 | 8.0                | NVIDIA              | `A100` `A30`                                                                                                |
 | 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                    |
 |                    | NVIDIA Professional | `T4` `RTX 5000` `RTX 4000` `RTX 3000` `T2000` `T1200` `T1000` `T600` `T500`                                 |
 |                    | Quadro              | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000`                                                                 |
 | 7.2                | Jetson              | `Xavier NX` `AGX Xavier` (Jetpack 5)                                                                        |
 | 7.0                | NVIDIA              | `TITAN V` `V100` `Quadro GV100`                                                                             |
 | 6.1                | NVIDIA TITAN        | `TITAN Xp` `TITAN X`                                                                                        |
 |                    | GeForce GTX         | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050 Ti` `GTX 1050`                       |
@@ -56,23 +49,20 @@ sudo modprobe nvidia_uvm`
 Ollama supports the following AMD GPUs:
 ### Linux Support
-| Family         | Cards and accelerators                                                                                               |
+| Family         | Cards and accelerators                                                                                                               |
-| -------------- | -------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`  |
+| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56`    |
-| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320`           |
+| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
-| AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100`                                                   |
+| AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50`                                                               |
 ### Windows Support
-With ROCm v6.2, the following GPUs are supported on Windows.
+With ROCm v6.1, the following GPUs are supported on Windows.
 | Family         | Cards and accelerators                                                                                                               |
 | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
 | AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`    |
 | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
 ### Known Workarounds
 - The RX Vega 56 requires `HSA_ENABLE_SDMA=0` to disable SDMA
 ### Overrides on Linux
 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
@@ -93,6 +83,8 @@ At this time, the known supported GPU types on linux are the following LLVM Targ
 This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
 |-----------------|---------------------|
 | gfx900 | Radeon RX Vega 56 |
 | gfx906 | Radeon Instinct MI50 |
 | gfx908 | Radeon Instinct MI100 |
 | gfx90a | Radeon Instinct MI210 |
 | gfx940 | Radeon Instinct MI300 |
--- a/docs/import.md
+++ b/docs/import.md
@@ -20,13 +20,13 @@ Make sure that you use the same base model in the `FROM` command as you used to
 Now run `ollama create` from the directory where the `Modelfile` was created:
-```shell
+```bash
 ollama create my-model
 ```
 Lastly, test the model:
-```shell
+```bash
 ollama run my-model
 ```
@@ -53,8 +53,6 @@ FROM /path/to/safetensors/directory
 If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
 If you do not create the Modelfile, ollama will act as if there was a Modelfile with the command `FROM .`.
 Now run the `ollama create` command from the directory where you created the `Modelfile`:
 ```shell
@@ -134,12 +132,22 @@ success
 ### Supported Quantizations
 - `q4_0`
 - `q4_1`
 - `q5_0`
 - `q5_1`
 - `q8_0`
 #### K-means Quantizations
 - `q3_K_S`
 - `q3_K_M`
 - `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
 - `q5_K_S`
 - `q5_K_M`
 - `q6_K`
 ## Sharing your model on ollama.com
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -11,13 +11,12 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install
 > [!NOTE]
-> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
 Download and extract the package:
 ```shell
-curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
+curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
 sudo rm -rf /usr/lib/ollama
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```
@@ -35,11 +34,7 @@ ollama -v
 ### AMD GPU install
-If you have an AMD GPU, **also** download and extract the additional ROCm package:
+If you have an AMD GPU, also download and extract the additional ROCm package:
 > [!IMPORTANT]
 > The ROCm tgz contains only AMD dependent libraries.  You must extract **both** `ollama-linux-amd64.tgz` and `ollama-linux-amd64-rocm.tgz` into the same location.
 ```shell
 curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
@@ -80,7 +75,7 @@ RestartSec=3
 Environment="PATH=$PATH"
 [Install]
-WantedBy=multi-user.target
+WantedBy=default.target
 ```
 Then start the service:
@@ -117,14 +112,14 @@ sudo systemctl status ollama
 > While AMD has contributed the `amdgpu` driver upstream to the official linux
 > kernel source, the version is older and may not support all ROCm features. We
 > recommend you install the latest driver from
-> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
+> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-> of your Radeon GPU.
+> GPU.
 ## Customizing
 To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
-```shell
+```
 sudo systemctl edit ollama
 ```
@@ -157,7 +152,7 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s
 For example:
 ```shell
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.5.7 sh
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
 ```
 ## Viewing logs
@@ -191,9 +186,3 @@ sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama
 ```
 Remove installed libraries:
 ```shell
 sudo rm -rf /usr/local/lib/ollama
 ```
--- a/docs/macos.md
+++ b/docs/macos.md
@@ -1,42 +0,0 @@
 # Ollama for macOS
 ## System Requirements
 * MacOS Sonoma (v14) or newer
 * Apple M series (CPU and GPU support) or x86 (CPU only)
 ## Filesystem Requirements
 The preferred method of installation is to mount the `ollama.dmg` and drag-and-drop the Ollama application to the system-wide `Applications` folder.  Upon startup, the Ollama app will verify the `ollama` CLI is present in your PATH, and if not detected, will prompt for permission to create a link in `/usr/local/bin`
 Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
 ### Changing Install Location
 To install the Ollama application somewhere other than `Applications`, place the Ollama application in the desired location, and ensure the CLI `Ollama.app/Contents/Resources/ollama` or a sym-link to the CLI can be found in your path.  Upon first start decline the "Move to Applications?" request.
 ## Troubleshooting
 Ollama on MacOS stores files in a few different locations.
 - `~/.ollama` contains models and configuration
 - `~/.ollama/logs` contains logs
    - *app.log* contains most recent logs from the GUI application
    - *server.log* contains the most recent server logs
 - `<install location>/Ollama.app/Contents/Resources/ollama` the CLI binary
 ## Uninstall
 To fully remove Ollama from your system, remove the following files and folders:
 ```
 sudo rm -rf /Applications/Ollama.app
 sudo rm /usr/local/bin/ollama
 rm -rf "~/Library/Application Support/Ollama"
 rm -rf "~/Library/Saved Application State/com.electron.ollama.savedState"
 rm -rf ~/Library/Caches/com.electron.ollama/
 rm -rf ~/Library/Caches/ollama
 rm -rf ~/Library/WebKit/com.electron.ollama
 rm -rf ~/.ollama
 ```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -28,7 +28,7 @@ A model file is the blueprint to create and share models with Ollama.
 The format of the `Modelfile`:
-```
+```modelfile
 # comment
 INSTRUCTION arguments
 ```
@@ -49,7 +49,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:
-```
+```modelfile
 FROM llama3.2
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -69,30 +69,24 @@ To use this:
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.
-```shell
+  ```bash
-ollama show --modelfile llama3.2
+  > ollama show --modelfile llama3.2
-```
+  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
  # FROM llama3.2:latest
  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
-> **Output**:
+  {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
 >
 > ```
 > # Modelfile generated by "ollama show"
 > # To build a new Modelfile based on this one, replace the FROM line with:
 > # FROM llama3.2:latest
 > FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
 > TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
 >
 > {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
 >
 > {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
 >
 > {{ .Response }}<|eot_id|>"""
 > PARAMETER stop "<|start_header_id|>"
 > PARAMETER stop "<|end_header_id|>"
 > PARAMETER stop "<|eot_id|>"
 > PARAMETER stop "<|reserved_special_token"
 > ```
  {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
  {{ .Response }}<|eot_id|>"""
  PARAMETER stop "<|start_header_id|>"
  PARAMETER stop "<|end_header_id|>"
  PARAMETER stop "<|eot_id|>"
  PARAMETER stop "<|reserved_special_token"
  ```
 ## Instructions
@@ -100,13 +94,13 @@ ollama show --modelfile llama3.2
 The `FROM` instruction defines the base model to use when creating a model.
-```
+```modelfile
 FROM <model name>:<tag>
 ```
 #### Build from existing model
-```
+```modelfile
 FROM llama3.2
 ```
@@ -117,7 +111,7 @@ Additional models can be found at:
 #### Build from a Safetensors model
-```
+```modelfile
 FROM <model directory>
 ```
@@ -131,7 +125,7 @@ Currently supported model architectures:
 #### Build from a GGUF file
-```
+```modelfile
 FROM ./ollama-model.gguf
 ```
@@ -142,7 +136,7 @@ The GGUF file location should be specified as an absolute path or relative to th
 The `PARAMETER` instruction defines a parameter that can be set when the model is run.
-```
+```modelfile
 PARAMETER <parameter> <parametervalue>
 ```
@@ -150,7 +144,10 @@ PARAMETER <parameter> <parametervalue>
 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 4096)                                                                                                                                                                    | int        | num_ctx 4096         |
+| mirostat       | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)                                                                                                                                         | int        | mirostat 0           |
 | mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                        | float      | mirostat_eta 0.1     |
 | mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                         | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
@@ -186,7 +183,7 @@ TEMPLATE """{{ if .System }}<|im_start|>system
 The `SYSTEM` instruction specifies the system message to be used in the template, if applicable.
-```
+```modelfile
 SYSTEM """<system message>"""
 ```
@@ -196,7 +193,7 @@ The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply
 #### Safetensor adapter
-```
+```modelfile
 ADAPTER <path to safetensor adapter>
 ```
@@ -207,7 +204,7 @@ Currently supported Safetensor adapters:
 #### GGUF adapter
-```
+```modelfile
 ADAPTER ./ollama-lora.gguf
 ```
@@ -215,7 +212,7 @@ ADAPTER ./ollama-lora.gguf
 The `LICENSE` instruction allows you to specify the legal license under which the model used with this Modelfile is shared or distributed.
-```
+```modelfile
 LICENSE """
 <license text>
 """
@@ -225,7 +222,7 @@ LICENSE """
 The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
-```
+```modelfile
 MESSAGE <role> <message>
 ```
@@ -240,7 +237,7 @@ MESSAGE <role> <message>
 #### Example conversation
-```
+```modelfile
 MESSAGE user Is Toronto in Canada?
 MESSAGE assistant yes
 MESSAGE user Is Sacramento in Canada?
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -1,7 +1,6 @@
 # OpenAI compatibility
-> [!NOTE]
+> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
 > OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
 Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
@@ -60,10 +59,8 @@ embeddings = client.embeddings.create(
    input=["why is the sky blue?", "why is the grass green?"],
 )
 ```
 #### Structured outputs
-
+```py
 ```python
 from pydantic import BaseModel
 from openai import OpenAI
@@ -72,7 +69,7 @@ client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
 # Define the schema for the response
 class FriendInfo(BaseModel):
    name: str
-    age: int
+    age: int 
    is_available: bool
 class FriendList(BaseModel):
@@ -147,7 +144,7 @@ const embedding = await openai.embeddings.create({
 ### `curl`
-```shell
+``` shell
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
@@ -322,7 +319,7 @@ ollama pull llama3.2
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
-```shell
+```
 ollama cp llama3.2 gpt-3.5-turbo
 ```
@@ -346,7 +343,7 @@ curl http://localhost:11434/v1/chat/completions \
 The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
-```
+```modelfile
 FROM <some model>
 PARAMETER num_ctx <context size>
 ```
--- a/docs/template.md
+++ b/docs/template.md
@@ -12,7 +12,7 @@ A basic Go template consists of three main parts:
 Here's an example of a simple chat template:
-```go
+```gotmpl
 {{- range .Messages }}
 {{ .Role }}: {{ .Content }}
 {{- end }}
@@ -162,6 +162,6 @@ CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://o
 Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
-```go
+```gotmpl
 [SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
 ```
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
 On **Linux** systems with systemd, the logs can be found with this command:
 ```shell
-journalctl -u ollama --no-pager --follow --pager-end
+journalctl -u ollama --no-pager
 ```
 When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
@@ -17,18 +17,17 @@ When you run Ollama in a **container**, the logs go to stdout/stderr in the cont
 ```shell
 docker logs <container-name>
 ```
 (Use `docker ps` to find the container name)
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.
 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
 - `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
 To enable additional debug logging to help troubleshoot problems, first **Quit the running app from the tray menu** then in a powershell terminal
 ```powershell
 $env:OLLAMA_DEBUG="1"
 & "ollama app.exe"
@@ -38,27 +37,38 @@ Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
 ## LLM libraries
-Ollama includes multiple LLM libraries compiled for different GPU libraries and versions. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library.
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. 
 In the server log, you will see a message that looks something like this (varies from release to release):
 ```
 Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
 ```
 **Experimental LLM Library Override**
-You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to limit autodetection, so for example, if you have both CUDA and AMD GPUs, but want to force the CUDA v13 only, use:
+You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass autodetection, so for example, if you have a CUDA card, but want to force the CPU LLM library with AVX2 vector support, use:
-```shell
+```
-OLLAMA_LLM_LIBRARY="cuda_v13" ollama serve
+OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
 ```
 You can see what features your CPU has with the following.
 ```
 cat /proc/cpuinfo| grep flags | head -1
 ```
 ## Installing older or pre-release versions on Linux
 If you run into problems on Linux and want to install an older version, or you'd like to try out a pre-release before it's officially released, you can tell the install script which version to install.
-```shell
+```sh
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.5.7 sh
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
 ```
-## Linux docker
+## Linux tmp noexec 
-If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
+If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
 ## NVIDIA GPU Discovery
@@ -80,15 +90,14 @@ If none of those resolve the problem, gather additional information and file an
 - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
 You may get more details for initialization failures by enabling debug prints in the uvm driver.  You should only use this temporarily while troubleshooting
 - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1`
 ## AMD GPU Discovery
 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
 If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Bruce MacDonald	fa74ae7214	use range	2025-01-31 14:55:09 -08:00
Bruce MacDonald	aff6d84e17	model: benchmark bpe text processing	2025-01-31 14:44:20 -08:00
Michael Yang	b21482e4a9	fix linter	2025-01-29 15:08:37 -08:00
Michael Yang	6a4120143f	next	2025-01-29 15:05:24 -08:00