readme: add ChibiChat to community integrations (#8883 )

build(rocm): add numa, elf (#8900 )
readme: add Ollama Chat WebUI for Docker to community integrations (#8084 )
2025-02-06 16:08:46 -08:00 · 2025-02-06 15:46:30 -08:00 · 2025-02-06 15:41:02 -08:00 · 2025-02-06 15:08:12 -08:00 · 2025-02-06 14:54:58 -08:00 · 2025-02-06 13:12:16 -08:00
157 changed files with 2719 additions and 9315 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,7 +3,9 @@ ollama
 app
 macapp
 dist
 build
 .env
 .cache
 test_data
-llama/build
+.git
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,6 +15,10 @@ ml/backend/**/*.cu linguist-vendored
 ml/backend/**/*.cuh linguist-vendored
 ml/backend/**/*.m linguist-vendored
 ml/backend/**/*.metal linguist-vendored
 ml/backend/**/CMakeLists.txt linguist-vendored
 llama/build-info.cpp linguist-generated
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated
 * text=auto
 *.go text eol=lf
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@@ -9,6 +9,14 @@ body:
      description: What happened? What did you expect to happen?
    validations:
      required: true
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md#how-to-troubleshoot-issues) for details.
      render: shell
    validations:
      required: false
  - type: dropdown
    id: os
    attributes:
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -40,28 +40,106 @@ jobs:
  linux:
    needs: [changes]
-    if: ${{ needs.changes.outputs.changed == 'True' }}
+    if: needs.changes.outputs.changed == 'True'
    strategy:
      matrix:
        include:
-          - container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+          - preset: CPU
-            preset: CUDA
+          - preset: CUDA
-          - container: rocm/dev-ubuntu-22.04:6.1.2
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
-            preset: ROCm
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
            extra-packages: rocm-libs
-    runs-on: ubuntu-latest
+            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
    runs-on: linux
    container: ${{ matrix.container }}
    steps:
      - uses: actions/checkout@v4
      - run: |
-          apt-get update
+          [ -n "${{ matrix.container }}" ] || sudo=sudo
-          apt-get install -y cmake pkg-config ${{ matrix.extra-packages }}
+          $sudo apt-get update
          $sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/cache@v4
        with:
          path: /github/home/.cache/ccache
          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
      - run: |
-          cmake --preset ${{ matrix.preset }}
+          cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
          cmake --build --preset ${{ matrix.preset }} --parallel
  windows:
    needs: [changes]
    if: needs.changes.outputs.changed == 'True'
    strategy:
      matrix:
        include:
          - preset: CPU
          - preset: CUDA
            install: https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
            flags: '-DAMDGPU_TARGETS=gfx1010'
    runs-on: windows
    steps:
      - run: |
          choco install -y --no-progress ccache ninja
          ccache -o cache_dir=${{ github.workspace }}\.ccache
      - if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm'
        id: cache-install
        uses: actions/cache/restore@v4
        with:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
          key: ${{ matrix.install }}
      - if: matrix.preset == 'CUDA'
        name: Install CUDA ${{ matrix.cuda-version }}
        run: |
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.8", "nvcc_11.8", "cublas_11.8", "cublas_dev_11.8")) -NoNewWindow -Wait
          }
          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - if: matrix.preset == 'ROCm'
        name: Install ROCm ${{ matrix.rocm-version }}
        run: |
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
            Start-Process -FilePath .\install.exe -ArgumentList '-install' -NoNewWindow -Wait
          }
          $hipPath = (Resolve-Path "C:\Program Files\AMD\ROCm\*").path
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
          key: ${{ matrix.install }}
      - uses: actions/checkout@v4
      - uses: actions/cache@v4
        with:
          path: ${{ github.workspace }}\.ccache
          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
      - run: |
          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
        env:
          CMAKE_GENERATOR: Ninja
  test:
    strategy:
      matrix:
@@ -85,5 +163,5 @@ jobs:
      - uses: actions/checkout@v4
      - name: Verify patches apply cleanly and do not change files
        run: |
-          make -f Makefile2 clean checkout sync
+          make -f Makefile.sync clean sync
          git diff --compact-summary --exit-code
--- a/.gitignore
+++ b/.gitignore
@@ -4,12 +4,13 @@
 .venv
 .swp
 dist
 build
 ollama
 .cache
 *.exe
 .idea
 test_data
 *.crt
 llama/build
 __debug_bin*
-llama/vendor
+llama/build
 llama/vendor
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,11 +19,30 @@ set(GGML_CCACHE ON)
 set(GGML_BACKEND_DL ON)
 set(GGML_BACKEND_SHARED ON)
 set(GGML_SCHED_MAX_COPIES 4)
 set(GGML_CPU_ALL_VARIANTS ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_LLAMAFILE ON)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(GGML_LLAMAFILE ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_CUDA_GRAPHS ON)
 if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()
 if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
    set(CMAKE_BUILD_RPATH "@loader_path")
    set(CMAKE_INSTALL_RPATH "@loader_path")
 endif()
 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
 set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
@@ -34,12 +53,66 @@ set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
 get_target_property(CPU_VARIANTS ggml-cpu MANUALLY_ADDED_DEPENDENCIES)
 if(NOT CPU_VARIANTS)
    set(CPU_VARIANTS "ggml-cpu")
 endif()
 install(TARGETS ggml-base ${CPU_VARIANTS}
    RUNTIME_DEPENDENCIES
        PRE_EXCLUDE_REGEXES ".*"
    RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
    LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
    FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
 )
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
        set(CMAKE_CUDA_ARCHITECTURES "native")
    endif()
    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
    )
 endif()
 check_language(HIP)
 if(CMAKE_HIP_COMPILER)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+    set(HIP_PLATFORM "amd")
    find_package(hip REQUIRED)
    if(NOT AMDGPU_TARGETS)
        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
    endif()
    if(AMDGPU_TARGETS)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
            RUNTIME_DEPENDENCIES
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
                POST_EXCLUDE_REGEXES "system32"
            RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
            LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
        )
        foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
            if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
                break()
            endif()
        endforeach()
    endif()
 endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -4,10 +4,15 @@
    {
      "name": "Default",
      "binaryDir": "${sourceDir}/build",
      "installDir": "${sourceDir}/dist",
      "cacheVariables": {
        "CMAKE_BUILD_TYPE": "Release"
      }
    },
    {
      "name": "CPU",
      "inherits": [ "Default" ]
    },
    {
      "name": "CUDA",
      "inherits": [ "Default" ]
@@ -42,20 +47,29 @@
    },
    {
      "name": "ROCm",
-      "inherits": [ "Default" ]
+      "inherits": [ "Default" ],
      "cacheVariables": {
        "CMAKE_HIP_PLATFORM": "amd"
      }
    },
    {
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
-        "CMAKE_HIP_ARCHITECTURES": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      }
    }
  ],
  "buildPresets": [
    {
      "name": "Default",
-      "configurePreset": "Default"
+      "configurePreset": "Default",
      "configuration": "Release"
    },
    {
      "name": "CPU",
      "configurePreset": "Default",
      "targets": [ "ggml-cpu" ]
    },
    {
      "name": "CUDA",
--- a/281
+++ b/281
@@ -1,201 +1,128 @@
-ARG GOLANG_VERSION=1.22.8
+# vim: filetype=dockerfile
 ARG CUDA_VERSION_11=11.3.1
 ARG CUDA_VERSION_12=12.4.0
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
-### To create a local image for building linux binaries on mac or windows with efficient incremental builds
+ARG FLAVOR=${TARGETARCH}
 #
 # docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
 # docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
 #
 ### Then incremental builds will be much faster in this container
 #
 # make -j 10 dist
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    dnf clean all && \
    dnf install -y \
    zsh \
    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 # TODO intel oneapi goes here...
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
-### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
+ARG ROCMVERSION=6.1.2
-# Note: this does not contain jetson variants
+ARG JETPACK5VERSION=r35.4.1
-#
+ARG JETPACK6VERSION=r36.2.0
-# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
+ARG CMAKEVERSION=3.31.2
 # docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
 #
 FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
    dnf config-manager --set-enabled appstream && \
    dnf clean all && \
    dnf install -y \
    zsh \
    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 ENV GOARCH arm64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
-FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCMVERSION}-complete AS base-amd64
-COPY . .
+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
-ARG OLLAMA_SKIP_CUDA_GENERATE
+    && yum install -y yum-utils devtoolset-10-gcc devtoolset-10-gcc-c++ \
-ARG OLLAMA_SKIP_ROCM_GENERATE
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo \
-ARG OLLAMA_FAST_BUILD
+    && curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
-ARG VERSION
+ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:/opt/rh/devtoolset-11/root/usr/bin:$PATH
-ARG CUSTOM_CPU_FLAGS
+
 FROM --platform=linux/arm64 rockylinux:8 AS base-arm64
 # install epel-release for ccache
 RUN yum install -y yum-utils epel-release \
    && yum install -y clang ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 ENV CC=clang CXX=clang++
 FROM base-${TARGETARCH} AS base
 ARG CMAKEVERSION
 RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ENV LDFLAGS=-s
 FROM base AS cpu
 # amd64 uses gcc which requires devtoolset-11 for AVX extensions while arm64 uses clang
 RUN if [ "$(uname -m)" = "x86_64" ]; then yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++; fi
 ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
+    cmake --preset 'CPU' \
-        make -j $(nproc) dist ; \
+        && cmake --build --parallel --preset 'CPU' \
-    else \
+        && cmake --install build --component CPU --strip --parallel 8
        make -j 5 dist ; \
    fi
 RUN cd dist/linux-$GOARCH && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
    cd dist/linux-$GOARCH-rocm && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
    fi
-# Jetsons need to be built in discrete stages
+FROM base AS cuda-11
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
+ARG CUDA11VERSION=11.3
-ARG GOLANG_VERSION
+RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
-RUN apt-get update && apt-get install -y git curl ccache && \
+ENV PATH=/usr/local/cuda-11/bin:$PATH
    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
 WORKDIR /go/src/github.com/ollama/ollama/
 COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 ARG VERSION
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v11 \
+    cmake --preset 'CUDA 11' \
-        CUDA_ARCHITECTURES="72;87" \
+        && cmake --build --parallel --preset 'CUDA 11' \
-        GPU_RUNNER_VARIANT=_jetpack5 \
+        && cmake --install build --component CUDA --strip --parallel 8
        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
+FROM base AS cuda-12
-ARG GOLANG_VERSION
+ARG CUDA12VERSION=12.4
-RUN apt-get update && apt-get install -y git curl ccache && \
+RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
+ENV PATH=/usr/local/cuda-12/bin:$PATH
    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
 WORKDIR /go/src/github.com/ollama/ollama/
 COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 ARG VERSION
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v12 \
+    cmake --preset 'CUDA 12' \
-        CUDA_ARCHITECTURES="87" \
+        && cmake --build --parallel --preset 'CUDA 12' \
-        GPU_RUNNER_VARIANT=_jetpack6 \
+        && cmake --install build --component CUDA --strip --parallel 8
        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
-FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
+FROM base AS rocm-6
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
 ARG OLLAMA_FAST_BUILD
 ARG VERSION
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist
+    cmake --preset 'ROCm 6' \
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+        && cmake --build --parallel --preset 'ROCm 6' \
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+        && cmake --install build --component HIP --strip --parallel 8
 RUN cd dist/linux-$GOARCH && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN cd dist/linux-$GOARCH-jetpack5 && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
 RUN cd dist/linux-$GOARCH-jetpack6 && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
-FROM --platform=linux/amd64 scratch AS dist-amd64
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+ARG CMAKEVERSION
-FROM --platform=linux/arm64 scratch AS dist-arm64
+RUN apt-get update && apt-get install -y curl ccache \
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
-FROM dist-$TARGETARCH AS dist
+COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'JetPack 5' \
        && cmake --build --parallel --preset 'JetPack 5' \
        && cmake --install build --component CUDA --strip --parallel 8
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
 ARG CMAKEVERSION
 RUN apt-get update && apt-get install -y curl ccache \
    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'JetPack 6' \
        && cmake --build --parallel --preset 'JetPack 6' \
        && cmake --install build --component CUDA --strip --parallel 8
-# For amd64 container images, filter out cuda/rocm to minimize size
+FROM base AS build
-FROM build-amd64 AS runners-cuda-amd64
+ARG GOVERSION=1.23.4
-RUN rm -rf \
+RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
-    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
+ENV PATH=/usr/local/go/bin:$PATH
-    ./dist/linux-amd64/lib/ollama/runners/rocm*
+WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
 RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .
-FROM build-amd64 AS runners-rocm-amd64
+FROM --platform=linux/amd64 scratch AS amd64
-RUN rm -rf \
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
    ./dist/linux-amd64/lib/ollama/libcu*.so* \
    ./dist/linux-amd64/lib/ollama/runners/cuda*
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
+FROM --platform=linux/arm64 scratch AS arm64
-RUN apt-get update && \
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-    apt-get install -y ca-certificates && \
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 lib/ollama/cuda_jetpack5
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 lib/ollama/cuda_jetpack6
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
+FROM --platform=linux/arm64 scratch AS rocm
-RUN apt-get update && \
+COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
 COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
 FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama
-# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
+FROM ubuntu:20.04
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
+RUN apt-get update \
-# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
+    && apt-get install -y ca-certificates \
-# across releases
+    && apt-get clean \
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
+    && rm -rf /var/lib/apt/lists/*
-RUN apt-get update && \
+COPY --from=archive /bin /usr/bin
    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 COPY --from=archive /lib/ollama /usr/lib/ollama
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_VISIBLE_DEVICES=all
-
+ENV OLLAMA_HOST=0.0.0.0:11434
 EXPOSE 11434
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/66
+++ b/66
@@ -1,66 +0,0 @@
 ARG CUDA_11_VERSION=11.3
 ARG CUDA_12_VERSION=12.4
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_5_VERSION=r35.4.1
 ARG JETPACK_6_VERSION=r36.2.0
 ARG CMAKE_VERSION=3.31.2
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS base
 ARG CMAKE_VERSION
 RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz | tar xz -C /usr --strip-components 1
 RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
 # FROM --platform=linux/arm64 rockylinux:8 AS base
 # ARG CMAKE_VERSION
 # RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
 # RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 FROM base AS amd64
 ARG CUDA_11_VERSION
 ARG CUDA_12_VERSION
 RUN yum install -y cuda-toolkit-${CUDA_11_VERSION//./-} \
    && yum install -y cuda-toolkit-${CUDA_12_VERSION//./-}
 COPY CMakeLists.txt CMakeLists.txt
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 FROM --platform=linux/amd64 amd64 AS cuda_11
 ENV PATH=/usr/local/cuda-${CUDA_11_VERSION}/bin:$PATH
 RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 RUN cmake --build build --target ggml-cuda -j
 FROM --platform=linux/amd64 amd64 AS cuda_12
 ENV PATH=/usr/local/cuda-${CUDA_12_VERSION}/bin:$PATH
 RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 RUN cmake --build build --target ggml-cuda -j
 FROM --platform=linux/amd64 amd64 AS rocm
 RUN cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
 RUN cmake --build build --target ggml-hip -j
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5_VERSION} AS jetpack_5
 ARG CMAKE_VERSION
 RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
 COPY CMakeLists.txt .
 COPY ml/backend/ggml/ggml .
 RUN cmake -S . -B build \
    -DCMAKE_CUDA_ARCHITECTURES="72;87"
 RUN cmake --build build --target ggml-cuda
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6_VERSION} AS jetpack_6
 ARG CMAKE_VERSION
 RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
 COPY CMakeLists.txt .
 COPY ml/backend/ggml/ggml .
 RUN cmake -S . -B build \
    -DCMAKE_CUDA_ARCHITECTURES="87"
 RUN cmake --build build --target ggml-cuda
 FROM --platform=linux/amd64 golang:1.23
 COPY --from=cuda_11 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-11.so
 COPY --from=cuda_12 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-12.so
 COPY --from=rocm build/ml/backend/ggml/ggml/src/ggml-hip/libggml-hip.so libggml-hip.so
 # FROM --platform=linux/arm64 golang:1.23
 # COPY --from=jetpack_5 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-5.so
 # COPY --from=jetpack_6 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-6.so
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -0,0 +1,60 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
 FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
 .PHONY: help
 help:
 	@echo "Available targets:"
 	@echo "    sync                 Sync with upstream repositories"
 	@echo "    checkout             Checkout upstream repository"
 	@echo "    apply-patches        Apply patches to local repository"
 	@echo "    format-patches       Format patches from local repository"
 	@echo "    clean                Clean local repository"
 	@echo
 	@echo "Example:"
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
 .PHONY: sync
 sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml apply-patches
 .PHONY: llama/build-info.cpp
 llama/build-info.cpp: llama/build-info.cpp.in
 	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/ apply-patches
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 .PHONY: ml/backend/ggml/ggml apply-patches
 ml/backend/ggml/ggml: llama/vendor/ggml/ apply-patches
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 PATCHES=$(wildcard llama/patches/*.patch)
 .PHONY: apply-patches
 .NOTPARALLEL:
 apply-patches: $(addsuffix ed, $(PATCHES))
 %.patched: %.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
 .PHONY: checkout
 checkout: $(WORKDIR)
 	git -C $(WORKDIR) fetch
 	git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
 $(WORKDIR):
 	git clone $(UPSTREAM) $(WORKDIR)
 .PHONE: format-patches
 format-patches: llama/patches
 	git -C $(WORKDIR) format-patch \
 		--no-signature \
 		--no-numbered \
 		--zero-commit \
 		-o $(realpath $<) \
 		$(FETCH_HEAD)
 .PHONE: clean
 clean: checkout
 	$(RM) $(addsuffix ed, $(PATCHES))
--- a/46
+++ b/46
@@ -1,46 +0,0 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
 FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
 all: sync
 .PHONY: sync
 sync: llama/llama.cpp ml/backend/ggml/ggml
 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/ apply_patches
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 .PHONY: ml/backend/ggml/ggml apply_patches
 ml/backend/ggml/ggml: llama/vendor/ggml/ apply_patches
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 PATCHES=$(wildcard llama/patches/*.patch)
 .PHONY: apply_patches
 .NOTPARALLEL:
 apply_patches: $(addsuffix ed, $(PATCHES))
 %.patched: %.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
 .PHONY: checkout
 checkout: $(WORKDIR)
 	git -C $(WORKDIR) fetch
 	git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
 $(WORKDIR):
 	git clone $(UPSTREAM) $(WORKDIR)
 .PHONE: format_patches
 format_patches: llama/patches
 	git -C $(WORKDIR) format-patch \
 		--no-signature \
 		--no-numbered \
 		--zero-commit \
 		-o $(realpath $<) \
 		$(FETCH_HEAD)
 .PHONE: clean
 clean: checkout
 	$(RM) $(addsuffix ed, $(PATCHES))
--- a/README.md
+++ b/README.md
@@ -369,6 +369,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
 - [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
 - [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
 - [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
 - [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
 - [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
 - [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 ### Cloud
@@ -481,6 +486,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
 ### Mobile
@@ -531,6 +537,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 ### Supported backends
@@ -539,4 +546,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Observability
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production. 
+- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
 - [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
 - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.
--- a/cache/cache.go
+++ b/cache/cache.go
@@ -1,63 +0,0 @@
 package cache
 import (
 	"github.com/ollama/ollama/ml"
 )
 type Options struct {
 	Position int
 }
 type Cache interface {
 	Sub(i int) Cache
 	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
 }
 type Simple struct {
 	DType    ml.DType
 	Capacity int
 	keys, values []ml.Tensor
 }
 func (c *Simple) Sub(i int) Cache {
 	if i >= len(c.keys) {
 		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
 		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
 	}
 	return &Simple{
 		keys:     c.keys[i : i+1],
 		values:   c.values[i : i+1],
 		Capacity: c.Capacity,
 		DType:    c.DType,
 	}
 }
 func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
 	if c.keys[0] == nil || c.values[0] == nil {
 		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
 		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
 	}
 	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
 	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
 	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
 	key = c.keys[0].View(ctx, 0,
 		int(key.Dim(0)), int(key.Stride(1)),
 		int(key.Dim(1)), int(key.Stride(2)),
 		n,
 	)
 	value = c.values[0].View(ctx, 0,
 		int(value.Dim(0)), int(value.Stride(1)),
 		int(value.Dim(1)), int(value.Stride(2)),
 		n,
 	)
 	// TODO shift context if necessary
 	return key, value
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -59,7 +59,7 @@ func getModelfileName(cmd *cobra.Command) (string, error) {
 	_, err = os.Stat(absName)
 	if err != nil {
-		return filename, err
+		return "", err
 	}
 	return absName, nil
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -279,7 +279,7 @@ func TestGetModelfileName(t *testing.T) {
 			name:          "no modelfile specified, no modelfile exists",
 			modelfileName: "",
 			fileExists:    false,
-			expectedName:  "Modelfile",
+			expectedName:  "",
 			expectedErr:   os.ErrNotExist,
 		},
 		{
@@ -293,7 +293,7 @@ func TestGetModelfileName(t *testing.T) {
 			name:          "modelfile specified, no modelfile exists",
 			modelfileName: "crazyfile",
 			fileExists:    false,
-			expectedName:  "crazyfile",
+			expectedName:  "",
 			expectedErr:   os.ErrNotExist,
 		},
 		{
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -9,7 +9,7 @@ import (
 	"log/slog"
 	"strings"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type ModelParameters struct {
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 }
-func (ModelParameters) KV(t *Tokenizer) ggml.KV {
+func (ModelParameters) KV(t *Tokenizer) llm.KV {
-	kv := ggml.KV{
+	kv := llm.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p AdapterParameters) KV() ggml.KV {
+func (p AdapterParameters) KV() llm.KV {
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() ggml.KV {
 		alpha = p.LoraParameters.Alpha
 	}
-	kv := ggml.KV{
+	kv := llm.KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }
-func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return ggml.WriteGGUF(ws, kv, ts)
+	return llm.WriteGGUF(ws, kv, ts)
 }
-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return ggml.WriteGGUF(ws, kv, ts)
+	return llm.WriteGGUF(ws, kv, ts)
 }
 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) ggml.KV
+	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []ggml.Tensor
+	Tensors([]Tensor) []llm.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
@@ -99,7 +99,7 @@ type ModelConverter interface {
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
-	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }
 type moreParser interface {
@@ -108,17 +108,17 @@ type moreParser interface {
 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(ggml.KV) ggml.KV
+	KV(llm.KV) llm.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []ggml.Tensor
+	Tensors([]Tensor) []llm.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
-	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }
-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -191,6 +191,8 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &qwen2Model{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
 	default:
 		return errors.New("unsupported architecture")
 	}
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -8,7 +8,7 @@ import (
 	"slices"
 	"strings"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type bertModel struct {
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 }
-func (p *bertModel) KV(t *Tokenizer) ggml.KV {
+func (p *bertModel) KV(t *Tokenizer) llm.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []ggml.Tensor
+	var out []llm.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
 			"embeddings.position_ids",
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
 			continue
 		}
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -0,0 +1,76 @@
 package convert
 import (
 	"cmp"
 	"github.com/ollama/ollama/llm"
 )
 type commandrModel struct {
 	ModelParameters
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	HiddenLayers          uint32  `json:"num_hidden_layers"`
 	IntermediateSize      uint32  `json:"intermediate_size"`
 	NumAttentionHeads     uint32  `json:"num_attention_heads"`
 	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 	LayerNormEPS          float32 `json:"layer_norm_eps"`
 	RopeTheta             float32 `json:"rope_theta"`
 	UseQKNorm             bool    `json:"use_qk_norm"`
 	MaxLength             uint32  `json:"model_max_length"`
 	LogitScale            float32 `json:"logit_scale"`
 	NCtx                  uint32  `json:"n_ctx"`
 }
 var _ ModelConverter = (*commandrModel)(nil)
 func (p *commandrModel) KV(t *Tokenizer) llm.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "command-r"
 	kv["general.name"] = "command-r"
 	kv["command-r.context_length"] = cmp.Or(p.MaxLength, p.MaxPositionEmbeddings, p.NCtx)
 	kv["command-r.embedding_length"] = p.HiddenSize
 	kv["command-r.block_count"] = p.HiddenLayers
 	kv["command-r.feed_forward_length"] = p.IntermediateSize
 	kv["command-r.attention.head_count"] = p.NumAttentionHeads
 	kv["command-r.attention.head_count_kv"] = p.NumKeyValueHeads
 	kv["command-r.attention.layer_norm_epsilon"] = p.LayerNormEPS
 	kv["command-r.rope.freq_base"] = p.RopeTheta
 	kv["command-r.max_position_embeddings"] = cmp.Or(p.MaxLength, p.MaxPositionEmbeddings)
 	kv["command-r.logit_scale"] = p.LogitScale
 	kv["command-r.rope.scaling.type"] = "none"
 	return kv
 }
 func (p *commandrModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
 		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
 		})
 	}
 	return out
 }
 func (p *commandrModel) Replacements() []string {
 	return []string{
 		"self_attn.q_norm", "attn_q_norm",
 		"self_attn.k_norm", "attn_k_norm",
 		"model.layers", "blk",
 		"input_layernorm", "attn_norm",
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
 		"self_attn.k_proj", "attn_k",
 		"self_attn.o_proj", "attn_output",
 		"self_attn.q_proj", "attn_q",
 		"self_attn.v_proj", "attn_v",
 		"model.norm", "output_norm",
 		"model.embed_tokens", "token_embd",
 	}
 }
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type gemmaModel struct {
@@ -23,7 +23,7 @@ type gemmaModel struct {
 var _ ModelConverter = (*gemmaModel)(nil)
-func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
+func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []ggml.Tensor
+	var out []llm.Tensor
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,6 +1,8 @@
 package convert
-import "github.com/ollama/ollama/fs/ggml"
+import (
 	"github.com/ollama/ollama/llm"
 )
 type gemma2Model struct {
 	gemmaModel
@@ -9,7 +11,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }
-func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
+func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type gemma2Adapter struct {
@@ -15,14 +15,14 @@ type gemma2Adapter struct {
 var _ AdapterConverter = (*gemma2Adapter)(nil)
-func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
+func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	return kv
 }
-func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []ggml.Tensor
+	var out []llm.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -9,7 +9,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type llamaModel struct {
@@ -46,7 +46,7 @@ type llamaModel struct {
 var _ ModelConverter = (*llamaModel)(nil)
-func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
+func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []ggml.Tensor
+	var out []llm.Tensor
 	if p.RopeScaling.factors != nil {
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,7 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type llamaAdapter struct {
@@ -18,7 +18,7 @@ type llamaAdapter struct {
 var _ AdapterConverter = (*llamaAdapter)(nil)
-func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
+func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	return kv
 }
-func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []ggml.Tensor
+	var out []llm.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    shape,
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -6,7 +6,7 @@ import (
 	"slices"
 	"strings"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type mixtralModel struct {
@@ -15,7 +15,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }
-func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
+func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 	kv := p.llamaModel.KV(t)
 	if p.NumLocalExperts > 0 {
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
 		return true
 	})
-	var out []ggml.Tensor
+	var out []llm.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -8,7 +8,7 @@ import (
 	"strings"
 	"sync"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type phi3Model struct {
@@ -37,7 +37,7 @@ type phi3Model struct {
 var _ ModelConverter = (*phi3Model)(nil)
-func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
+func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 	var addRopeFactors sync.Once
-	out := make([]ggml.Tensor, 0, len(ts)+2)
+	out := make([]llm.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
-				out = append(out, ggml.Tensor{
+				out = append(out, llm.Tensor{
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
-				}, ggml.Tensor{
+				}, llm.Tensor{
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
 			})
 		}
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -1,7 +1,6 @@
 package convert
-import "github.com/ollama/ollama/fs/ggml"
+import "github.com/ollama/ollama/llm"
 type qwen2Model struct {
 	ModelParameters
@@ -22,7 +21,7 @@ type qwen2Model struct {
 var _ ModelConverter = (*qwen2Model)(nil)
-func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
+func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen2"
 	kv["qwen2.block_count"] = q.HiddenLayers
@@ -46,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }
-func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
+func (q *qwen2Model) Tensors(ts []Tensor) []llm.Tensor {
-	var out []ggml.Tensor
+	var out []llm.Tensor
 	for _, t := range ts {
-		out = append(out, ggml.Tensor{
+		out = append(out, llm.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -20,7 +20,7 @@ import (
 	"golang.org/x/exp/maps"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/llm"
 )
 type tensorData struct {
@@ -29,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 }
-func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	t.Helper()
 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })
-	m, _, err := ggml.Decode(r, math.MaxInt)
+	m, _, err := llm.DecodeGGML(r, math.MaxInt)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	return r, m.KV(), m.Tensors()
 }
-func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
 	actual := make(map[string]string)
 	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
@@ -75,7 +75,7 @@ func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tens
 		}
 	}
-	for _, tensor := range tensors.Items() {
+	for _, tensor := range tensors.Items {
 		sha256sum := sha256.New()
 		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
 		if _, err := io.Copy(sha256sum, sr); err != nil {
@@ -109,6 +109,7 @@ func TestConvertModel(t *testing.T) {
 		"all-MiniLM-L6-v2",
 		"gemma-2-9b-it",
 		"Qwen2.5-0.5B-Instruct",
 		"c4ai-command-r-v01",
 	}
 	for i := range cases {
@@ -331,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()
-			m, _, err := ggml.Decode(r, math.MaxInt)
+			m, _, err := llm.DecodeGGML(r, math.MaxInt)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/testdata/c4ai-command-r-v01.json
+++ b/convert/testdata/c4ai-command-r-v01.json
@@ -0,0 +1,344 @@
 {
    "general.architecture": "command-r",
    "general.name": "command-r",
    "command-r.attention.head_count": "64",
    "command-r.attention.head_count_kv": "64",
    "command-r.attention.layer_norm_epsilon": "1e-05",
    "command-r.block_count": "40",
    "command-r.context_length": "131072",
    "command-r.embedding_length": "8192",
    "command-r.feed_forward_length": "22528",
    "command-r.logit_scale": "0.0625",
    "command-r.rope.freq_base": "8e+06",
    "command-r.rope.scaling.type": "none",
    "tokenizer.ggml.add_bos_token": "true",
    "tokenizer.ggml.add_eos_token": "false",
    "tokenizer.ggml.bos_token_id": "5",
    "tokenizer.ggml.eos_token_id": "255001",
    "tokenizer.ggml.merges": "902a060cac8884a5793d2a857dd2e53a259de46c8d08c4deb243c239671e1350",
    "tokenizer.ggml.model": "gpt2",
    "tokenizer.ggml.padding_token_id": "0",
    "tokenizer.ggml.token_type": "b7a352ccd1c99d4413bcf452c2db707b0526d0e1216616b865560fab80296462",
    "tokenizer.ggml.tokens": "815ac90ff23565081522d7258f46648c8a0619eb847a9c7c31b238a9b984e4ae",
    "blk.0.attn_k.weight": "6fcfdb466f9ceb1229404ce4ec4e480751b8d00da12707a11783dad7256cb864",
    "blk.0.attn_norm.weight": "6063317f731371864049c7704a70772f1eb632194201ebdc2ed0f8e483507c72",
    "blk.0.attn_output.weight": "920f49716a1e2fc73b6794ec777947f1c122701e63ed302422ac89e90f06e9da",
    "blk.0.attn_q.weight": "ddbcd7cde197e632564ac58e4f25d9e3a8ca52917329eeb6081eb41a797932ab",
    "blk.0.attn_v.weight": "318fc02a189d87420f0cbf57f47f11e00c21ec1ed472ce0a2a895b44f7fa0fca",
    "blk.0.ffn_down.weight": "aa71975b6eb1f4c77b03d2ac4a194cf8d95718efac741bb12f0f3ff79a27f9bc",
    "blk.0.ffn_gate.weight": "42967702fa0bc738b88dc50007ace26dbe74a5a9e0978124dd093f818241a9e1",
    "blk.0.ffn_up.weight": "5282c8788b086bd30f46525e7995a17464882a72703fd27165491afdd8bfd4af",
    "blk.1.attn_k.weight": "cd248882e64fd2c3402c44790ebe12440133dc671b6893fdad0564c461973adc",
    "blk.1.attn_norm.weight": "ba84e1c8fd30af6ec94208db4078befac8c921aad3acb887812887f3282ea2be",
    "blk.1.attn_output.weight": "2efa3ef7c5666ccceb05e339b83ad680cc0d2c3ec78203f5da5959f23a80e14f",
    "blk.1.attn_q.weight": "5106f2e255358a1303c22e8b5f0ec044852bb30a866c52cabefd30017a7a6b7d",
    "blk.1.attn_v.weight": "a211a634a1a5df1d5f973645438be0461dd922210f9747c6b04e386c7f1ebe95",
    "blk.1.ffn_down.weight": "37093afe48d32c578ec956c9ed85242cd000d6aa979e60526aafa10c822dbb10",
    "blk.1.ffn_gate.weight": "469860819e9159caefb1aad0bc66db790f3393f05fd87b08e52256a7ed256543",
    "blk.1.ffn_up.weight": "736742c97d35d1a011f9cafd3c0ce947ad559bb2fba6da73c816f6bfd0fa9aeb",
    "blk.2.attn_k.weight": "92c219d92804d832ab404bd6dc7339c90877bb7cf405dd030c121f8b27757739",
    "blk.2.attn_norm.weight": "61e4466069474b76b6d1e702566420eb669faf3556b00ff7b824784aca13a2d6",
    "blk.2.attn_output.weight": "d2fb38a2b2171fd91caf037faa585a62225819aa232d86fd4f7f9d2c3c8a45e9",
    "blk.2.attn_q.weight": "f6faf5cc6844e3daa4f9f68d90f5458c64879de68a7728860e38374e30c3429d",
    "blk.2.attn_v.weight": "f340ef8f7341d987a6f37c0e9afe0aef5be67be00c0ce5f57612daf73319cce1",
    "blk.2.ffn_down.weight": "c7be61a701d779860b621b143fb6365b607bf99ec7c0f153b07908ac8120885a",
    "blk.2.ffn_gate.weight": "b64f0878187bd3392abfa4c3e8ad2f8b4c133903e54246747ff8f3b4639ad83e",
    "blk.2.ffn_up.weight": "50b11c712652e90ee7428dbb45cffebb80662ac982bc72bd9eafff361b5eb5a8",
    "blk.3.attn_k.weight": "2b7bcbe9ee5c9c630c8c8d7483887e78b73581016f4cbb6933db2a147a25f431",
    "blk.3.attn_norm.weight": "0181dac7f4eee7252980323e8032cf339bef2046ce0a16c0fd72af7c98a8a37b",
    "blk.3.attn_output.weight": "aef8843b636ce231da9e7c9acbee197883cc15df0e2887709324c6a50f16da7b",
    "blk.3.attn_q.weight": "55404130fa10e81322d33eb378aa0de31a92990ce7730f1338c0ace0406bb1b1",
    "blk.3.attn_v.weight": "76f7fb8040d82b957d689ce34fea2302a6640ad5bbaa0052ad2b7ebce270c33d",
    "blk.3.ffn_down.weight": "648628933eff3b357c3729c33c5b1ae51c28e59b9c19acd1601a2ff7c5d5d9a5",
    "blk.3.ffn_gate.weight": "6a588885d16e98d5f50ebed05af089154f680085ca9c97691e5b489088630a4a",
    "blk.3.ffn_up.weight": "e12455a1d702f4986e1a663493e3d5102b367af74d45557522002a35d63ecac2",
    "blk.4.attn_k.weight": "40d943380a8a85e4eab147934bf6e16f23cc8ab753f6636526382c074d182288",
    "blk.4.attn_norm.weight": "4ab2c098983d4599fe540eef624c4df954adb7473faebda7471ef0ba4134814c",
    "blk.4.attn_output.weight": "d14b91e40f58bf4a3c8c2eca0b12bb541de406574af39027d56f6c588a147082",
    "blk.4.attn_q.weight": "e1224960a3562107488589f883fa32414bae41712fa8dbd47c5f3e3a7801452f",
    "blk.4.attn_v.weight": "063f297bc4aa6e709fc32c4c32e35af7d07d80e83cb939b76adbba858006c03d",
    "blk.4.ffn_down.weight": "f88a18020c5e1caaa29596895eb348e76ee5bfad27ed57651a86cd8cd1f9b5aa",
    "blk.4.ffn_gate.weight": "48e7e1eed3fb52e92e61d3557dd0ec002418327090e034ce4322fd68542266f8",
    "blk.4.ffn_up.weight": "1ca8a7aa17355b6ce0d9ad5539fdad3899fa47fd359c285fbfb31f19f47bf073",
    "blk.5.attn_k.weight": "2bdf15f8e73d068d972380f25d207004cf0bf3b5bfa46946803ba6fba07d9175",
    "blk.5.attn_norm.weight": "60448d7cde6e1b6467aa31bdea012e39cdb08c88081cee7d102dca4f93f766ef",
    "blk.5.attn_output.weight": "f9f687d7c457537f9fca8a4087a59f1c3bebfaf5537b94e42c831a13224f7799",
    "blk.5.attn_q.weight": "987db7a2ad68657a92625e1980effbb1f79697c2183f2b9f3b3a0570c51b0ab9",
    "blk.5.attn_v.weight": "cf696891148f3e4783ad1d20f93462ae091eb8651c656bba9b662253b6263e02",
    "blk.5.ffn_down.weight": "c0662b0bd0929136005fb9d691fdd9b2c33867d9ce9622339a6a456b720b059a",
    "blk.5.ffn_gate.weight": "200bbdfab615d7a3a84719b6ced7751e3ce52757ef212d96f87798bc1de5e987",
    "blk.5.ffn_up.weight": "df5d23e7e035fb1b9d163da7ddfdfe38da6a37e86e96534dc02ad20f011b55b3",
    "blk.6.attn_k.weight": "c0dae2d272a7c5a2fa004bbb8475dbab362fc1f6d008e73d5a4434a9382ac6ba",
    "blk.6.attn_norm.weight": "51c57ac8b55e04354d5dca6bb9c0cf4177639d3b038e80209e33036209688f64",
    "blk.6.attn_output.weight": "229d97892c62f85bcdf431675250e01c976ad69ffa450b01fb543bf88f14a2fb",
    "blk.6.attn_q.weight": "c20e49621821bd46ed156e6823864a5bda4f317750e71ab8dc54e44eb48cf7c2",
    "blk.6.attn_v.weight": "53ceb1a2ee43fce3c7b5b33c58a9fc5ee7f44dc1c6f29bc9dbefc37582102dc9",
    "blk.6.ffn_down.weight": "7923c943b7629d560a032d1efa210d1d75c6692140f1be94464ee7ed24f44ed0",
    "blk.6.ffn_gate.weight": "57593d350361af753a6a39f53b066282634c0fb44f396f6f2966a574b01d8f8c",
    "blk.6.ffn_up.weight": "327b6a7a387098b8899d3ded04a4d4e7c658ca61b80d4e7b17594be232721602",
    "blk.7.attn_k.weight": "9ca48b87a10116fd8868e62b76f211d4bb91f166096be9061439ee2e1c3a5c20",
    "blk.7.attn_norm.weight": "cd56cfcc4e2ad6b96e23ea7b0d32b4caf236107d99a0b22c56760b62e63c8cfd",
    "blk.7.attn_output.weight": "7352b509a03cae2491ffc060e577d189341a0f861233f18c96f9d275dc4234bf",
    "blk.7.attn_q.weight": "2b3791c8c008c33ddbe12bedba8191322ceea2dcce5cf0eb7a93d40ad254e672",
    "blk.7.attn_v.weight": "3ae721d52466487a3d48150581e57f6d64ea1e83ab929f23b28c3d777422eeb6",
    "blk.7.ffn_down.weight": "3b6fa8ececdb3c34af3a5363863d6f94289c1c95bf47fce3a3ddcf184c5f0848",
    "blk.7.ffn_gate.weight": "dbd7df6c5ae5eb4adb859f0d36453813a4e289a359a1ba8f72d67fcbf21c3e22",
    "blk.7.ffn_up.weight": "de68380a334b4c5cfd4c318b0e9854aec59bd79aa0f0c30af3f56414f83482b0",
    "blk.8.attn_k.weight": "7303c4e4480abc72a7ee271811311199245fb5c2ea27a2bd3b8cad3a53a03c27",
    "blk.8.attn_norm.weight": "2e3d1921898d1b943ce1a1b6818546c8b471d6d542da24f51a8b514b8c3dd4ef",
    "blk.8.attn_output.weight": "30421520887b66bf97a18dbcdc283bc8d0b60590b612fd638a319a6eae923227",
    "blk.8.attn_q.weight": "73e064d5433c9b500068a1c31744dbd53f4ade298fb450a0e8c97f62cf1f8a8d",
    "blk.8.attn_v.weight": "27e21f8b9a9a8533e8178ca34a72aa1d786393d57302b7806dcdf3e51de511a8",
    "blk.8.ffn_down.weight": "bf694bd8e00047982108000e7b3dee7b225db8b19abc595e5697b6bbefd92e7c",
    "blk.8.ffn_gate.weight": "d55fdbf8606d9141b774b0500c58944fd1253b9e69d1f765eaa9a680b9f2ca40",
    "blk.8.ffn_up.weight": "1ae3f580655e7c8e8dd6c34fa4ac574fdfc5e3f1a8536da0c5442d3a2976f0e7",
    "blk.9.attn_k.weight": "b18080626012d8aabcf78542d6c7bf31c712bf55a70172fbfe173fcf34481036",
    "blk.9.attn_norm.weight": "2e3620620dc09998c6d3063a7d5de5433fbbae8c11e5b00d13f145d39140e162",
    "blk.9.attn_output.weight": "69c3c0e27ef1c0fc933eeb7b612b70909f18cde238873c0d576a2ba9714ef174",
    "blk.9.attn_q.weight": "68330e5aa28a28873c9a6e67f032186ef651df2df5844e0f27094ba349fbe4ab",
    "blk.9.attn_v.weight": "3df8d45a102be082d0793a51cb82aa62a43cd0e9d047ba4115ca0f2414b39325",
    "blk.9.ffn_down.weight": "1d6cc162b73745b135b4f040a0aac3c06d5135a3dc5b2421e7ee2af48662fd7f",
    "blk.9.ffn_gate.weight": "034a9d40fb1e32b534b45f4bccd65cbe43c4a6a3f5d01132bd245ca0005de5fc",
    "blk.9.ffn_up.weight": "c838c38d0e1a0ac0da17eb2a66023ed31929f07d8fcfe1cc546df26096c91f0c",
    "blk.10.attn_k.weight": "a78507cb72f744b86ceaa032596e74e5571c822d0226d334881169addb32cbd5",
    "blk.10.attn_norm.weight": "35f48d0b28ee0e6b4cad4e983925737562d64824be5b168b3e26df3d6b260cf1",
    "blk.10.attn_output.weight": "53712db06796de39b131323e7abf9a58551b6d52da6db66a471580386d396252",
    "blk.10.attn_q.weight": "efe08429ba196026b81cd1c471e1c7418afd9e966659feb3936b674aa0803b58",
    "blk.10.attn_v.weight": "7ec6055e134f89da0cbe79ec9f13ef2e442ac584b1f03c3e13e7d0cdad0078bd",
    "blk.10.ffn_down.weight": "37e66af4bcd1f3079e841e892255b8255070655901864ea3a8c602a7f681a640",
    "blk.10.ffn_gate.weight": "1825282bc34830d371c6edcc3c1e73e6ecc1e10f4aea0122dbb7acc1d6f7b1bc",
    "blk.10.ffn_up.weight": "819b3b276a4d4c14a35ed6682d5ef18a5e8ed468e5ce3f12e8c75ec18ac20ec4",
    "blk.11.attn_k.weight": "5327e6a2af82dfff0619a14971f5864a15553c36fead84e1af42c7630f2729c6",
    "blk.11.attn_norm.weight": "fec363b3c4a43036d2c635fb8aa9e122dd87ee79811839f2f6cd955be3373e7b",
    "blk.11.attn_output.weight": "ccf7b38f18ee8798b8a6a35018e2df3eb3e007de62876befb68025dd66c79763",
    "blk.11.attn_q.weight": "da8c4a1c824ffe174e39f126cd72f7ef83c56aff1259d452a1212de80f98f5e9",
    "blk.11.attn_v.weight": "d17ae6bb77f03982b55d341eb67acb5969e9ad3da5994b96eafc09793dcfe3a0",
    "blk.11.ffn_down.weight": "a6bac521e2791345f22c57205fa1c2f2f687794dfd24d0e98d50ae0d0eb6088a",
    "blk.11.ffn_gate.weight": "5ed902c488cb51ba5635f3df08258c5f84f31a679a00211ea5f9d8b824ef6d9d",
    "blk.11.ffn_up.weight": "ee9f1437eb890d2cf9df2574afa1cecf20aafdd847cd75b152d7eb74419afd34",
    "blk.12.attn_k.weight": "5a069c06e1019b0f889088e67458f7a11ec77fa190ada6069e46211f62219947",
    "blk.12.attn_norm.weight": "194d7e5fcc8c49aea62daf1940532419cf3c505afdce6be377286b677db5db8f",
    "blk.12.attn_output.weight": "6534995fd4d6fecb55e317add4b1723aba4d825e1e9471d0b08813dfdc247176",
    "blk.12.attn_q.weight": "4ab51ca519b5995581fa34f846276feca3b907ef2b51f192f6cc0b3263c3f5a2",
    "blk.12.attn_v.weight": "5652ca3fa81ef9a1ac1543d71fc6813f8517f8ec54b25c701f6f98061614830f",
    "blk.12.ffn_down.weight": "4b2c263f54c88516b8eb273bb8d9615b01c5c8b484dc70358adb91b50b300edd",
    "blk.12.ffn_gate.weight": "8f50c3c3e3e8568991d6c1b0e74b500cf4f208e7700bbb8e87c3f6a6d359b6b5",
    "blk.12.ffn_up.weight": "1c1a581fec1fbe959e1427fa513f400100b5e1ee9d83932630be9905fb49c231",
    "blk.13.attn_k.weight": "efd7a38c46f08d8376d82974f33c644e3a02220e142d63b1704718699a8a884c",
    "blk.13.attn_norm.weight": "d28fa4f1bd75abbd063b0e622e08f579c89cd0c0c5ce63c1952ec9f944f8ee13",
    "blk.13.attn_output.weight": "71e0068a639288718bdb70a6cfdefd50bc8b3ec3993347a65129e70001ca5827",
    "blk.13.attn_q.weight": "b97077adc92cff07a2e07d80ee38f214ad8713571c69cd5c70ebd43dc501ac87",
    "blk.13.attn_v.weight": "79b3e2749ab4b459c81e96e322b215f1e8af645eb346e176c326bd00cf6ed2fd",
    "blk.13.ffn_down.weight": "9f8687d11effa1db7cfecf7bec5631734bcf2962aad74a9f519144491e08ec85",
    "blk.13.ffn_gate.weight": "7d14dfa0543852e7777fe8fff29ca533744cbcf1ebcf10067e5adfc4eb345e65",
    "blk.13.ffn_up.weight": "852b9527b97fdab211ff3f832a660ee1d93ccb56906144c50f01319a6e8ee615",
    "blk.14.attn_k.weight": "79e926b20f36f66d58226cb358881f2f68ae7b468787d33cafae5110287a14a0",
    "blk.14.attn_norm.weight": "97d481b63deb0df6142c2c6cd23043720c62eb609e390f47a7113751c79974ec",
    "blk.14.attn_output.weight": "aa6e94d7176d5c79fbb89b96e5f13ce75702ce3dd23ee52986446da436a6c3d6",
    "blk.14.attn_q.weight": "214becb6d1bb460da9fb8ace0f99b9a5afa9edf7aa7acc19606c7401b11d6305",
    "blk.14.attn_v.weight": "488b0e6d7f1a7a2ed0972aaa6d10ef9c775ee5373460324efcf5b3e3da9311df",
    "blk.14.ffn_down.weight": "29c7ad16cf9542e30996a1a01ab95b844533b28051f04cc7949c371afb796471",
    "blk.14.ffn_gate.weight": "b7ef208f2b054803665b377f5a5980c122c026841809cf855c6ba06d1c3a885a",
    "blk.14.ffn_up.weight": "76a5cc28100748d79c4398ce7b9176aab4d661548b6293a82f99144812e5b70e",
    "blk.15.attn_k.weight": "a6b8f9e98ab878fa7ebc5d080978ebf2d050acc2ab2fa8ea9188eb10e27702c8",
    "blk.15.attn_norm.weight": "a26d07a9752d6dccb68e3a8a2a49fd0752cdd0a415e05547819bc37d9ba63d5e",
    "blk.15.attn_output.weight": "c63616c69048ccbee801e05be4f56d21fda21aa0cc470f41d57c31b4d9283a4d",
    "blk.15.attn_q.weight": "fd595a67bf96c6ba16eb148a9d02fa52fa3c1d33ed10be28a08f851409fd6e64",
    "blk.15.attn_v.weight": "1c5c9d33fa07c05d5f4ed0032c6c4aa83d863f0d31c94a66109d239dcd03cea3",
    "blk.15.ffn_down.weight": "585ea62ab8aff7d7d212ea5c1a03226fda6b68370c890b776834af70c948dcbc",
    "blk.15.ffn_gate.weight": "a13c63f86f879b03a573d5dd2a25cfd1f4dc73e8132e6454ecc23e538b4cdf6f",
    "blk.15.ffn_up.weight": "f7112450f57c12fcd511f049e0dc0b541625a107a7901c3261ed9e984299f65c",
    "blk.16.attn_k.weight": "2d2c8b11dd71fba6d1c106aa1673c113a5448653cca7eab897c8739212ed5003",
    "blk.16.attn_norm.weight": "95c2ec7be9469690e18a9a1779684acb3e9da44b13e263a0da840305646fbf8a",
    "blk.16.attn_output.weight": "31a65046e677f54dae654ded4e733479fcc0f7283d83076b7dc7cbcae8528230",
    "blk.16.attn_q.weight": "bfc6292b9c6d49b7118d08060242a138182eb182d136ba5dfaf469437c16081d",
    "blk.16.attn_v.weight": "68f81d037340217d87c7853ff4d6edfbc46d9e827ee6d5bff7c3f6238e3a95ad",
    "blk.16.ffn_down.weight": "bbd6629691950cef4d5113e1c6670e91b216a9b872cb92cee02dfda4d6c4f7b8",
    "blk.16.ffn_gate.weight": "63cb56f282b7401ed6c76e5bb6fdf1bf68a64f9af0c82c014209b55bcb5191d0",
    "blk.16.ffn_up.weight": "b54f39a2541063cbfb6f713aa81c3b69a04100e999aa2ebbeec195dc382eceec",
    "blk.17.attn_k.weight": "3d9ba49799cc56664ec30a002bcad61eb651294212a68c3ddb573eb042aef5a4",
    "blk.17.attn_norm.weight": "42ee0db4b9d63257bca0012a30b12737ead1caafeb5ed3d93c8f48ffec4b46de",
    "blk.17.attn_output.weight": "a38fd100f05c9041c592bc739e287de0b10d08ef2bda41a879225bdca9002f71",
    "blk.17.attn_q.weight": "8a3bee285b0180a9eb35662e449ee4cbe16d992bdd48fb3a94bc4a347728cfa2",
    "blk.17.attn_v.weight": "d7f8f1b8b863494ed4392a1656775912e9b264ad36016547b12e832a1d6757d6",
    "blk.17.ffn_down.weight": "bb7ee58f61da8630972e25b621996fbe8ec06f4dc9ab1e268ab5b120c526ca28",
    "blk.17.ffn_gate.weight": "6b652dbf167fee09a45ebfd78d500ff6548fb2756dbe5343ffec3f7e6207179f",
    "blk.17.ffn_up.weight": "3b67f727e55e742715de978fab80457781e7a3762bc48f79d13b45dcb8de664c",
    "blk.18.attn_k.weight": "ff7fe57c57b90c6fcc0aefc39ec24593c3a7d1ea1c23770480075a015450e0f5",
    "blk.18.attn_norm.weight": "1d40faca082d2633ef0ccf19e121870dd6c7c3e2154607c7f3543fa96e99cb2d",
    "blk.18.attn_output.weight": "9adfecaaa397a92db4687efd5fcabfa0daef9e6b0493763b7ff5ebc185c43a6c",
    "blk.18.attn_q.weight": "ad1803eb9b291948639277afe981e666b07167eb3fcae903ba5b73bf86d8f50b",
    "blk.18.attn_v.weight": "308cf23399adccf27401a4ab60d74dac6fb9d4cd4b9c5940d9145118d1881b34",
    "blk.18.ffn_down.weight": "7de4ac9a561fb580619b745687dfd7ca8a69ef70471dee978741b80e9ff7bead",
    "blk.18.ffn_gate.weight": "0c66970f696b33bd5ee8f1f2fbcb41fd78fa5ccabdc927e11a4d5a4089f19c69",
    "blk.18.ffn_up.weight": "66a42e988e8a1f468fabf976c48e9e4bb045eaac6916ef16555ac101cd674abc",
    "blk.19.attn_k.weight": "a928ab50390bacbcebe2e4b66922498134ce22d7b93beaa87d6cf4ab52eb7174",
    "blk.19.attn_norm.weight": "b4a02c55b46c2a96aec9c64a254087cf48e6c1d4b6f31782c77a46fc4daebad1",
    "blk.19.attn_output.weight": "b768319c641dff1eac5d1f8ceb960c9899c795bf2b24c1d6bf70aa24fda45f77",
    "blk.19.attn_q.weight": "79ef3f57d187d3954a26362096e1b6c222d76f537dff73e034d6e9999935b8bc",
    "blk.19.attn_v.weight": "ce13d6b13e24fcb2d5bc6a2662e5bd295b31b12db10a6d0307f86cf29b8d5001",
    "blk.19.ffn_down.weight": "cf90d7e2137482cfd50934a8223ad774621d08554969da80a9712df5e6227eb0",
    "blk.19.ffn_gate.weight": "71ce30150f003b6eeb3bf7464e05b6ae615f135110d8e47f0a47fd973e537c0f",
    "blk.19.ffn_up.weight": "7f92aca0cc29866633feec701ec01a85a8ee2fd4e2b9630173a6cffb1d9d50ee",
    "blk.20.attn_k.weight": "a2df23159d6fb74ef28e14b61028fe8b00a693a2fc9234a980be74f20b958682",
    "blk.20.attn_norm.weight": "c6cd5f1b096fc5efa4eb59ca1c8c4bd28730f3dcedd59a63601663eccc6724ed",
    "blk.20.attn_output.weight": "896a8a166d0f006d4b09867ae4345426303cbc3fb13a18d3d4e1bde00f16dbdf",
    "blk.20.attn_q.weight": "01eb79588fe61baea0da43e99f4dc5939590e1bafd01e12dadb8326f102bfea2",
    "blk.20.attn_v.weight": "bd39630fdd5a7c859ac1addaf53e63faf524c3f32f5f4896d86b6e746b1d5c06",
    "blk.20.ffn_down.weight": "0304a5d39957a0e3f031c4bcc4549a135d396c8d97c8d276fd1c823ce86560c2",
    "blk.20.ffn_gate.weight": "117b79d595b1dca0c8b37586beaecc4d84411507276212dc286cde7fc36c9bef",
    "blk.20.ffn_up.weight": "6e799346db145c125f01783539749d3828fcc451cd4f10c5352f047a47e28714",
    "blk.21.attn_k.weight": "1c37e4c0664147e775bb006b226b9553e3421140cd96288ea755f81731ab80ba",
    "blk.21.attn_norm.weight": "00ae783a29000ccda5e4bdbff03df0752fb82805dc3f9b987500ebd80714476e",
    "blk.21.attn_output.weight": "7588b84f9fb19f15095b5265c60b4a4e7ae74bcc47d4607dfa5d0bfab6f136cb",
    "blk.21.attn_q.weight": "a65f1c0dd06d45bb97532d3e932689c1eecfe7359089b39174a96a149335cbc1",
    "blk.21.attn_v.weight": "4220b77e7d5e8709b4eef33a679b5dad11f297085ef44c9977f9e54ef08f7a2d",
    "blk.21.ffn_down.weight": "b8c082a0530d4b5328e67db0df84c5498f2af956de23c639fa0198ffea853950",
    "blk.21.ffn_gate.weight": "cd1b656ee72d00e9835ef667c19ef89a88de261eb8eb7c0e936e0f9ddf83ef9f",
    "blk.21.ffn_up.weight": "dc445f73e36ec7a3bd86884186b728f8e0187f32848c3b8b69d4d41f8571bf31",
    "blk.22.attn_k.weight": "e37cf0b893ec8b9ee8c78dd139b8d9c45cb997a3bc0c3d93a70ca1c3f6af8859",
    "blk.22.attn_norm.weight": "248a27838d3c46cc03a5c312facc84e2e0e2c990ef8401e93da25918497f88d1",
    "blk.22.attn_output.weight": "fc191a18f6d18332c66761f7ab28008bfe295dd1f5c8741a2488442f9e00d0f5",
    "blk.22.attn_q.weight": "4b193a2ab8bc2b085db18f2bf3eeba26e02b537b2cdd738160c8f14b165d0f5a",
    "blk.22.attn_v.weight": "7a60ce5ccac7e045e55ba1e1e85bd2a0f93f8c781daee96c5223665e22f0c666",
    "blk.22.ffn_down.weight": "e0a34fb4244e2c7168f3dbaa1904c15d339ec39999cdf27128bbaf619ee0a237",
    "blk.22.ffn_gate.weight": "8bac872d4b8549c8812f927efa309f1792b524f33601095fff61b826de5a5615",
    "blk.22.ffn_up.weight": "b67fa2b94dd901b6ec64c0853ce8ca2d86fe9cb1cc6d2f15fbbbe0e691c0c648",
    "blk.23.attn_k.weight": "2c32e66ad01942b819ac09a197c71579fe66f02226a264fdd72ad1e02c67a27e",
    "blk.23.attn_norm.weight": "825fdc94deb439cb93c713eeb077c1052b90ed658d6d464fc4ad3d611e911d48",
    "blk.23.attn_output.weight": "95ca6707a95b8750b0c7c5d379d368f0f2e7ebef631954e7d4d8ec0f41f13a3a",
    "blk.23.attn_q.weight": "6eccc84faca5fac015d1b26e2854501edcfd292a302228fe14cf99f5eb59a34b",
    "blk.23.attn_v.weight": "b343ac3d226040f1033ee049668aa1d89b1774bc18431965682e5dbdce78ccdc",
    "blk.23.ffn_down.weight": "9fc599befea8d3b1e342d564a110074f66d2542df406c4b90b6bdc5828fbb2b2",
    "blk.23.ffn_gate.weight": "488556c1b0c9f0b20b0c99b4bac2e0f4046b81edb601d7b91e7e5b3bab47d667",
    "blk.23.ffn_up.weight": "1088e291d7008dd9c7c2dd6830af686a8a84b724d123a016209bd5156d6898f1",
    "blk.24.attn_k.weight": "a923fbe35e61e009a53927d7828818e0592bb737d6a1106c4b0b5a1efc367e07",
    "blk.24.attn_norm.weight": "9b51aaaa939cefafdd9b13a7e5b74ac7fa2d603427e55a16a909d6f3f353750a",
    "blk.24.attn_output.weight": "1beb2baba56f8409466434b037771248c2f620ec5f53e15f44c271d5a2d9ecf4",
    "blk.24.attn_q.weight": "4b0194fe5bfae0c6bf6131dcf8cb6e2b994f6ea10b27cb03574f0f4f8cc0c950",
    "blk.24.attn_v.weight": "6ac34b1ab0f66226d85bca1194a7c212cd93d384ecbc8b8395de48aec0970a61",
    "blk.24.ffn_down.weight": "5508f74cb732a662c2936b32ac5e90742d172b9f961a747b0e5cba0e5906a89d",
    "blk.24.ffn_gate.weight": "095e39b8584403835f9bb1ac33e0e81f54175575e4800273d281b845bff381e7",
    "blk.24.ffn_up.weight": "2d43ec21637dda12973de367b0113ee9840b0d815bf6fce042f7c3f270b0b530",
    "blk.25.attn_k.weight": "9e2aee029f3d2c7f67dfc7926e72c8228fb978382c8e5a4701bbf82c93801419",
    "blk.25.attn_norm.weight": "220cd7164fb4cdbe22d26058e4153b26c27c7b5ce2bec8e95bf2c0ea08d23103",
    "blk.25.attn_output.weight": "a17f4a5dc6aa51f03dbd75602d98e9491767c205cdc2c3a5f8667fc54bbf7c64",
    "blk.25.attn_q.weight": "f60827496835c440c794bf57ce9780704d10a59d8229886bf75ebb18900ba4ef",
    "blk.25.attn_v.weight": "9cac217e9e9f4f4c85f14ee51165a77c580165bd4a34b202389169bbe61a1ced",
    "blk.25.ffn_down.weight": "a0f36949b663e80849581dfb71e7babcc73580793bbcb0c80ab26d5a6e000359",
    "blk.25.ffn_gate.weight": "df4d1be4d50d6afe5ad3ef0d0e0fac76a33e85c963dea769641d612dd53e7d13",
    "blk.25.ffn_up.weight": "992da76be762632e25ebc5ef4d03728eece1b43f7c4e31827df19ca724aea694",
    "blk.26.attn_k.weight": "34199ff856ac32a500c754539d070258574192a34ecba87a182897cb59fdff52",
    "blk.26.attn_norm.weight": "a8e9dfb2dae5d22b5c0aec5f3675991c0e3c3e6a44153db2579136b73f456e00",
    "blk.26.attn_output.weight": "1c4f257ffb0d7db0f11cfb275e38b4af736917b43ad82de1badce3f1d227da4d",
    "blk.26.attn_q.weight": "33d55786274c2e718cf61e8fbecf3dfa5ee0c208f0b716d42b061f55459acb3c",
    "blk.26.attn_v.weight": "684b636939cd4ffcfec5a6238a0790ffa43d853c95783af9b9e8275e74071a7a",
    "blk.26.ffn_down.weight": "89d0bf066db154e6d312b5433aed1714f6a28b40f4c52e3e1530ee07703303c8",
    "blk.26.ffn_gate.weight": "393d649bebe5e2940e1b043649f6c860b4b8b9f380f30e9da1744a830f358156",
    "blk.26.ffn_up.weight": "179edc85ababd9d8440cc6093eecd1004290aa1cb96434b26ecf7585b6cca17b",
    "blk.27.attn_k.weight": "334841445a7f1e14731b08f56eb0b1f0938c63823d28bc6d078c4c5f05b36f19",
    "blk.27.attn_norm.weight": "57344471bbda2e9deffdfdb2dd05a07aa47f8761e24de53525588639145bf551",
    "blk.27.attn_output.weight": "506126af9ee54b535d49f97e36f630e74834f480329f098d6d62e96246d8d65a",
    "blk.27.attn_q.weight": "dd984df1acb4783849e25ba7ae378bfd385cd9efc540fb798cd5bdd873f0118f",
    "blk.27.attn_v.weight": "b4b3fe9a4455d34c297ff20a2f537b647cef424741d840a747b265f23d320ac0",
    "blk.27.ffn_down.weight": "621fdb185ba0d35ba5476dae73d2c81ec1482a0e878d5bfd5c3b29fe837af013",
    "blk.27.ffn_gate.weight": "e4fbab45f2ec506fa374103251a0bdb7baa6f576080bdd796f3e9db92098e08f",
    "blk.27.ffn_up.weight": "a0c57e463e988002bbd6a6c6792baa21a65e6f89ae303a2c301951b0ae6e4bbe",
    "blk.28.attn_k.weight": "bac36cbd52ec5056841663865e1291ddab4b47ef9a2544dd285d4503bfb0e4a0",
    "blk.28.attn_norm.weight": "5774a9df2bbb2e86d1f70179c7b92d81e1f401160148b3328fb64db6646a5425",
    "blk.28.attn_output.weight": "e8712622d1569557000c75f26c3f55fad267fd300463c2c2cfe3afbfa1c8f908",
    "blk.28.attn_q.weight": "11677751fddee52cc739699c02836f7be54d96038be4240be5d4f53d00161608",
    "blk.28.attn_v.weight": "e5ee459b8958d65e1445997b9aa1e90e2f5d17761ebcf5357313119a45322507",
    "blk.28.ffn_down.weight": "3934518f9f85292da8475fe38a8edcbfc4e24ac56c351b472d6351f98750871e",
    "blk.28.ffn_gate.weight": "6ba735d57e98d0847e487f25ffaa25256deaa8abec76f428cb70bd9774279d83",
    "blk.28.ffn_up.weight": "977fae6e1e5353114fc645dd98429464749758765cbc6e6457593d596e57850c",
    "blk.29.attn_k.weight": "8122a457307d580ad6f1e0acea09a2f593d97f595ba0d6737f5fea16d2433642",
    "blk.29.attn_norm.weight": "d626f721e05aa1202439b01027031d4caf1adace61ed37870a277cb6297c77cc",
    "blk.29.attn_output.weight": "7fb7122ab1b6b1e6615ca746897da27bc52c92cb70d3147183cdde61795b72b3",
    "blk.29.attn_q.weight": "be43e94ff6b6e391024dc824101efa0ddf4005d5b002ac26cb03765c0c73c2fa",
    "blk.29.attn_v.weight": "af93c85ebff908f74f9935b81bde0516ca487c84139868a1ce079c3ae20036b1",
    "blk.29.ffn_down.weight": "39dae12340ed3120bd19c495fe0872b559613641e41fde69d02d8631900b84c0",
    "blk.29.ffn_gate.weight": "36fd482439840ef197c9f3b8905d86acfcea49bcf018544106ca465d4bf8d5c7",
    "blk.29.ffn_up.weight": "5243fbdfdc1e2a1dd84b6210a9869d18a014db9088897e345240cdc99990bd5d",
    "blk.30.attn_k.weight": "948f263616bd3788b2b968baafd69b9c5bd1b77578665f096c4b7e247b4cea42",
    "blk.30.attn_norm.weight": "e168df981e744874ff303faf2eb470e5f6868c2040ba5f383f6c5148669975e7",
    "blk.30.attn_output.weight": "4cf0ccca04b792573b756655a24fc89cfb1f272da8305633f0bc66ef14990b93",
    "blk.30.attn_q.weight": "21e07d6cba6c50d65350289258209717174a13c42be57e8141d69712cbaf32c1",
    "blk.30.attn_v.weight": "65a8ca29c7237b3182ccf03e2fc94e84f9a53d0e160fb679ab401c853170dd9c",
    "blk.30.ffn_down.weight": "8b00500a6d00d84058f6658ee1d6f06fb4fcae2f90d4341792259362923b3c13",
    "blk.30.ffn_gate.weight": "5bc0e19ab7a31b50ac2118ad1b36e31055271a322cd8ff661d47c3ac0210703c",
    "blk.30.ffn_up.weight": "f37a0561955725bd59ee2d064fa9f4e00a12a1b620b624db3bc3add5330bc321",
    "blk.31.attn_k.weight": "9a5663edda227f5d87533897146764f8e8a7481b9e71fae197c39204f8463221",
    "blk.31.attn_norm.weight": "060a4f438a1ee5e220b5b5278ad2f5c085a428bf38c515766781815597c87529",
    "blk.31.attn_output.weight": "6ada5d3cad9dea4780ffbb43302bb6ccc2f24eddd0fc4f5f84c9ce0fc0c6e5dd",
    "blk.31.attn_q.weight": "bb5d08c08603907981ad388d5d8b70fcc9b98034ba264b8474c8890cc0297af0",
    "blk.31.attn_v.weight": "e01b4252ea9c6a889c32b21144b441a347464d04536ef4f6572425be55759796",
    "blk.31.ffn_down.weight": "8ba4d679c36e93ba65ba03180385ef35ea86b3b7cdf2fded9df59369f1c09630",
    "blk.31.ffn_gate.weight": "e5b41dc93645f8b5e8eebae3ada3ea43a18f97ce2654228655170b07b463ccb0",
    "blk.31.ffn_up.weight": "25b88cdddc8b547af294ed107d3d1312e90b983cae87936fa6062ecd8ea02539",
    "blk.32.attn_k.weight": "4bcf86dc0858c8ca2fbdf6aa76674d43eb698f78979fdc1a38f556a7af1facc4",
    "blk.32.attn_norm.weight": "cdcc12f3b8b9773c6722736bfb748a2729230b21478cbcc4104859d3148df815",
    "blk.32.attn_output.weight": "d43f1196822995ed89a9365c97054753a8b30ce20b6e273c8edcc42673a1e141",
    "blk.32.attn_q.weight": "ebf2972bb3865cbc5be4840113a322089752038344beab2a0122c7cb4fb399b6",
    "blk.32.attn_v.weight": "714db81704ff34fa137512903c1013acee7877467473e46600728b9240582eb7",
    "blk.32.ffn_down.weight": "2cde3da1258bb170a79d5d3cdfe10c86a71eb34b77da46b74c5ed71e7f4fe274",
    "blk.32.ffn_gate.weight": "c7e1ed792532613ff9d4e5834b6536e2e0f47df2303bc0fdaa90aac0c1f4e8db",
    "blk.32.ffn_up.weight": "d8d6f13fe66a716e28f79101a29817f0c0d6f99969a6f017d51bafd1a16c600c",
    "blk.33.attn_k.weight": "a0a28f6cbca88da00cab2ca37094d9b0503bf9defdae77b91895b911c408cbb6",
    "blk.33.attn_norm.weight": "0251200c24cc8445607ace6dc8c5aa0566567997262b7cca53a11ac23cc564b2",
    "blk.33.attn_output.weight": "b2423205bdf6a1096d43c44d8d12f1a84fcd4e1bb70fcf6dc8542b8b8a71a13c",
    "blk.33.attn_q.weight": "00b425c3ef71065ce5e0234e702bf38143b4952da78a85f52ab2c2e3073d97ab",
    "blk.33.attn_v.weight": "035edd2335df816c42c765a5e66b9d9b9e15a822a8dc1863508145499c942c14",
    "blk.33.ffn_down.weight": "4894a923a3db75bae4496ba3ce5f28796ad31fe33996a066271fb8654964310e",
    "blk.33.ffn_gate.weight": "8f6c819b8bbfbe3357fae89e1ac5a3d58be85b3b04be3bacf7b62775869046ff",
    "blk.33.ffn_up.weight": "257c3544b5b544fd5d839665bf5caf107a329b59dbc3751efcaa24ae63c56179",
    "blk.34.attn_k.weight": "b6cd8bba892e38dac4a2ebc3ba1bce49e71b967fc436fde30c6d76f54a18935f",
    "blk.34.attn_norm.weight": "2b3c8e60a064cba9955752bbbbdd92c71ba5c2f1bd721097bdbe88b5abc68787",
    "blk.34.attn_output.weight": "8cc272551c9aaca9db5a660c6927bab94a0243d74a30b2bc165f06bd577714ea",
    "blk.34.attn_q.weight": "74b561eb4792484e6a94b58fe2583848c3ae28ff2f1bf3d02939a0cfdfa49990",
    "blk.34.attn_v.weight": "dba19e24ff05154dc5a1f55c023729303a583d13d68732ce22ea74d4410dc8f0",
    "blk.34.ffn_down.weight": "76eca5dfeb274c35774e0bf9f22ee420ed9085c8e99aa2cd5a236e4918b44c61",
    "blk.34.ffn_gate.weight": "9af0862d5fcbc24732846488e653db8242a467765c0cdbc00332b3a40256b4a6",
    "blk.34.ffn_up.weight": "2a03126bf73587eaba99ece2066103d12e47bcd4ce30ff6c17b2f383b81d40df",
    "blk.35.attn_k.weight": "52513fc0cd4e997a842729af7d21dd09399bce0a339558374738be266d0fa2f0",
    "blk.35.attn_norm.weight": "e5281fa911964263ccf1630b14762edbd41d0b9472d6ec695fc600fed4892c35",
    "blk.35.attn_output.weight": "b391d6705d5dc6f48326b5fd16573f679edf64109d86fb729a498819676590ca",
    "blk.35.attn_q.weight": "d16446921966db9b0e0539626ad22a2511ace780e59379d6a4162d8c5441440b",
    "blk.35.attn_v.weight": "9d8cdf23ffdb0c5c74106843390b94b24c9f33ef0eb9998d39f78c73390101ea",
    "blk.35.ffn_down.weight": "938eb6301f7bbf162d7dd965682a5ed11d0a4a530c6fedd7e5469ce80012fc17",
    "blk.35.ffn_gate.weight": "5ad84f5a0c8edcfea1ecf1a3e3d21d85ceda0c4ad9e3c6ca68885eeff8ed3c2f",
    "blk.35.ffn_up.weight": "1c4330d9dc71bf4c98812c34356c51f520f47610a534152aa6d29284b758090d",
    "blk.36.attn_k.weight": "ef720655e5ca2465f13db2dfc4732fb4ef2c9d53acde52f514fd4f301e974081",
    "blk.36.attn_norm.weight": "88f4b9310b3c8c2644e3029160cd35678c79dfa59280430e03f5c29a6fe84a58",
    "blk.36.attn_output.weight": "aec6f915fffd7bb72cd783273e871b4f09605950089d45e72059d1316b6c4b01",
    "blk.36.attn_q.weight": "72f9408a2405d42f8db6ce5fcf1d26a3660b6f225fc60e77d0277109cfcb82ed",
    "blk.36.attn_v.weight": "0f3b3d851dc44b3893ef53f6cca5b4acc9658bacfe1cc2d13c3d704ddd409b67",
    "blk.36.ffn_down.weight": "470aec48ce8c5129a6654d9fd26fcae72776f9fc1429a8bb05818072a876475d",
    "blk.36.ffn_gate.weight": "7f5f296d09cf55679767b5d15de3eff489c456782119f25204be4b1647f18dcf",
    "blk.36.ffn_up.weight": "b7ef74a1f7ffb4982711d93f1787be3a70edc3d2358d5203c41d8900508037d4",
    "blk.37.attn_k.weight": "c4ffa5412e4ff2dcfe1aed991c1f54169fd171a4c7638e4b9f21a1ca64c5e1d6",
    "blk.37.attn_norm.weight": "4eb6c888d841cccfacf5b963f8611120f6ff24b84af0b5714fd9ab36dcda422f",
    "blk.37.attn_output.weight": "db2a7bbf9682f9f6eea672dae8e150738f1bf74dbc80edc7022017a3f040c8ac",
    "blk.37.attn_q.weight": "e38c0462aff139afcbab289189823527e453abc9e541154adde5e7af88cacf0b",
    "blk.37.attn_v.weight": "952eb2492ed452a72f96bcc12d4b2affad9dfdf46ee39ce4a5d7b57a5dc301e5",
    "blk.37.ffn_down.weight": "25f23a8fbc44febf6dc4848fd7fe03a580e2822bd3b3b5a51f4990826bfe3e4e",
    "blk.37.ffn_gate.weight": "707da5eb40118b035305d3262444382351f170a20a537386a70e90c5a83a7817",
    "blk.37.ffn_up.weight": "d2d2ba5cfc4ef47338dd7384219e22bf030a5a2209e0354d88f5bbaaafd20e87",
    "blk.38.attn_k.weight": "abc4bb189dedf7ce661e79028427623a4f91ac091c2cd60e31b58bc62b1cda71",
    "blk.38.attn_norm.weight": "9f4803a7d03fd40fcb83d85f84eb1d5682ea4e5bb084f210c02850675d804c3d",
    "blk.38.attn_output.weight": "77cb66007f1a41df7135d0e7f900ceb499c2f667dfc3f1a6ac01a3203bbd3ccf",
    "blk.38.attn_q.weight": "d94a8b26cd375bf2bcaa76597e314aa8268ee50a479d00931e5e0e021feadb5d",
    "blk.38.attn_v.weight": "660c907888bc5016dc69b7d35fe6f55c7ded697c93be0e2d332a2f17aff88758",
    "blk.38.ffn_down.weight": "6f06173bae5b00ffaf88ef383619a8b9c6a8d0d5c6494695d17f6c1de1a68a13",
    "blk.38.ffn_gate.weight": "89f99be149d03f116527bfcabe073c50001c874de40fb6e817f6619027f3cd05",
    "blk.38.ffn_up.weight": "8d57557c8d5e2d2688b73f01dddf1ce8d5194990cda6358153320aea88aac7f8",
    "blk.39.attn_k.weight": "21be09c988b46c8393e6c2ec9230f3b5136eb7607dd1953ba92d0811c2f0dd75",
    "blk.39.attn_norm.weight": "ba7c1912dd1c4e2d16917201f62396fd0600e4a451137eaddff255548c209abd",
    "blk.39.attn_output.weight": "acfaf4abb3fd27fd899b5563c3877f176b597d8f6cdb2f2fd3f3a0bd4da15ed6",
    "blk.39.attn_q.weight": "e8adbc140d4c8f0db2a27ca584c5531d5b1e080555fe627e34d80d0814a92bed",
    "blk.39.attn_v.weight": "92f96b0e1f724e73a0f90a76c145654418844c04a6d4b14c05eb5af8a62bf8dc",
    "blk.39.ffn_down.weight": "4d9ee7c65fc16fe95d10c47b79ac6a525741947600a64b5fcea5d300a82c50de",
    "blk.39.ffn_gate.weight": "7e18507989f39b32191133d2657c2ee3b74f42f070579204d727eb72215793d1",
    "blk.39.ffn_up.weight": "22cda752269c9757ba918abede1df95bb0f83a5c772dea13c8deea3d5f2723d9",
    "output_norm.weight": "2858cf0e39d32caf52b7861378ace076000241e147f10b9eb21d8a5cd149e3cb"
 }
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@@ -9,8 +9,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
 	"github.com/ollama/ollama/envconfig"
 )
 // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -41,13 +39,10 @@ func commonAMDValidateLibDir() (string, error) {
 	// Favor our bundled version
 	// Installer payload location if we're running the installed binary
-	exe, err := os.Executable()
+	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
-	if err == nil {
+	if rocmLibUsable(rocmTargetDir) {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
+		slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-		if rocmLibUsable(rocmTargetDir) {
+		return rocmTargetDir, nil
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
 		}
 	}
 	// Prefer explicit HIP env var
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -77,8 +77,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
-	depPaths := LibraryDirs()
+	var libDir string
 	libDir := ""
 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
@@ -353,9 +352,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				})
 				return nil, err
 			}
 			depPaths = append(depPaths, libDir)
 		}
-		gpuInfo.DependencyPath = depPaths
+		gpuInfo.DependencyPath = []string{libDir}
 		if gfxOverride == "" {
 			// Only load supported list once
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -5,7 +5,6 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"slices"
 	"strconv"
@@ -50,14 +49,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		slog.Info(err.Error())
 		return nil, err
 	}
-	depPaths := LibraryDirs()
+
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
 		err = fmt.Errorf("unable to verify rocm library: %w", err)
 		slog.Warn(err.Error())
 		return nil, err
 	}
 	depPaths = append(depPaths, libDir)
 	var supported []string
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
@@ -113,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,
 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: depPaths,
+				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
@@ -164,9 +162,7 @@ func AMDValidateLibDir() (string, error) {
 	}
 	// Installer payload (if we're running from some other location)
-	localAppData := os.Getenv("LOCALAPPDATA")
+	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
 	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -23,7 +23,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/runners"
 )
 type cudaHandles struct {
@@ -101,15 +100,7 @@ func initCudaHandles() *cudaHandles {
 	// Aligned with driver, we can't carry as payloads
 	nvcudaMgmtPatterns := NvcudaGlobs
-
+	cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(LibOllamaPath, "cuda_v*", CudartMgmtName))
 	if runtime.GOOS == "windows" {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
 	libDirs := LibraryDirs()
 	for _, d := range libDirs {
 		cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(d, CudartMgmtName))
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
 	if len(NvmlGlobs) > 0 {
@@ -240,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
-		depPaths := LibraryDirs()
+
 		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
@@ -248,11 +239,9 @@ func GetGPUInfo() GpuInfoList {
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
-					memInfo:        mem,
+					memInfo: mem,
-					Library:        "cpu",
+					Library: "cpu",
-					Variant:        runners.GetCPUCapability().String(),
+					ID:      "0",
 					ID:             "0",
 					DependencyPath: depPaths,
 				},
 				CPUs: details,
 			},
@@ -294,17 +283,13 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
-				if depPaths != nil {
+
-					gpuInfo.DependencyPath = depPaths
+				// Start with our bundled libraries
-					// Check for variant specific directory
+				if variant != "" {
-					if variant != "" {
+					variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant)
-						for _, d := range depPaths {
+					if _, err := os.Stat(variantPath); err == nil {
-							if _, err := os.Stat(filepath.Join(d, "cuda_"+variant)); err == nil {
+						// Put the variant directory first in the search path to avoid runtime linking to the wrong library
-								// Put the variant directory first in the search path to avoid runtime linking to the wrong library
+						gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...)
 								gpuInfo.DependencyPath = append([]string{filepath.Join(d, "cuda_"+variant)}, gpuInfo.DependencyPath...)
 								break
 							}
 						}
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
@@ -376,7 +361,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPaths
+						gpuInfo.DependencyPath = []string{LibOllamaPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
@@ -512,33 +497,30 @@ func GetGPUInfo() GpuInfoList {
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)
-	// Start with our bundled libraries
+	// search our bundled libraries first
-	patterns := []string{}
+	patterns := []string{filepath.Join(LibOllamaPath, baseLibName)}
 	for _, d := range LibraryDirs() {
 		patterns = append(patterns, filepath.Join(d, baseLibName))
 	}
 	var ldPaths []string
 	switch runtime.GOOS {
 	case "windows":
-		ldPaths = strings.Split(os.Getenv("PATH"), ";")
+		ldPaths = strings.Split(os.Getenv("PATH"), string(os.PathListSeparator))
 	case "linux":
-		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
+		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), string(os.PathListSeparator))
 	default:
 		return gpuLibPaths
 	}
-	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
+	// then search the system's LD_LIBRARY_PATH
-	for _, ldPath := range ldPaths {
+	for _, p := range ldPaths {
-		d, err := filepath.Abs(ldPath)
+		p, err := filepath.Abs(p)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName))
+		patterns = append(patterns, filepath.Join(p, baseLibName))
 	}
 	// finally, search the default patterns provided by the caller
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
 	for _, pattern := range patterns {
@@ -715,23 +697,6 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }
 func LibraryDirs() []string {
 	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
 	// This can be simplified once we no longer carry runners as payloads
 	exe, err := os.Executable()
 	if err != nil {
 		slog.Warn("failed to lookup executable path", "error", err)
 		return nil
 	}
 	lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
 	if _, err := os.Stat(lib); err != nil {
 		return nil
 	}
 	return []string{lib}
 }
 func GetSystemInfo() SystemInfo {
 	gpus := GetGPUInfo()
 	gpuMutex.Lock()
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -15,7 +15,6 @@ import (
 	"syscall"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/runners"
 )
 const (
@@ -28,7 +27,6 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
 				Variant: runners.GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -51,7 +49,6 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
 			Variant: runners.GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
--- a/discover/path.go
+++ b/discover/path.go
@@ -0,0 +1,57 @@
 package discover
 import (
 	"os"
 	"path/filepath"
 	"runtime"
 )
 // LibPath is a path to lookup dynamic libraries
 // in development it's usually 'build/lib/ollama'
 // in distribution builds it's 'lib/ollama' on Windows
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
 // 'cuda_v11', 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
 		return ""
 	}
 	exe, err = filepath.EvalSymlinks(exe)
 	if err != nil {
 		return ""
 	}
 	var libPath string
 	switch runtime.GOOS {
 	case "windows":
 		libPath = filepath.Join(filepath.Dir(exe), "lib", "ollama")
 	case "linux":
 		libPath = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 	case "darwin":
 		libPath = filepath.Dir(exe)
 	}
 	cwd, err := os.Getwd()
 	if err != nil {
 		return ""
 	}
 	paths := []string{
 		libPath,
 		// build paths for development
 		filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"),
 		filepath.Join(cwd, "build", "lib", "ollama"),
 	}
 	for _, p := range paths {
 		if _, err := os.Stat(p); err == nil {
 			return p
 		}
 	}
 	return filepath.Dir(exe)
 }()
--- a/discover/types.go
+++ b/discover/types.go
@@ -5,7 +5,6 @@ import (
 	"log/slog"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/runners"
 )
 type memInfo struct {
@@ -107,7 +106,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != runners.CPUCapabilityNone.String() {
+		if info.Variant != "" {
 			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,7 +2,7 @@
 ### Getting Started
 * [Quickstart](../README.md#quickstart)
-* [Examples](../examples)
+* [Examples](./examples.md)
 * [Importing models](./import.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
--- a/docs/api.md
+++ b/docs/api.md
@@ -306,7 +306,7 @@ curl http://localhost:11434/api/generate -d '{
 #### Response
-```
+```json
 {
  "model": "llava",
  "created_at": "2023-11-03T15:36:02.583064Z",
@@ -495,14 +495,14 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false`
+- `tools`: list of tools in JSON for the model to use if supported
 The `message` object has the following fields:
 - `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
- `tool_calls` (optional): a list of tools the model wants to use
+- `tool_calls` (optional): a list of tools in JSON that the model wants to use
 Advanced parameters (optional):
@@ -795,7 +795,7 @@ curl http://localhost:11434/api/chat -d '{
 ##### Request
-```
+```shell
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [
@@ -870,7 +870,7 @@ If the messages array is empty, the model will be loaded into memory.
 ##### Request
-```
+```shell
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": []
@@ -897,7 +897,7 @@ If the messages array is empty and the `keep_alive` parameter is set to `0`, a m
 ##### Request
-```
+```shell
 curl http://localhost:11434/api/chat -d '{
  "model": "llama3.2",
  "messages": [],
--- a/docs/development.md
+++ b/docs/development.md
@@ -1,165 +1,131 @@
 # Development
-Install required tools:
+Install prerequisites:
- go version 1.22 or higher
+- [Go](https://go.dev/doc/install)
- OS specific C/C++ compiler (see below)
+- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://jmeubank.github.io/tdm-gcc/download/) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.
 - GNU Make
-
+Then build and run Ollama from the root directory of the repository:
 ## Overview
 Ollama uses a mix of Go and C/C++ code to interface with GPUs.  The C/C++ code is compiled with both CGO and GPU library specific compilers.  A set of GNU Makefiles are used to compile the project.  GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary.  The default make target will build the runners and primary Go Ollama application that will run within the repo directory.  Throughout the examples below `-j 5` is suggested for 5 parallel jobs to speed up the build.  You can adjust the job count based on your CPU Core count to reduce build times.  If you want to relocate the built binaries, use the `dist` target and recursively copy the files in `./dist/$OS-$ARCH/` to your desired location. To learn more about the other make targets use `make help`
 Once you have built the GPU/CPU runners, you can compile the main application with `go build .` 
 ### MacOS
 [Download Go](https://go.dev/dl/)
 ```bash
 make -j 5
 ```
 Now you can run `ollama`:
 ```bash
 ./ollama
 ```
 #### Xcode 15 warnings
 If you are using Xcode newer than version 14, you may see a warning during `go build` about `ld: warning: ignoring duplicate libraries: '-lobjc'` due to Golang issue https://github.com/golang/go/issues/67799 which can be safely ignored.  You can suppress the warning with `export CGO_LDFLAGS="-Wl,-no_warn_duplicate_libraries"`
 ### Linux
 #### Linux CUDA (NVIDIA)
 _Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
 Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.
 Typically the makefile will auto-detect CUDA, however, if your Linux distro
 or installation approach uses alternative paths, you can specify the location by
 overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
 a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)
 ```
-make -j 5
+go run . serve
 ```
-If both v11 and v12 tookkits are detected, runners for both major versions will be built by default.  You can build just v12 with `make cuda_v12`
+## macOS (Apple Silicon)
-#### Older Linux CUDA (NVIDIA)
+macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
-To support older GPUs with Compute Capability 3.5 or 3.7, you will need to use an older version of the Driver from [Unix Driver Archive](https://www.nvidia.com/en-us/drivers/unix/) (tested with 470) and [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (tested with cuda V11).  When you build Ollama, you will need to set two make variable to adjust the minimum compute capability Ollama supports via `make -j 5 CUDA_ARCHITECTURES="35;37;50;52" EXTRA_GOLDFLAGS="\"-X=github.com/ollama/ollama/discover.CudaComputeMajorMin=3\" \"-X=github.com/ollama/ollama/discover.CudaComputeMinorMin=5\""`.  To find the Compute Capability of your older GPU, refer to [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
+## macOS (Intel)
-#### Linux ROCm (AMD)
+Install prerequisites:
-_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+- [CMake](https://cmake.org/download/) or `brew install cmake`
-Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
+Then, configure and build the project:
 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `HIP_PATH` to the location of the ROCm
 install (typically `/opt/rocm`). You can also customize
 the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)
 ```
-make -j 5
+cmake -B build
 cmake --build build
 ```
-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
+Lastly, run Ollama:
 #### Containerized Linux Build
 If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`
 ### Windows
 The following tools are required as a minimal development environment to build CPU inference support.
 - Go version 1.22 or higher
  - https://go.dev/dl/
 - Git
  - https://git-scm.com/download/win
 - clang with gcc compat and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
  - [MSYS2](https://www.msys2.org/)
    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
  - Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
 > [!NOTE]  
 > Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
 ```
-make -j 5
+go run . serve
 ```
-#### GPU Support
+## Windows
-The GPU tools require the Microsoft native build tools.  To build either CUDA or ROCm, you must first install MSVC via Visual Studio:
+Install prerequisites:
- Make sure to select `Desktop development with C++` as a Workload during the Visual Studio install
+- [CMake](https://cmake.org/download/)
- You must complete the Visual Studio install and run it once **BEFORE** installing CUDA or ROCm for the tools to properly register
+- [Visual Studio 2022](https://visualstudio.microsoft.com/downloads/) including the Native Desktop Workload
- Add the location of the **64 bit (x64)** compiler (`cl.exe`) to your `PATH`
+- (Optional) AMD GPU support
- Note: the default Developer Shell may configure the 32 bit (x86) compiler which will lead to build failures.  Ollama requires a 64 bit toolchain.
+    - [ROCm](https://rocm.github.io/install.html)
    - [Ninja](https://github.com/ninja-build/ninja/releases)
 - (Optional) NVIDIA GPU support
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
-#### Windows CUDA (NVIDIA)
+> [!IMPORTANT]
 > Ensure prerequisites are in `PATH` before running CMake.
-In addition to the common Windows development tools and MSVC described above:
+> [!IMPORTANT]
 > ROCm is not compatible with Visual Studio CMake generators. Use `-GNinja` when configuring the project.
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
+> [!IMPORTANT]
 > CUDA is only compatible with Visual Studio CMake generators.
-#### Windows ROCm (AMD Radeon)
+Then, configure and build the project:
 In addition to the common Windows development tools and MSVC described above:
 - [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
 #### Windows arm64
 The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
 ```powershell
 import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
 Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
 ```
 You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
 Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
 ```
-pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
+cmake -B build
 cmake --build build --config Release
 ```
-You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
+Lastly, run Ollama:
 ## Advanced CPU Vector Settings
 On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
 To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
 To build without any vector flags:
 ```
-make CUSTOM_CPU_FLAGS=""
+go run . serve
 ```
-To build with both AVX and AVX2:
+## Windows (ARM)
 ```
 make CUSTOM_CPU_FLAGS=avx,avx2
 ```
-To build with AVX512 features turned on:
+Windows ARM does not support additional acceleration libraries at this time.
 ## Linux
 Install prerequisites:
 - [CMake](https://cmake.org/download/) or `sudo apt install cmake` or `sudo dnf install cmake`
 - (Optional) AMD GPU support
    - [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
 - (Optional) NVIDIA GPU support
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
 > [!IMPORTANT]
 > Ensure prerequisites are in `PATH` before running CMake.
 Then, configure and build the project:
 ```
-make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
+cmake -B build
 cmake --build build
 ```
-> [!NOTE]  
+Lastly, run Ollama:
-> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
+
 ```
 go run . serve
 ```
 ## Docker
 ```
 docker build .
 ```
 ### ROCm
 ```
 docker build --build-arg FLAVOR=rocm .
 ```
 ## Running tests
 To run tests, use `go test`:
 ```
 go test ./...
 ```
 ## Library detection
 Ollama looks for acceleration libraries in the following paths relative to the `ollama` executable:
 * `./lib/ollama` (Windows)
 * `../lib/ollama` (Linux)
 * `.` (macOS)
 * `build/lib/ollama` (for development)
 If the libraries are not found, Ollama will not run with any acceleration libraries.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -38,7 +38,7 @@ Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
 You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
 ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
-### Laptop Suspend Resume
+### Linux Suspend Resume
 On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
 your NVIDIA GPU, and fallback to running on the CPU.  You can workaround this
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -152,7 +152,7 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s
 For example:
 ```shell
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.5.7 sh
 ```
 ## Viewing logs
@@ -186,3 +186,9 @@ sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama
 ```
 Remove installed libraries:
 ```shell
 sudo rm -rf /usr/local/lib/ollama
 ```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -67,8 +67,6 @@ To use this:
 3. `ollama run choose-a-model-name`
 4. Start using the model!
 More examples are available in the [examples directory](../examples).
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.
  ```bash
@@ -155,7 +153,6 @@ PARAMETER <parameter> <parametervalue>
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
 | seed           | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0)                                                                                       | int        | seed 42              |
 | stop           | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile.                                      | string     | stop "AI assistant:" |
 | tfs_z          | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1)                                               | float      | tfs_z 1              |
 | num_predict    | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -288,12 +288,3 @@ func Values() map[string]string {
 func Var(key string) string {
 	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
 // On windows, we keep the binary at the top directory, but
 // other platforms use a "bin" directory, so this returns ".."
 func LibRelativeToExe() string {
 	if runtime.GOOS == "windows" {
 		return "."
 	}
 	return ".."
 }
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -40,8 +40,6 @@ func HumanBytes(b int64) string {
 	}
 	switch {
 	case value >= 100:
 		return fmt.Sprintf("%d %s", int(value), unit)
 	case value >= 10:
 		return fmt.Sprintf("%d %s", int(value), unit)
 	case value != math.Trunc(value):
--- a/format/bytes_test.go
+++ b/format/bytes_test.go
@@ -0,0 +1,91 @@
 package format
 import (
 	"testing"
 )
 func TestHumanBytes(t *testing.T) {
 	type testCase struct {
 		input    int64
 		expected string
 	}
 	tests := []testCase{
 		// Test bytes (B)
 		{0, "0 B"},
 		{1, "1 B"},
 		{999, "999 B"},
 		// Test kilobytes (KB)
 		{1000, "1 KB"},
 		{1500, "1.5 KB"},
 		{999999, "999 KB"},
 		// Test megabytes (MB)
 		{1000000, "1 MB"},
 		{1500000, "1.5 MB"},
 		{999999999, "999 MB"},
 		// Test gigabytes (GB)
 		{1000000000, "1 GB"},
 		{1500000000, "1.5 GB"},
 		{999999999999, "999 GB"},
 		// Test terabytes (TB)
 		{1000000000000, "1 TB"},
 		{1500000000000, "1.5 TB"},
 		{1999999999999, "2.0 TB"},
 		// Test fractional values
 		{1234, "1.2 KB"},
 		{1234567, "1.2 MB"},
 		{1234567890, "1.2 GB"},
 	}
 	for _, tc := range tests {
 		t.Run(tc.expected, func(t *testing.T) {
 			result := HumanBytes(tc.input)
 			if result != tc.expected {
 				t.Errorf("Expected %s, got %s", tc.expected, result)
 			}
 		})
 	}
 }
 func TestHumanBytes2(t *testing.T) {
 	type testCase struct {
 		input    uint64
 		expected string
 	}
 	tests := []testCase{
 		// Test bytes (B)
 		{0, "0 B"},
 		{1, "1 B"},
 		{1023, "1023 B"},
 		// Test kibibytes (KiB)
 		{1024, "1.0 KiB"},
 		{1536, "1.5 KiB"},
 		{1048575, "1024.0 KiB"},
 		// Test mebibytes (MiB)
 		{1048576, "1.0 MiB"},
 		{1572864, "1.5 MiB"},
 		{1073741823, "1024.0 MiB"},
 		// Test gibibytes (GiB)
 		{1073741824, "1.0 GiB"},
 		{1610612736, "1.5 GiB"},
 		{2147483648, "2.0 GiB"},
 	}
 	for _, tc := range tests {
 		t.Run(tc.expected, func(t *testing.T) {
 			result := HumanBytes2(tc.input)
 			if result != tc.expected {
 				t.Errorf("Expected %s, got %s", tc.expected, result)
 			}
 		})
 	}
 }
--- a/go.mod
+++ b/go.mod
@@ -24,7 +24,6 @@ require (
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
 	golang.org/x/tools v0.28.0
 	gonum.org/v1/gonum v0.15.0
 )
@@ -72,7 +71,7 @@ require (
 	golang.org/x/arch v0.8.0 // indirect
 	golang.org/x/crypto v0.31.0
 	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
-	golang.org/x/net v0.32.0 // indirect
+	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sys v0.28.0
 	golang.org/x/term v0.27.0
 	golang.org/x/text v0.21.0
--- a/go.sum
+++ b/go.sum
@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
+golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
-golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -309,8 +309,6 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8=
 golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/grammar/bench_test.go
+++ b/grammar/bench_test.go
@@ -1,22 +0,0 @@
 //go:build go1.24
 package grammar
 import "testing"
 func BenchmarkFromSchema(b *testing.B) {
 	for tt := range testCases(b) {
 		b.Run("", func(b *testing.B) {
 			s := []byte(tt.schema)
 			b.ReportAllocs()
 			for b.Loop() {
 				_, err := FromSchema(nil, s)
 				if err != nil {
 					b.Fatalf("GrammarFromSchema: %v", err)
 				}
 			}
 		})
 		return
 	}
 }
--- a/grammar/grammar.go
+++ b/grammar/grammar.go
@@ -1,227 +0,0 @@
 package grammar
 import (
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"iter"
 	"strconv"
 	"github.com/ollama/ollama/grammar/jsonschema"
 )
 const jsonTerms = `
 # Unicode
 #
 # Unicode characters can be specified directly in the grammar, for example
 # hiragana ::= [ぁ-ゟ], or with escapes: 8-bit (\xXX), 16-bit (\uXXXX) or 32-bit
 # (\UXXXXXXXX).
 unicode ::= \x{hex}{2} | \u{hex}{4} | \U{hex}{8}
 # JSON grammar from RFC 7159
 null    ::= "null"
 object  ::= "{" (kv ("," kv)*)? "}"
 array   ::= "[" (value ("," value)*)? "]"
 kv      ::= string ":" value
 integer ::= "0" | [1-9] [0-9]*
 number  ::= "-"? integer frac? exp?
 frac    ::= "." [0-9]+
 exp     ::= ("e" | "E") ("+" | "-") [0-9]+
 string  ::= "\"" char* "\""
 escape  ::= ["/" | "b" | "f" | "n" | "r" | "t" | unicode]
 char    ::= [^"\\] | escape
 space   ::= (" " | "\t" | "\n" | "\r")*
 hex     ::= [0-9] | [a-f] | [A-F]
 boolean ::= "true" | "false"
 value   ::= object | array | string | number | boolean | "null"
 # User-defined
 `
 // FromSchema generates a grammar from a JSON schema.
 func FromSchema(buf []byte, jsonSchema []byte) ([]byte, error) {
 	var s *jsonschema.Schema
 	if err := json.Unmarshal(jsonSchema, &s); err != nil {
 		return nil, err
 	}
 	var g builder
 	// "root" is the only rule that is guaranteed to exist, so we start
 	// with its length for padding, and then adjust it as we go.
 	g.pad = len("root")
 	for id := range dependencies("root", s) {
 		g.pad = max(g.pad, len(id))
 	}
 	g.b.WriteString(jsonTerms)
 	ids := make(map[*jsonschema.Schema]string)
 	for id, s := range dependencies("root", s) {
 		ids[s] = id
 		g.define(id)
 		if err := fromSchema(&g, ids, s); err != nil {
 			return nil, err
 		}
 	}
 	g.define("root")
 	if err := fromSchema(&g, ids, s); err != nil {
 		return nil, err
 	}
 	g.define("") // finalize the last rule
 	return g.b.Bytes(), nil
 }
 func fromSchema(g *builder, ids map[*jsonschema.Schema]string, s *jsonschema.Schema) error {
 	switch typ := s.EffectiveType(); typ {
 	case "array":
 		if len(s.PrefixItems) == 0 && s.Items == nil {
 			g.u("array")
 		} else {
 			g.q("[")
 			for i, s := range s.PrefixItems {
 				if i > 0 {
 					g.q(",")
 				}
 				g.u(ids[s])
 			}
 			if s.Items != nil {
 				g.u("(")
 				if len(s.PrefixItems) > 0 {
 					g.q(",")
 				}
 				g.u(ids[s.Items])
 				g.u(")*")
 			}
 			g.q("]")
 		}
 	case "object":
 		if len(s.Properties) == 0 {
 			g.u("object")
 		} else {
 			g.q("{")
 			for i, p := range s.Properties {
 				name := ids[p]
 				if i > 0 {
 					g.q(",")
 				}
 				g.q(p.Name)
 				g.q(":")
 				g.u(name)
 			}
 			g.q("}")
 		}
 	case "number":
 		buildConstrainedNumber(g, s)
 	case "string":
 		if len(s.Enum) == 0 {
 			g.u("string")
 		} else {
 			g.u("(")
 			for i, e := range s.Enum {
 				if i > 0 {
 					g.q("|")
 				}
 				g.q(string(e))
 			}
 			g.u(")")
 		}
 	case "boolean", "value", "null", "integer":
 		g.u(typ)
 	default:
 		return fmt.Errorf("%s: unsupported type %q", s.Name, typ)
 	}
 	return nil
 }
 // dependencies returns a sequence of all child dependencies of the schema in
 // post-order.
 //
 // The first value is the id/pointer to the dependency, and the second value
 // is the schema.
 func dependencies(id string, s *jsonschema.Schema) iter.Seq2[string, *jsonschema.Schema] {
 	return func(yield func(string, *jsonschema.Schema) bool) {
 		for i, p := range s.Properties {
 			id := fmt.Sprintf("%s_%d", id, i)
 			for did, d := range dependencies(id, p) {
 				if !yield(did, d) {
 					return
 				}
 			}
 			if !yield(id, p) {
 				return
 			}
 		}
 		for i, p := range s.PrefixItems {
 			id := fmt.Sprintf("tuple_%d", i)
 			for did, d := range dependencies(id, p) {
 				id := fmt.Sprintf("%s_%s", id, did)
 				if !yield(id, d) {
 					return
 				}
 			}
 			if !yield(id, p) {
 				return
 			}
 		}
 		if s.Items != nil {
 			id := fmt.Sprintf("%s_tuple_%d", id, len(s.PrefixItems))
 			for did, d := range dependencies(id, s.Items) {
 				if !yield(did, d) {
 					return
 				}
 			}
 			if !yield(id, s.Items) {
 				return
 			}
 		}
 	}
 }
 type builder struct {
 	b     bytes.Buffer
 	pad   int
 	rules int
 	items int
 }
 // define terminates the current rule, if any, and then either starts a new
 // rule or does nothing else if the name is empty.
 func (b *builder) define(name string) {
 	if b.rules > 0 {
 		b.b.WriteString(";\n")
 	}
 	if name == "" {
 		return
 	}
 	fmt.Fprintf(&b.b, "% -*s", b.pad, name)
 	b.b.WriteString(" ::=")
 	b.rules++
 	b.items = 0
 }
 // quote appends a terminal to the current rule.
 func (b *builder) q(s string) {
 	if b.items > 0 {
 		b.b.WriteString(" ")
 	}
 	b.b.WriteString(" ")
 	b.b.WriteString(strconv.Quote(s))
 }
 // u appends a non-terminal to the current rule.
 func (b *builder) u(s string) {
 	if b.items > 0 {
 		b.b.WriteString(" ")
 	}
 	b.b.WriteString(" ")
 	b.b.WriteString(s)
 }
 func buildConstrainedNumber(b *builder, s *jsonschema.Schema) {
 	if s.Minimum == 0 && s.Maximum == 0 {
 		b.u("TODO")
 	} else {
 		b.u("number")
 	}
 }
--- a/grammar/grammar_test.go
+++ b/grammar/grammar_test.go
@@ -1,75 +0,0 @@
 package grammar
 import (
 	"bufio"
 	"cmp"
 	"iter"
 	"strings"
 	"testing"
 	_ "embed"
 	"github.com/ollama/ollama/grammar/internal/diff"
 )
 func TestFromSchema(t *testing.T) {
 	for tt := range testCases(t) {
 		t.Run(tt.name, func(t *testing.T) {
 			g, err := FromSchema(nil, []byte(tt.schema))
 			if err != nil {
 				t.Fatalf("FromSchema: %v", err)
 			}
 			got := string(g)
 			got = strings.TrimPrefix(got, jsonTerms)
 			if got != tt.want {
 				t.Logf("schema:\n%s", tt.schema)
 				t.Fatal(string(diff.Diff("got", []byte(got), "want", []byte(tt.want))))
 			}
 		})
 	}
 }
 type testCase struct {
 	name   string
 	schema string
 	want   string
 }
 //go:embed testdata/schemas.txt
 var tests string
 func testCases(t testing.TB) iter.Seq[testCase] {
 	t.Helper()
 	return func(yield func(testCase) bool) {
 		t.Helper()
 		sc := bufio.NewScanner(strings.NewReader(tests))
 		name := ""
 		for sc.Scan() {
 			line := strings.TrimSpace(sc.Text())
 			if line == "" {
 				name = ""
 				continue
 			}
 			if line[0] == '#' {
 				name = cmp.Or(name, strings.TrimSpace(line[1:]))
 				continue
 			}
 			s := sc.Text()
 			g := ""
 			for sc.Scan() {
 				line = strings.TrimSpace(sc.Text())
 				if line == "" || line[0] == '#' {
 					break
 				}
 				g += sc.Text() + "\n"
 			}
 			if !yield(testCase{name, s, g}) {
 				return
 			}
 			name = strings.TrimSpace(strings.TrimPrefix(line, "#"))
 		}
 		if err := sc.Err(); err != nil {
 			t.Fatalf("error reading tests: %v", err)
 		}
 	}
 }
--- a/grammar/internal/diff/diff.go
+++ b/grammar/internal/diff/diff.go
@@ -1,261 +0,0 @@
 // Copyright 2022 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package diff
 import (
 	"bytes"
 	"fmt"
 	"sort"
 	"strings"
 )
 // A pair is a pair of values tracked for both the x and y side of a diff.
 // It is typically a pair of line indexes.
 type pair struct{ x, y int }
 // Diff returns an anchored diff of the two texts old and new
 // in the “unified diff” format. If old and new are identical,
 // Diff returns a nil slice (no output).
 //
 // Unix diff implementations typically look for a diff with
 // the smallest number of lines inserted and removed,
 // which can in the worst case take time quadratic in the
 // number of lines in the texts. As a result, many implementations
 // either can be made to run for a long time or cut off the search
 // after a predetermined amount of work.
 //
 // In contrast, this implementation looks for a diff with the
 // smallest number of “unique” lines inserted and removed,
 // where unique means a line that appears just once in both old and new.
 // We call this an “anchored diff” because the unique lines anchor
 // the chosen matching regions. An anchored diff is usually clearer
 // than a standard diff, because the algorithm does not try to
 // reuse unrelated blank lines or closing braces.
 // The algorithm also guarantees to run in O(n log n) time
 // instead of the standard O(n²) time.
 //
 // Some systems call this approach a “patience diff,” named for
 // the “patience sorting” algorithm, itself named for a solitaire card game.
 // We avoid that name for two reasons. First, the name has been used
 // for a few different variants of the algorithm, so it is imprecise.
 // Second, the name is frequently interpreted as meaning that you have
 // to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
 // when in fact the algorithm is faster than the standard one.
 func Diff(oldName string, old []byte, newName string, new []byte) []byte {
 	if bytes.Equal(old, new) {
 		return nil
 	}
 	x := lines(old)
 	y := lines(new)
 	// Print diff header.
 	var out bytes.Buffer
 	fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
 	fmt.Fprintf(&out, "--- %s\n", oldName)
 	fmt.Fprintf(&out, "+++ %s\n", newName)
 	// Loop over matches to consider,
 	// expanding each match to include surrounding lines,
 	// and then printing diff chunks.
 	// To avoid setup/teardown cases outside the loop,
 	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
 	// in the sequence of matches.
 	var (
 		done  pair     // printed up to x[:done.x] and y[:done.y]
 		chunk pair     // start lines of current chunk
 		count pair     // number of lines from each side in current chunk
 		ctext []string // lines for current chunk
 	)
 	for _, m := range tgs(x, y) {
 		if m.x < done.x {
 			// Already handled scanning forward from earlier match.
 			continue
 		}
 		// Expand matching lines as far as possible,
 		// establishing that x[start.x:end.x] == y[start.y:end.y].
 		// Note that on the first (or last) iteration we may (or definitely do)
 		// have an empty match: start.x==end.x and start.y==end.y.
 		start := m
 		for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
 			start.x--
 			start.y--
 		}
 		end := m
 		for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
 			end.x++
 			end.y++
 		}
 		// Emit the mismatched lines before start into this chunk.
 		// (No effect on first sentinel iteration, when start = {0,0}.)
 		for _, s := range x[done.x:start.x] {
 			ctext = append(ctext, "-"+s)
 			count.x++
 		}
 		for _, s := range y[done.y:start.y] {
 			ctext = append(ctext, "+"+s)
 			count.y++
 		}
 		// If we're not at EOF and have too few common lines,
 		// the chunk includes all the common lines and continues.
 		const C = 3 // number of context lines
 		if (end.x < len(x) || end.y < len(y)) &&
 			(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
 			for _, s := range x[start.x:end.x] {
 				ctext = append(ctext, " "+s)
 				count.x++
 				count.y++
 			}
 			done = end
 			continue
 		}
 		// End chunk with common lines for context.
 		if len(ctext) > 0 {
 			n := end.x - start.x
 			if n > C {
 				n = C
 			}
 			for _, s := range x[start.x : start.x+n] {
 				ctext = append(ctext, " "+s)
 				count.x++
 				count.y++
 			}
 			done = pair{start.x + n, start.y + n}
 			// Format and emit chunk.
 			// Convert line numbers to 1-indexed.
 			// Special case: empty file shows up as 0,0 not 1,0.
 			if count.x > 0 {
 				chunk.x++
 			}
 			if count.y > 0 {
 				chunk.y++
 			}
 			fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
 			for _, s := range ctext {
 				out.WriteString(s)
 			}
 			count.x = 0
 			count.y = 0
 			ctext = ctext[:0]
 		}
 		// If we reached EOF, we're done.
 		if end.x >= len(x) && end.y >= len(y) {
 			break
 		}
 		// Otherwise start a new chunk.
 		chunk = pair{end.x - C, end.y - C}
 		for _, s := range x[chunk.x:end.x] {
 			ctext = append(ctext, " "+s)
 			count.x++
 			count.y++
 		}
 		done = end
 	}
 	return out.Bytes()
 }
 // lines returns the lines in the file x, including newlines.
 // If the file does not end in a newline, one is supplied
 // along with a warning about the missing newline.
 func lines(x []byte) []string {
 	l := strings.SplitAfter(string(x), "\n")
 	if l[len(l)-1] == "" {
 		l = l[:len(l)-1]
 	} else {
 		// Treat last line as having a message about the missing newline attached,
 		// using the same text as BSD/GNU diff (including the leading backslash).
 		l[len(l)-1] += "\n\\ No newline at end of file\n"
 	}
 	return l
 }
 // tgs returns the pairs of indexes of the longest common subsequence
 // of unique lines in x and y, where a unique line is one that appears
 // once in x and once in y.
 //
 // The longest common subsequence algorithm is as described in
 // Thomas G. Szymanski, “A Special Case of the Maximal Common
 // Subsequence Problem,” Princeton TR #170 (January 1975),
 // available at https://research.swtch.com/tgs170.pdf.
 func tgs(x, y []string) []pair {
 	// Count the number of times each string appears in a and b.
 	// We only care about 0, 1, many, counted as 0, -1, -2
 	// for the x side and 0, -4, -8 for the y side.
 	// Using negative numbers now lets us distinguish positive line numbers later.
 	m := make(map[string]int)
 	for _, s := range x {
 		if c := m[s]; c > -2 {
 			m[s] = c - 1
 		}
 	}
 	for _, s := range y {
 		if c := m[s]; c > -8 {
 			m[s] = c - 4
 		}
 	}
 	// Now unique strings can be identified by m[s] = -1+-4.
 	//
 	// Gather the indexes of those strings in x and y, building:
 	//	xi[i] = increasing indexes of unique strings in x.
 	//	yi[i] = increasing indexes of unique strings in y.
 	//	inv[i] = index j such that x[xi[i]] = y[yi[j]].
 	var xi, yi, inv []int
 	for i, s := range y {
 		if m[s] == -1+-4 {
 			m[s] = len(yi)
 			yi = append(yi, i)
 		}
 	}
 	for i, s := range x {
 		if j, ok := m[s]; ok && j >= 0 {
 			xi = append(xi, i)
 			inv = append(inv, j)
 		}
 	}
 	// Apply Algorithm A from Szymanski's paper.
 	// In those terms, A = J = inv and B = [0, n).
 	// We add sentinel pairs {0,0}, and {len(x),len(y)}
 	// to the returned sequence, to help the processing loop.
 	J := inv
 	n := len(xi)
 	T := make([]int, n)
 	L := make([]int, n)
 	for i := range T {
 		T[i] = n + 1
 	}
 	for i := range n {
 		k := sort.Search(n, func(k int) bool {
 			return T[k] >= J[i]
 		})
 		T[k] = J[i]
 		L[i] = k + 1
 	}
 	k := 0
 	for _, v := range L {
 		if k < v {
 			k = v
 		}
 	}
 	seq := make([]pair, 2+k)
 	seq[1+k] = pair{len(x), len(y)} // sentinel at end
 	lastj := n
 	for i := n - 1; i >= 0; i-- {
 		if L[i] == k && J[i] < lastj {
 			seq[k] = pair{xi[i], yi[J[i]]}
 			k--
 		}
 	}
 	seq[0] = pair{0, 0} // sentinel at start
 	return seq
 }
--- a/grammar/internal/diff/diff_test.go
+++ b/grammar/internal/diff/diff_test.go
@@ -1,44 +0,0 @@
 // Copyright 2022 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package diff
 import (
 	"bytes"
 	"path/filepath"
 	"testing"
 	"golang.org/x/tools/txtar"
 )
 func clean(text []byte) []byte {
 	text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n"))
 	text = bytes.TrimSuffix(text, []byte("^D\n"))
 	return text
 }
 func Test(t *testing.T) {
 	files, _ := filepath.Glob("testdata/*.txt")
 	if len(files) == 0 {
 		t.Fatalf("no testdata")
 	}
 	for _, file := range files {
 		t.Run(filepath.Base(file), func(t *testing.T) {
 			a, err := txtar.ParseFile(file)
 			if err != nil {
 				t.Fatal(err)
 			}
 			if len(a.Files) != 3 || a.Files[2].Name != "diff" {
 				t.Fatalf("%s: want three files, third named \"diff\"", file)
 			}
 			diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data))
 			want := clean(a.Files[2].Data)
 			if !bytes.Equal(diffs, want) {
 				t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file,
 					diffs, want, Diff("have", diffs, "want", want))
 			}
 		})
 	}
 }
--- a/grammar/internal/diff/testdata/allnew.txt
+++ b/grammar/internal/diff/testdata/allnew.txt
@@ -1,13 +0,0 @@
 -- old --
 -- new --
 a
 b
 c
 -- diff --
 diff old new
 --- old
 +++ new
@@ -0,0 +1,3 @@
 +a
 +b
 +c
--- a/grammar/internal/diff/testdata/allold.txt
+++ b/grammar/internal/diff/testdata/allold.txt
@@ -1,13 +0,0 @@
 -- old --
 a
 b
 c
 -- new --
 -- diff --
 diff old new
 --- old
 +++ new
@@ -1,3 +0,0 @@
 -a
 -b
 -c
--- a/grammar/internal/diff/testdata/basic.txt
+++ b/grammar/internal/diff/testdata/basic.txt
@@ -1,35 +0,0 @@
 Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.”
 https://www.cs.dartmouth.edu/~doug/diff.pdf
 -- old --
 a
 b
 c
 d
 e
 f
 g
 -- new --
 w
 a
 b
 x
 y
 z
 e
 -- diff --
 diff old new
 --- old
 +++ new
@@ -1,7 +1,7 @@
 +w
 a
 b
 -c
 -d
 +x
 +y
 +z
 e
 -f
 -g
--- a/grammar/internal/diff/testdata/dups.txt
+++ b/grammar/internal/diff/testdata/dups.txt
@@ -1,40 +0,0 @@
 -- old --
 a
 b
 c
 d
 e
 f
 -- new --
 a
 B
 C
 d
 e
 f
 -- diff --
 diff old new
 --- old
 +++ new
@@ -1,8 +1,8 @@
 a
 $
 -b
 -
 -c
 +B
 +
 +C
 $
 d
 $
--- a/grammar/internal/diff/testdata/end.txt
+++ b/grammar/internal/diff/testdata/end.txt
@@ -1,38 +0,0 @@
 -- old --
 1
 2
 3
 4
 5
 6
 7
 eight
 nine
 ten
 eleven
 -- new --
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 -- diff --
 diff old new
 --- old
 +++ new
@@ -5,7 +5,6 @@
 5
 6
 7
 -eight
 -nine
 -ten
 -eleven
 +8
 +9
 +10
--- a/grammar/internal/diff/testdata/eof.txt
+++ b/grammar/internal/diff/testdata/eof.txt
@@ -1,9 +0,0 @@
 -- old --
 a
 b
 c^D
 -- new --
 a
 b
 c^D
 -- diff --
--- a/grammar/internal/diff/testdata/eof1.txt
+++ b/grammar/internal/diff/testdata/eof1.txt
@@ -1,18 +0,0 @@
 -- old --
 a
 b
 c
 -- new --
 a
 b
 c^D
 -- diff --
 diff old new
 --- old
 +++ new
@@ -1,3 +1,3 @@
 a
 b
 -c
 +c
 \ No newline at end of file
--- a/grammar/internal/diff/testdata/eof2.txt
+++ b/grammar/internal/diff/testdata/eof2.txt
@@ -1,18 +0,0 @@
 -- old --
 a
 b
 c^D
 -- new --
 a
 b
 c
 -- diff --
 diff old new
 --- old
 +++ new
@@ -1,3 +1,3 @@
 a
 b
 -c
 \ No newline at end of file
 +c
--- a/grammar/internal/diff/testdata/long.txt
+++ b/grammar/internal/diff/testdata/long.txt
@@ -1,62 +0,0 @@
 -- old --
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 14½
 15
 16
 17
 18
 19
 20
 -- new --
 1
 2
 3
 4
 5
 6
 8
 9
 10
 11
 12
 13
 14
 17
 18
 19
 20
 -- diff --
 diff old new
 --- old
 +++ new
@@ -4,7 +4,6 @@
 4
 5
 6
 -7
 8
 9
 10
@@ -12,9 +11,6 @@
 12
 13
 14
 -14½
 -15
 -16
 17
 18
 19
--- a/grammar/internal/diff/testdata/same.txt
+++ b/grammar/internal/diff/testdata/same.txt
@@ -1,5 +0,0 @@
 -- old --
 hello world
 -- new --
 hello world
 -- diff --
--- a/grammar/internal/diff/testdata/start.txt
+++ b/grammar/internal/diff/testdata/start.txt
@@ -1,34 +0,0 @@
 -- old --
 e
 pi
 4
 5
 6
 7
 8
 9
 10
 -- new --
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 -- diff --
 diff old new
 --- old
 +++ new
@@ -1,5 +1,6 @@
 -e
 -pi
 +1
 +2
 +3
 4
 5
 6
--- a/grammar/internal/diff/testdata/triv.txt
+++ b/grammar/internal/diff/testdata/triv.txt
@@ -1,40 +0,0 @@
 Another example from Hunt and McIlroy,
 “An Algorithm for Differential File Comparison.”
 https://www.cs.dartmouth.edu/~doug/diff.pdf
 Anchored diff gives up on finding anything,
 since there are no unique lines.
 -- old --
 a
 b
 c
 a
 b
 b
 a
 -- new --
 c
 a
 b
 a
 b
 c
 -- diff --
 diff old new
 --- old
 +++ new
@@ -1,7 +1,6 @@
 -a
 -b
 -c
 -a
 -b
 -b
 -a
 +c
 +a
 +b
 +a
 +b
 +c
--- a/grammar/jsonschema/decode.go
+++ b/grammar/jsonschema/decode.go
@@ -1,171 +0,0 @@
 package jsonschema
 import (
 	"bytes"
 	"encoding/json"
 	"errors"
 )
 // Schema holds a JSON schema.
 type Schema struct {
 	// Name is the name of the property. For the parent/root property, this
 	// is "root". For child properties, this is the name of the property.
 	Name string `json:"-"`
 	// Type is the type of the property.
 	//
 	// TODO: Union types (e.g. make this a []string).
 	Type string
 	// PrefixItems is a list of schemas for each item in a tuple. By
 	// default, the tuple is "closed." unless Items is set to true or a
 	// valid Schema.
 	PrefixItems []*Schema
 	// Items is the schema for each item in a list.
 	//
 	// If it is missing, or its JSON value is "null" or "false", it is nil.
 	// If the JSON value is "true", it is set to the empty Schema. If the
 	// JSON value is an object, it will be decoded as a Schema.
 	Items *Schema
 	// MinItems specifies the minimum number of items allowed in a list.
 	MinItems int
 	// MaxItems specifies the maximum number of items allowed in a list.
 	MaxItems int
 	// Properties is the schema for each property of an object.
 	Properties []*Schema
 	// Format is the format of the property. This is used to validate the
 	// property against a specific format.
 	//
 	// It is the callers responsibility to validate the property against
 	// the format.
 	Format string
 	// Minimum specifies the minimum value for numeric properties.
 	Minimum float64
 	// Maximum specifies the maximum value for numeric properties.
 	Maximum float64
 	// Enum is a list of valid values for the property.
 	Enum []json.RawMessage
 }
 func (s *Schema) UnmarshalJSON(data []byte) error {
 	type S Schema
 	w := struct {
 		Properties props
 		Items      items
 		*S
 	}{
 		S: (*S)(s),
 	}
 	if err := json.Unmarshal(data, &w); err != nil {
 		return err
 	}
 	if w.Items.set {
 		s.Items = &w.Items.Schema
 	}
 	s.Properties = w.Properties
 	return nil
 }
 type items struct {
 	Schema
 	set bool
 }
 func (s *items) UnmarshalJSON(data []byte) error {
 	switch b := data[0]; b {
 	case 't':
 		*s = items{set: true}
 	case '{':
 		type I items
 		if err := json.Unmarshal(data, (*I)(s)); err != nil {
 			return err
 		}
 		s.set = true
 	case 'n', 'f':
 	default:
 		return errors.New("invalid Items")
 	}
 	return nil
 }
 // EffectiveType returns the effective type of the schema. If the Type field is
 // not empty, it is returned; otherwise:
 //
 //   - If the schema has both Properties and Items, it returns an empty string.
 //   - If the schema has Properties, it returns "object".
 //   - If the schema has Items, it returns "array".
 //   - If the schema has neither Properties nor Items, it returns "value".
 //
 // The returned string is never empty.
 func (d *Schema) EffectiveType() string {
 	if d.Type == "" {
 		if len(d.Properties) > 0 {
 			return "object"
 		}
 		if len(d.PrefixItems) > 0 || d.Items != nil {
 			return "array"
 		}
 		return "value"
 	}
 	return d.Type
 }
 // props is an ordered list of properties. The order of the properties
 // is the order in which they were defined in the schema.
 type props []*Schema
 var _ json.Unmarshaler = (*props)(nil)
 func (v *props) UnmarshalJSON(data []byte) error {
 	if len(data) == 0 {
 		return nil
 	}
 	if data[0] != '{' {
 		return errors.New("expected object")
 	}
 	d := json.NewDecoder(bytes.NewReader(data))
 	// TODO(bmizerany): Consider DisallowUnknownFields. Currently, we, like
 	// llama.cpp, ignore unknown fields, which could be lead to unexpected
 	// behavior for clients of this package, since they may not be aware
 	// that "additionalFields", "itemsPrefix", etc, are being ignored.
 	//
 	// For now, just do what llama.cpp does.
 	t, err := d.Token()
 	if err != nil {
 		return err
 	}
 	if t != json.Delim('{') {
 		return errors.New("expected object")
 	}
 	for d.More() {
 		// Use the first token (map key) as the property name, then
 		// decode the rest of the object fields into a Schema and
 		// append.
 		t, err := d.Token()
 		if err != nil {
 			return err
 		}
 		if t == json.Delim('}') {
 			return nil
 		}
 		s := &Schema{
 			Name: t.(string),
 		}
 		if err := d.Decode(s); err != nil {
 			return err
 		}
 		*v = append(*v, s)
 	}
 	return nil
 }
--- a/grammar/jsonschema/decode_test.go
+++ b/grammar/jsonschema/decode_test.go
@@ -1,104 +0,0 @@
 package jsonschema
 import (
 	"encoding/json"
 	"reflect"
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 )
 const testSchemaBasic = `
 {
  "properties": {
    "tupleClosedEmpty":   { "prefixItems": [] },
    "tupleClosedMissing": { "prefixItems": [{}] },
    "tupleClosedNull":    { "prefixItems": [{}], "items": null },
    "tupleClosedFalse":   { "prefixItems": [{}], "items": false },
    "tupleOpenTrue":      { "prefixItems": [{}], "items": true },
    "tupleOpenEmpty":     { "prefixItems": [{}], "items": {} },
    "tupleOpenTyped":     { "prefixItems": [{}], "items": {"type": "boolean"} },
    "tupleOpenMax":       { "prefixItems": [{}], "items": true, "maxItems": 3},
    "array": { "items": {"type": "number"} },
    "null": { "type": "null" },
    "string": { "type": "string" },
    "boolean": { "type": "boolean" }
  }
 }
 `
 func TestSchemaUnmarshal(t *testing.T) {
 	var got *Schema
 	if err := json.Unmarshal([]byte(testSchemaBasic), &got); err != nil {
 		t.Fatalf("Unmarshal: %v", err)
 	}
 	want := &Schema{
 		Properties: []*Schema{
 			{Name: "tupleClosedEmpty", PrefixItems: []*Schema{}, Items: nil},
 			{Name: "tupleClosedMissing", PrefixItems: []*Schema{{}}, Items: nil},
 			{Name: "tupleClosedNull", PrefixItems: []*Schema{{}}, Items: nil},
 			{Name: "tupleClosedFalse", PrefixItems: []*Schema{{}}, Items: nil},
 			{Name: "tupleOpenTrue", PrefixItems: []*Schema{{}}, Items: &Schema{}},
 			{Name: "tupleOpenEmpty", PrefixItems: []*Schema{{}}, Items: &Schema{}},
 			{Name: "tupleOpenTyped", PrefixItems: []*Schema{{}}, Items: &Schema{Type: "boolean"}},
 			{Name: "tupleOpenMax", PrefixItems: []*Schema{{}}, Items: &Schema{}, MaxItems: 3},
 			{Name: "array", Items: &Schema{Type: "number"}},
 			{Name: "null", Type: "null"},
 			{Name: "string", Type: "string"},
 			{Name: "boolean", Type: "boolean"},
 		},
 	}
 	if diff := cmp.Diff(want, got); diff != "" {
 		t.Errorf("(-want, +got)\n%s", diff)
 	}
 }
 func TestEffectiveType(t *testing.T) {
 	const schema = `
 		{"properties": {
 			"o": {"type": "object"},
 			"a": {"type": "array"},
 			"n": {"type": "number"},
 			"s": {"type": "string"},
 			"z": {"type": "null"},
 			"b": {"type": "boolean"},
 			"t0": {"prefixItems": [{}], "items": {"type": "number"}},
 			"t1": {"items": {"type": "number"}, "maxItems": 3},
 			"v": {"maxItems": 3}
 		}}
 	`
 	var s *Schema
 	if err := json.Unmarshal([]byte(schema), &s); err != nil {
 		t.Fatalf("json.Unmarshal: %v", err)
 	}
 	var got []string
 	for _, p := range s.Properties {
 		got = append(got, p.EffectiveType())
 	}
 	want := strings.Fields(`
 		object
 		array
 		number
 		string
 		null
 		boolean
 		array
 		array
 		value
 	`)
 	if !reflect.DeepEqual(want, got) {
 		t.Errorf("\ngot:\n\t%v\nwant:\n\t%v", got, want)
 	}
 }
--- a/grammar/testdata/schemas.txt
+++ b/grammar/testdata/schemas.txt
@@ -1,76 +0,0 @@
 # This file holds tests for JSON schema to EBNF grammar conversions.
 #
 # The format is a JSON schema, followed by the expected EBNF grammar. Each test
 # MAY be preceded by a comment that describes the test (e.g. the test name), followed by
 # the JSON schema and the expected EBNF grammar. If no comment is present, the test
 # name the tests number in the file (e.g. "#0", "#1", etc.)
 #
 # Blank lines signify the end or start of a new test. Comments can be added
 # anywhere in the file, but they must be preceded by a '#' character and start at
 # the beginning of the line.
 # default
 {}
 root ::= value;
 {"properties": {}}
 root ::= value;
 # array
 {"properties": {"a": {"type": "array", "items": {"type": "string"}}}}
 root_0_tuple_0 ::= string;
 root_0         ::= "[" ( root_0_tuple_0 )* "]";
 root           ::= "{" "a" ":" root_0 "}";
 # array with nested array
 {"type": "array", "items": {"type": "array", "items": {"type": "string"}}}
 root_tuple_0_tuple_0 ::= string;
 root_tuple_0         ::= "[" ( root_tuple_0_tuple_0 )* "]";
 root                 ::= "[" ( root_tuple_0 )* "]";
 # object
 {"properties": {"e": {}}}
 root_0 ::= value;
 root   ::= "{" "e" ":" root_0 "}";
 # object with nested object
 {"properties": {"o": {"type": "object", "properties": {"e": {}}}}}
 root_0_0 ::= value;
 root_0   ::= "{" "e" ":" root_0_0 "}";
 root     ::= "{" "o" ":" root_0 "}";
 # boolean
 {"type": "boolean"}
 root ::= boolean;
 # number
 {"properties": {"n": {"type": "number", "minimum": 123, "maximum": 4567}}}
 root_0 ::= number;
 root   ::= "{" "n" ":" root_0 "}";
 # string
 {"type": "string"}
 root ::= string;
 # string with enum
 {"type": "string", "enum": ["a", "b", "c"]}
 root ::= ( "\"a\"" "|" "\"b\"" "|" "\"c\"" );
 # spaces in key
 {"properties": {"a b": {}}}
 root_0 ::= value;
 root   ::= "{" "a b" ":" root_0 "}";
 # issue7978
 { "type": "object", "properties": { "steps": { "type": "array", "items": { "type": "object", "properties": { "explanation": { "type": "string" }, "output": { "type": "string" } }, "required": [ "explanation", "output" ], "additionalProperties": false } }, "final_answer": { "type": "string" } }, "required": [ "steps", "final_answer" ], "additionalProperties": false }
 root_0_tuple_0_0 ::= string;
 root_0_tuple_0_1 ::= string;
 root_0_tuple_0   ::= "{" "explanation" ":" root_0_tuple_0_0 "," "output" ":" root_0_tuple_0_1 "}";
 root_0           ::= "[" ( root_0_tuple_0 )* "]";
 root_1           ::= string;
 root             ::= "{" "steps" ":" root_0 "," "final_answer" ":" root_1 "}";
 # !! # special characters in key
 # !! {"properties": {"a!b": {}}}
 # !! !invalid character '!' in key
 # !! 
--- a/llama/README.md
+++ b/llama/README.md
@@ -1,157 +1,53 @@
 # `llama`
-This package integrates the [llama.cpp](https://github.com/ggerganov/llama.cpp) library as a Go package and makes it easy to build it with tags for different CPU and GPU processors.
+This package provides Go bindings to [llama.cpp](https://github.com/ggerganov/llama.cpp).
 Supported:
 - [x] CPU
 - [x] avx, avx2
 - [x] macOS Metal
 - [x] Windows CUDA
 - [x] Windows ROCm
 - [x] Linux CUDA
 - [x] Linux ROCm
 - [x] Llava
 Extra build steps are required for CUDA and ROCm on Windows since `nvcc` and `hipcc` both require using msvc as the host compiler. For these shared libraries are created:
 - `ggml_cuda.dll` on Windows or `ggml_cuda.so` on Linux
 - `ggml_hipblas.dll` on Windows or `ggml_hipblas.so` on Linux
 > Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
 ## Building
 ```
 go build .
 ```
 ### AVX
 ```shell
 go build -tags avx .
 ```
 ### AVX2
 ```shell
 # go doesn't recognize `-mfma` as a valid compiler flag
 # see https://github.com/golang/go/issues/17895
 go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
 go build -tags=avx,avx2 .
 ```
 ## Linux
 ### CUDA
 Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
 ```shell
 make ggml_cuda.so
 go build -tags avx,cuda .
 ```
 ### ROCm
 Install [ROCm](https://rocm.docs.amd.com/en/latest/).
 ```shell
 make ggml_hipblas.so
 go build -tags avx,rocm .
 ```
 ## Windows
 Download [w64devkit](https://github.com/skeeto/w64devkit/releases/latest) for a simple MinGW development environment.
 ### CUDA
 Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build the cuda code:
 ```shell
 make ggml_cuda.dll
 go build -tags avx,cuda .
 ```
 ### ROCm
 Install [ROCm](https://rocm.docs.amd.com/en/latest/).
 ```shell
 make ggml_hipblas.dll
 go build -tags avx,rocm .
 ```
 ## Building runners
 ```shell
 # build all runners for this platform
 make -j
 ```
 ## Vendoring
-Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
+Ollama vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/llama.cpp/tree/master/ggml/src). While we generally strive to contribute changes back upstream to avoid drift, we carry a small set of patches which are applied to the tracking commit.
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
 ```
-make apply-patches
+make -f Makefile.sync apply-patches
 ```
 ### Updating Base Commit
 **Pin to new base commit**
-To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring`
+To change the base commit, update `FETCH_HEAD` in Makefile.sync.
 #### Applying patches
 When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
 Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
 ```
-make apply-patches
+make -f Makefile.sync apply-patches
 ```
-If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
+If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
 Once all patches are applied, commit the changes to the tracking repository.
 ```
-make create-patches sync
+make -f Makefile.sync format-patches sync
 ```
 Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
 ### Generating Patches
 When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
 ```
-make apply-patches
+make -f Makefile.sync clean apply-patches
 ```
 Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
 ```
 make sync
 make -j 8
 go build .
 ```
 > [!IMPORTANT]
 > Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
 Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
 ```
-make create-patches
+make -f Makefile.sync format-patches
 ```
 > [!IMPORTANT]
 > Once you have completed this step, it is safe to run `apply-patches` since your change is preserved in the patches.
 In your `./vendor/` directory, create a branch, and cherry-pick the new commit to that branch, then submit a PR upstream to llama.cpp.
 Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "ba1cb19cdd0d92e012e0f6e009e0620f854b6afd";
+char const *LLAMA_COMMIT = "46e3556e01b824e52395fb050b29804b6cff2a7c";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/build-info.cpp.in
+++ b/llama/build-info.cpp.in
@@ -0,0 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
 char const *LLAMA_COMMIT = "@FETCH_HEAD@";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }
-#ifdef GGML_USE_CUDA
+    ggml_backend_t backend = ggml_backend_init_best();
-   new_clip->backend = ggml_backend_cuda_init(0);
+    if (backend == nullptr) {
-   LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+        LOG_ERR("%s: failed to initialize backend\n", __func__);
-#endif
+        clip_free(new_clip);
-
+        gguf_free(ctx);
-#ifdef GGML_USE_METAL
+        return nullptr;
   new_clip->backend = ggml_backend_metal_init();
   LOG_INF("%s: CLIP using Metal backend\n", __func__);
 #endif
 #ifdef GGML_USE_CANN
   new_clip->backend = ggml_backend_cann_init(0);
   LOG_INF("%s: CLIP using CANN backend\n", __func__);
 #endif
 #ifdef GGML_USE_VULKAN
   new_clip->backend = ggml_backend_vk_init(0);
   LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif
 #ifdef GGML_USE_SYCL
   new_clip->backend = ggml_backend_sycl_init(0);
   LOG_INF("%s: CLIP using SYCL backend\n", __func__);
 #endif
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
        LOG_INF("%s: CLIP using CPU backend\n", __func__);
    }
    LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
    new_clip->backend = backend;
    // model size and capabilities
    {
--- a/llama/llama.cpp/src/llama.go
+++ b/llama/llama.cpp/src/llama.go
@@ -3,5 +3,6 @@ package llama
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/../include
 // #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
 // #cgo windows CPPFLAGS: -D_WIN32_WINNT=0x0602
 import "C"
 import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -199,21 +199,25 @@ func (c *Context) KvCacheDefrag() {
 // Get the embeddings for a sequence id
 func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
-	embeddings := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
+	e := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
-	if embeddings == nil {
+	if e == nil {
 		return nil
 	}
-	return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
+	embeddings := make([]float32, c.Model().NEmbd())
 	_ = copy(embeddings, unsafe.Slice((*float32)(e), c.Model().NEmbd()))
 	return embeddings
 }
 func (c *Context) GetEmbeddingsIth(i int) []float32 {
-	embeddings := unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))
+	e := unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))
-	if embeddings == nil {
+	if e == nil {
 		return nil
 	}
-	return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
+	embeddings := make([]float32, c.Model().NEmbd())
 	_ = copy(embeddings, unsafe.Slice((*float32)(e), c.Model().NEmbd()))
 	return embeddings
 }
 type ModelParams struct {
--- a/llama/mllama.cpp
+++ b/llama/mllama.cpp
@@ -558,30 +558,15 @@ struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1)
    mllama_ctx *new_mllama = new mllama_ctx{};
-#ifdef GGML_USE_CUDA
+    ggml_backend_t backend = ggml_backend_init_best();
-    new_mllama->backend = ggml_backend_cuda_init(0);
+    if (backend == nullptr) {
-    LOG("vision using CUDA backend");
+        LOG("%s: failed to initialize backend\n", __func__);
-#endif
+        mllama_free(new_mllama);
-
+        gguf_free(ctx);
-#ifdef GGML_USE_METAL
+        return nullptr;
    new_mllama->backend = ggml_backend_metal_init();
    LOG("vision using Metal backend");
 #endif
 #ifdef GGML_USE_CANN
    new_mllama->backend = ggml_backend_cann_init(0);
    LOG("vision using CANN backend");
 #endif
 #ifdef GGML_USE_VULKAN
    new_mllama->backend = ggml_backend_vk_init(0);
    LOG("vision using Vulkan backend");
 #endif
    if (!new_mllama->backend) {
        new_mllama->backend = ggml_backend_cpu_init();
        LOG("vision using CPU backend");
    }
    LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
    new_mllama->backend = backend;
    // load tensors
    {
--- a/llama/patches/0013-use-dynamic-backend-loading-for-clip.patch
+++ b/llama/patches/0013-use-dynamic-backend-loading-for-clip.patch
@@ -1,14 +1,14 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
 Date: Sat, 4 Jan 2025 22:52:48 -0800
-Subject: [PATCH] re-enable gpu for clip
+Subject: [PATCH] use dynamic backend loading for clip
 ---
- examples/llava/clip.cpp | 86 ++++++++++++++++++++---------------------
+ examples/llava/clip.cpp | 74 +++++++++++++++--------------------------
- 1 file changed, 43 insertions(+), 43 deletions(-)
+ 1 file changed, 27 insertions(+), 47 deletions(-)
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index b3c1829f..718052e1 100644
+index b3c1829f..86b91d5c 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -8,25 +8,25 @@
@@ -56,7 +56,7 @@ index b3c1829f..718052e1 100644
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
-@@ -1235,30 +1235,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
     }
@@ -84,30 +84,19 @@ index b3c1829f..718052e1 100644
 -//    new_clip->backend = ggml_backend_sycl_init(0);
 -//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
 -//#endif
-+#ifdef GGML_USE_CUDA
+-
-+   new_clip->backend = ggml_backend_cuda_init(0);
+-    if (!new_clip->backend) {
-+   LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+-        new_clip->backend = ggml_backend_cpu_init();
-+#endif
+-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
-+
+    ggml_backend_t backend = ggml_backend_init_best();
-+#ifdef GGML_USE_METAL
+    if (backend == nullptr) {
-+   new_clip->backend = ggml_backend_metal_init();
+        LOG_ERR("%s: failed to initialize backend\n", __func__);
-+   LOG_INF("%s: CLIP using Metal backend\n", __func__);
+        clip_free(new_clip);
-+#endif
+        gguf_free(ctx);
-+
+        return nullptr;
-+#ifdef GGML_USE_CANN
+     }
-+   new_clip->backend = ggml_backend_cann_init(0);
+    LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
-+   LOG_INF("%s: CLIP using CANN backend\n", __func__);
+    new_clip->backend = backend;
 +#endif
 +
 +#ifdef GGML_USE_VULKAN
 +   new_clip->backend = ggml_backend_vk_init(0);
 +   LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 +#endif
 +
 +#ifdef GGML_USE_SYCL
 +   new_clip->backend = ggml_backend_sycl_init(0);
 +   LOG_INF("%s: CLIP using SYCL backend\n", __func__);
 +#endif
-     if (!new_clip->backend) {
+     // model size and capabilities
-         new_clip->backend = ggml_backend_cpu_init();
+     {
--- a/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -0,0 +1,29 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Tue, 14 Jan 2025 15:59:04 -0800
 Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 ---
 ggml/src/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
 index 84101c32..72b488dd 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endforeach()
     ggml_add_cpu_backend_variant_impl(${tag_name})
 +    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
 endfunction()
 ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
 +    add_custom_target(ggml-cpu)
     ggml_add_cpu_backend_variant(sandybridge    AVX)
     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -443,7 +443,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		s.lc.Synchronize()
 	}
 	var totalSamplingTime time.Duration
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -478,12 +477,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		}
 		// sample a token
 		samplingStart := time.Now()
 		token := seq.samplingCtx.Sample(s.lc, seq.iBatch)
 		seq.samplingCtx.Accept(token, true)
 		samplingTime := time.Since(samplingStart)
 		totalSamplingTime += samplingTime
 		slog.Info("sampling time", "time", samplingTime)
 		piece := s.model.TokenToPiece(token)
 		seq.numPredicted++
@@ -640,7 +635,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	samplingParams.Seed = uint32(req.Seed)
 	samplingParams.Grammar = req.Grammar
 	start := time.Now()
 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
 		numPredict:     req.NumPredict,
 		stop:           req.Stop,
@@ -648,7 +642,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		samplingParams: &samplingParams,
 		embedding:      false,
 	})
 	slog.Info("new sequence created", "duration", time.Since(start))
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
 		return
--- a/llm/filetype.go
+++ b/llm/filetype.go
@@ -1,4 +1,4 @@
-package ggml
+package llm
 import "fmt"
@@ -32,9 +32,10 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
 	fileTypeIQ3_M
 	fileTypeIQ2_S
 	fileTypeIQ4_XS
 	fileTypeIQ2_M
 	fileTypeIQ4_XS
 	fileTypeIQ1_M
 	fileTypeBF16
@@ -93,6 +94,8 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
 	case "IQ3_M":
 		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
 	case "IQ4_XS":
@@ -160,6 +163,8 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
 	case fileTypeIQ3_M:
 		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -0,0 +1,149 @@
 package llm
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 	"slices"
 )
 type containerGGLA struct {
 	version uint32
 }
 func (c *containerGGLA) Name() string {
 	return "ggla"
 }
 func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
 	if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
 		return nil, err
 	}
 	switch c.version {
 	case 1:
 	default:
 		return nil, errors.New("invalid version")
 	}
 	model := newGGLA(c)
 	err := model.decode(rs)
 	return model, err
 }
 type ggla struct {
 	*containerGGLA
 	kv      KV
 	tensors []*Tensor
 	tensorOffset uint64
 }
 func newGGLA(container *containerGGLA) *ggla {
 	return &ggla{
 		containerGGLA: container,
 		kv:            make(KV),
 	}
 }
 func (llm *ggla) KV() KV {
 	return llm.kv
 }
 func (llm *ggla) Tensors() *Tensors {
 	return &Tensors{
 		Items:  llm.tensors,
 		Offset: llm.tensorOffset,
 	}
 }
 func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
 	var r uint32
 	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
 		return err
 	}
 	llm.kv["r"] = r
 	var alpha uint32
 	if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
 		return err
 	}
 	llm.kv["alpha"] = alpha
 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
 		return err
 	}
 	llm.tensorOffset = uint64(offset)
 	for {
 		var dims uint32
 		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
 			if errors.Is(err, io.EOF) {
 				return nil
 			}
 			return err
 		}
 		defer func() {
 			if errors.Is(retErr, io.EOF) {
 				retErr = io.ErrUnexpectedEOF
 			}
 		}()
 		var namesize uint32
 		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
 			return err
 		}
 		var t Tensor
 		if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
 			return err
 		}
 		t.Shape = make([]uint64, dims)
 		for i := 0; uint32(i) < dims; i++ {
 			var shape32 uint32
 			if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
 				return err
 			}
 			t.Shape[i] = uint64(shape32)
 		}
 		// ggla tensor shape is reversed
 		// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
 		slices.Reverse(t.Shape)
 		name := make([]byte, namesize)
 		if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
 			return err
 		}
 		t.Name = string(name)
 		offset, err := rs.Seek(0, io.SeekCurrent)
 		if err != nil {
 			return err
 		}
 		if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
 			return err
 		}
 		offset, err = rs.Seek(0, io.SeekCurrent)
 		if err != nil {
 			return err
 		}
 		t.Offset = uint64(offset)
 		if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
 			return err
 		}
 		llm.tensors = append(llm.tensors, &t)
 	}
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,15 +1,15 @@
-package ggml
+package llm
 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"log/slog"
 	"slices"
 	"strings"
 	"sync"
-	"github.com/ollama/ollama/fs/util/bufioutil"
+	"github.com/ollama/ollama/util/bufioutil"
 )
 type GGML struct {
@@ -19,168 +19,145 @@ type GGML struct {
 type model interface {
 	KV() KV
-	Tensors() Tensors
+	Tensors() *Tensors
 }
 type KV map[string]any
 func (kv KV) u64(key string) uint64 {
 	switch v := kv[key].(type) {
 	case uint64:
 		return v
 	case uint32:
 		return uint64(v)
 	case float64:
 		return uint64(v)
 	default:
 		return 0
 	}
 }
 func (kv KV) Architecture() string {
-	return kv.String("general.architecture", "unknown")
+	if s, ok := kv["general.architecture"].(string); ok {
 		return s
 	}
 	return "unknown"
 }
 func (kv KV) Kind() string {
-	return kv.String("general.type", "unknown")
+	if s, ok := kv["general.type"].(string); ok {
 		return s
 	}
 	return "unknown"
 }
 func (kv KV) ParameterCount() uint64 {
-	return keyValue[uint64](kv, "general.parameter_count")
+	return kv.u64("general.parameter_count")
 }
 func (kv KV) FileType() fileType {
-	if t := kv.Uint("general.file_type"); t > 0 {
+	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(t)
+		return fileType(uint32(u64))
 	}
 	return fileTypeUnknown
 }
 func (kv KV) BlockCount() uint64 {
-	return uint64(kv.Uint("block_count"))
+	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
 }
 func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }
 func (kv KV) HeadCount() uint64 {
-	return uint64(kv.Uint("attention.head_count"))
+	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
 }
 func (kv KV) HeadCountKV() uint64 {
-	return uint64(kv.Uint("attention.head_count_kv", 1))
+	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
 		return headCountKV
 	}
 	return 1
 }
 func (kv KV) EmbeddingHeadCount() uint64 {
 	if heads := kv.HeadCount(); heads > 0 {
-		return kv.EmbeddingLength() / heads
+		return kv.EmbeddingLength() / kv.HeadCount()
 	}
 	return 0
 }
 func (kv KV) EmbeddingHeadCountK() uint64 {
-	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
+	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
 		return k
 	}
 	return kv.EmbeddingHeadCount()
 }
 func (kv KV) EmbeddingHeadCountV() uint64 {
-	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
+	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
 		return v
 	}
 	return kv.EmbeddingHeadCount()
 }
 func (kv KV) GQA() uint64 {
 	return kv.HeadCount() / kv.HeadCountKV()
 }
 func (kv KV) EmbeddingLength() uint64 {
 	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
 }
 func (kv KV) ContextLength() uint64 {
-	return uint64(kv.Uint("context_length"))
+	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
 }
 func (kv KV) ChatTemplate() string {
-	return kv.String("tokenizer.chat_template")
+	s, _ := kv["tokenizer.chat_template"].(string)
 }
 func (kv KV) String(key string, defaultValue ...string) string {
 	return keyValue(kv, key, append(defaultValue, "")...)
 }
 func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
 	return keyValue(kv, key, append(defaultValue, 0)...)
 }
 func (kv KV) Float(key string, defaultValue ...float32) float32 {
 	return keyValue(kv, key, append(defaultValue, 0)...)
 }
 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
 	r := keyValue(kv, key, &array{})
 	s := make([]string, r.size)
 	for i := range r.size {
 		s[i] = r.values[i].(string)
 	}
 	return s
 }
 func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
 	r := keyValue(kv, key, &array{})
 	s := make([]uint32, r.size)
 	for i := range r.size {
 		s[i] = uint32(r.values[i].(int32))
 	}
 	return s
 }
 func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 	}
 	if val, ok := kv[key]; ok {
 		return val.(T)
 	}
 	slog.Warn("key not found", "key", key, "default", defaultValue[0])
 	return defaultValue[0]
 }
 type Tensors struct {
-	items  []*Tensor
+	Items  []*Tensor
 	Offset uint64
 	layers     map[string]Layer
 	layersOnce sync.Once
 }
-func (s Tensors) Items(prefix ...string) []*Tensor {
+func (ts *Tensors) Layers() map[string]Layer {
-	if len(prefix) == 0 {
+	ts.layersOnce.Do(func() {
-		return s.items
+		ts.layers = make(map[string]Layer)
-	}
+		for _, t := range ts.Items {
 			parts := strings.Split(t.Name, ".")
 			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
 				if len(parts) > index+2 {
 					// blk and mm should have a number after them, join it
 					parts = append(
 						[]string{strings.Join(parts[:index+2], ".")},
 						parts[index+2:]...)
 				}
 			}
-	var items []*Tensor
+			if _, ok := ts.layers[parts[0]]; !ok {
-	for _, t := range s.items {
+				ts.layers[parts[0]] = make(Layer)
-		if strings.HasPrefix(t.Name, prefix[0]) {
+			}
-			items = append(items, t)
+
 			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
 		}
-	}
+	})
-	return items
+	return ts.layers
 }
 func (ts Tensors) Layers() map[string]Layer {
 	layers := make(map[string]Layer)
 	for _, t := range ts.items {
 		parts := strings.Split(t.Name, ".")
 		if i := slices.Index(parts, "blk"); i > 0 {
 			parts = append([]string{
 				strings.Join(parts[:i], "."),
 				strings.Join(parts[i:i+2], "."),
 			}, parts[i+2:]...)
 		} else if i == 0 {
 			parts = append([]string{
 				strings.Join(parts[i:i+2], "."),
 			}, parts[i+2:]...)
 		}
 		if _, ok := layers[parts[0]]; !ok {
 			layers[parts[0]] = make(Layer)
 		}
 		layers[parts[0]][strings.Join(parts[1:], ".")] = t
 	}
 	return layers
 }
 type Layer map[string]*Tensor
-func (l Layer) Size() (size uint64) {
+func (l Layer) size() (size uint64) {
 	for _, t := range l {
 		size += t.Size()
 	}
@@ -278,6 +255,8 @@ func (t Tensor) typeSize() uint64 {
 		return 8
 	case 29: // IQ1_M
 		return blockSize/8 + blockSize/16 + blockSize/32
 	case 30: // BF16
 		return 2
 	default:
 		return 0
 	}
@@ -316,7 +295,7 @@ const (
 var ErrUnsupportedFormat = errors.New("unsupported model format")
-func DetectContentType(b []byte) string {
+func DetectGGMLType(b []byte) string {
 	switch binary.LittleEndian.Uint32(b[:4]) {
 	case FILE_MAGIC_GGML:
 		return "ggml"
@@ -333,12 +312,12 @@ func DetectContentType(b []byte) string {
 	}
 }
-// Decode decodes a GGML model from the given reader.
+// DecodeGGML decodes a GGML model from the given reader.
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	if maxArraySize == 0 {
 		maxArraySize = 1024
 	}
@@ -352,6 +331,10 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	var c container
 	switch magic {
 	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
 		return nil, 0, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
 		c = &containerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
 		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
 	case FILE_MAGIC_GGUF_BE:
@@ -547,20 +530,21 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 }
 // SupportsKVCacheType checks if the requested cache type is supported
-func (llm GGML) SupportsKVCacheType(cacheType string) bool {
+func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
-	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
+	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
 	return slices.Contains(validKVCacheTypes, cacheType)
 }
 // SupportsFlashAttention checks if the model supports flash attention
-func (llm GGML) SupportsFlashAttention() bool {
+func (ggml GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
+	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
 	if isEmbedding {
 		return false
 	}
 	// Check head counts match and are non-zero
-	headCountK := llm.KV().EmbeddingHeadCountK()
+	headCountK := ggml.KV().EmbeddingHeadCountK()
-	headCountV := llm.KV().EmbeddingHeadCountV()
+	headCountV := ggml.KV().EmbeddingHeadCountV()
 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }
--- a/llm/ggml_test.go
+++ b/llm/ggml_test.go
@@ -0,0 +1 @@
 package llm
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -1,4 +1,4 @@
-package ggml
+package llm
 import (
 	"bytes"
@@ -8,9 +8,10 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
 	"maps"
 	"slices"
 	"strings"
 	"golang.org/x/exp/maps"
 )
 type containerGGUF struct {
@@ -109,9 +110,9 @@ func (llm *gguf) KV() KV {
 	return llm.kv
 }
-func (llm *gguf) Tensors() Tensors {
+func (llm *gguf) Tensors() *Tensors {
-	return Tensors{
+	return &Tensors{
-		items:  llm.tensors,
+		Items:  llm.tensors,
 		Offset: llm.tensorOffset,
 	}
 }
@@ -522,7 +523,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 		return err
 	}
-	keys := slices.Collect(maps.Keys(kv))
+	keys := maps.Keys(kv)
 	slices.Sort(keys)
 	for _, key := range keys {
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -11,19 +11,18 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 )
 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		estimate := EstimateGPULayers(gpus, f, projectors, opts)
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
-			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
+			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
 				return true, estimatedVRAM
 			}
 		} else {
@@ -71,7 +70,7 @@ type MemoryEstimate struct {
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64
@@ -116,31 +115,33 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	layers := f.Tensors().Layers()
+	layers := ggml.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.Size()
+		layerSize = blk0.size()
 	} else {
 		slog.Warn("model missing blk.0 layer size")
 	}
-	var kvct string
+	fa := envconfig.FlashAttention() &&
 	if envconfig.FlashAttention() &&
 		discover.GetGPUInfo().FlashAttentionSupported() &&
-		f.SupportsFlashAttention() {
+		ggml.SupportsFlashAttention()
 	var kvct string
 	if fa {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && f.SupportsKVCacheType(requested) {
+		if requested != "" && ggml.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}
-	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
+	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
 	// KV is proportional to the number of layers
-	layerSize += kv / f.KV().BlockCount()
+	layerSize += kv / ggml.KV().BlockCount()
 	if graphPartialOffload == 0 {
-		graphPartialOffload = f.KV().GQA() * kv / 6
+		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
@@ -155,12 +156,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	if layer, ok := layers["output_norm"]; ok {
-		memoryLayerOutput += layer.Size()
+		memoryLayerOutput += layer.size()
 	}
 	if layer, ok := layers["output"]; ok {
-		memoryLayerOutput += layer.Size()
+		memoryLayerOutput += layer.size()
 	} else if layer, ok := layers["token_embd"]; ok {
-		memoryLayerOutput += layer.Size()
+		memoryLayerOutput += layer.size()
 	}
 	// Output layer handled at the end if we have space
@@ -210,11 +211,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	// For all the layers, find where they can fit on the GPU(s)
-	for i := range int(f.KV().BlockCount()) {
+	for i := range int(ggml.KV().BlockCount()) {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
-			layerSize = blk.Size()
+			layerSize = blk.size()
-			layerSize += kv / f.KV().BlockCount()
+			layerSize += kv / ggml.KV().BlockCount()
 		}
 		memoryWeights += layerSize
@@ -237,10 +238,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 			}
 		}
 	}
-	if layerCount >= int(f.KV().BlockCount()) {
+	if layerCount >= int(ggml.KV().BlockCount()) {
 		fullyLoaded = true
 	} else {
-		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
+		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
 			overflow += layerSize
 		}
 	}
@@ -258,7 +259,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 			}
 		}
-		if layerCount < int(f.KV().BlockCount())+1 {
+		if layerCount < int(ggml.KV().BlockCount())+1 {
 			fullyLoaded = false
 			overflow += memoryLayerOutput
 		}
@@ -310,7 +311,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		inferenceLibrary:    gpus[0].Library,
 		layersRequested:     opts.NumGPU,
-		layersModel:         int(f.KV().BlockCount()) + 1,
+		layersModel:         int(ggml.KV().BlockCount()) + 1,
 		availableList:       availableList,
 		kv:                  kv,
 		allocationsList:     allocationsList,
@@ -338,9 +339,22 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	return estimate
 }
-func (m MemoryEstimate) LogValue() slog.Value {
+func (m MemoryEstimate) log() {
-	attrs := []slog.Attr{
+	overhead := envconfig.GpuOverhead()
-		slog.String("library", m.inferenceLibrary),
+
 	log := slog.With()
 	if m.projectorWeights > 0 {
 		log = log.With(
 			slog.Group(
 				"projector",
 				"weights", format.HumanBytes2(m.projectorWeights),
 				"graph", format.HumanBytes2(m.projectorGraph),
 			),
 		)
 	}
 	log.Info(
 		"offload to "+m.inferenceLibrary,
 		slog.Group(
 			"layers",
 			// requested number of layers to offload
@@ -356,7 +370,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
 			"memory",
 			// memory available by GPU for offloading
 			"available", m.availableList,
-			"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
+			"gpu_overhead", format.HumanBytes2(overhead),
 			slog.Group(
 				"required",
 				// memory required for full offloading
@@ -385,17 +399,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
 				"partial", format.HumanBytes2(m.graphPartialOffload),
 			),
 		),
-	}
+	)
 	if m.projectorWeights > 0 {
 		attrs = append(attrs, slog.Group(
 			"projector",
 			"weights", format.HumanBytes2(m.projectorWeights),
 			"graph", format.HumanBytes2(m.projectorGraph),
 		))
 	}
 	return slog.GroupValue(attrs...)
 }
 func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
@@ -405,13 +409,13 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 	}
 	defer file.Close()
-	ggml, _, err := ggml.Decode(file, 0)
+	ggml, _, err := DecodeGGML(file, 0)
 	if err != nil {
 		return 0, 0
 	}
 	for _, layer := range ggml.Tensors().Layers() {
-		weights += layer.Size()
+		weights += layer.size()
 	}
 	switch arch := ggml.KV().Architecture(); arch {
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -11,7 +11,6 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
 )
 func TestEstimateGPULayers(t *testing.T) {
@@ -24,7 +23,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	inputLayerCount := 5
-	tensors := []ggml.Tensor{
+	tensors := []Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
@@ -33,7 +32,7 @@ func TestEstimateGPULayers(t *testing.T) {
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}
 	assert.Len(t, tensors, inputLayerCount+1)
-	err = ggml.WriteGGUF(f, ggml.KV{
+	err = WriteGGUF(f, KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
--- a/llm/server.go
+++ b/llm/server.go
@@ -28,8 +28,6 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/grammar"
 	"github.com/ollama/ollama/llama"
 )
@@ -73,7 +71,7 @@ type llmServer struct {
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
-func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
+func LoadModel(model string, maxArraySize int) (*GGML, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -84,17 +82,21 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()
-	ggml, _, err := ggml.Decode(f, maxArraySize)
+	ggml, _, err := DecodeGGML(f, maxArraySize)
 	return ggml, err
 }
 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
-func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var systemTotalMemory uint64
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
 	systemInfo := discover.GetSystemInfo()
-	systemTotalMemory := systemInfo.System.TotalMemory
+	systemTotalMemory = systemInfo.System.TotalMemory
-	systemFreeMemory := systemInfo.System.FreeMemory
+	systemFreeMemory = systemInfo.System.FreeMemory
-	systemSwapFreeMemory := systemInfo.System.FreeSwap
+	systemSwapFreeMemory = systemInfo.System.FreeSwap
 	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
@@ -102,12 +104,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		gpus = discover.GetCPUInfo()
 	}
-	var estimate MemoryEstimate
+	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
-	if len(gpus) == 1 && gpus[0].Library == "cpu" {
+	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		estimate = EstimateGPULayers(gpus, f, projectors, opts)
 	} else {
 		estimate = EstimateGPULayers(gpus, f, projectors, opts)
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
@@ -132,7 +130,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		}
 	}
-	slog.Info("offload", "", estimate)
+	estimate.log()
 	params := []string{
 		"--model", model,
@@ -176,7 +174,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		fa = false
 	}
-	if fa && !f.SupportsFlashAttention() {
+	if fa && !ggml.SupportsFlashAttention() {
 		slog.Warn("flash attention enabled but not supported by model")
 		fa = false
 	}
@@ -189,7 +187,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		// Flash Attention also supports kv cache quantization
 		// Enable if the requested and kv cache type is supported by the model
-		if kvct != "" && f.SupportsKVCacheType(kvct) {
+		if kvct != "" && ggml.SupportsKVCacheType(kvct) {
 			params = append(params, "--kv-cache-type", kvct)
 		} else {
 			slog.Warn("kv cache type not supported by model", "type", kvct)
@@ -202,7 +200,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 	for _, g := range gpus {
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
-			uint64(opts.NumGPU) < f.KV().BlockCount()+1 {
+			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
 			opts.UseMMap = new(bool)
 			*opts.UseMMap = false
 		}
@@ -234,149 +232,205 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		params = append(params, "--multiuser-cache")
 	}
-	exe, err := os.Executable()
+	libs := make(map[string]string)
-	if err != nil {
+	if entries, err := os.ReadDir(discover.LibOllamaPath); err == nil {
-		return nil, err
+		for _, entry := range entries {
-	}
+			libs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
 	// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
 	port := 0
 	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
 		var l *net.TCPListener
 		if l, err = net.ListenTCP("tcp", a); err == nil {
 			port = l.Addr().(*net.TCPAddr).Port
 			l.Close()
 		}
 	}
 	if port == 0 {
 		slog.Debug("ResolveTCPAddr failed ", "error", err)
 		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 	}
 	finalParams := []string{"runner"}
 	finalParams = append(finalParams, params...)
 	finalParams = append(finalParams, "--port", strconv.Itoa(port))
-	pathEnv := "LD_LIBRARY_PATH"
+	lib := gpus[0].RunnerName()
-	if runtime.GOOS == "windows" {
+	requested := envconfig.LLMLibrary()
-		pathEnv = "PATH"
+	if libs[requested] != "" {
-	}
+		slog.Info("using requested gpu library", "requested", requested)
-	// Start with the server directory for the LD_LIBRARY_PATH/PATH
+		lib = requested
 	libraryPaths := []string{filepath.Dir(exe)}
 	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 		// favor our bundled library dependencies over system libraries
 		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 	}
-	// Note: we always put the dependency path first
+	var compatible []string
-	// since this was the exact version we compiled/linked against
+	for k := range libs {
-	if gpus[0].DependencyPath != nil {
+		// exact match first
-		// assume gpus from the same library have the same dependency path
+		if k == lib {
-		libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
+			compatible = append([]string{k}, compatible...)
 			continue
 		}
 		// then match the family (e.g. 'cuda')
 		if strings.Split(k, "_")[0] == strings.Split(lib, "_")[0] {
 			compatible = append(compatible, k)
 		}
 	}
 	slog.Debug("compatible gpu libraries", "compatible", compatible)
-	// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
-	s := &llmServer{
+	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
-		port:        port,
+	// without any LD_LIBRARY_PATH flags
-		cmd:         exec.Command(exe, finalParams...),
+	for {
-		status:      NewStatusWriter(os.Stderr),
+		port := 0
-		options:     opts,
+		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
-		modelPath:   model,
+			var l *net.TCPListener
-		estimate:    estimate,
+			if l, err = net.ListenTCP("tcp", a); err == nil {
-		numParallel: numParallel,
+				port = l.Addr().(*net.TCPAddr).Port
-		sem:         semaphore.NewWeighted(int64(numParallel)),
+				l.Close()
-		totalLayers: f.KV().BlockCount() + 1,
+			}
-		gpus:        gpus,
+		}
-		done:        make(chan error, 1),
+		if port == 0 {
-	}
+			slog.Debug("ResolveTCPAddr failed, using random port")
 			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		}
 		finalParams := []string{"runner"}
 		finalParams = append(finalParams, params...)
 		finalParams = append(finalParams, "--port", strconv.Itoa(port))
-	s.cmd.Env = os.Environ()
+		var pathEnv string
-	s.cmd.Stdout = os.Stdout
+		switch runtime.GOOS {
-	s.cmd.Stderr = s.status
+		case "windows":
-	s.cmd.SysProcAttr = LlamaServerSysProcAttr
+			pathEnv = "PATH"
 		case "darwin":
 			pathEnv = "DYLD_LIBRARY_PATH"
 		default:
 			pathEnv = "LD_LIBRARY_PATH"
 		}
-	envWorkarounds := [][2]string{}
+		var libraryPaths []string
-	for _, gpu := range gpus {
+		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-		envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
+			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
-	}
+		}
 	visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
 	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
-	// Update or add the path and visible devices variable with our adjusted version
+		if len(compatible) > 0 {
-	pathNeeded := true
+			c := compatible[0]
-	devicesNeeded := visibleDevicesEnv != ""
+			if libpath, ok := libs[c]; ok {
-	for i := range s.cmd.Env {
+				slog.Debug("adding gpu library", "path", libpath)
-		cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
+				libraryPaths = append(libraryPaths, libpath)
-		if strings.EqualFold(cmp[0], pathEnv) {
+			}
-			s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
+		}
-			pathNeeded = false
+
-		} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
+		// Note: we always put the dependency path first
-			s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
+		// since this was the exact version we compiled/linked against
-			devicesNeeded = false
+		if gpus[0].DependencyPath != nil {
-		} else if len(envWorkarounds) != 0 {
+			slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
-			for _, kv := range envWorkarounds {
+			// assume gpus from the same library have the same dependency path
-				if strings.EqualFold(cmp[0], kv[0]) {
+			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
-					s.cmd.Env[i] = kv[0] + "=" + kv[1]
+		}
 		// finally, add the root library path
 		libraryPaths = append(libraryPaths, discover.LibOllamaPath)
 		exe, err := os.Executable()
 		if err != nil {
 			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
 		}
 		exe, err = filepath.EvalSymlinks(exe)
 		if err != nil {
 			return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
 		}
 		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 		s := &llmServer{
 			port:        port,
 			cmd:         exec.Command(exe, finalParams...),
 			status:      NewStatusWriter(os.Stderr),
 			options:     opts,
 			modelPath:   model,
 			estimate:    estimate,
 			numParallel: numParallel,
 			sem:         semaphore.NewWeighted(int64(numParallel)),
 			totalLayers: ggml.KV().BlockCount() + 1,
 			gpus:        gpus,
 			done:        make(chan error, 1),
 		}
 		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
 		s.cmd.SysProcAttr = LlamaServerSysProcAttr
 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {
 			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
 		}
 		visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
 		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
 		// Update or add the path and visible devices variable with our adjusted version
 		pathNeeded := true
 		devicesNeeded := visibleDevicesEnv != ""
 		for i := range s.cmd.Env {
 			cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
 			if strings.EqualFold(cmp[0], pathEnv) {
 				s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
 				pathNeeded = false
 			} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
 				s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
 				devicesNeeded = false
 			} else if len(envWorkarounds) != 0 {
 				for _, kv := range envWorkarounds {
 					if strings.EqualFold(cmp[0], kv[0]) {
 						s.cmd.Env[i] = kv[0] + "=" + kv[1]
 					}
 				}
 			}
 		}
-	}
+		if pathNeeded {
-	if pathNeeded {
+			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
-		s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
+		}
-	}
+		if devicesNeeded {
-	if devicesNeeded {
+			s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
-		s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
+		}
 	}
-	slog.Info("starting llama server", "cmd", s.cmd.String())
+		slog.Info("starting llama server", "cmd", s.cmd.String())
-	if envconfig.Debug() {
+		if envconfig.Debug() {
-		filteredEnv := []string{}
+			filteredEnv := []string{}
-		for _, ev := range s.cmd.Env {
+			for _, ev := range s.cmd.Env {
-			if strings.HasPrefix(ev, "CUDA_") ||
+				if strings.HasPrefix(ev, "CUDA_") ||
-				strings.HasPrefix(ev, "ROCR_") ||
+					strings.HasPrefix(ev, "ROCR_") ||
-				strings.HasPrefix(ev, "ROCM_") ||
+					strings.HasPrefix(ev, "ROCM_") ||
-				strings.HasPrefix(ev, "HIP_") ||
+					strings.HasPrefix(ev, "HIP_") ||
-				strings.HasPrefix(ev, "GPU_") ||
+					strings.HasPrefix(ev, "GPU_") ||
-				strings.HasPrefix(ev, "HSA_") ||
+					strings.HasPrefix(ev, "HSA_") ||
-				strings.HasPrefix(ev, "GGML_") ||
+					strings.HasPrefix(ev, "GGML_") ||
-				strings.HasPrefix(ev, "PATH=") ||
+					strings.HasPrefix(ev, "PATH=") ||
-				strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
+					strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
-				filteredEnv = append(filteredEnv, ev)
+					strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
 					filteredEnv = append(filteredEnv, ev)
 				}
 			}
-		}
+			// Log at debug as the environment is inherited and might contain sensitive information
-		// Log at debug as the environment is inherited and might contain sensitive information
+			slog.Debug("subprocess", "environment", filteredEnv)
 		slog.Debug("subprocess", "environment", filteredEnv)
 	}
 	if err = s.cmd.Start(); err != nil {
 		// Detect permission denied and augment the message about noexec
 		if errors.Is(err, os.ErrPermission) {
 			return nil, fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, exe)
 		}
-		msg := ""
+		if err = s.cmd.Start(); err != nil {
-		if s.status != nil && s.status.LastErrMsg != "" {
+			var msg string
-			msg = s.status.LastErrMsg
+			if s.status != nil && s.status.LastErrMsg != "" {
-		}
+				msg = s.status.LastErrMsg
-		return nil, fmt.Errorf("error starting the external llama server: %v %s", err, msg)
+			}
-	}
+			err := fmt.Errorf("error starting runner: %v %s", err, msg)
-
+			if len(compatible) == 0 {
-	// reap subprocess when it exits
+				return nil, err
 	go func() {
 		err := s.cmd.Wait()
 		// Favor a more detailed message over the process exit status
 		if err != nil && s.status != nil && s.status.LastErrMsg != "" {
 			slog.Debug("llama runner terminated", "error", err)
 			if strings.Contains(s.status.LastErrMsg, "unknown model") {
 				s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
 			}
 			s.done <- errors.New(s.status.LastErrMsg)
 		} else {
 			s.done <- err
 		}
 	}()
-	return s, nil
+			slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
 			compatible = compatible[1:]
 			continue
 		}
 		// reap subprocess when it exits
 		go func() {
 			err := s.cmd.Wait()
 			// Favor a more detailed message over the process exit status
 			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
 				slog.Error("llama runner terminated", "error", err)
 				if strings.Contains(s.status.LastErrMsg, "unknown model") {
 					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
 				}
 				s.done <- errors.New(s.status.LastErrMsg)
 			} else {
 				s.done <- err
 			}
 		}()
 		return s, nil
 	}
 }
 type ServerStatus int
@@ -661,9 +715,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}
 			// User provided a JSON schema
-			g, err := grammar.FromSchema(nil, req.Format)
+			g := llama.SchemaToGrammar(req.Format)
-			if err != nil {
+			if g == nil {
-				return fmt.Errorf("invalid JSON schema in format: %w", err)
+				return fmt.Errorf("invalid JSON schema in format")
 			}
 			request["grammar"] = string(g)
 		}
@@ -683,6 +737,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
 		req.Options.NumPredict = 10 * s.options.NumCtx
 	}
 	// Make sure the server is ready
 	status, err := s.getServerStatusRetry(ctx)
 	if err != nil {
--- a/macapp/forge.config.ts
+++ b/macapp/forge.config.ts
@@ -18,8 +18,8 @@ const config: ForgeConfig = {
    asar: true,
    icon: './assets/icon.icns',
    extraResource: [
-      '../dist/ollama',
+      path.join(__dirname, '../dist/darwin/ollama'),
-      '../dist/darwin-amd64/lib',
+      ...fs.readdirSync(path.join(__dirname, '../dist/darwin-amd64/lib/ollama')).map(f => path.join(__dirname, '../dist/darwin-amd64/lib/ollama', f)),
      path.join(__dirname, './assets/iconTemplate.png'),
      path.join(__dirname, './assets/iconTemplate@2x.png'),
      path.join(__dirname, './assets/iconUpdateTemplate.png'),
@@ -43,7 +43,7 @@ const config: ForgeConfig = {
        }
      : {}),
    osxUniversal: {
-      x64ArchFiles: '**/ollama*',
+      x64ArchFiles: '*',
    },
  },
  rebuildConfig: {},
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -1,191 +0,0 @@
 package ml
 import (
 	"bytes"
 	"encoding/binary"
 	"fmt"
 	"os"
 	"strings"
 )
 type Config interface {
 	Architecture() string
 	String(string, ...string) string
 	Uint(string, ...uint32) uint32
 	Float(string, ...float32) float32
 	Strings(string, ...[]string) []string
 	Uints(string, ...[]uint32) []uint32
 }
 type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
 }
 var backends = make(map[string]func(*os.File) (Backend, error))
 func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
 	backends[name] = f
 }
 func NewBackend(f *os.File) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
 		return backend(f)
 	}
 	return nil, fmt.Errorf("unsupported backend")
 }
 type Context interface {
 	Zeros(dtype DType, shape ...int) Tensor
 	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
 	FromIntSlice(s []int32, shape ...int) (Tensor, error)
 	Forward(Tensor)
 	Compute(Tensor) Tensor
 	Close() error
 }
 type Tensor interface {
 	Dim(n int) int64
 	Stride(n int) int64
 	Shape() []int64
 	DType() DType
 	Bytes() []byte
 	Floats() []float32
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	Softmax(ctx Context) Tensor
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
 	SILU(ctx Context) Tensor
 	Reshape(ctx Context, shape ...int64) Tensor
 	View(ctx Context, offset int, shape ...int) Tensor
 	Permute(ctx Context, shape ...int) Tensor
 	Contiguous(ctx Context) Tensor
 	Pad(ctx Context, shape ...int64) Tensor
 	Unpad(ctx Context, shape ...int64) Tensor
 	Stack(ctx Context, dim int, s ...Tensor) Tensor
 	Concat(ctx Context, t2 Tensor, dim int) Tensor
 	Rows(ctx Context, t2 Tensor) Tensor
 	Copy(ctx Context, t2 Tensor) Tensor
 }
 type number interface {
 	~int | ~int8 | ~int16 | ~int32 | ~int64 |
 		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
 		~float32 | ~float64 |
 		~complex64 | ~complex128
 }
 func mul[T number](s ...T) T {
 	p := T(1)
 	for _, v := range s {
 		p *= v
 	}
 	return p
 }
 type DumpOptions struct {
 	// Items is the number of elements to print at the beginning and end of each dimension.
 	Items int64
 	// Precision is the number of decimal places to print. Applies to float32 and float64.
 	Precision int
 }
 func Dump(t Tensor, opts ...DumpOptions) string {
 	if len(opts) < 1 {
 		opts = append(opts, DumpOptions{
 			Items:     3,
 			Precision: 4,
 		})
 	}
 	switch t.DType() {
 	case DTypeF32:
 		return dump[[]float32](t, opts[0])
 	case DTypeI32:
 		return dump[[]int32](t, opts[0])
 	default:
 		return "<unsupported>"
 	}
 }
 func dump[S ~[]E, E number](t Tensor, opts DumpOptions) string {
 	bts := t.Bytes()
 	if bts == nil {
 		return "<nil>"
 	}
 	s := make(S, mul(t.Shape()...))
 	if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
 		panic(err)
 	}
 	shape := t.Shape()
 	var sb strings.Builder
 	var f func([]int64, int64)
 	f = func(dims []int64, stride int64) {
 		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
 		fmt.Fprint(&sb, "[")
 		defer func() { fmt.Fprint(&sb, "]") }()
 		for i := int64(0); i < dims[0]; i++ {
 			if i >= opts.Items && i < dims[0]-opts.Items {
 				fmt.Fprint(&sb, "..., ")
 				// skip to next printable element
 				skip := dims[0] - 2*opts.Items
 				if len(dims) > 1 {
 					stride += mul(append(dims[1:], skip)...)
 					fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
 				}
 				i += skip - 1
 			} else if len(dims) > 1 {
 				f(dims[1:], stride)
 				stride += mul(dims[1:]...)
 				if i < dims[0]-1 {
 					fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
 				}
 			} else {
 				fmt.Fprint(&sb, s[stride+i])
 				if i < dims[0]-1 {
 					fmt.Fprint(&sb, ", ")
 				}
 			}
 		}
 	}
 	f(shape, 0)
 	return sb.String()
 }
 type DType int
 const (
 	DTypeF32 DType = iota
 	DTypeI32
 	DTypeOther
 )
--- a/ml/backend/backend.go
+++ b/ml/backend/backend.go
@@ -1,5 +0,0 @@
 package backend
 import (
 	_ "github.com/ollama/ollama/ml/backend/ggml"
 )
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,580 +0,0 @@
 package ggml
 // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 // #include <stdlib.h>
 // #include <stdint.h>
 // #include "ggml.h"
 // #include "ggml-cpu.h"
 // #include "ggml-backend.h"
 import "C"
 import (
 	"bytes"
 	"encoding/binary"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"sync"
 	"unsafe"
 	"github.com/ollama/ollama/format"
 	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )
 type device struct {
 	d *C.struct_ggml_backend_device
 }
 func (d device) LogValue() slog.Value {
 	var free, total uint64
 	C.ggml_backend_dev_memory(d.d, (*C.size_t)(&free), (*C.size_t)(&total))
 	kind := "unknown"
 	switch C.ggml_backend_dev_type(d.d) {
 	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
 		kind = "cpu"
 	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
 		kind = "gpu"
 	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 		kind = "accel"
 	}
 	return slog.GroupValue(
 		slog.String("name", C.GoString(C.ggml_backend_dev_name(d.d))),
 		slog.String("description", C.GoString(C.ggml_backend_dev_description(d.d))),
 		slog.String("kind", kind),
 		slog.String("free", format.HumanBytes2(free)),
 		slog.String("total", format.HumanBytes2(total)),
 	)
 }
 var devices = sync.OnceValue(func() []device {
 	ggml.OnceLoad()
 	s := make([]device, C.ggml_backend_dev_count())
 	for i := range s {
 		s[i] = device{C.ggml_backend_dev_get(C.size_t(i))}
 	}
 	return s
 })
 type Backend struct {
 	meta       *fs.GGML
 	cpus, gpus []Context
 	tensors    map[string]*Context
 }
 func New(r *os.File) (ml.Backend, error) {
 	meta, n, err := fs.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
 	slog.Info(
 		"",
 		"architecture", meta.KV().Architecture(),
 		"file_type", meta.KV().FileType(),
 		"name", meta.KV().String("general.name"),
 		"description", meta.KV().String("general.description"),
 		"num_tensors", len(meta.Tensors().Items()),
 		"num_key_values", len(meta.KV()),
 	)
 	var cpus, gpus []Context
 	for _, d := range devices() {
 		switch C.ggml_backend_dev_type(d.d) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			slog.Info("cpu", "device", d)
 			cpus = append(cpus, Context{
 				ctx: C.ggml_init(C.struct_ggml_init_params{
 					mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
 					no_alloc: true,
 				}),
 				backend: C.ggml_backend_dev_init(d.d, nil),
 			})
 		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
 			slog.Info("gpu", "device", d)
 			gpus = append(gpus, Context{
 				ctx: C.ggml_init(C.struct_ggml_init_params{
 					mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
 					no_alloc: true,
 				}),
 				backend: C.ggml_backend_dev_init(d.d, nil),
 			})
 		}
 	}
 	ctxFunc := func(s []Context) (*Context, error) {
 		for _, e := range s {
 			return &e, nil
 		}
 		return nil, fmt.Errorf("no devices available")
 	}
 	tensors := make(map[*fs.Tensor]*Context, len(meta.Tensors().Items()))
 	for _, t := range meta.Tensors().Items() {
 		c, err := ctxFunc(append(gpus, cpus...))
 		if err != nil {
 			return nil, err
 		}
 		func() {
 			tt := C.ggml_new_tensor(c.ctx, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0])))
 			cname := C.CString(t.Name)
 			defer C.free(unsafe.Pointer(cname))
 			C.ggml_set_name(tt, cname)
 			tensors[t] = c
 		}()
 	}
 	for _, b := range append(gpus, cpus...) {
 		C.ggml_backend_alloc_ctx_tensors(b.ctx, b.backend)
 	}
 	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
 	var g errgroup.Group
 	for t, c := range tensors {
 		g.Go(func() error {
 			bts := make([]byte, t.Size())
 			n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
 			if err != nil {
 				return err
 			}
 			if n != int(t.Size()) {
 				return fmt.Errorf("expected %d bytes, got %d", t.Size(), n)
 			}
 			cname := C.CString(t.Name)
 			defer C.free(unsafe.Pointer(cname))
 			C.ggml_backend_tensor_set(C.ggml_get_tensor(c.ctx, cname), unsafe.Pointer(&bts[0]), 0, C.size_t(n))
 			return nil
 		})
 	}
 	if err := g.Wait(); err != nil {
 		return nil, err
 	}
 	return &Backend{
 		meta: meta,
 		cpus: cpus,
 		gpus: gpus,
 	}, nil
 }
 func init() {
 	ml.RegisterBackend("ggml", New)
 }
 func (b *Backend) Config() ml.Config {
 	return b.meta.KV()
 }
 func (b *Backend) Get(name string) ml.Tensor {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	for _, c := range append(b.gpus, b.cpus...) {
 		if t := C.ggml_get_tensor(c.ctx, cname); t != nil {
 			return &Tensor{t: t}
 		}
 	}
 	return nil
 }
 func (b *Backend) NewContext() ml.Context {
 	nodes := max(8192, len(b.meta.Tensors().Items())*5)
 	bts := make([]byte, C.size_t(nodes)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(nodes), false))
 	c := C.ggml_init(C.struct_ggml_init_params{
 		mem_buffer: unsafe.Pointer(&bts[0]),
 		mem_size:   C.size_t(len(bts)),
 		no_alloc:   true,
 	})
 	backends := make([]*C.struct_ggml_backend, len(b.gpus)+len(b.cpus))
 	bufts := make([]*C.struct_ggml_backend_buffer_type, len(b.gpus)+len(b.cpus))
 	for i, c := range append(b.gpus, b.cpus...) {
 		backends[i] = c.backend
 		bufts[i] = C.ggml_backend_get_default_buffer_type(c.backend)
 	}
 	return &Context{
 		ctx:     c,
 		backend: backends[0],
 		nodes:   nodes,
 		sched: C.ggml_backend_sched_new(
 			(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
 			C.int(len(backends)),
 			C.size_t(nodes),
 			true,
 		),
 	}
 }
 type Context struct {
 	ctx     *C.struct_ggml_context
 	backend *C.struct_ggml_backend
 	sched *C.struct_ggml_backend_sched
 	graph *C.struct_ggml_cgraph
 	nodes int
 }
 func (c *Context) Forward(t ml.Tensor) {
 	if c.graph == nil {
 		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.nodes), false)
 	}
 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
 }
 func (c *Context) Compute(t ml.Tensor) ml.Tensor {
 	c.Forward(t)
 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
 	backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
 	t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
 	C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	return t
 }
 func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	if len(shape) < 1 || len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
 	for _, dim := range shape {
 		if dim < 1 {
 			panic("invalid shape")
 		}
 	}
 	var t *C.struct_ggml_tensor
 	switch dtype {
 	case ml.DTypeF32:
 		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
 	case ml.DTypeI32:
 		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
 	default:
 		panic("unsupported dtype")
 	}
 	b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
 	C.ggml_set_zero(t)
 	return &Tensor{t: t}
 }
 func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
 	n := len(s)
 	for _, v := range shape {
 		n /= v
 	}
 	if n != 1 {
 		return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
 	}
 	t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
 	b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
 	C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
 	return &Tensor{t: t}, nil
 }
 func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
 	return fromSlice(c, s, shape, C.GGML_TYPE_F32)
 }
 func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	return fromSlice(c, s, shape, C.GGML_TYPE_I32)
 }
 func (c *Context) Close() error {
 	C.ggml_backend_sched_free(c.sched)
 	C.ggml_free(c.ctx)
 	return nil
 }
 type Tensor struct {
 	t    *C.struct_ggml_tensor
 	data []byte
 }
 func (t *Tensor) LogValue() slog.Value {
 	return slog.GroupValue(
 		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
 		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
 		slog.Any("shape", t.Shape()),
 	)
 }
 func (t *Tensor) Dim(n int) int64 {
 	return int64(t.t.ne[n])
 }
 func (t *Tensor) Stride(n int) int64 {
 	return int64(t.t.nb[n])
 }
 func (t *Tensor) Shape() []int64 {
 	shape := make([]int64, C.ggml_n_dims(t.t))
 	for i := range shape {
 		shape[i] = t.Dim(i)
 	}
 	return shape
 }
 func (t *Tensor) Bytes() []byte {
 	if bts := C.ggml_get_data(t.t); bts != nil {
 		return C.GoBytes(bts, C.int(C.ggml_nbytes(t.t)))
 	}
 	return nil
 }
 func (t *Tensor) Floats() (f32s []float32) {
 	if t.data != nil {
 		f32s = make([]float32, C.ggml_nelements(t.t))
 		_ = binary.Read(bytes.NewReader(t.data), binary.LittleEndian, f32s)
 	}
 	return
 }
 func (t *Tensor) DType() ml.DType {
 	switch t.t._type {
 	case C.GGML_TYPE_F32:
 		return ml.DTypeF32
 	case C.GGML_TYPE_I32:
 		return ml.DTypeI32
 	default:
 		return ml.DTypeOther
 	}
 }
 func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
 	}
 }
 func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
 	if len(s) > 0 {
 		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
 	}
 	return t
 }
 func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
 	}
 }
 func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
 	}
 }
 func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
 	}
 }
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
 	}
 }
 func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
 	tt := (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
 	if b != nil {
 		tt = tt.Add(ctx, b)
 	}
 	return tt
 }
 func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 	return (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
 }
 func (t *Tensor) Pad(ctx ml.Context, shape ...int64) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
 	}
 	return &Tensor{
 		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
 	}
 }
 func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
 	}
 	return &Tensor{
 		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
 	}
 }
 func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
 	}
 }
 func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
 	}
 }
 func (t *Tensor) Reshape(ctx ml.Context, shape ...int64) ml.Tensor {
 	switch len(shape) {
 	case 1:
 		return &Tensor{
 			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
 		}
 	case 2:
 		return &Tensor{
 			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
 		}
 	case 3:
 		return &Tensor{
 			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
 		}
 	case 4:
 		return &Tensor{
 			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
 		}
 	default:
 		panic("unsupported number of dimensions")
 	}
 }
 func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
 	}
 }
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
 	}
 }
 func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
 	}
 }
 func (t *Tensor) Unpad(ctx ml.Context, shape ...int64) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
 	}
 	return &Tensor{
 		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
 	}
 }
 func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	switch len(shape) {
 	case 1:
 		return &Tensor{
 			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
 		}
 	case 3:
 		return &Tensor{
 			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
 				C.int64_t(shape[0]), C.int64_t(shape[2]),
 				C.size_t(shape[1]),
 				C.size_t(offset)),
 		}
 	case 5:
 		return &Tensor{
 			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
 				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
 				C.size_t(shape[1]), C.size_t(shape[3]),
 				C.size_t(offset)),
 		}
 	case 7:
 		return &Tensor{
 			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
 				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
 				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
 				C.size_t(offset)),
 		}
 	default:
 		panic("unsupported number of dimensions")
 	}
 }
 const (
 	ropeTypeNorm C.int = iota
 )
 func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{}
 	}
 	return &Tensor{
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
 			131072,       // YaRN n_ctx_train
 			ropeTypeNorm, // ROPE_TYPE_NORM
 			C.float(ropeBase),
 			C.float(ropeScale),
 			0.,  // YaRN ext_factor
 			1.,  // YaRN attn_factor
 			32., // YaRN beta_fast
 			1.,  // YaRN beta_slow
 		),
 	}
 }
 func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
 	}
 }
 func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
 	}
 }
 func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
 	return &Tensor{
 		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
 	}
 }
--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -1,7 +1,9 @@
-protect **/*.go
+protect *.go
-protect **/*-embed.*
+protect *-embed.*
 include include/
 include src/
 include src/CMakeLists.txt
 include src/**/CMakeLists.txt
 include src/ggml-blas/
 include src/ggml-cpu/
 include src/ggml-cpu/amx/
@@ -10,12 +12,11 @@ include src/ggml-cuda/
 include src/ggml-cuda/template-instances/
 include src/ggml-hip/
 include src/ggml-metal/
-include **/CMakeLists.txt
+include *.c
-include **/*.c
+include *.h
-include **/*.h
+include *.cpp
-include **/*.cpp
+include *.cu
-include **/*.cu
+include *.cuh
-include **/*.cuh
+include *.m
-include **/*.m
+include *.metal
 include **/*.metal
 exclude *
--- a/ml/backend/ggml/ggml/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/CMakeLists.txt
@@ -1,262 +0,0 @@
 cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("ggml" C CXX)
 include(CheckIncludeFileCXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(GGML_STANDALONE ON)
    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
    # configure project version
    # TODO
 else()
    set(GGML_STANDALONE OFF)
 endif()
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)
    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
    else()
        set(BUILD_SHARED_LIBS_DEFAULT ON)
    endif()
 endif()
 # remove the lib prefix on win32 mingw
 if (WIN32)
    set(CMAKE_STATIC_LIBRARY_PREFIX "")
    set(CMAKE_SHARED_LIBRARY_PREFIX "")
    set(CMAKE_SHARED_MODULE_PREFIX  "")
 endif()
 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
 #
 # option list
 #
 # TODO: mark all options as advanced when not GGML_STANDALONE
 if (APPLE)
    set(GGML_METAL_DEFAULT ON)
    set(GGML_BLAS_DEFAULT ON)
    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
 else()
    set(GGML_METAL_DEFAULT OFF)
    set(GGML_BLAS_DEFAULT OFF)
    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 if (CMAKE_CROSSCOMPILING)
    set(GGML_NATIVE_DEFAULT OFF)
 else()
    set(GGML_NATIVE_DEFAULT ON)
 endif()
 # defaults
 if (NOT GGML_LLAMAFILE_DEFAULT)
    set(GGML_LLAMAFILE_DEFAULT OFF)
 endif()
 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
 endif()
 # general
 option(GGML_STATIC "ggml: static link libraries"                     OFF)
 option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
 option(GGML_LTO    "ggml: enable link time optimization"             OFF)
 option(GGML_CCACHE "ggml: use ccache if available"                   ON)
 # debug
 option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
 option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
 option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
 # build
 option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
 # sanitizers
 option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
 option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
 option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
 # instruction set specific
 if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
    set(INS_ENB OFF)
 else()
    set(INS_ENB ON)
 endif()
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
 option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
 option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
 option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
 option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
 option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
 if (NOT MSVC)
    # in MSVC F16C and FMA is implied with AVX2/AVX512
    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
    # MSVC does not seem to support AMX
    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
 endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
 if (WIN32)
    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
 endif()
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
 option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
 option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
 option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
 set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                            "ggml: metal minimum macOS version")
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                            "ggml: sycl device architecture")
 option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
 option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
 option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
 option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
 #
 # dependencies
 #
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 #
 # build the library
 #
 add_subdirectory(src)
 #
 # tests and examples
 #
 if (GGML_BUILD_TESTS)
    enable_testing()
    add_subdirectory(tests)
 endif ()
 if (GGML_BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif ()
 #
 # install
 #
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-cpu.h
    include/ggml-alloc.h
    include/ggml-backend.h
    include/ggml-blas.h
    include/ggml-cann.h
    include/ggml-cuda.h
    include/ggml-kompute.h
    include/ggml-opt.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
    include/ggml-vulkan.h)
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
 #    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
 #endif()
 install(TARGETS ggml LIBRARY PUBLIC_HEADER)
 install(TARGETS ggml-base LIBRARY)
 if (GGML_STANDALONE)
    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
        @ONLY)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
        DESTINATION share/pkgconfig)
 endif()
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
    endforeach()
    ggml_add_cpu_backend_variant_impl(${tag_name})
    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
 endfunction()
 ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
    if (NOT GGML_BACKEND_DL)
        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
    endif()
    add_custom_target(ggml-cpu)
    ggml_add_cpu_backend_variant(sandybridge    AVX)
    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -1,5 +1,6 @@
 package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
 // #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -1,77 +0,0 @@
 #!/usr/bin/env python3
 from glob import glob
 import os
 TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
 SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f{vkq_size}.cuh"
 DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v});
 """
 SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-wmma-f16.cuh"
 """
 SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n"
 TYPES_MMQ = [
    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
 ]
 SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE({type});
 """
 def get_short_name(long_quant_name):
    return long_quant_name.replace("GGML_TYPE_", "").lower()
 def get_head_sizes(type_k, type_v):
    if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16":
        return [64, 128, 256]
    if type_k == "GGML_TYPE_F16":
        return [64, 128]
    return [128]
 for filename in glob("*.cu"):
    os.remove(filename)
 for vkq_size in [16, 32]:
    for type_k in TYPES_KV:
        for type_v in TYPES_KV:
            for head_size in get_head_sizes(type_k, type_v):
                with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
                    f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
 for kq_acc_t in ["half", "float"]:
    for cols_per_block in [8, 16, 32]:
        if kq_acc_t == "float" and cols_per_block == 8:
            continue
        with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f:
            f.write(SOURCE_FATTN_WMMA_START)
            for head_size in [64, 80, 96, 112, 128, 256]:
                if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32
                    continue
                if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance
                    continue
                f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size))
 for type in TYPES_MMQ:
    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
        f.write(SOURCE_MMQ.format(type=type))
--- a/ml/backend/ggml/ggml/src/ggml.go
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -3,6 +3,7 @@ package ggml
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
 // #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu
 // #cgo windows LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++
 // #include <stdlib.h>
 // #include "ggml-backend.h"
 // extern void sink(int level, char *text, void *user_data);
@@ -40,28 +41,53 @@ func sink(level C.int, text *C.char, _ unsafe.Pointer) {
 }
 var OnceLoad = sync.OnceFunc(func() {
-	var lib struct{ name, defaultValue string }
+	exe, err := os.Executable()
 	if err != nil {
 		slog.Warn("failed to get executable path", "error", err)
 		exe = "."
 	}
 	// PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often
 	// set by the parent process, however, use a default value
 	// if the environment variable is not set.
 	var name, value string
 	switch runtime.GOOS {
-	case "darwin", "linux":
+	case "darwin":
-		lib.name = "LD_LIBRARY_PATH"
+		// On macOS, DYLD_LIBRARY_PATH is often not set, so
-		lib.defaultValue = "/usr/local/lib:/usr/lib"
+		// we use the directory of the executable as the default.
 		name = "DYLD_LIBRARY_PATH"
 		value = filepath.Dir(exe)
 	case "windows":
-		lib.name = "PATH"
+		name = "PATH"
-		lib.defaultValue = "."
+		value = filepath.Join(filepath.Dir(exe), "lib", "ollama")
 	default:
-		return
+		name = "LD_LIBRARY_PATH"
 		value = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 	}
-	paths, ok := os.LookupEnv(lib.name)
+	paths, ok := os.LookupEnv(name)
 	if !ok {
-		paths = lib.defaultValue
+		paths = value
 	}
-	for _, path := range filepath.SplitList(paths) {
+	split := filepath.SplitList(paths)
-		func() {
+	visited := make(map[string]struct{}, len(split))
-			cpath := C.CString(path)
+	for _, path := range split {
-			defer C.free(unsafe.Pointer(cpath))
+		abspath, err := filepath.Abs(path)
-			C.ggml_backend_load_all_from_path(cpath)
+		if err != nil {
-		}()
+			slog.Error("failed to get absolute path", "error", err)
 			continue
 		}
 		if _, ok := visited[abspath]; !ok {
 			func() {
 				slog.Debug("ggml backend load all from path", "path", abspath)
 				cpath := C.CString(abspath)
 				defer C.free(unsafe.Pointer(cpath))
 				C.ggml_backend_load_all_from_path(cpath)
 			}()
 			visited[abspath] = struct{}{}
 		}
 	}
 })
--- a/ml/nn/convolution.go
+++ b/ml/nn/convolution.go
@@ -1,11 +0,0 @@
 package nn
 import "github.com/ollama/ollama/ml"
 type Conv2D struct {
 	Weight ml.Tensor `gguf:"weight"`
 }
 func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
 	return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
 }
--- a/ml/nn/embedding.go
+++ b/ml/nn/embedding.go
@@ -1,11 +0,0 @@
 package nn
 import "github.com/ollama/ollama/ml"
 type Embedding struct {
 	Weight ml.Tensor `gguf:"weight"`
 }
 func (m *Embedding) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
 	return m.Weight.Rows(ctx, hiddenState)
 }
--- a/ml/nn/linear.go
+++ b/ml/nn/linear.go
@@ -1,17 +0,0 @@
 package nn
 import "github.com/ollama/ollama/ml"
 type Linear struct {
 	Weight ml.Tensor `gguf:"weight"`
 	Bias   ml.Tensor `gguf:"bias"`
 }
 func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
 	t = m.Weight.Mulmat(ctx, t)
 	if m.Bias != nil {
 		t = t.Add(ctx, m.Bias)
 	}
 	return t
 }
--- a/ml/nn/normalization.go
+++ b/ml/nn/normalization.go
@@ -1,22 +0,0 @@
 package nn
 import (
 	"github.com/ollama/ollama/ml"
 )
 type LayerNorm struct {
 	Weight ml.Tensor `gguf:"weight"`
 	Bias   ml.Tensor `gguf:"bias"`
 }
 func (m *LayerNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
 	return t.LayerNorm(ctx, m.Weight, m.Bias, eps)
 }
 type RMSNorm struct {
 	Weight ml.Tensor `gguf:"weight"`
 }
 func (m *RMSNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
 	return t.RMSNorm(ctx, m.Weight, eps)
 }
--- a/model/cmd/main.go
+++ b/model/cmd/main.go
@@ -1,212 +0,0 @@
 package main
 import (
 	"errors"
 	"flag"
 	"fmt"
 	"image"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strings"
 	"time"
 	"github.com/ollama/ollama/cache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	_ "github.com/ollama/ollama/model/llama"
 	_ "github.com/ollama/ollama/model/mllama"
 	"github.com/ollama/ollama/sample"
 )
 var args struct {
 	n     int
 	debug bool
 	image string
 	cache bool
 }
 func temp() error {
 	// start := time.Now()
 	flag.IntVar(&args.n, "n", 10, "number of samples")
 	flag.BoolVar(&args.debug, "debug", false, "enable debug logging")
 	flag.StringVar(&args.image, "image", "", "path to image file")
 	flag.BoolVar(&args.cache, "cache", false, "enable KV cache")
 	flag.Parse()
 	var prompt string
 	if n := len(flag.Args()); n == 1 {
 		bts, err := io.ReadAll(os.Stdin)
 		if err != nil {
 			return err
 		}
 		prompt = string(bts)
 	} else if n > 1 {
 		prompt = strings.Join(flag.Args()[1:], " ")
 	} else {
 		return fmt.Errorf("usage: %s path/to/file <prompt\n", filepath.Base(os.Args[0]))
 	}
 	level := slog.LevelInfo
 	if args.debug {
 		level = slog.LevelDebug
 	}
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			if attr.Key == slog.SourceKey {
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	})))
 	m, err := model.New(flag.Arg(0))
 	if err != nil {
 		return err
 	}
 	inputIDs, err := m.(model.TextProcessor).Encode(prompt)
 	if err != nil {
 		return err
 	}
 	var opts []model.OptionsFunc
 	if args.cache {
 		opts = append(opts, model.WithCache(&cache.Simple{
 			Capacity: 2048,
 			DType:    ml.DTypeF32,
 		}))
 	}
 	if args.image != "" {
 		if err := func() error {
 			f, err := os.Open(args.image)
 			if err != nil {
 				return err
 			}
 			defer f.Close()
 			img, _, err := image.Decode(f)
 			if err != nil {
 				return err
 			}
 			opts = append(opts, model.WithImage(img))
 			return nil
 		}(); err != nil {
 			return err
 		}
 	}
 	// Schema for a list of friends with their info
 	// Maps to JSON like:
 	// {
 	// 	"name": "string",
 	// 	"age": integer,
 	// 	"is_available": boolean
 	// }
 	schema := &sample.Schema{
 		Name: "root",
 		Type: "object",
 		Properties: []*sample.Schema{
 			{Name: "name", Type: "string"},
 			{Name: "age", Type: "integer"},
 			{Name: "is_available", Type: "boolean"},
 		},
 	}
 	// fmt.Println("schema", schema)
 	// schema = nil
 	jsonTransform, err := sample.NewJSONSampler(m.(model.TextProcessor), schema)
 	if err != nil {
 		return err
 	}
 	transforms := []sample.Transform{
 		jsonTransform,
 	}
 	var offset int
 	var stringBuffer string
 	// var ttft time.Duration
 	var totalSamplingTime time.Duration
 	count := 0
 	for range args.n {
 		logits, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
 		if err != nil {
 			return err
 		}
 		samplingStart := time.Now()
 		sampler := sample.Greedy()
 		sampledIdx, err := sampler.Sample(logits.Floats(), transforms...)
 		if err != nil {
 			return err
 		}
 		samplingTime := time.Since(samplingStart)
 		totalSamplingTime += samplingTime
 		// fmt.Println("sampling time", samplingTime)
 		// fmt.Printf("Sample time: %vms\n", finishTime.Sub(sampleTime).Milliseconds())
 		var outputIDs []int32
 		if !m.(model.TextProcessor).Is(uint32(sampledIdx), model.SpecialEOS) {
 			outputIDs = append(outputIDs, int32(sampledIdx))
 		}
 		if len(outputIDs) == 0 {
 			break
 		}
 		s, err := m.(model.TextProcessor).Decode(outputIDs)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
 			return err
 		}
 		// if ttft == 0 {
 		// 	ttft = time.Since(start)
 		// fmt.Printf("Time to first token: %vms\n", ttft.Milliseconds())
 		// }
 		// fmt.Printf("--- token: %q\n", s)
 		// fmt.Printf("--- outputIDs: %v\n", outputIDs)
 		stringBuffer += s
 		count++
 		fmt.Println("--- stringBuffer", stringBuffer)
 		outputIDs, err = jsonTransform.UpdateState(outputIDs)
 		if err != nil {
 			return err
 		}
 		// can do fun shifting stuff here if needed
 		inputIDs = append(inputIDs, outputIDs...)
 		if args.cache {
 			offset = len(inputIDs) - 1
 		}
 	}
 	fmt.Println("\n------ Output: ------")
 	fmt.Println(stringBuffer)
 	fmt.Println("--------------------")
 	fmt.Println("sample average time", totalSamplingTime/time.Duration(count))
 	return nil
 }
 func main() {
 	if err := temp(); err != nil {
 		fmt.Println("err", err)
 		os.Exit(1)
 	}
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
CosmicEventHorizon	e8d4eb3e68	readme: add ChibiChat to community integrations (#8883 )	2025-02-06 16:08:46 -08:00
Michael Yang	ae7e368f75	build(rocm): add numa, elf (#8900 )	2025-02-06 15:46:30 -08:00
oslook	31acd1ebf9	readme: add Ollama Chat WebUI for Docker to community integrations (#8084 )	2025-02-06 15:41:02 -08:00
Michael Yang	9a4757ae66	build(rocm): add tinfo (#8899 )	2025-02-06 15:08:12 -08:00
Abhinav Pant	7814019708	docs: add step for removing libraries in linux.md (#8897 )	2025-02-06 14:54:58 -08:00
Michael Yang	b698f9a0d8	build: add missing dependencies (#8896 )	2025-02-06 13:12:16 -08:00
Azis Alvriyanto	32285a6d19	format: rename test file from byte_test.go to bytes_test.go (#8865 )	2025-02-06 13:06:15 -08:00
Michael Yang	1c198977ec	ci: fix linux archive (#8862 ) the find returns intermediate directories which pulls the parent directories. it also omits files under lib/ollama. switch back to globbing	2025-02-05 19:45:58 -08:00
zyphixor	330b6c50b0	readme: add simple-discord-ai to community integrations (#8659 )	2025-02-05 18:35:04 -08:00
Diego Pereira	928911bc68	runner: avoid buffer overwrite when generating multiple embeddings (#8714 ) Shield the code processing the embedding result from subsequent calls that may overwrite the same buffer to process a second input when retrieving model embeddings.	2025-02-05 16:53:33 -08:00
Michael Yang	5b446cc815	chore: update gitattributes (#8860 ) * chore: update gitattributes * chore: add build info source	2025-02-05 16:37:18 -08:00
Daniel Lok	451c1596af	readme: add MLflow Tracing as an observability integration (#8811 )	2025-02-05 16:04:24 -08:00
Michael Yang	932bded12f	chore: add optional field for server logs	2025-02-05 15:55:32 -08:00
Michael Yang	070ad913ac	ci: fix linux archive	2025-02-05 15:08:02 -08:00
Azis Alvriyanto	8d8b9f83ae	format: byte formatting test coverage (#8692 ) Removed redundant checks and streamlined the switch-case structure. Added test cases for both HumanBytes and HumanBytes2 to cover a wide range of scenarios.	2025-02-05 12:23:07 -08:00
Jeffrey Morgan	f00d359a67	docs: add section in development.md on library detection (#8855 )	2025-02-05 11:16:27 -08:00
Yashwanth A	291def6adb	server: increase timeout in stall detection from 5s to 30s (#8831 ) In some cases, downloads slow due to disk i/o or other factors, causing the download to restart a part. This causes the download to "reverse" in percent completion. By increasing the timeout to 30s, this should happen less frequently.	2025-02-05 10:00:26 -08:00
Jeffrey Morgan	cd3fbf1c49	llama: use dynamic backend loading for mllama and clip (#8835 )	2025-02-05 09:46:56 -08:00
Jeffrey Morgan	c852b8e021	server: always print upload/download part info (#8832 )	2025-02-04 19:30:49 -08:00
William	d8932c55e7	server: fix out of bounds exception on model download (#8746 )	2025-02-04 18:52:47 -08:00
Michael Yang	63f0269f7f	ci: split docker build by platform this improves build reliability and concurrency	2025-02-04 17:04:27 -08:00
Jeffrey Morgan	4759ecae19	ml/backend/ggml: fix library loading on macOS amd64 (#8827 )	2025-02-04 15:05:39 -08:00
Michael Yang	65b7ecac7b	fix extra quote	2025-02-04 08:35:30 -08:00
Michael Yang	f9d2d89135	fix linux archive	2025-02-03 16:12:33 -08:00
Michael Yang	669dc31cf3	fix build	2025-02-03 15:10:51 -08:00
Tilman Griesel	d4d338c224	readme: add Chipper to community integrations (#8803 )	2025-02-03 14:18:19 -08:00
Melroy van den Berg	bfdeffc375	docs: use OLLAMA_VERSION=0.5.7 for install version override (#8802 )	2025-02-03 13:54:08 -08:00
Michael Yang	e806184023	fix release workflow	2025-02-03 13:19:57 -08:00
Jeffrey Morgan	50566113ac	llm: do not error if LibOllamaPath does not exist (#8801 )	2025-02-03 12:27:48 -08:00
Davide Bertoni	ad22ace439	docs: add missing json and shell code blocks in api.md (#8766 )	2025-02-02 13:12:55 -08:00
Anıl Kaynar	f4321a421c	readme: add MinimalNextOllamaChat to community integrations (#8767 )	2025-02-02 12:56:10 -08:00
Michael Yang	475333d533	fix docker build-args env context is not accessible from job.*.strategy. since it's in the environment, just tell docker to use the environment variable[1] [1]: https://docs.docker.com/reference/cli/docker/buildx/build/#build-arg	2025-01-31 14:56:02 -08:00
Michael Yang	39fd89308c	build: set CFLAGS=-O3 specifically for cpu.go	2025-01-31 10:25:39 -08:00
Michael Yang	548a9f56a6	Revert "cgo: use O3" This reverts commit `bea1f1fac6`.	2025-01-31 10:25:39 -08:00
Michael Yang	3f0cb36bdb	build: set goflags in linux release	2025-01-30 13:07:32 -08:00
Michael Yang	bea1f1fac6	cgo: use O3	2025-01-30 12:21:50 -08:00
Jeffrey Morgan	5d75d837ef	discover: fix default LibOllamaPath value (#8702 )	2025-01-30 12:21:38 -08:00
Parth Sareen	711648c9bb	docs: update api.md with streaming with tools is enabled (#8676 )	2025-01-29 15:14:30 -08:00
Michael Yang	dcfb7a105c	next build (#8539 ) * add build to .dockerignore * test: only build one arch * add build to .gitignore * fix ccache path * filter amdgpu targets * only filter if autodetecting * Don't clobber gpu list for default runner This ensures the GPU specific environment variables are set properly * explicitly set CXX compiler for HIP * Update build_windows.ps1 This isn't complete, but is close. Dependencies are missing, and it only builds the "default" preset. * build: add ollama subdir * add .git to .dockerignore * docs: update development.md * update build_darwin.sh * remove unused scripts * llm: add cwd and build/lib/ollama to library paths * default DYLD_LIBRARY_PATH to LD_LIBRARY_PATH in runner on macOS * add additional cmake output vars for msvc * interim edits to make server detection logic work with dll directories like lib/ollama/cuda_v12 * remove unncessary filepath.Dir, cleanup * add hardware-specific directory to path * use absolute server path * build: linux arm * cmake install targets * remove unused files * ml: visit each library path once * build: skip cpu variants on arm * build: install cpu targets * build: fix workflow * shorter names * fix rocblas install * docs: clean up development.md * consistent build dir removal in development.md * silence -Wimplicit-function-declaration build warnings in ggml-cpu * update readme * update development readme * llm: update library lookup logic now that there is one runner (#8587) * tweak development.md * update docs * add windows cuda/rocm tests --------- Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Daniel Hiltgen <daniel@ollama.com>	2025-01-29 15:03:38 -08:00
Xiaofu Huang	2ef3c803a1	readme: add AI Toolkit for VSCode to community integrations (#8604 )	2025-01-27 00:36:23 -08:00
Matěj Štágl	453e4d090b	readme: add LlmTornado to community integrations (#8551 )	2025-01-25 01:04:07 -08:00
Daniel Jalkut	ca2f9843c8	docs: remove reference to the deleted examples folder (#8524 )	2025-01-22 22:52:15 -08:00
frob	294b6f5a22	docs: remove tfs_z option from documentation (#8515 )	2025-01-21 09:28:59 -08:00
EndoTheDev	7bb356c680	docs: update suspend header in gpu.md (#8487 )	2025-01-19 18:45:35 -08:00
Jannik Maierhöfer	021817e59a	readme: add link to Langfuse (#8455 )	2025-01-16 22:41:12 -08:00
Patrick Devine	a420a453b4	fix default modelfile for create (#8452 )	2025-01-16 01:14:04 -08:00
Jeffrey Morgan	42cf4db601	parser: fix parsing Modelfiles with multiple FROM commands (#8449 )	2025-01-16 00:14:04 -08:00
Josh	93a8daf285	convert: import support for command-r models from safetensors (#6063 ) --------- Co-authored-by: Patrick Devine <patrick@infrahq.com>	2025-01-15 16:31:22 -08:00
Gloryjaw	a041b4df7c	docs: fix path to examples (#8438 )	2025-01-15 11:49:12 -08:00
Patrick Devine	2539f2dbf9	Fix absolute path names + gguf detection (#8428 )	2025-01-14 19:01:24 -08:00