model: test byte pair encoding

fix linter
2025-01-29 15:17:47 -08:00 · 2025-01-29 15:08:37 -08:00 · 2025-01-29 15:05:24 -08:00 · 2025-01-29 15:03:38 -08:00 · 2025-01-27 00:36:23 -08:00 · 2025-01-25 01:04:07 -08:00
97 changed files with 2178 additions and 5285 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,7 +3,9 @@ ollama
 app
 macapp
 dist
+build
 .env
 .cache
 test_data
-llama/build
+.git
+
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -40,28 +40,106 @@ jobs:

  linux:
    needs: [changes]
-    if: ${{ needs.changes.outputs.changed == 'True' }}
+    if: needs.changes.outputs.changed == 'True'
    strategy:
      matrix:
        include:
-          - container: nvidia/cuda:11.8.0-devel-ubuntu22.04
-            preset: CUDA
-          - container: rocm/dev-ubuntu-22.04:6.1.2
-            preset: ROCm
+          - preset: CPU
+          - preset: CUDA
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
+          - preset: ROCm
+            container: rocm/dev-ubuntu-22.04:6.1.2
            extra-packages: rocm-libs
-    runs-on: ubuntu-latest
+            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
+    runs-on: linux
    container: ${{ matrix.container }}
    steps:
      - uses: actions/checkout@v4
      - run: |
-          apt-get update
-          apt-get install -y cmake pkg-config ${{ matrix.extra-packages }}
+          [ -n "${{ matrix.container }}" ] || sudo=sudo
+          $sudo apt-get update
+          $sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
        env:
          DEBIAN_FRONTEND: noninteractive
+      - uses: actions/cache@v4
+        with:
+          path: /github/home/.cache/ccache
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
      - run: |
-          cmake --preset ${{ matrix.preset }}
+          cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
          cmake --build --preset ${{ matrix.preset }} --parallel

+  windows:
+    needs: [changes]
+    if: needs.changes.outputs.changed == 'True'
+    strategy:
+      matrix:
+        include:
+          - preset: CPU
+          - preset: CUDA
+            install: https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
+          - preset: ROCm
+            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
+            flags: '-DAMDGPU_TARGETS=gfx1010'
+    runs-on: windows
+    steps:
+      - run: |
+          choco install -y --no-progress ccache ninja
+          ccache -o cache_dir=${{ github.workspace }}\.ccache
+      - if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm'
+        id: cache-install
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
+            C:\Program Files\AMD\ROCm
+          key: ${{ matrix.install }}
+      - if: matrix.preset == 'CUDA'
+        name: Install CUDA ${{ matrix.cuda-version }}
+        run: |
+          $ErrorActionPreference = "Stop"
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.8", "nvcc_11.8", "cublas_11.8", "cublas_dev_11.8")) -NoNewWindow -Wait
+          }
+
+          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
+          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - if: matrix.preset == 'ROCm'
+        name: Install ROCm ${{ matrix.rocm-version }}
+        run: |
+          $ErrorActionPreference = "Stop"
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
+            Start-Process -FilePath .\install.exe -ArgumentList '-install' -NoNewWindow -Wait
+          }
+
+          $hipPath = (Resolve-Path "C:\Program Files\AMD\ROCm\*").path
+          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
+            C:\Program Files\AMD\ROCm
+          key: ${{ matrix.install }}
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v4
+        with:
+          path: ${{ github.workspace }}\.ccache
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
+      - run: |
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
+          cmake --build --parallel --preset "${{ matrix.preset }}"
+        env:
+          CMAKE_GENERATOR: Ninja
+
  test:
    strategy:
      matrix:
@@ -85,5 +163,5 @@ jobs:
      - uses: actions/checkout@v4
      - name: Verify patches apply cleanly and do not change files
        run: |
-          make -f Makefile2 clean checkout sync
+          make -f Makefile.sync clean checkout sync
          git diff --compact-summary --exit-code
--- a/.gitignore
+++ b/.gitignore
@@ -4,12 +4,13 @@
 .venv
 .swp
 dist
+build
 ollama
 .cache
 *.exe
 .idea
 test_data
 *.crt
-llama/build
 __debug_bin*
-llama/vendor
+llama/build
+llama/vendor
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,11 +19,25 @@ set(GGML_CCACHE ON)
 set(GGML_BACKEND_DL ON)
 set(GGML_BACKEND_SHARED ON)
 set(GGML_SCHED_MAX_COPIES 4)
-set(GGML_CPU_ALL_VARIANTS ON)
-set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
-set(GGML_LLAMAFILE ON)

-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(GGML_LLAMAFILE ON)
+set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
+set(GGML_CUDA_GRAPHS ON)
+
+if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+    OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
+    set(GGML_CPU_ALL_VARIANTS ON)
+endif()
+
+set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
+set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})

 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
@@ -34,12 +48,65 @@ set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)

+get_target_property(CPU_VARIANTS ggml-cpu MANUALLY_ADDED_DEPENDENCIES)
+if(NOT CPU_VARIANTS)
+    set(CPU_VARIANTS "ggml-cpu")
+endif()
+
+install(TARGETS ggml-base ${CPU_VARIANTS}
+    RUNTIME_DEPENDENCIES
+        PRE_EXCLUDE_REGEXES ".*"
+    RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
+    LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
+    FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
+)
+
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES "native")
+    endif()
+
+    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
+    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
+    install(TARGETS ggml-cuda
+        RUNTIME_DEPENDENCIES
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart
+            PRE_EXCLUDE_REGEXES ".*"
+        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+    )
 endif()

 check_language(HIP)
 if(CMAKE_HIP_COMPILER)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+    set(HIP_PLATFORM "amd")
+
+    find_package(hip REQUIRED)
+    if(NOT AMDGPU_TARGETS)
+        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
+    endif()
+
+    if(AMDGPU_TARGETS)
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
+        install(TARGETS ggml-hip
+            RUNTIME_DEPENDENCIES
+                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
+                PRE_INCLUDE_REGEXES amdhip64 hipblas rocblas amd_comgr hsa_runtime64 rocprofiler-register drm_amdgpu drm numa
+                PRE_EXCLUDE_REGEXES ".*"
+                POST_EXCLUDE_REGEXES "system32"
+            RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
+        )
+
+        foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
+            if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
+                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
+                break()
+            endif()
+        endforeach()
+    endif()
 endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -4,10 +4,15 @@
    {
      "name": "Default",
      "binaryDir": "${sourceDir}/build",
+      "installDir": "${sourceDir}/dist",
      "cacheVariables": {
        "CMAKE_BUILD_TYPE": "Release"
      }
    },
+    {
+      "name": "CPU",
+      "inherits": [ "Default" ]
+    },
    {
      "name": "CUDA",
      "inherits": [ "Default" ]
@@ -42,20 +47,29 @@
    },
    {
      "name": "ROCm",
-      "inherits": [ "Default" ]
+      "inherits": [ "Default" ],
+      "cacheVariables": {
+        "CMAKE_HIP_PLATFORM": "amd"
+      }
    },
    {
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
-        "CMAKE_HIP_ARCHITECTURES": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      }
    }
  ],
  "buildPresets": [
    {
      "name": "Default",
-      "configurePreset": "Default"
+      "configurePreset": "Default",
+      "configuration": "Release"
+    },
+    {
+      "name": "CPU",
+      "configurePreset": "Default",
+      "targets": [ "ggml-cpu" ]
    },
    {
      "name": "CUDA",
--- a/281
+++ b/281
@@ -1,201 +1,128 @@
-ARG GOLANG_VERSION=1.22.8
-ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_VERSION_12=12.4.0
-ARG ROCM_VERSION=6.1.2
-ARG JETPACK_6=r36.2.0
-ARG JETPACK_5=r35.4.1
+# vim: filetype=dockerfile

-### To create a local image for building linux binaries on mac or windows with efficient incremental builds
-#
-# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
-# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
-#
-### Then incremental builds will be much faster in this container
-#
-# make -j 10 dist
-#
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-# TODO intel oneapi goes here...
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
+ARG FLAVOR=${TARGETARCH}

-### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
-# Note: this does not contain jetson variants
-#
-# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
-# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
-#
-FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
-    dnf config-manager --set-enabled appstream && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH arm64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
+ARG ROCMVERSION=6.1.2
+ARG JETPACK5VERSION=r35.4.1
+ARG JETPACK6VERSION=r36.2.0
+ARG CMAKEVERSION=3.31.2

-FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG OLLAMA_FAST_BUILD
-ARG VERSION
-ARG CUSTOM_CPU_FLAGS
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCMVERSION}-complete AS base-amd64
+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
+    && yum install -y yum-utils devtoolset-10-gcc devtoolset-10-gcc-c++ \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo \
+    && curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
+ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:/opt/rh/devtoolset-11/root/usr/bin:$PATH
+
+FROM --platform=linux/arm64 rockylinux:8 AS base-arm64
+# install epel-release for ccache
+RUN yum install -y yum-utils epel-release \
+    && yum install -y clang ccache \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
+ENV CC=clang CXX=clang++
+
+FROM base-${TARGETARCH} AS base
+ARG CMAKEVERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+ENV LDFLAGS=-s
+
+FROM base AS cpu
+# amd64 uses gcc which requires devtoolset-11 for AVX extensions while arm64 uses clang
+RUN if [ "$(uname -m)" = "x86_64" ]; then yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++; fi
+ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -j $(nproc) dist ; \
-    else \
-        make -j 5 dist ; \
-    fi
-RUN cd dist/linux-$GOARCH && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
-    fi
+    cmake --preset 'CPU' \
+        && cmake --build --parallel --preset 'CPU' \
+        && cmake --install build --component CPU --strip --parallel 8

-# Jetsons need to be built in discrete stages
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
-ARG GOLANG_VERSION
-RUN apt-get update && apt-get install -y git curl ccache && \
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
-    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-WORKDIR /go/src/github.com/ollama/ollama/
-COPY . .
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ARG VERSION
+FROM base AS cuda-11
+ARG CUDA11VERSION=11.3
+RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v11 \
-        CUDA_ARCHITECTURES="72;87" \
-        GPU_RUNNER_VARIANT=_jetpack5 \
-        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
-        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
+    cmake --preset 'CUDA 11' \
+        && cmake --build --parallel --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel 8

-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
-ARG GOLANG_VERSION
-RUN apt-get update && apt-get install -y git curl ccache && \
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
-    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-WORKDIR /go/src/github.com/ollama/ollama/
-COPY . .
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ARG VERSION
+FROM base AS cuda-12
+ARG CUDA12VERSION=12.4
+RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
+ENV PATH=/usr/local/cuda-12/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v12 \
-        CUDA_ARCHITECTURES="87" \
-        GPU_RUNNER_VARIANT=_jetpack6 \
-        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
-        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
+    cmake --preset 'CUDA 12' \
+        && cmake --build --parallel --preset 'CUDA 12' \
+        && cmake --install build --component CUDA --strip --parallel 8

-FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_FAST_BUILD
-ARG VERSION
+FROM base AS rocm-6
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-RUN cd dist/linux-$GOARCH && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN cd dist/linux-$GOARCH-jetpack5 && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
-RUN cd dist/linux-$GOARCH-jetpack6 && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
+    cmake --preset 'ROCm 6' \
+        && cmake --build --parallel --preset 'ROCm 6' \
+        && cmake --install build --component HIP --strip --parallel 8

-FROM --platform=linux/amd64 scratch AS dist-amd64
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM --platform=linux/arm64 scratch AS dist-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH AS dist
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
+ARG CMAKEVERSION
+RUN apt-get update && apt-get install -y curl ccache \
+    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'JetPack 5' \
+        && cmake --build --parallel --preset 'JetPack 5' \
+        && cmake --install build --component CUDA --strip --parallel 8

+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
+ARG CMAKEVERSION
+RUN apt-get update && apt-get install -y curl ccache \
+    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'JetPack 6' \
+        && cmake --build --parallel --preset 'JetPack 6' \
+        && cmake --install build --component CUDA --strip --parallel 8

-# For amd64 container images, filter out cuda/rocm to minimize size
-FROM build-amd64 AS runners-cuda-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
-    ./dist/linux-amd64/lib/ollama/runners/rocm*
+FROM base AS build
+ARG GOVERSION=1.23.4
+RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
+ENV PATH=/usr/local/go/bin:$PATH
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS="'-ldflags=-w -s'"
+ENV CGO_ENABLED=1
+RUN --mount=type=cache,target=/root/.cache/go-build \
+    go build -trimpath -buildmode=pie -o /bin/ollama .

-FROM build-amd64 AS runners-rocm-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
-    ./dist/linux-amd64/lib/ollama/libcu*.so* \
-    ./dist/linux-amd64/lib/ollama/runners/cuda*
+FROM --platform=linux/amd64 scratch AS amd64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12

-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+FROM --platform=linux/arm64 scratch AS arm64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 lib/ollama/cuda_jetpack6

-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
+FROM --platform=linux/arm64 scratch AS rocm
+COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm

+FROM ${FLAVOR} AS archive
+COPY --from=cpu dist/lib/ollama /lib/ollama
+COPY --from=build /bin/ollama /bin/ollama

-# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
-# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
-# across releases
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
-
-FROM runtime-$TARGETARCH
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
+FROM ubuntu:20.04
+RUN apt-get update \
+    && apt-get install -y ca-certificates \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+COPY --from=archive /bin /usr/bin
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+COPY --from=archive /lib/ollama /usr/lib/ollama
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_VISIBLE_DEVICES=all
-
+ENV OLLAMA_HOST=0.0.0.0:11434
+EXPOSE 11434
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/66
+++ b/66
@@ -1,66 +0,0 @@
-ARG CUDA_11_VERSION=11.3
-ARG CUDA_12_VERSION=12.4
-ARG ROCM_VERSION=6.1.2
-ARG JETPACK_5_VERSION=r35.4.1
-ARG JETPACK_6_VERSION=r36.2.0
-ARG CMAKE_VERSION=3.31.2
-
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS base
-ARG CMAKE_VERSION
-RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz | tar xz -C /usr --strip-components 1
-RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
-    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-
-# FROM --platform=linux/arm64 rockylinux:8 AS base
-# ARG CMAKE_VERSION
-# RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
-# RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
-
-FROM base AS amd64
-ARG CUDA_11_VERSION
-ARG CUDA_12_VERSION
-RUN yum install -y cuda-toolkit-${CUDA_11_VERSION//./-} \
-    && yum install -y cuda-toolkit-${CUDA_12_VERSION//./-}
-COPY CMakeLists.txt CMakeLists.txt
-COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
-
-FROM --platform=linux/amd64 amd64 AS cuda_11
-ENV PATH=/usr/local/cuda-${CUDA_11_VERSION}/bin:$PATH
-RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-RUN cmake --build build --target ggml-cuda -j
-
-FROM --platform=linux/amd64 amd64 AS cuda_12
-ENV PATH=/usr/local/cuda-${CUDA_12_VERSION}/bin:$PATH
-RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-RUN cmake --build build --target ggml-cuda -j
-
-FROM --platform=linux/amd64 amd64 AS rocm
-RUN cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-RUN cmake --build build --target ggml-hip -j
-
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5_VERSION} AS jetpack_5
-ARG CMAKE_VERSION
-RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
-COPY CMakeLists.txt .
-COPY ml/backend/ggml/ggml .
-RUN cmake -S . -B build \
-    -DCMAKE_CUDA_ARCHITECTURES="72;87"
-RUN cmake --build build --target ggml-cuda
-
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6_VERSION} AS jetpack_6
-ARG CMAKE_VERSION
-RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
-COPY CMakeLists.txt .
-COPY ml/backend/ggml/ggml .
-RUN cmake -S . -B build \
-    -DCMAKE_CUDA_ARCHITECTURES="87"
-RUN cmake --build build --target ggml-cuda
-
-FROM --platform=linux/amd64 golang:1.23
-COPY --from=cuda_11 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-11.so
-COPY --from=cuda_12 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-12.so
-COPY --from=rocm build/ml/backend/ggml/ggml/src/ggml-hip/libggml-hip.so libggml-hip.so
-
-# FROM --platform=linux/arm64 golang:1.23
-# COPY --from=jetpack_5 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-5.so
-# COPY --from=jetpack_6 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-6.so
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -2,24 +2,34 @@ UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
 FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c

-all: sync
+.PHONY: help
+help:
+	@echo "Available targets:"
+	@echo "    sync                 Sync with upstream repositories"
+	@echo "    checkout             Checkout upstream repository"
+	@echo "    apply-patches        Apply patches to local repository"
+	@echo "    format-patches       Format patches from local repository"
+	@echo "    clean                Clean local repository"
+	@echo
+	@echo "Example:"
+	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"

 .PHONY: sync
-sync: llama/llama.cpp ml/backend/ggml/ggml
+sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches

 .PHONY: llama/llama.cpp
-llama/llama.cpp: llama/vendor/ apply_patches
+llama/llama.cpp: llama/vendor/ apply-patches
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@

-.PHONY: ml/backend/ggml/ggml apply_patches
-ml/backend/ggml/ggml: llama/vendor/ggml/ apply_patches
+.PHONY: ml/backend/ggml/ggml apply-patches
+ml/backend/ggml/ggml: llama/vendor/ggml/ apply-patches
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@

 PATCHES=$(wildcard llama/patches/*.patch)

-.PHONY: apply_patches
+.PHONY: apply-patches
 .NOTPARALLEL:
-apply_patches: $(addsuffix ed, $(PATCHES))
+apply-patches: $(addsuffix ed, $(PATCHES))

 %.patched: %.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
@@ -32,8 +42,8 @@ checkout: $(WORKDIR)
 $(WORKDIR):
 	git clone $(UPSTREAM) $(WORKDIR)

-.PHONE: format_patches
-format_patches: llama/patches
+.PHONE: format-patches
+format-patches: llama/patches
 	git -C $(WORKDIR) format-patch \
 		--no-signature \
 		--no-numbered \
--- a/README.md
+++ b/README.md
@@ -369,6 +369,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
 - [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
 - [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
+- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)

 ### Cloud

@@ -481,6 +482,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
+- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)

 ### Mobile

@@ -539,4 +541,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Observability

 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production. 
+- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
+- [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -59,7 +59,7 @@ func getModelfileName(cmd *cobra.Command) (string, error) {

 	_, err = os.Stat(absName)
 	if err != nil {
-		return filename, err
+		return "", err
 	}

 	return absName, nil
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -279,7 +279,7 @@ func TestGetModelfileName(t *testing.T) {
 			name:          "no modelfile specified, no modelfile exists",
 			modelfileName: "",
 			fileExists:    false,
-			expectedName:  "Modelfile",
+			expectedName:  "",
 			expectedErr:   os.ErrNotExist,
 		},
 		{
@@ -293,7 +293,7 @@ func TestGetModelfileName(t *testing.T) {
 			name:          "modelfile specified, no modelfile exists",
 			modelfileName: "crazyfile",
 			fileExists:    false,
-			expectedName:  "crazyfile",
+			expectedName:  "",
 			expectedErr:   os.ErrNotExist,
 		},
 		{
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -191,6 +191,8 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &qwen2Model{}
 	case "BertModel":
 		conv = &bertModel{}
+	case "CohereForCausalLM":
+		conv = &commandrModel{}
 	default:
 		return errors.New("unsupported architecture")
 	}
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -0,0 +1,76 @@
+package convert
+
+import (
+	"cmp"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type commandrModel struct {
+	ModelParameters
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	LayerNormEPS          float32 `json:"layer_norm_eps"`
+	RopeTheta             float32 `json:"rope_theta"`
+	UseQKNorm             bool    `json:"use_qk_norm"`
+	MaxLength             uint32  `json:"model_max_length"`
+	LogitScale            float32 `json:"logit_scale"`
+	NCtx                  uint32  `json:"n_ctx"`
+}
+
+var _ ModelConverter = (*commandrModel)(nil)
+
+func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "command-r"
+	kv["general.name"] = "command-r"
+	kv["command-r.context_length"] = cmp.Or(p.MaxLength, p.MaxPositionEmbeddings, p.NCtx)
+	kv["command-r.embedding_length"] = p.HiddenSize
+	kv["command-r.block_count"] = p.HiddenLayers
+	kv["command-r.feed_forward_length"] = p.IntermediateSize
+	kv["command-r.attention.head_count"] = p.NumAttentionHeads
+	kv["command-r.attention.head_count_kv"] = p.NumKeyValueHeads
+	kv["command-r.attention.layer_norm_epsilon"] = p.LayerNormEPS
+	kv["command-r.rope.freq_base"] = p.RopeTheta
+	kv["command-r.max_position_embeddings"] = cmp.Or(p.MaxLength, p.MaxPositionEmbeddings)
+	kv["command-r.logit_scale"] = p.LogitScale
+	kv["command-r.rope.scaling.type"] = "none"
+
+	return kv
+}
+
+func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
+	for _, t := range ts {
+		out = append(out, ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *commandrModel) Replacements() []string {
+	return []string{
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.k_norm", "attn_k_norm",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.o_proj", "attn_output",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.v_proj", "attn_v",
+		"model.norm", "output_norm",
+		"model.embed_tokens", "token_embd",
+	}
+}
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -2,7 +2,6 @@ package convert

 import "github.com/ollama/ollama/fs/ggml"

-
 type qwen2Model struct {
 	ModelParameters
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -109,6 +109,7 @@ func TestConvertModel(t *testing.T) {
 		"all-MiniLM-L6-v2",
 		"gemma-2-9b-it",
 		"Qwen2.5-0.5B-Instruct",
+		"c4ai-command-r-v01",
 	}

 	for i := range cases {
--- a/convert/testdata/c4ai-command-r-v01.json
+++ b/convert/testdata/c4ai-command-r-v01.json
@@ -0,0 +1,344 @@
+{
+    "general.architecture": "command-r",
+    "general.name": "command-r",
+    "command-r.attention.head_count": "64",
+    "command-r.attention.head_count_kv": "64",
+    "command-r.attention.layer_norm_epsilon": "1e-05",
+    "command-r.block_count": "40",
+    "command-r.context_length": "131072",
+    "command-r.embedding_length": "8192",
+    "command-r.feed_forward_length": "22528",
+    "command-r.logit_scale": "0.0625",
+    "command-r.rope.freq_base": "8e+06",
+    "command-r.rope.scaling.type": "none",
+    "tokenizer.ggml.add_bos_token": "true",
+    "tokenizer.ggml.add_eos_token": "false",
+    "tokenizer.ggml.bos_token_id": "5",
+    "tokenizer.ggml.eos_token_id": "255001",
+    "tokenizer.ggml.merges": "902a060cac8884a5793d2a857dd2e53a259de46c8d08c4deb243c239671e1350",
+    "tokenizer.ggml.model": "gpt2",
+    "tokenizer.ggml.padding_token_id": "0",
+    "tokenizer.ggml.token_type": "b7a352ccd1c99d4413bcf452c2db707b0526d0e1216616b865560fab80296462",
+    "tokenizer.ggml.tokens": "815ac90ff23565081522d7258f46648c8a0619eb847a9c7c31b238a9b984e4ae",
+    "blk.0.attn_k.weight": "6fcfdb466f9ceb1229404ce4ec4e480751b8d00da12707a11783dad7256cb864",
+    "blk.0.attn_norm.weight": "6063317f731371864049c7704a70772f1eb632194201ebdc2ed0f8e483507c72",
+    "blk.0.attn_output.weight": "920f49716a1e2fc73b6794ec777947f1c122701e63ed302422ac89e90f06e9da",
+    "blk.0.attn_q.weight": "ddbcd7cde197e632564ac58e4f25d9e3a8ca52917329eeb6081eb41a797932ab",
+    "blk.0.attn_v.weight": "318fc02a189d87420f0cbf57f47f11e00c21ec1ed472ce0a2a895b44f7fa0fca",
+    "blk.0.ffn_down.weight": "aa71975b6eb1f4c77b03d2ac4a194cf8d95718efac741bb12f0f3ff79a27f9bc",
+    "blk.0.ffn_gate.weight": "42967702fa0bc738b88dc50007ace26dbe74a5a9e0978124dd093f818241a9e1",
+    "blk.0.ffn_up.weight": "5282c8788b086bd30f46525e7995a17464882a72703fd27165491afdd8bfd4af",
+    "blk.1.attn_k.weight": "cd248882e64fd2c3402c44790ebe12440133dc671b6893fdad0564c461973adc",
+    "blk.1.attn_norm.weight": "ba84e1c8fd30af6ec94208db4078befac8c921aad3acb887812887f3282ea2be",
+    "blk.1.attn_output.weight": "2efa3ef7c5666ccceb05e339b83ad680cc0d2c3ec78203f5da5959f23a80e14f",
+    "blk.1.attn_q.weight": "5106f2e255358a1303c22e8b5f0ec044852bb30a866c52cabefd30017a7a6b7d",
+    "blk.1.attn_v.weight": "a211a634a1a5df1d5f973645438be0461dd922210f9747c6b04e386c7f1ebe95",
+    "blk.1.ffn_down.weight": "37093afe48d32c578ec956c9ed85242cd000d6aa979e60526aafa10c822dbb10",
+    "blk.1.ffn_gate.weight": "469860819e9159caefb1aad0bc66db790f3393f05fd87b08e52256a7ed256543",
+    "blk.1.ffn_up.weight": "736742c97d35d1a011f9cafd3c0ce947ad559bb2fba6da73c816f6bfd0fa9aeb",
+    "blk.2.attn_k.weight": "92c219d92804d832ab404bd6dc7339c90877bb7cf405dd030c121f8b27757739",
+    "blk.2.attn_norm.weight": "61e4466069474b76b6d1e702566420eb669faf3556b00ff7b824784aca13a2d6",
+    "blk.2.attn_output.weight": "d2fb38a2b2171fd91caf037faa585a62225819aa232d86fd4f7f9d2c3c8a45e9",
+    "blk.2.attn_q.weight": "f6faf5cc6844e3daa4f9f68d90f5458c64879de68a7728860e38374e30c3429d",
+    "blk.2.attn_v.weight": "f340ef8f7341d987a6f37c0e9afe0aef5be67be00c0ce5f57612daf73319cce1",
+    "blk.2.ffn_down.weight": "c7be61a701d779860b621b143fb6365b607bf99ec7c0f153b07908ac8120885a",
+    "blk.2.ffn_gate.weight": "b64f0878187bd3392abfa4c3e8ad2f8b4c133903e54246747ff8f3b4639ad83e",
+    "blk.2.ffn_up.weight": "50b11c712652e90ee7428dbb45cffebb80662ac982bc72bd9eafff361b5eb5a8",
+    "blk.3.attn_k.weight": "2b7bcbe9ee5c9c630c8c8d7483887e78b73581016f4cbb6933db2a147a25f431",
+    "blk.3.attn_norm.weight": "0181dac7f4eee7252980323e8032cf339bef2046ce0a16c0fd72af7c98a8a37b",
+    "blk.3.attn_output.weight": "aef8843b636ce231da9e7c9acbee197883cc15df0e2887709324c6a50f16da7b",
+    "blk.3.attn_q.weight": "55404130fa10e81322d33eb378aa0de31a92990ce7730f1338c0ace0406bb1b1",
+    "blk.3.attn_v.weight": "76f7fb8040d82b957d689ce34fea2302a6640ad5bbaa0052ad2b7ebce270c33d",
+    "blk.3.ffn_down.weight": "648628933eff3b357c3729c33c5b1ae51c28e59b9c19acd1601a2ff7c5d5d9a5",
+    "blk.3.ffn_gate.weight": "6a588885d16e98d5f50ebed05af089154f680085ca9c97691e5b489088630a4a",
+    "blk.3.ffn_up.weight": "e12455a1d702f4986e1a663493e3d5102b367af74d45557522002a35d63ecac2",
+    "blk.4.attn_k.weight": "40d943380a8a85e4eab147934bf6e16f23cc8ab753f6636526382c074d182288",
+    "blk.4.attn_norm.weight": "4ab2c098983d4599fe540eef624c4df954adb7473faebda7471ef0ba4134814c",
+    "blk.4.attn_output.weight": "d14b91e40f58bf4a3c8c2eca0b12bb541de406574af39027d56f6c588a147082",
+    "blk.4.attn_q.weight": "e1224960a3562107488589f883fa32414bae41712fa8dbd47c5f3e3a7801452f",
+    "blk.4.attn_v.weight": "063f297bc4aa6e709fc32c4c32e35af7d07d80e83cb939b76adbba858006c03d",
+    "blk.4.ffn_down.weight": "f88a18020c5e1caaa29596895eb348e76ee5bfad27ed57651a86cd8cd1f9b5aa",
+    "blk.4.ffn_gate.weight": "48e7e1eed3fb52e92e61d3557dd0ec002418327090e034ce4322fd68542266f8",
+    "blk.4.ffn_up.weight": "1ca8a7aa17355b6ce0d9ad5539fdad3899fa47fd359c285fbfb31f19f47bf073",
+    "blk.5.attn_k.weight": "2bdf15f8e73d068d972380f25d207004cf0bf3b5bfa46946803ba6fba07d9175",
+    "blk.5.attn_norm.weight": "60448d7cde6e1b6467aa31bdea012e39cdb08c88081cee7d102dca4f93f766ef",
+    "blk.5.attn_output.weight": "f9f687d7c457537f9fca8a4087a59f1c3bebfaf5537b94e42c831a13224f7799",
+    "blk.5.attn_q.weight": "987db7a2ad68657a92625e1980effbb1f79697c2183f2b9f3b3a0570c51b0ab9",
+    "blk.5.attn_v.weight": "cf696891148f3e4783ad1d20f93462ae091eb8651c656bba9b662253b6263e02",
+    "blk.5.ffn_down.weight": "c0662b0bd0929136005fb9d691fdd9b2c33867d9ce9622339a6a456b720b059a",
+    "blk.5.ffn_gate.weight": "200bbdfab615d7a3a84719b6ced7751e3ce52757ef212d96f87798bc1de5e987",
+    "blk.5.ffn_up.weight": "df5d23e7e035fb1b9d163da7ddfdfe38da6a37e86e96534dc02ad20f011b55b3",
+    "blk.6.attn_k.weight": "c0dae2d272a7c5a2fa004bbb8475dbab362fc1f6d008e73d5a4434a9382ac6ba",
+    "blk.6.attn_norm.weight": "51c57ac8b55e04354d5dca6bb9c0cf4177639d3b038e80209e33036209688f64",
+    "blk.6.attn_output.weight": "229d97892c62f85bcdf431675250e01c976ad69ffa450b01fb543bf88f14a2fb",
+    "blk.6.attn_q.weight": "c20e49621821bd46ed156e6823864a5bda4f317750e71ab8dc54e44eb48cf7c2",
+    "blk.6.attn_v.weight": "53ceb1a2ee43fce3c7b5b33c58a9fc5ee7f44dc1c6f29bc9dbefc37582102dc9",
+    "blk.6.ffn_down.weight": "7923c943b7629d560a032d1efa210d1d75c6692140f1be94464ee7ed24f44ed0",
+    "blk.6.ffn_gate.weight": "57593d350361af753a6a39f53b066282634c0fb44f396f6f2966a574b01d8f8c",
+    "blk.6.ffn_up.weight": "327b6a7a387098b8899d3ded04a4d4e7c658ca61b80d4e7b17594be232721602",
+    "blk.7.attn_k.weight": "9ca48b87a10116fd8868e62b76f211d4bb91f166096be9061439ee2e1c3a5c20",
+    "blk.7.attn_norm.weight": "cd56cfcc4e2ad6b96e23ea7b0d32b4caf236107d99a0b22c56760b62e63c8cfd",
+    "blk.7.attn_output.weight": "7352b509a03cae2491ffc060e577d189341a0f861233f18c96f9d275dc4234bf",
+    "blk.7.attn_q.weight": "2b3791c8c008c33ddbe12bedba8191322ceea2dcce5cf0eb7a93d40ad254e672",
+    "blk.7.attn_v.weight": "3ae721d52466487a3d48150581e57f6d64ea1e83ab929f23b28c3d777422eeb6",
+    "blk.7.ffn_down.weight": "3b6fa8ececdb3c34af3a5363863d6f94289c1c95bf47fce3a3ddcf184c5f0848",
+    "blk.7.ffn_gate.weight": "dbd7df6c5ae5eb4adb859f0d36453813a4e289a359a1ba8f72d67fcbf21c3e22",
+    "blk.7.ffn_up.weight": "de68380a334b4c5cfd4c318b0e9854aec59bd79aa0f0c30af3f56414f83482b0",
+    "blk.8.attn_k.weight": "7303c4e4480abc72a7ee271811311199245fb5c2ea27a2bd3b8cad3a53a03c27",
+    "blk.8.attn_norm.weight": "2e3d1921898d1b943ce1a1b6818546c8b471d6d542da24f51a8b514b8c3dd4ef",
+    "blk.8.attn_output.weight": "30421520887b66bf97a18dbcdc283bc8d0b60590b612fd638a319a6eae923227",
+    "blk.8.attn_q.weight": "73e064d5433c9b500068a1c31744dbd53f4ade298fb450a0e8c97f62cf1f8a8d",
+    "blk.8.attn_v.weight": "27e21f8b9a9a8533e8178ca34a72aa1d786393d57302b7806dcdf3e51de511a8",
+    "blk.8.ffn_down.weight": "bf694bd8e00047982108000e7b3dee7b225db8b19abc595e5697b6bbefd92e7c",
+    "blk.8.ffn_gate.weight": "d55fdbf8606d9141b774b0500c58944fd1253b9e69d1f765eaa9a680b9f2ca40",
+    "blk.8.ffn_up.weight": "1ae3f580655e7c8e8dd6c34fa4ac574fdfc5e3f1a8536da0c5442d3a2976f0e7",
+    "blk.9.attn_k.weight": "b18080626012d8aabcf78542d6c7bf31c712bf55a70172fbfe173fcf34481036",
+    "blk.9.attn_norm.weight": "2e3620620dc09998c6d3063a7d5de5433fbbae8c11e5b00d13f145d39140e162",
+    "blk.9.attn_output.weight": "69c3c0e27ef1c0fc933eeb7b612b70909f18cde238873c0d576a2ba9714ef174",
+    "blk.9.attn_q.weight": "68330e5aa28a28873c9a6e67f032186ef651df2df5844e0f27094ba349fbe4ab",
+    "blk.9.attn_v.weight": "3df8d45a102be082d0793a51cb82aa62a43cd0e9d047ba4115ca0f2414b39325",
+    "blk.9.ffn_down.weight": "1d6cc162b73745b135b4f040a0aac3c06d5135a3dc5b2421e7ee2af48662fd7f",
+    "blk.9.ffn_gate.weight": "034a9d40fb1e32b534b45f4bccd65cbe43c4a6a3f5d01132bd245ca0005de5fc",
+    "blk.9.ffn_up.weight": "c838c38d0e1a0ac0da17eb2a66023ed31929f07d8fcfe1cc546df26096c91f0c",
+    "blk.10.attn_k.weight": "a78507cb72f744b86ceaa032596e74e5571c822d0226d334881169addb32cbd5",
+    "blk.10.attn_norm.weight": "35f48d0b28ee0e6b4cad4e983925737562d64824be5b168b3e26df3d6b260cf1",
+    "blk.10.attn_output.weight": "53712db06796de39b131323e7abf9a58551b6d52da6db66a471580386d396252",
+    "blk.10.attn_q.weight": "efe08429ba196026b81cd1c471e1c7418afd9e966659feb3936b674aa0803b58",
+    "blk.10.attn_v.weight": "7ec6055e134f89da0cbe79ec9f13ef2e442ac584b1f03c3e13e7d0cdad0078bd",
+    "blk.10.ffn_down.weight": "37e66af4bcd1f3079e841e892255b8255070655901864ea3a8c602a7f681a640",
+    "blk.10.ffn_gate.weight": "1825282bc34830d371c6edcc3c1e73e6ecc1e10f4aea0122dbb7acc1d6f7b1bc",
+    "blk.10.ffn_up.weight": "819b3b276a4d4c14a35ed6682d5ef18a5e8ed468e5ce3f12e8c75ec18ac20ec4",
+    "blk.11.attn_k.weight": "5327e6a2af82dfff0619a14971f5864a15553c36fead84e1af42c7630f2729c6",
+    "blk.11.attn_norm.weight": "fec363b3c4a43036d2c635fb8aa9e122dd87ee79811839f2f6cd955be3373e7b",
+    "blk.11.attn_output.weight": "ccf7b38f18ee8798b8a6a35018e2df3eb3e007de62876befb68025dd66c79763",
+    "blk.11.attn_q.weight": "da8c4a1c824ffe174e39f126cd72f7ef83c56aff1259d452a1212de80f98f5e9",
+    "blk.11.attn_v.weight": "d17ae6bb77f03982b55d341eb67acb5969e9ad3da5994b96eafc09793dcfe3a0",
+    "blk.11.ffn_down.weight": "a6bac521e2791345f22c57205fa1c2f2f687794dfd24d0e98d50ae0d0eb6088a",
+    "blk.11.ffn_gate.weight": "5ed902c488cb51ba5635f3df08258c5f84f31a679a00211ea5f9d8b824ef6d9d",
+    "blk.11.ffn_up.weight": "ee9f1437eb890d2cf9df2574afa1cecf20aafdd847cd75b152d7eb74419afd34",
+    "blk.12.attn_k.weight": "5a069c06e1019b0f889088e67458f7a11ec77fa190ada6069e46211f62219947",
+    "blk.12.attn_norm.weight": "194d7e5fcc8c49aea62daf1940532419cf3c505afdce6be377286b677db5db8f",
+    "blk.12.attn_output.weight": "6534995fd4d6fecb55e317add4b1723aba4d825e1e9471d0b08813dfdc247176",
+    "blk.12.attn_q.weight": "4ab51ca519b5995581fa34f846276feca3b907ef2b51f192f6cc0b3263c3f5a2",
+    "blk.12.attn_v.weight": "5652ca3fa81ef9a1ac1543d71fc6813f8517f8ec54b25c701f6f98061614830f",
+    "blk.12.ffn_down.weight": "4b2c263f54c88516b8eb273bb8d9615b01c5c8b484dc70358adb91b50b300edd",
+    "blk.12.ffn_gate.weight": "8f50c3c3e3e8568991d6c1b0e74b500cf4f208e7700bbb8e87c3f6a6d359b6b5",
+    "blk.12.ffn_up.weight": "1c1a581fec1fbe959e1427fa513f400100b5e1ee9d83932630be9905fb49c231",
+    "blk.13.attn_k.weight": "efd7a38c46f08d8376d82974f33c644e3a02220e142d63b1704718699a8a884c",
+    "blk.13.attn_norm.weight": "d28fa4f1bd75abbd063b0e622e08f579c89cd0c0c5ce63c1952ec9f944f8ee13",
+    "blk.13.attn_output.weight": "71e0068a639288718bdb70a6cfdefd50bc8b3ec3993347a65129e70001ca5827",
+    "blk.13.attn_q.weight": "b97077adc92cff07a2e07d80ee38f214ad8713571c69cd5c70ebd43dc501ac87",
+    "blk.13.attn_v.weight": "79b3e2749ab4b459c81e96e322b215f1e8af645eb346e176c326bd00cf6ed2fd",
+    "blk.13.ffn_down.weight": "9f8687d11effa1db7cfecf7bec5631734bcf2962aad74a9f519144491e08ec85",
+    "blk.13.ffn_gate.weight": "7d14dfa0543852e7777fe8fff29ca533744cbcf1ebcf10067e5adfc4eb345e65",
+    "blk.13.ffn_up.weight": "852b9527b97fdab211ff3f832a660ee1d93ccb56906144c50f01319a6e8ee615",
+    "blk.14.attn_k.weight": "79e926b20f36f66d58226cb358881f2f68ae7b468787d33cafae5110287a14a0",
+    "blk.14.attn_norm.weight": "97d481b63deb0df6142c2c6cd23043720c62eb609e390f47a7113751c79974ec",
+    "blk.14.attn_output.weight": "aa6e94d7176d5c79fbb89b96e5f13ce75702ce3dd23ee52986446da436a6c3d6",
+    "blk.14.attn_q.weight": "214becb6d1bb460da9fb8ace0f99b9a5afa9edf7aa7acc19606c7401b11d6305",
+    "blk.14.attn_v.weight": "488b0e6d7f1a7a2ed0972aaa6d10ef9c775ee5373460324efcf5b3e3da9311df",
+    "blk.14.ffn_down.weight": "29c7ad16cf9542e30996a1a01ab95b844533b28051f04cc7949c371afb796471",
+    "blk.14.ffn_gate.weight": "b7ef208f2b054803665b377f5a5980c122c026841809cf855c6ba06d1c3a885a",
+    "blk.14.ffn_up.weight": "76a5cc28100748d79c4398ce7b9176aab4d661548b6293a82f99144812e5b70e",
+    "blk.15.attn_k.weight": "a6b8f9e98ab878fa7ebc5d080978ebf2d050acc2ab2fa8ea9188eb10e27702c8",
+    "blk.15.attn_norm.weight": "a26d07a9752d6dccb68e3a8a2a49fd0752cdd0a415e05547819bc37d9ba63d5e",
+    "blk.15.attn_output.weight": "c63616c69048ccbee801e05be4f56d21fda21aa0cc470f41d57c31b4d9283a4d",
+    "blk.15.attn_q.weight": "fd595a67bf96c6ba16eb148a9d02fa52fa3c1d33ed10be28a08f851409fd6e64",
+    "blk.15.attn_v.weight": "1c5c9d33fa07c05d5f4ed0032c6c4aa83d863f0d31c94a66109d239dcd03cea3",
+    "blk.15.ffn_down.weight": "585ea62ab8aff7d7d212ea5c1a03226fda6b68370c890b776834af70c948dcbc",
+    "blk.15.ffn_gate.weight": "a13c63f86f879b03a573d5dd2a25cfd1f4dc73e8132e6454ecc23e538b4cdf6f",
+    "blk.15.ffn_up.weight": "f7112450f57c12fcd511f049e0dc0b541625a107a7901c3261ed9e984299f65c",
+    "blk.16.attn_k.weight": "2d2c8b11dd71fba6d1c106aa1673c113a5448653cca7eab897c8739212ed5003",
+    "blk.16.attn_norm.weight": "95c2ec7be9469690e18a9a1779684acb3e9da44b13e263a0da840305646fbf8a",
+    "blk.16.attn_output.weight": "31a65046e677f54dae654ded4e733479fcc0f7283d83076b7dc7cbcae8528230",
+    "blk.16.attn_q.weight": "bfc6292b9c6d49b7118d08060242a138182eb182d136ba5dfaf469437c16081d",
+    "blk.16.attn_v.weight": "68f81d037340217d87c7853ff4d6edfbc46d9e827ee6d5bff7c3f6238e3a95ad",
+    "blk.16.ffn_down.weight": "bbd6629691950cef4d5113e1c6670e91b216a9b872cb92cee02dfda4d6c4f7b8",
+    "blk.16.ffn_gate.weight": "63cb56f282b7401ed6c76e5bb6fdf1bf68a64f9af0c82c014209b55bcb5191d0",
+    "blk.16.ffn_up.weight": "b54f39a2541063cbfb6f713aa81c3b69a04100e999aa2ebbeec195dc382eceec",
+    "blk.17.attn_k.weight": "3d9ba49799cc56664ec30a002bcad61eb651294212a68c3ddb573eb042aef5a4",
+    "blk.17.attn_norm.weight": "42ee0db4b9d63257bca0012a30b12737ead1caafeb5ed3d93c8f48ffec4b46de",
+    "blk.17.attn_output.weight": "a38fd100f05c9041c592bc739e287de0b10d08ef2bda41a879225bdca9002f71",
+    "blk.17.attn_q.weight": "8a3bee285b0180a9eb35662e449ee4cbe16d992bdd48fb3a94bc4a347728cfa2",
+    "blk.17.attn_v.weight": "d7f8f1b8b863494ed4392a1656775912e9b264ad36016547b12e832a1d6757d6",
+    "blk.17.ffn_down.weight": "bb7ee58f61da8630972e25b621996fbe8ec06f4dc9ab1e268ab5b120c526ca28",
+    "blk.17.ffn_gate.weight": "6b652dbf167fee09a45ebfd78d500ff6548fb2756dbe5343ffec3f7e6207179f",
+    "blk.17.ffn_up.weight": "3b67f727e55e742715de978fab80457781e7a3762bc48f79d13b45dcb8de664c",
+    "blk.18.attn_k.weight": "ff7fe57c57b90c6fcc0aefc39ec24593c3a7d1ea1c23770480075a015450e0f5",
+    "blk.18.attn_norm.weight": "1d40faca082d2633ef0ccf19e121870dd6c7c3e2154607c7f3543fa96e99cb2d",
+    "blk.18.attn_output.weight": "9adfecaaa397a92db4687efd5fcabfa0daef9e6b0493763b7ff5ebc185c43a6c",
+    "blk.18.attn_q.weight": "ad1803eb9b291948639277afe981e666b07167eb3fcae903ba5b73bf86d8f50b",
+    "blk.18.attn_v.weight": "308cf23399adccf27401a4ab60d74dac6fb9d4cd4b9c5940d9145118d1881b34",
+    "blk.18.ffn_down.weight": "7de4ac9a561fb580619b745687dfd7ca8a69ef70471dee978741b80e9ff7bead",
+    "blk.18.ffn_gate.weight": "0c66970f696b33bd5ee8f1f2fbcb41fd78fa5ccabdc927e11a4d5a4089f19c69",
+    "blk.18.ffn_up.weight": "66a42e988e8a1f468fabf976c48e9e4bb045eaac6916ef16555ac101cd674abc",
+    "blk.19.attn_k.weight": "a928ab50390bacbcebe2e4b66922498134ce22d7b93beaa87d6cf4ab52eb7174",
+    "blk.19.attn_norm.weight": "b4a02c55b46c2a96aec9c64a254087cf48e6c1d4b6f31782c77a46fc4daebad1",
+    "blk.19.attn_output.weight": "b768319c641dff1eac5d1f8ceb960c9899c795bf2b24c1d6bf70aa24fda45f77",
+    "blk.19.attn_q.weight": "79ef3f57d187d3954a26362096e1b6c222d76f537dff73e034d6e9999935b8bc",
+    "blk.19.attn_v.weight": "ce13d6b13e24fcb2d5bc6a2662e5bd295b31b12db10a6d0307f86cf29b8d5001",
+    "blk.19.ffn_down.weight": "cf90d7e2137482cfd50934a8223ad774621d08554969da80a9712df5e6227eb0",
+    "blk.19.ffn_gate.weight": "71ce30150f003b6eeb3bf7464e05b6ae615f135110d8e47f0a47fd973e537c0f",
+    "blk.19.ffn_up.weight": "7f92aca0cc29866633feec701ec01a85a8ee2fd4e2b9630173a6cffb1d9d50ee",
+    "blk.20.attn_k.weight": "a2df23159d6fb74ef28e14b61028fe8b00a693a2fc9234a980be74f20b958682",
+    "blk.20.attn_norm.weight": "c6cd5f1b096fc5efa4eb59ca1c8c4bd28730f3dcedd59a63601663eccc6724ed",
+    "blk.20.attn_output.weight": "896a8a166d0f006d4b09867ae4345426303cbc3fb13a18d3d4e1bde00f16dbdf",
+    "blk.20.attn_q.weight": "01eb79588fe61baea0da43e99f4dc5939590e1bafd01e12dadb8326f102bfea2",
+    "blk.20.attn_v.weight": "bd39630fdd5a7c859ac1addaf53e63faf524c3f32f5f4896d86b6e746b1d5c06",
+    "blk.20.ffn_down.weight": "0304a5d39957a0e3f031c4bcc4549a135d396c8d97c8d276fd1c823ce86560c2",
+    "blk.20.ffn_gate.weight": "117b79d595b1dca0c8b37586beaecc4d84411507276212dc286cde7fc36c9bef",
+    "blk.20.ffn_up.weight": "6e799346db145c125f01783539749d3828fcc451cd4f10c5352f047a47e28714",
+    "blk.21.attn_k.weight": "1c37e4c0664147e775bb006b226b9553e3421140cd96288ea755f81731ab80ba",
+    "blk.21.attn_norm.weight": "00ae783a29000ccda5e4bdbff03df0752fb82805dc3f9b987500ebd80714476e",
+    "blk.21.attn_output.weight": "7588b84f9fb19f15095b5265c60b4a4e7ae74bcc47d4607dfa5d0bfab6f136cb",
+    "blk.21.attn_q.weight": "a65f1c0dd06d45bb97532d3e932689c1eecfe7359089b39174a96a149335cbc1",
+    "blk.21.attn_v.weight": "4220b77e7d5e8709b4eef33a679b5dad11f297085ef44c9977f9e54ef08f7a2d",
+    "blk.21.ffn_down.weight": "b8c082a0530d4b5328e67db0df84c5498f2af956de23c639fa0198ffea853950",
+    "blk.21.ffn_gate.weight": "cd1b656ee72d00e9835ef667c19ef89a88de261eb8eb7c0e936e0f9ddf83ef9f",
+    "blk.21.ffn_up.weight": "dc445f73e36ec7a3bd86884186b728f8e0187f32848c3b8b69d4d41f8571bf31",
+    "blk.22.attn_k.weight": "e37cf0b893ec8b9ee8c78dd139b8d9c45cb997a3bc0c3d93a70ca1c3f6af8859",
+    "blk.22.attn_norm.weight": "248a27838d3c46cc03a5c312facc84e2e0e2c990ef8401e93da25918497f88d1",
+    "blk.22.attn_output.weight": "fc191a18f6d18332c66761f7ab28008bfe295dd1f5c8741a2488442f9e00d0f5",
+    "blk.22.attn_q.weight": "4b193a2ab8bc2b085db18f2bf3eeba26e02b537b2cdd738160c8f14b165d0f5a",
+    "blk.22.attn_v.weight": "7a60ce5ccac7e045e55ba1e1e85bd2a0f93f8c781daee96c5223665e22f0c666",
+    "blk.22.ffn_down.weight": "e0a34fb4244e2c7168f3dbaa1904c15d339ec39999cdf27128bbaf619ee0a237",
+    "blk.22.ffn_gate.weight": "8bac872d4b8549c8812f927efa309f1792b524f33601095fff61b826de5a5615",
+    "blk.22.ffn_up.weight": "b67fa2b94dd901b6ec64c0853ce8ca2d86fe9cb1cc6d2f15fbbbe0e691c0c648",
+    "blk.23.attn_k.weight": "2c32e66ad01942b819ac09a197c71579fe66f02226a264fdd72ad1e02c67a27e",
+    "blk.23.attn_norm.weight": "825fdc94deb439cb93c713eeb077c1052b90ed658d6d464fc4ad3d611e911d48",
+    "blk.23.attn_output.weight": "95ca6707a95b8750b0c7c5d379d368f0f2e7ebef631954e7d4d8ec0f41f13a3a",
+    "blk.23.attn_q.weight": "6eccc84faca5fac015d1b26e2854501edcfd292a302228fe14cf99f5eb59a34b",
+    "blk.23.attn_v.weight": "b343ac3d226040f1033ee049668aa1d89b1774bc18431965682e5dbdce78ccdc",
+    "blk.23.ffn_down.weight": "9fc599befea8d3b1e342d564a110074f66d2542df406c4b90b6bdc5828fbb2b2",
+    "blk.23.ffn_gate.weight": "488556c1b0c9f0b20b0c99b4bac2e0f4046b81edb601d7b91e7e5b3bab47d667",
+    "blk.23.ffn_up.weight": "1088e291d7008dd9c7c2dd6830af686a8a84b724d123a016209bd5156d6898f1",
+    "blk.24.attn_k.weight": "a923fbe35e61e009a53927d7828818e0592bb737d6a1106c4b0b5a1efc367e07",
+    "blk.24.attn_norm.weight": "9b51aaaa939cefafdd9b13a7e5b74ac7fa2d603427e55a16a909d6f3f353750a",
+    "blk.24.attn_output.weight": "1beb2baba56f8409466434b037771248c2f620ec5f53e15f44c271d5a2d9ecf4",
+    "blk.24.attn_q.weight": "4b0194fe5bfae0c6bf6131dcf8cb6e2b994f6ea10b27cb03574f0f4f8cc0c950",
+    "blk.24.attn_v.weight": "6ac34b1ab0f66226d85bca1194a7c212cd93d384ecbc8b8395de48aec0970a61",
+    "blk.24.ffn_down.weight": "5508f74cb732a662c2936b32ac5e90742d172b9f961a747b0e5cba0e5906a89d",
+    "blk.24.ffn_gate.weight": "095e39b8584403835f9bb1ac33e0e81f54175575e4800273d281b845bff381e7",
+    "blk.24.ffn_up.weight": "2d43ec21637dda12973de367b0113ee9840b0d815bf6fce042f7c3f270b0b530",
+    "blk.25.attn_k.weight": "9e2aee029f3d2c7f67dfc7926e72c8228fb978382c8e5a4701bbf82c93801419",
+    "blk.25.attn_norm.weight": "220cd7164fb4cdbe22d26058e4153b26c27c7b5ce2bec8e95bf2c0ea08d23103",
+    "blk.25.attn_output.weight": "a17f4a5dc6aa51f03dbd75602d98e9491767c205cdc2c3a5f8667fc54bbf7c64",
+    "blk.25.attn_q.weight": "f60827496835c440c794bf57ce9780704d10a59d8229886bf75ebb18900ba4ef",
+    "blk.25.attn_v.weight": "9cac217e9e9f4f4c85f14ee51165a77c580165bd4a34b202389169bbe61a1ced",
+    "blk.25.ffn_down.weight": "a0f36949b663e80849581dfb71e7babcc73580793bbcb0c80ab26d5a6e000359",
+    "blk.25.ffn_gate.weight": "df4d1be4d50d6afe5ad3ef0d0e0fac76a33e85c963dea769641d612dd53e7d13",
+    "blk.25.ffn_up.weight": "992da76be762632e25ebc5ef4d03728eece1b43f7c4e31827df19ca724aea694",
+    "blk.26.attn_k.weight": "34199ff856ac32a500c754539d070258574192a34ecba87a182897cb59fdff52",
+    "blk.26.attn_norm.weight": "a8e9dfb2dae5d22b5c0aec5f3675991c0e3c3e6a44153db2579136b73f456e00",
+    "blk.26.attn_output.weight": "1c4f257ffb0d7db0f11cfb275e38b4af736917b43ad82de1badce3f1d227da4d",
+    "blk.26.attn_q.weight": "33d55786274c2e718cf61e8fbecf3dfa5ee0c208f0b716d42b061f55459acb3c",
+    "blk.26.attn_v.weight": "684b636939cd4ffcfec5a6238a0790ffa43d853c95783af9b9e8275e74071a7a",
+    "blk.26.ffn_down.weight": "89d0bf066db154e6d312b5433aed1714f6a28b40f4c52e3e1530ee07703303c8",
+    "blk.26.ffn_gate.weight": "393d649bebe5e2940e1b043649f6c860b4b8b9f380f30e9da1744a830f358156",
+    "blk.26.ffn_up.weight": "179edc85ababd9d8440cc6093eecd1004290aa1cb96434b26ecf7585b6cca17b",
+    "blk.27.attn_k.weight": "334841445a7f1e14731b08f56eb0b1f0938c63823d28bc6d078c4c5f05b36f19",
+    "blk.27.attn_norm.weight": "57344471bbda2e9deffdfdb2dd05a07aa47f8761e24de53525588639145bf551",
+    "blk.27.attn_output.weight": "506126af9ee54b535d49f97e36f630e74834f480329f098d6d62e96246d8d65a",
+    "blk.27.attn_q.weight": "dd984df1acb4783849e25ba7ae378bfd385cd9efc540fb798cd5bdd873f0118f",
+    "blk.27.attn_v.weight": "b4b3fe9a4455d34c297ff20a2f537b647cef424741d840a747b265f23d320ac0",
+    "blk.27.ffn_down.weight": "621fdb185ba0d35ba5476dae73d2c81ec1482a0e878d5bfd5c3b29fe837af013",
+    "blk.27.ffn_gate.weight": "e4fbab45f2ec506fa374103251a0bdb7baa6f576080bdd796f3e9db92098e08f",
+    "blk.27.ffn_up.weight": "a0c57e463e988002bbd6a6c6792baa21a65e6f89ae303a2c301951b0ae6e4bbe",
+    "blk.28.attn_k.weight": "bac36cbd52ec5056841663865e1291ddab4b47ef9a2544dd285d4503bfb0e4a0",
+    "blk.28.attn_norm.weight": "5774a9df2bbb2e86d1f70179c7b92d81e1f401160148b3328fb64db6646a5425",
+    "blk.28.attn_output.weight": "e8712622d1569557000c75f26c3f55fad267fd300463c2c2cfe3afbfa1c8f908",
+    "blk.28.attn_q.weight": "11677751fddee52cc739699c02836f7be54d96038be4240be5d4f53d00161608",
+    "blk.28.attn_v.weight": "e5ee459b8958d65e1445997b9aa1e90e2f5d17761ebcf5357313119a45322507",
+    "blk.28.ffn_down.weight": "3934518f9f85292da8475fe38a8edcbfc4e24ac56c351b472d6351f98750871e",
+    "blk.28.ffn_gate.weight": "6ba735d57e98d0847e487f25ffaa25256deaa8abec76f428cb70bd9774279d83",
+    "blk.28.ffn_up.weight": "977fae6e1e5353114fc645dd98429464749758765cbc6e6457593d596e57850c",
+    "blk.29.attn_k.weight": "8122a457307d580ad6f1e0acea09a2f593d97f595ba0d6737f5fea16d2433642",
+    "blk.29.attn_norm.weight": "d626f721e05aa1202439b01027031d4caf1adace61ed37870a277cb6297c77cc",
+    "blk.29.attn_output.weight": "7fb7122ab1b6b1e6615ca746897da27bc52c92cb70d3147183cdde61795b72b3",
+    "blk.29.attn_q.weight": "be43e94ff6b6e391024dc824101efa0ddf4005d5b002ac26cb03765c0c73c2fa",
+    "blk.29.attn_v.weight": "af93c85ebff908f74f9935b81bde0516ca487c84139868a1ce079c3ae20036b1",
+    "blk.29.ffn_down.weight": "39dae12340ed3120bd19c495fe0872b559613641e41fde69d02d8631900b84c0",
+    "blk.29.ffn_gate.weight": "36fd482439840ef197c9f3b8905d86acfcea49bcf018544106ca465d4bf8d5c7",
+    "blk.29.ffn_up.weight": "5243fbdfdc1e2a1dd84b6210a9869d18a014db9088897e345240cdc99990bd5d",
+    "blk.30.attn_k.weight": "948f263616bd3788b2b968baafd69b9c5bd1b77578665f096c4b7e247b4cea42",
+    "blk.30.attn_norm.weight": "e168df981e744874ff303faf2eb470e5f6868c2040ba5f383f6c5148669975e7",
+    "blk.30.attn_output.weight": "4cf0ccca04b792573b756655a24fc89cfb1f272da8305633f0bc66ef14990b93",
+    "blk.30.attn_q.weight": "21e07d6cba6c50d65350289258209717174a13c42be57e8141d69712cbaf32c1",
+    "blk.30.attn_v.weight": "65a8ca29c7237b3182ccf03e2fc94e84f9a53d0e160fb679ab401c853170dd9c",
+    "blk.30.ffn_down.weight": "8b00500a6d00d84058f6658ee1d6f06fb4fcae2f90d4341792259362923b3c13",
+    "blk.30.ffn_gate.weight": "5bc0e19ab7a31b50ac2118ad1b36e31055271a322cd8ff661d47c3ac0210703c",
+    "blk.30.ffn_up.weight": "f37a0561955725bd59ee2d064fa9f4e00a12a1b620b624db3bc3add5330bc321",
+    "blk.31.attn_k.weight": "9a5663edda227f5d87533897146764f8e8a7481b9e71fae197c39204f8463221",
+    "blk.31.attn_norm.weight": "060a4f438a1ee5e220b5b5278ad2f5c085a428bf38c515766781815597c87529",
+    "blk.31.attn_output.weight": "6ada5d3cad9dea4780ffbb43302bb6ccc2f24eddd0fc4f5f84c9ce0fc0c6e5dd",
+    "blk.31.attn_q.weight": "bb5d08c08603907981ad388d5d8b70fcc9b98034ba264b8474c8890cc0297af0",
+    "blk.31.attn_v.weight": "e01b4252ea9c6a889c32b21144b441a347464d04536ef4f6572425be55759796",
+    "blk.31.ffn_down.weight": "8ba4d679c36e93ba65ba03180385ef35ea86b3b7cdf2fded9df59369f1c09630",
+    "blk.31.ffn_gate.weight": "e5b41dc93645f8b5e8eebae3ada3ea43a18f97ce2654228655170b07b463ccb0",
+    "blk.31.ffn_up.weight": "25b88cdddc8b547af294ed107d3d1312e90b983cae87936fa6062ecd8ea02539",
+    "blk.32.attn_k.weight": "4bcf86dc0858c8ca2fbdf6aa76674d43eb698f78979fdc1a38f556a7af1facc4",
+    "blk.32.attn_norm.weight": "cdcc12f3b8b9773c6722736bfb748a2729230b21478cbcc4104859d3148df815",
+    "blk.32.attn_output.weight": "d43f1196822995ed89a9365c97054753a8b30ce20b6e273c8edcc42673a1e141",
+    "blk.32.attn_q.weight": "ebf2972bb3865cbc5be4840113a322089752038344beab2a0122c7cb4fb399b6",
+    "blk.32.attn_v.weight": "714db81704ff34fa137512903c1013acee7877467473e46600728b9240582eb7",
+    "blk.32.ffn_down.weight": "2cde3da1258bb170a79d5d3cdfe10c86a71eb34b77da46b74c5ed71e7f4fe274",
+    "blk.32.ffn_gate.weight": "c7e1ed792532613ff9d4e5834b6536e2e0f47df2303bc0fdaa90aac0c1f4e8db",
+    "blk.32.ffn_up.weight": "d8d6f13fe66a716e28f79101a29817f0c0d6f99969a6f017d51bafd1a16c600c",
+    "blk.33.attn_k.weight": "a0a28f6cbca88da00cab2ca37094d9b0503bf9defdae77b91895b911c408cbb6",
+    "blk.33.attn_norm.weight": "0251200c24cc8445607ace6dc8c5aa0566567997262b7cca53a11ac23cc564b2",
+    "blk.33.attn_output.weight": "b2423205bdf6a1096d43c44d8d12f1a84fcd4e1bb70fcf6dc8542b8b8a71a13c",
+    "blk.33.attn_q.weight": "00b425c3ef71065ce5e0234e702bf38143b4952da78a85f52ab2c2e3073d97ab",
+    "blk.33.attn_v.weight": "035edd2335df816c42c765a5e66b9d9b9e15a822a8dc1863508145499c942c14",
+    "blk.33.ffn_down.weight": "4894a923a3db75bae4496ba3ce5f28796ad31fe33996a066271fb8654964310e",
+    "blk.33.ffn_gate.weight": "8f6c819b8bbfbe3357fae89e1ac5a3d58be85b3b04be3bacf7b62775869046ff",
+    "blk.33.ffn_up.weight": "257c3544b5b544fd5d839665bf5caf107a329b59dbc3751efcaa24ae63c56179",
+    "blk.34.attn_k.weight": "b6cd8bba892e38dac4a2ebc3ba1bce49e71b967fc436fde30c6d76f54a18935f",
+    "blk.34.attn_norm.weight": "2b3c8e60a064cba9955752bbbbdd92c71ba5c2f1bd721097bdbe88b5abc68787",
+    "blk.34.attn_output.weight": "8cc272551c9aaca9db5a660c6927bab94a0243d74a30b2bc165f06bd577714ea",
+    "blk.34.attn_q.weight": "74b561eb4792484e6a94b58fe2583848c3ae28ff2f1bf3d02939a0cfdfa49990",
+    "blk.34.attn_v.weight": "dba19e24ff05154dc5a1f55c023729303a583d13d68732ce22ea74d4410dc8f0",
+    "blk.34.ffn_down.weight": "76eca5dfeb274c35774e0bf9f22ee420ed9085c8e99aa2cd5a236e4918b44c61",
+    "blk.34.ffn_gate.weight": "9af0862d5fcbc24732846488e653db8242a467765c0cdbc00332b3a40256b4a6",
+    "blk.34.ffn_up.weight": "2a03126bf73587eaba99ece2066103d12e47bcd4ce30ff6c17b2f383b81d40df",
+    "blk.35.attn_k.weight": "52513fc0cd4e997a842729af7d21dd09399bce0a339558374738be266d0fa2f0",
+    "blk.35.attn_norm.weight": "e5281fa911964263ccf1630b14762edbd41d0b9472d6ec695fc600fed4892c35",
+    "blk.35.attn_output.weight": "b391d6705d5dc6f48326b5fd16573f679edf64109d86fb729a498819676590ca",
+    "blk.35.attn_q.weight": "d16446921966db9b0e0539626ad22a2511ace780e59379d6a4162d8c5441440b",
+    "blk.35.attn_v.weight": "9d8cdf23ffdb0c5c74106843390b94b24c9f33ef0eb9998d39f78c73390101ea",
+    "blk.35.ffn_down.weight": "938eb6301f7bbf162d7dd965682a5ed11d0a4a530c6fedd7e5469ce80012fc17",
+    "blk.35.ffn_gate.weight": "5ad84f5a0c8edcfea1ecf1a3e3d21d85ceda0c4ad9e3c6ca68885eeff8ed3c2f",
+    "blk.35.ffn_up.weight": "1c4330d9dc71bf4c98812c34356c51f520f47610a534152aa6d29284b758090d",
+    "blk.36.attn_k.weight": "ef720655e5ca2465f13db2dfc4732fb4ef2c9d53acde52f514fd4f301e974081",
+    "blk.36.attn_norm.weight": "88f4b9310b3c8c2644e3029160cd35678c79dfa59280430e03f5c29a6fe84a58",
+    "blk.36.attn_output.weight": "aec6f915fffd7bb72cd783273e871b4f09605950089d45e72059d1316b6c4b01",
+    "blk.36.attn_q.weight": "72f9408a2405d42f8db6ce5fcf1d26a3660b6f225fc60e77d0277109cfcb82ed",
+    "blk.36.attn_v.weight": "0f3b3d851dc44b3893ef53f6cca5b4acc9658bacfe1cc2d13c3d704ddd409b67",
+    "blk.36.ffn_down.weight": "470aec48ce8c5129a6654d9fd26fcae72776f9fc1429a8bb05818072a876475d",
+    "blk.36.ffn_gate.weight": "7f5f296d09cf55679767b5d15de3eff489c456782119f25204be4b1647f18dcf",
+    "blk.36.ffn_up.weight": "b7ef74a1f7ffb4982711d93f1787be3a70edc3d2358d5203c41d8900508037d4",
+    "blk.37.attn_k.weight": "c4ffa5412e4ff2dcfe1aed991c1f54169fd171a4c7638e4b9f21a1ca64c5e1d6",
+    "blk.37.attn_norm.weight": "4eb6c888d841cccfacf5b963f8611120f6ff24b84af0b5714fd9ab36dcda422f",
+    "blk.37.attn_output.weight": "db2a7bbf9682f9f6eea672dae8e150738f1bf74dbc80edc7022017a3f040c8ac",
+    "blk.37.attn_q.weight": "e38c0462aff139afcbab289189823527e453abc9e541154adde5e7af88cacf0b",
+    "blk.37.attn_v.weight": "952eb2492ed452a72f96bcc12d4b2affad9dfdf46ee39ce4a5d7b57a5dc301e5",
+    "blk.37.ffn_down.weight": "25f23a8fbc44febf6dc4848fd7fe03a580e2822bd3b3b5a51f4990826bfe3e4e",
+    "blk.37.ffn_gate.weight": "707da5eb40118b035305d3262444382351f170a20a537386a70e90c5a83a7817",
+    "blk.37.ffn_up.weight": "d2d2ba5cfc4ef47338dd7384219e22bf030a5a2209e0354d88f5bbaaafd20e87",
+    "blk.38.attn_k.weight": "abc4bb189dedf7ce661e79028427623a4f91ac091c2cd60e31b58bc62b1cda71",
+    "blk.38.attn_norm.weight": "9f4803a7d03fd40fcb83d85f84eb1d5682ea4e5bb084f210c02850675d804c3d",
+    "blk.38.attn_output.weight": "77cb66007f1a41df7135d0e7f900ceb499c2f667dfc3f1a6ac01a3203bbd3ccf",
+    "blk.38.attn_q.weight": "d94a8b26cd375bf2bcaa76597e314aa8268ee50a479d00931e5e0e021feadb5d",
+    "blk.38.attn_v.weight": "660c907888bc5016dc69b7d35fe6f55c7ded697c93be0e2d332a2f17aff88758",
+    "blk.38.ffn_down.weight": "6f06173bae5b00ffaf88ef383619a8b9c6a8d0d5c6494695d17f6c1de1a68a13",
+    "blk.38.ffn_gate.weight": "89f99be149d03f116527bfcabe073c50001c874de40fb6e817f6619027f3cd05",
+    "blk.38.ffn_up.weight": "8d57557c8d5e2d2688b73f01dddf1ce8d5194990cda6358153320aea88aac7f8",
+    "blk.39.attn_k.weight": "21be09c988b46c8393e6c2ec9230f3b5136eb7607dd1953ba92d0811c2f0dd75",
+    "blk.39.attn_norm.weight": "ba7c1912dd1c4e2d16917201f62396fd0600e4a451137eaddff255548c209abd",
+    "blk.39.attn_output.weight": "acfaf4abb3fd27fd899b5563c3877f176b597d8f6cdb2f2fd3f3a0bd4da15ed6",
+    "blk.39.attn_q.weight": "e8adbc140d4c8f0db2a27ca584c5531d5b1e080555fe627e34d80d0814a92bed",
+    "blk.39.attn_v.weight": "92f96b0e1f724e73a0f90a76c145654418844c04a6d4b14c05eb5af8a62bf8dc",
+    "blk.39.ffn_down.weight": "4d9ee7c65fc16fe95d10c47b79ac6a525741947600a64b5fcea5d300a82c50de",
+    "blk.39.ffn_gate.weight": "7e18507989f39b32191133d2657c2ee3b74f42f070579204d727eb72215793d1",
+    "blk.39.ffn_up.weight": "22cda752269c9757ba918abede1df95bb0f83a5c772dea13c8deea3d5f2723d9",
+    "output_norm.weight": "2858cf0e39d32caf52b7861378ace076000241e147f10b9eb21d8a5cd149e3cb"
+}
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@@ -9,8 +9,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
-
-	"github.com/ollama/ollama/envconfig"
 )

 // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -41,13 +39,10 @@ func commonAMDValidateLibDir() (string, error) {
 	// Favor our bundled version

 	// Installer payload location if we're running the installed binary
-	exe, err := os.Executable()
-	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
-		if rocmLibUsable(rocmTargetDir) {
-			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-			return rocmTargetDir, nil
-		}
+	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
+		return rocmTargetDir, nil
 	}

 	// Prefer explicit HIP env var
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -77,8 +77,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {

 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
-	depPaths := LibraryDirs()
-	libDir := ""
+	var libDir string

 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
@@ -353,9 +352,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				})
 				return nil, err
 			}
-			depPaths = append(depPaths, libDir)
 		}
-		gpuInfo.DependencyPath = depPaths
+		gpuInfo.DependencyPath = []string{libDir}

 		if gfxOverride == "" {
 			// Only load supported list once
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -5,7 +5,6 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
-	"os"
 	"path/filepath"
 	"slices"
 	"strconv"
@@ -50,14 +49,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		slog.Info(err.Error())
 		return nil, err
 	}
-	depPaths := LibraryDirs()
+
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
 		err = fmt.Errorf("unable to verify rocm library: %w", err)
 		slog.Warn(err.Error())
 		return nil, err
 	}
-	depPaths = append(depPaths, libDir)

 	var supported []string
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
@@ -113,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,

 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: depPaths,
+				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
@@ -164,9 +162,7 @@ func AMDValidateLibDir() (string, error) {
 	}

 	// Installer payload (if we're running from some other location)
-	localAppData := os.Getenv("LOCALAPPDATA")
-	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
+	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -23,7 +23,6 @@ import (

 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )

 type cudaHandles struct {
@@ -101,15 +100,7 @@ func initCudaHandles() *cudaHandles {

 	// Aligned with driver, we can't carry as payloads
 	nvcudaMgmtPatterns := NvcudaGlobs
-
-	if runtime.GOOS == "windows" {
-		localAppData := os.Getenv("LOCALAPPDATA")
-		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
-	}
-	libDirs := LibraryDirs()
-	for _, d := range libDirs {
-		cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(d, CudartMgmtName))
-	}
+	cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(LibOllamaPath, "cuda_v*", CudartMgmtName))
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)

 	if len(NvmlGlobs) > 0 {
@@ -240,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
-		depPaths := LibraryDirs()
+
 		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
@@ -248,11 +239,9 @@ func GetGPUInfo() GpuInfoList {
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
-					memInfo:        mem,
-					Library:        "cpu",
-					Variant:        runners.GetCPUCapability().String(),
-					ID:             "0",
-					DependencyPath: depPaths,
+					memInfo: mem,
+					Library: "cpu",
+					ID:      "0",
 				},
 				CPUs: details,
 			},
@@ -294,17 +283,13 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
-				if depPaths != nil {
-					gpuInfo.DependencyPath = depPaths
-					// Check for variant specific directory
-					if variant != "" {
-						for _, d := range depPaths {
-							if _, err := os.Stat(filepath.Join(d, "cuda_"+variant)); err == nil {
-								// Put the variant directory first in the search path to avoid runtime linking to the wrong library
-								gpuInfo.DependencyPath = append([]string{filepath.Join(d, "cuda_"+variant)}, gpuInfo.DependencyPath...)
-								break
-							}
-						}
+
+				// Start with our bundled libraries
+				if variant != "" {
+					variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant)
+					if _, err := os.Stat(variantPath); err == nil {
+						// Put the variant directory first in the search path to avoid runtime linking to the wrong library
+						gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...)
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
@@ -376,7 +361,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPaths
+						gpuInfo.DependencyPath = []string{LibOllamaPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
@@ -512,33 +497,30 @@ func GetGPUInfo() GpuInfoList {

 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
-	var ldPaths []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)

-	// Start with our bundled libraries
-	patterns := []string{}
-	for _, d := range LibraryDirs() {
-		patterns = append(patterns, filepath.Join(d, baseLibName))
-	}
+	// search our bundled libraries first
+	patterns := []string{filepath.Join(LibOllamaPath, baseLibName)}

+	var ldPaths []string
 	switch runtime.GOOS {
 	case "windows":
-		ldPaths = strings.Split(os.Getenv("PATH"), ";")
+		ldPaths = strings.Split(os.Getenv("PATH"), string(os.PathListSeparator))
 	case "linux":
-		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-	default:
-		return gpuLibPaths
+		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), string(os.PathListSeparator))
 	}

-	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
-	for _, ldPath := range ldPaths {
-		d, err := filepath.Abs(ldPath)
+	// then search the system's LD_LIBRARY_PATH
+	for _, p := range ldPaths {
+		p, err := filepath.Abs(p)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName))
+		patterns = append(patterns, filepath.Join(p, baseLibName))
 	}
+
+	// finally, search the default patterns provided by the caller
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
 	for _, pattern := range patterns {
@@ -715,23 +697,6 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }

-func LibraryDirs() []string {
-	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
-	// This can be simplified once we no longer carry runners as payloads
-	exe, err := os.Executable()
-	if err != nil {
-		slog.Warn("failed to lookup executable path", "error", err)
-		return nil
-	}
-
-	lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
-	if _, err := os.Stat(lib); err != nil {
-		return nil
-	}
-
-	return []string{lib}
-}
-
 func GetSystemInfo() SystemInfo {
 	gpus := GetGPUInfo()
 	gpuMutex.Lock()
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -15,7 +15,6 @@ import (
 	"syscall"

 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )

 const (
@@ -28,7 +27,6 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: runners.GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -51,7 +49,6 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: runners.GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
--- a/discover/path.go
+++ b/discover/path.go
@@ -0,0 +1,53 @@
+package discover
+
+import (
+	"os"
+	"path/filepath"
+	"runtime"
+)
+
+// LibPath is a path to lookup dynamic libraries
+// in development it's usually 'build/lib/ollama'
+// in distribution builds it's 'lib/ollama' on Windows
+// '../lib/ollama' on Linux and the executable's directory on macOS
+// note: distribution builds, additional GPU-specific libraries are
+// found in subdirectories of the returned path, such as
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+var LibOllamaPath string = func() string {
+	exe, err := os.Executable()
+	if err != nil {
+		return ""
+	}
+
+	exe, err = filepath.EvalSymlinks(exe)
+	if err != nil {
+		return ""
+	}
+
+	libPath := filepath.Dir(exe)
+	switch runtime.GOOS {
+	case "windows":
+		libPath = filepath.Join(filepath.Dir(exe), "lib", "ollama")
+	case "linux":
+		libPath = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		return ""
+	}
+
+	// build paths for development
+	buildPaths := []string{
+		filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"),
+		filepath.Join(cwd, "build", "lib", "ollama"),
+	}
+
+	for _, p := range buildPaths {
+		if _, err := os.Stat(p); err == nil {
+			return p
+		}
+	}
+
+	return libPath
+}()
--- a/discover/types.go
+++ b/discover/types.go
@@ -5,7 +5,6 @@ import (
 	"log/slog"

 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )

 type memInfo struct {
@@ -107,7 +106,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != runners.CPUCapabilityNone.String() {
+		if info.Variant != "" {
 			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,7 +2,7 @@

 ### Getting Started
 * [Quickstart](../README.md#quickstart)
-* [Examples](../examples)
+* [Examples](./examples.md)
 * [Importing models](./import.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
--- a/docs/development.md
+++ b/docs/development.md
@@ -1,165 +1,120 @@
 # Development

-Install required tools:
+Install prerequisites:

- go version 1.22 or higher
- OS specific C/C++ compiler (see below)
- GNU Make
+- [Go](https://go.dev/doc/install)
+- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://jmeubank.github.io/tdm-gcc/download/) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.

-
-## Overview
-
-Ollama uses a mix of Go and C/C++ code to interface with GPUs.  The C/C++ code is compiled with both CGO and GPU library specific compilers.  A set of GNU Makefiles are used to compile the project.  GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary.  The default make target will build the runners and primary Go Ollama application that will run within the repo directory.  Throughout the examples below `-j 5` is suggested for 5 parallel jobs to speed up the build.  You can adjust the job count based on your CPU Core count to reduce build times.  If you want to relocate the built binaries, use the `dist` target and recursively copy the files in `./dist/$OS-$ARCH/` to your desired location. To learn more about the other make targets use `make help`
-
-Once you have built the GPU/CPU runners, you can compile the main application with `go build .` 
-
-### MacOS
-
-[Download Go](https://go.dev/dl/)
-
-```bash
-make -j 5
-```
-
-Now you can run `ollama`:
-
-```bash
-./ollama
-```
-
-#### Xcode 15 warnings
-
-If you are using Xcode newer than version 14, you may see a warning during `go build` about `ld: warning: ignoring duplicate libraries: '-lobjc'` due to Golang issue https://github.com/golang/go/issues/67799 which can be safely ignored.  You can suppress the warning with `export CGO_LDFLAGS="-Wl,-no_warn_duplicate_libraries"`
-
-### Linux
-
-#### Linux CUDA (NVIDIA)
-
-_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-
-Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages.
-
-Typically the makefile will auto-detect CUDA, however, if your Linux distro
-or installation approach uses alternative paths, you can specify the location by
-overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
-a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)
+Then build and run Ollama from the root directory of the repository:

 ```
-make -j 5
+go run . serve
 ```

-If both v11 and v12 tookkits are detected, runners for both major versions will be built by default.  You can build just v12 with `make cuda_v12`
+## macOS (Apple Silicon)

-#### Older Linux CUDA (NVIDIA)
+macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.

-To support older GPUs with Compute Capability 3.5 or 3.7, you will need to use an older version of the Driver from [Unix Driver Archive](https://www.nvidia.com/en-us/drivers/unix/) (tested with 470) and [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (tested with cuda V11).  When you build Ollama, you will need to set two make variable to adjust the minimum compute capability Ollama supports via `make -j 5 CUDA_ARCHITECTURES="35;37;50;52" EXTRA_GOLDFLAGS="\"-X=github.com/ollama/ollama/discover.CudaComputeMajorMin=3\" \"-X=github.com/ollama/ollama/discover.CudaComputeMinorMin=5\""`.  To find the Compute Capability of your older GPU, refer to [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
+## macOS (Intel)

-#### Linux ROCm (AMD)
+Install prerequisites:

-_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+- [CMake](https://cmake.org/download/) or `brew install cmake`

-Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
-
-Typically the build scripts will auto-detect ROCm, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `HIP_PATH` to the location of the ROCm
-install (typically `/opt/rocm`). You can also customize
-the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)
+Then, configure and build the project:

 ```
-make -j 5
+cmake -B build
+cmake --build build
 ```

-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
-
-#### Containerized Linux Build
-
-If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`
-
-### Windows
-
-The following tools are required as a minimal development environment to build CPU inference support.
-
- Go version 1.22 or higher
-  - https://go.dev/dl/
- Git
-  - https://git-scm.com/download/win
- clang with gcc compat and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
-  - [MSYS2](https://www.msys2.org/)
-    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
-  - Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
-
-> [!NOTE]  
-> Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
+Lastly, run Ollama:

 ```
-make -j 5
+go run . serve
 ```

-#### GPU Support
+## Windows

-The GPU tools require the Microsoft native build tools.  To build either CUDA or ROCm, you must first install MSVC via Visual Studio:
+Install prerequisites:

- Make sure to select `Desktop development with C++` as a Workload during the Visual Studio install
- You must complete the Visual Studio install and run it once **BEFORE** installing CUDA or ROCm for the tools to properly register
- Add the location of the **64 bit (x64)** compiler (`cl.exe`) to your `PATH`
- Note: the default Developer Shell may configure the 32 bit (x86) compiler which will lead to build failures.  Ollama requires a 64 bit toolchain.
+- [CMake](https://cmake.org/download/)
+- [Visual Studio 2022](https://visualstudio.microsoft.com/downloads/) including the Native Desktop Workload
+- (Optional) AMD GPU support
+    - [ROCm](https://rocm.github.io/install.html)
+    - [Ninja](https://github.com/ninja-build/ninja/releases)
+- (Optional) NVIDIA GPU support
+    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)

-#### Windows CUDA (NVIDIA)
+> [!IMPORTANT]
+> Ensure prerequisites are in `PATH` before running CMake.

-In addition to the common Windows development tools and MSVC described above:
+> [!IMPORTANT]
+> ROCm is not compatible with Visual Studio CMake generators. Use `-GNinja` when configuring the project.

- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
+> [!IMPORTANT]
+> CUDA is only compatible with Visual Studio CMake generators.

-#### Windows ROCm (AMD Radeon)
-
-In addition to the common Windows development tools and MSVC described above:
-
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
-
-#### Windows arm64
-
-The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
-
-```powershell
-import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
-Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
-```
-
-You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
-
-Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
+Then, configure and build the project:

 ```
-pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
+cmake -B build
+cmake --build build --config Release
 ```

-You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
-
-
-## Advanced CPU Vector Settings
-
-On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
-
-To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
-
-To build without any vector flags:
+Lastly, run Ollama:

 ```
-make CUSTOM_CPU_FLAGS=""
+go run . serve
 ```

-To build with both AVX and AVX2:
-```
-make CUSTOM_CPU_FLAGS=avx,avx2
-```
+## Windows (ARM)

-To build with AVX512 features turned on:
+Windows ARM does not support additional acceleration libraries at this time.
+
+## Linux
+
+Install prerequisites:
+
+- [CMake](https://cmake.org/download/) or `sudo apt install cmake` or `sudo dnf install cmake`
+- (Optional) AMD GPU support
+    - [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
+- (Optional) NVIDIA GPU support
+    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
+
+> [!IMPORTANT]
+> Ensure prerequisites are in `PATH` before running CMake.
+
+
+Then, configure and build the project:

 ```
-make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
+cmake -B build
+cmake --build build
 ```

-> [!NOTE]  
-> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
+Lastly, run Ollama:
+
+```
+go run . serve
+```
+
+## Docker
+
+```
+docker build .
+```
+
+### ROCm
+
+```
+docker build --build-arg FLAVOR=rocm .
+```
+
+## Running tests
+
+To run tests, use `go test`:
+
+```
+go test ./...
+```
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -38,7 +38,7 @@ Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
 You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
 ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")

-### Laptop Suspend Resume
+### Linux Suspend Resume

 On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
 your NVIDIA GPU, and fallback to running on the CPU.  You can workaround this
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -67,8 +67,6 @@ To use this:
 3. `ollama run choose-a-model-name`
 4. Start using the model!

-More examples are available in the [examples directory](../examples).
-
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.

  ```bash
@@ -155,7 +153,6 @@ PARAMETER <parameter> <parametervalue>
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
 | seed           | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0)                                                                                       | int        | seed 42              |
 | stop           | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile.                                      | string     | stop "AI assistant:" |
-| tfs_z          | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1)                                               | float      | tfs_z 1              |
 | num_predict    | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -288,12 +288,3 @@ func Values() map[string]string {
 func Var(key string) string {
 	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
-
-// On windows, we keep the binary at the top directory, but
-// other platforms use a "bin" directory, so this returns ".."
-func LibRelativeToExe() string {
-	if runtime.GOOS == "windows" {
-		return "."
-	}
-	return ".."
-}
--- a/go.mod
+++ b/go.mod
@@ -24,7 +24,6 @@ require (
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
-	golang.org/x/tools v0.28.0
 	gonum.org/v1/gonum v0.15.0
 )

@@ -72,7 +71,7 @@ require (
 	golang.org/x/arch v0.8.0 // indirect
 	golang.org/x/crypto v0.31.0
 	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
-	golang.org/x/net v0.32.0 // indirect
+	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sys v0.28.0
 	golang.org/x/term v0.27.0
 	golang.org/x/text v0.21.0
--- a/go.sum
+++ b/go.sum
@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
-golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
+golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -309,8 +309,6 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8=
-golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/grammar/bench_test.go
+++ b/grammar/bench_test.go
@@ -1,22 +0,0 @@
-//go:build go1.24
-
-package grammar
-
-import "testing"
-
-func BenchmarkFromSchema(b *testing.B) {
-	for tt := range testCases(b) {
-		b.Run("", func(b *testing.B) {
-			s := []byte(tt.schema)
-
-			b.ReportAllocs()
-			for b.Loop() {
-				_, err := FromSchema(nil, s)
-				if err != nil {
-					b.Fatalf("GrammarFromSchema: %v", err)
-				}
-			}
-		})
-		return
-	}
-}
--- a/grammar/grammar.go
+++ b/grammar/grammar.go
@@ -1,227 +0,0 @@
-package grammar
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"iter"
-	"strconv"
-
-	"github.com/ollama/ollama/grammar/jsonschema"
-)
-
-const jsonTerms = `
-# Unicode
-#
-# Unicode characters can be specified directly in the grammar, for example
-# hiragana ::= [ぁ-ゟ], or with escapes: 8-bit (\xXX), 16-bit (\uXXXX) or 32-bit
-# (\UXXXXXXXX).
-unicode ::= \x{hex}{2} | \u{hex}{4} | \U{hex}{8}
-
-# JSON grammar from RFC 7159
-null    ::= "null"
-object  ::= "{" (kv ("," kv)*)? "}"
-array   ::= "[" (value ("," value)*)? "]"
-kv      ::= string ":" value
-integer ::= "0" | [1-9] [0-9]*
-number  ::= "-"? integer frac? exp?
-frac    ::= "." [0-9]+
-exp     ::= ("e" | "E") ("+" | "-") [0-9]+
-string  ::= "\"" char* "\""
-escape  ::= ["/" | "b" | "f" | "n" | "r" | "t" | unicode]
-char    ::= [^"\\] | escape
-space   ::= (" " | "\t" | "\n" | "\r")*
-hex     ::= [0-9] | [a-f] | [A-F]
-boolean ::= "true" | "false"
-value   ::= object | array | string | number | boolean | "null"
-
-# User-defined
-`
-
-// FromSchema generates a grammar from a JSON schema.
-func FromSchema(buf []byte, jsonSchema []byte) ([]byte, error) {
-	var s *jsonschema.Schema
-	if err := json.Unmarshal(jsonSchema, &s); err != nil {
-		return nil, err
-	}
-
-	var g builder
-
-	// "root" is the only rule that is guaranteed to exist, so we start
-	// with its length for padding, and then adjust it as we go.
-	g.pad = len("root")
-	for id := range dependencies("root", s) {
-		g.pad = max(g.pad, len(id))
-	}
-
-	g.b.WriteString(jsonTerms)
-
-	ids := make(map[*jsonschema.Schema]string)
-	for id, s := range dependencies("root", s) {
-		ids[s] = id
-		g.define(id)
-		if err := fromSchema(&g, ids, s); err != nil {
-			return nil, err
-		}
-	}
-	g.define("root")
-	if err := fromSchema(&g, ids, s); err != nil {
-		return nil, err
-	}
-	g.define("") // finalize the last rule
-	return g.b.Bytes(), nil
-}
-
-func fromSchema(g *builder, ids map[*jsonschema.Schema]string, s *jsonschema.Schema) error {
-	switch typ := s.EffectiveType(); typ {
-	case "array":
-		if len(s.PrefixItems) == 0 && s.Items == nil {
-			g.u("array")
-		} else {
-			g.q("[")
-			for i, s := range s.PrefixItems {
-				if i > 0 {
-					g.q(",")
-				}
-				g.u(ids[s])
-			}
-			if s.Items != nil {
-				g.u("(")
-				if len(s.PrefixItems) > 0 {
-					g.q(",")
-				}
-				g.u(ids[s.Items])
-				g.u(")*")
-			}
-			g.q("]")
-		}
-	case "object":
-		if len(s.Properties) == 0 {
-			g.u("object")
-		} else {
-			g.q("{")
-			for i, p := range s.Properties {
-				name := ids[p]
-				if i > 0 {
-					g.q(",")
-				}
-				g.q(p.Name)
-				g.q(":")
-				g.u(name)
-			}
-			g.q("}")
-		}
-	case "number":
-		buildConstrainedNumber(g, s)
-	case "string":
-		if len(s.Enum) == 0 {
-			g.u("string")
-		} else {
-			g.u("(")
-			for i, e := range s.Enum {
-				if i > 0 {
-					g.q("|")
-				}
-				g.q(string(e))
-			}
-			g.u(")")
-		}
-	case "boolean", "value", "null", "integer":
-		g.u(typ)
-	default:
-		return fmt.Errorf("%s: unsupported type %q", s.Name, typ)
-	}
-	return nil
-}
-
-// dependencies returns a sequence of all child dependencies of the schema in
-// post-order.
-//
-// The first value is the id/pointer to the dependency, and the second value
-// is the schema.
-func dependencies(id string, s *jsonschema.Schema) iter.Seq2[string, *jsonschema.Schema] {
-	return func(yield func(string, *jsonschema.Schema) bool) {
-		for i, p := range s.Properties {
-			id := fmt.Sprintf("%s_%d", id, i)
-			for did, d := range dependencies(id, p) {
-				if !yield(did, d) {
-					return
-				}
-			}
-			if !yield(id, p) {
-				return
-			}
-		}
-		for i, p := range s.PrefixItems {
-			id := fmt.Sprintf("tuple_%d", i)
-			for did, d := range dependencies(id, p) {
-				id := fmt.Sprintf("%s_%s", id, did)
-				if !yield(id, d) {
-					return
-				}
-			}
-			if !yield(id, p) {
-				return
-			}
-		}
-		if s.Items != nil {
-			id := fmt.Sprintf("%s_tuple_%d", id, len(s.PrefixItems))
-			for did, d := range dependencies(id, s.Items) {
-				if !yield(did, d) {
-					return
-				}
-			}
-			if !yield(id, s.Items) {
-				return
-			}
-		}
-	}
-}
-
-type builder struct {
-	b     bytes.Buffer
-	pad   int
-	rules int
-	items int
-}
-
-// define terminates the current rule, if any, and then either starts a new
-// rule or does nothing else if the name is empty.
-func (b *builder) define(name string) {
-	if b.rules > 0 {
-		b.b.WriteString(";\n")
-	}
-	if name == "" {
-		return
-	}
-	fmt.Fprintf(&b.b, "% -*s", b.pad, name)
-	b.b.WriteString(" ::=")
-	b.rules++
-	b.items = 0
-}
-
-// quote appends a terminal to the current rule.
-func (b *builder) q(s string) {
-	if b.items > 0 {
-		b.b.WriteString(" ")
-	}
-	b.b.WriteString(" ")
-	b.b.WriteString(strconv.Quote(s))
-}
-
-// u appends a non-terminal to the current rule.
-func (b *builder) u(s string) {
-	if b.items > 0 {
-		b.b.WriteString(" ")
-	}
-	b.b.WriteString(" ")
-	b.b.WriteString(s)
-}
-
-func buildConstrainedNumber(b *builder, s *jsonschema.Schema) {
-	if s.Minimum == 0 && s.Maximum == 0 {
-		b.u("TODO")
-	} else {
-		b.u("number")
-	}
-}
--- a/grammar/grammar_test.go
+++ b/grammar/grammar_test.go
@@ -1,75 +0,0 @@
-package grammar
-
-import (
-	"bufio"
-	"cmp"
-	"iter"
-	"strings"
-	"testing"
-
-	_ "embed"
-
-	"github.com/ollama/ollama/grammar/internal/diff"
-)
-
-func TestFromSchema(t *testing.T) {
-	for tt := range testCases(t) {
-		t.Run(tt.name, func(t *testing.T) {
-			g, err := FromSchema(nil, []byte(tt.schema))
-			if err != nil {
-				t.Fatalf("FromSchema: %v", err)
-			}
-			got := string(g)
-			got = strings.TrimPrefix(got, jsonTerms)
-			if got != tt.want {
-				t.Logf("schema:\n%s", tt.schema)
-				t.Fatal(string(diff.Diff("got", []byte(got), "want", []byte(tt.want))))
-			}
-		})
-	}
-}
-
-type testCase struct {
-	name   string
-	schema string
-	want   string
-}
-
-//go:embed testdata/schemas.txt
-var tests string
-
-func testCases(t testing.TB) iter.Seq[testCase] {
-	t.Helper()
-	return func(yield func(testCase) bool) {
-		t.Helper()
-		sc := bufio.NewScanner(strings.NewReader(tests))
-		name := ""
-		for sc.Scan() {
-			line := strings.TrimSpace(sc.Text())
-			if line == "" {
-				name = ""
-				continue
-			}
-			if line[0] == '#' {
-				name = cmp.Or(name, strings.TrimSpace(line[1:]))
-				continue
-			}
-			s := sc.Text()
-			g := ""
-			for sc.Scan() {
-				line = strings.TrimSpace(sc.Text())
-				if line == "" || line[0] == '#' {
-					break
-				}
-				g += sc.Text() + "\n"
-			}
-			if !yield(testCase{name, s, g}) {
-				return
-			}
-			name = strings.TrimSpace(strings.TrimPrefix(line, "#"))
-		}
-		if err := sc.Err(); err != nil {
-			t.Fatalf("error reading tests: %v", err)
-		}
-	}
-}
--- a/grammar/internal/diff/diff.go
+++ b/grammar/internal/diff/diff.go
@@ -1,261 +0,0 @@
-// Copyright 2022 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package diff
-
-import (
-	"bytes"
-	"fmt"
-	"sort"
-	"strings"
-)
-
-// A pair is a pair of values tracked for both the x and y side of a diff.
-// It is typically a pair of line indexes.
-type pair struct{ x, y int }
-
-// Diff returns an anchored diff of the two texts old and new
-// in the “unified diff” format. If old and new are identical,
-// Diff returns a nil slice (no output).
-//
-// Unix diff implementations typically look for a diff with
-// the smallest number of lines inserted and removed,
-// which can in the worst case take time quadratic in the
-// number of lines in the texts. As a result, many implementations
-// either can be made to run for a long time or cut off the search
-// after a predetermined amount of work.
-//
-// In contrast, this implementation looks for a diff with the
-// smallest number of “unique” lines inserted and removed,
-// where unique means a line that appears just once in both old and new.
-// We call this an “anchored diff” because the unique lines anchor
-// the chosen matching regions. An anchored diff is usually clearer
-// than a standard diff, because the algorithm does not try to
-// reuse unrelated blank lines or closing braces.
-// The algorithm also guarantees to run in O(n log n) time
-// instead of the standard O(n²) time.
-//
-// Some systems call this approach a “patience diff,” named for
-// the “patience sorting” algorithm, itself named for a solitaire card game.
-// We avoid that name for two reasons. First, the name has been used
-// for a few different variants of the algorithm, so it is imprecise.
-// Second, the name is frequently interpreted as meaning that you have
-// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
-// when in fact the algorithm is faster than the standard one.
-func Diff(oldName string, old []byte, newName string, new []byte) []byte {
-	if bytes.Equal(old, new) {
-		return nil
-	}
-	x := lines(old)
-	y := lines(new)
-
-	// Print diff header.
-	var out bytes.Buffer
-	fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
-	fmt.Fprintf(&out, "--- %s\n", oldName)
-	fmt.Fprintf(&out, "+++ %s\n", newName)
-
-	// Loop over matches to consider,
-	// expanding each match to include surrounding lines,
-	// and then printing diff chunks.
-	// To avoid setup/teardown cases outside the loop,
-	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
-	// in the sequence of matches.
-	var (
-		done  pair     // printed up to x[:done.x] and y[:done.y]
-		chunk pair     // start lines of current chunk
-		count pair     // number of lines from each side in current chunk
-		ctext []string // lines for current chunk
-	)
-	for _, m := range tgs(x, y) {
-		if m.x < done.x {
-			// Already handled scanning forward from earlier match.
-			continue
-		}
-
-		// Expand matching lines as far as possible,
-		// establishing that x[start.x:end.x] == y[start.y:end.y].
-		// Note that on the first (or last) iteration we may (or definitely do)
-		// have an empty match: start.x==end.x and start.y==end.y.
-		start := m
-		for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
-			start.x--
-			start.y--
-		}
-		end := m
-		for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
-			end.x++
-			end.y++
-		}
-
-		// Emit the mismatched lines before start into this chunk.
-		// (No effect on first sentinel iteration, when start = {0,0}.)
-		for _, s := range x[done.x:start.x] {
-			ctext = append(ctext, "-"+s)
-			count.x++
-		}
-		for _, s := range y[done.y:start.y] {
-			ctext = append(ctext, "+"+s)
-			count.y++
-		}
-
-		// If we're not at EOF and have too few common lines,
-		// the chunk includes all the common lines and continues.
-		const C = 3 // number of context lines
-		if (end.x < len(x) || end.y < len(y)) &&
-			(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
-			for _, s := range x[start.x:end.x] {
-				ctext = append(ctext, " "+s)
-				count.x++
-				count.y++
-			}
-			done = end
-			continue
-		}
-
-		// End chunk with common lines for context.
-		if len(ctext) > 0 {
-			n := end.x - start.x
-			if n > C {
-				n = C
-			}
-			for _, s := range x[start.x : start.x+n] {
-				ctext = append(ctext, " "+s)
-				count.x++
-				count.y++
-			}
-			done = pair{start.x + n, start.y + n}
-
-			// Format and emit chunk.
-			// Convert line numbers to 1-indexed.
-			// Special case: empty file shows up as 0,0 not 1,0.
-			if count.x > 0 {
-				chunk.x++
-			}
-			if count.y > 0 {
-				chunk.y++
-			}
-			fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
-			for _, s := range ctext {
-				out.WriteString(s)
-			}
-			count.x = 0
-			count.y = 0
-			ctext = ctext[:0]
-		}
-
-		// If we reached EOF, we're done.
-		if end.x >= len(x) && end.y >= len(y) {
-			break
-		}
-
-		// Otherwise start a new chunk.
-		chunk = pair{end.x - C, end.y - C}
-		for _, s := range x[chunk.x:end.x] {
-			ctext = append(ctext, " "+s)
-			count.x++
-			count.y++
-		}
-		done = end
-	}
-
-	return out.Bytes()
-}
-
-// lines returns the lines in the file x, including newlines.
-// If the file does not end in a newline, one is supplied
-// along with a warning about the missing newline.
-func lines(x []byte) []string {
-	l := strings.SplitAfter(string(x), "\n")
-	if l[len(l)-1] == "" {
-		l = l[:len(l)-1]
-	} else {
-		// Treat last line as having a message about the missing newline attached,
-		// using the same text as BSD/GNU diff (including the leading backslash).
-		l[len(l)-1] += "\n\\ No newline at end of file\n"
-	}
-	return l
-}
-
-// tgs returns the pairs of indexes of the longest common subsequence
-// of unique lines in x and y, where a unique line is one that appears
-// once in x and once in y.
-//
-// The longest common subsequence algorithm is as described in
-// Thomas G. Szymanski, “A Special Case of the Maximal Common
-// Subsequence Problem,” Princeton TR #170 (January 1975),
-// available at https://research.swtch.com/tgs170.pdf.
-func tgs(x, y []string) []pair {
-	// Count the number of times each string appears in a and b.
-	// We only care about 0, 1, many, counted as 0, -1, -2
-	// for the x side and 0, -4, -8 for the y side.
-	// Using negative numbers now lets us distinguish positive line numbers later.
-	m := make(map[string]int)
-	for _, s := range x {
-		if c := m[s]; c > -2 {
-			m[s] = c - 1
-		}
-	}
-	for _, s := range y {
-		if c := m[s]; c > -8 {
-			m[s] = c - 4
-		}
-	}
-
-	// Now unique strings can be identified by m[s] = -1+-4.
-	//
-	// Gather the indexes of those strings in x and y, building:
-	//	xi[i] = increasing indexes of unique strings in x.
-	//	yi[i] = increasing indexes of unique strings in y.
-	//	inv[i] = index j such that x[xi[i]] = y[yi[j]].
-	var xi, yi, inv []int
-	for i, s := range y {
-		if m[s] == -1+-4 {
-			m[s] = len(yi)
-			yi = append(yi, i)
-		}
-	}
-	for i, s := range x {
-		if j, ok := m[s]; ok && j >= 0 {
-			xi = append(xi, i)
-			inv = append(inv, j)
-		}
-	}
-
-	// Apply Algorithm A from Szymanski's paper.
-	// In those terms, A = J = inv and B = [0, n).
-	// We add sentinel pairs {0,0}, and {len(x),len(y)}
-	// to the returned sequence, to help the processing loop.
-	J := inv
-	n := len(xi)
-	T := make([]int, n)
-	L := make([]int, n)
-	for i := range T {
-		T[i] = n + 1
-	}
-	for i := range n {
-		k := sort.Search(n, func(k int) bool {
-			return T[k] >= J[i]
-		})
-		T[k] = J[i]
-		L[i] = k + 1
-	}
-	k := 0
-	for _, v := range L {
-		if k < v {
-			k = v
-		}
-	}
-	seq := make([]pair, 2+k)
-	seq[1+k] = pair{len(x), len(y)} // sentinel at end
-	lastj := n
-	for i := n - 1; i >= 0; i-- {
-		if L[i] == k && J[i] < lastj {
-			seq[k] = pair{xi[i], yi[J[i]]}
-			k--
-		}
-	}
-	seq[0] = pair{0, 0} // sentinel at start
-	return seq
-}
--- a/grammar/internal/diff/diff_test.go
+++ b/grammar/internal/diff/diff_test.go
@@ -1,44 +0,0 @@
-// Copyright 2022 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package diff
-
-import (
-	"bytes"
-	"path/filepath"
-	"testing"
-
-	"golang.org/x/tools/txtar"
-)
-
-func clean(text []byte) []byte {
-	text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n"))
-	text = bytes.TrimSuffix(text, []byte("^D\n"))
-	return text
-}
-
-func Test(t *testing.T) {
-	files, _ := filepath.Glob("testdata/*.txt")
-	if len(files) == 0 {
-		t.Fatalf("no testdata")
-	}
-
-	for _, file := range files {
-		t.Run(filepath.Base(file), func(t *testing.T) {
-			a, err := txtar.ParseFile(file)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if len(a.Files) != 3 || a.Files[2].Name != "diff" {
-				t.Fatalf("%s: want three files, third named \"diff\"", file)
-			}
-			diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data))
-			want := clean(a.Files[2].Data)
-			if !bytes.Equal(diffs, want) {
-				t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file,
-					diffs, want, Diff("have", diffs, "want", want))
-			}
-		})
-	}
-}
--- a/grammar/internal/diff/testdata/allnew.txt
+++ b/grammar/internal/diff/testdata/allnew.txt
@@ -1,13 +0,0 @@
-- old --
-- new --
-a
-b
-c
-- diff --
-diff old new
--- old
-+++ new
-@@ -0,0 +1,3 @@
-+a
-+b
-+c
--- a/grammar/internal/diff/testdata/allold.txt
+++ b/grammar/internal/diff/testdata/allold.txt
@@ -1,13 +0,0 @@
-- old --
-a
-b
-c
-- new --
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,3 +0,0 @@
-a
-b
-c
--- a/grammar/internal/diff/testdata/basic.txt
+++ b/grammar/internal/diff/testdata/basic.txt
@@ -1,35 +0,0 @@
-Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.”
-https://www.cs.dartmouth.edu/~doug/diff.pdf
-
-- old --
-a
-b
-c
-d
-e
-f
-g
-- new --
-w
-a
-b
-x
-y
-z
-e
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,7 +1,7 @@
-+w
- a
- b
-c
-d
-+x
-+y
-+z
- e
-f
-g
--- a/grammar/internal/diff/testdata/dups.txt
+++ b/grammar/internal/diff/testdata/dups.txt
@@ -1,40 +0,0 @@
-- old --
-a
-
-b
-
-c
-
-d
-
-e
-
-f
-- new --
-a
-
-B
-
-C
-
-d
-
-e
-
-f
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,8 +1,8 @@
- a
- $
-b
-
-c
-+B
-+
-+C
- $
- d
- $
--- a/grammar/internal/diff/testdata/end.txt
+++ b/grammar/internal/diff/testdata/end.txt
@@ -1,38 +0,0 @@
-- old --
-1
-2
-3
-4
-5
-6
-7
-eight
-nine
-ten
-eleven
-- new --
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-- diff --
-diff old new
--- old
-+++ new
-@@ -5,7 +5,6 @@
- 5
- 6
- 7
-eight
-nine
-ten
-eleven
-+8
-+9
-+10
--- a/grammar/internal/diff/testdata/eof.txt
+++ b/grammar/internal/diff/testdata/eof.txt
@@ -1,9 +0,0 @@
-- old --
-a
-b
-c^D
-- new --
-a
-b
-c^D
-- diff --
--- a/grammar/internal/diff/testdata/eof1.txt
+++ b/grammar/internal/diff/testdata/eof1.txt
@@ -1,18 +0,0 @@
-- old --
-a
-b
-c
-- new --
-a
-b
-c^D
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,3 +1,3 @@
- a
- b
-c
-+c
-\ No newline at end of file
--- a/grammar/internal/diff/testdata/eof2.txt
+++ b/grammar/internal/diff/testdata/eof2.txt
@@ -1,18 +0,0 @@
-- old --
-a
-b
-c^D
-- new --
-a
-b
-c
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,3 +1,3 @@
- a
- b
-c
-\ No newline at end of file
-+c
--- a/grammar/internal/diff/testdata/long.txt
+++ b/grammar/internal/diff/testdata/long.txt
@@ -1,62 +0,0 @@
-- old --
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-14½
-15
-16
-17
-18
-19
-20
-- new --
-1
-2
-3
-4
-5
-6
-8
-9
-10
-11
-12
-13
-14
-17
-18
-19
-20
-- diff --
-diff old new
--- old
-+++ new
-@@ -4,7 +4,6 @@
- 4
- 5
- 6
-7
- 8
- 9
- 10
-@@ -12,9 +11,6 @@
- 12
- 13
- 14
-14½
-15
-16
- 17
- 18
- 19
--- a/grammar/internal/diff/testdata/same.txt
+++ b/grammar/internal/diff/testdata/same.txt
@@ -1,5 +0,0 @@
-- old --
-hello world
-- new --
-hello world
-- diff --
--- a/grammar/internal/diff/testdata/start.txt
+++ b/grammar/internal/diff/testdata/start.txt
@@ -1,34 +0,0 @@
-- old --
-e
-pi
-4
-5
-6
-7
-8
-9
-10
-- new --
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,5 +1,6 @@
-e
-pi
-+1
-+2
-+3
- 4
- 5
- 6
--- a/grammar/internal/diff/testdata/triv.txt
+++ b/grammar/internal/diff/testdata/triv.txt
@@ -1,40 +0,0 @@
-Another example from Hunt and McIlroy,
-“An Algorithm for Differential File Comparison.”
-https://www.cs.dartmouth.edu/~doug/diff.pdf
-
-Anchored diff gives up on finding anything,
-since there are no unique lines.
-
-- old --
-a
-b
-c
-a
-b
-b
-a
-- new --
-c
-a
-b
-a
-b
-c
-- diff --
-diff old new
--- old
-+++ new
-@@ -1,7 +1,6 @@
-a
-b
-c
-a
-b
-b
-a
-+c
-+a
-+b
-+a
-+b
-+c
--- a/grammar/jsonschema/decode.go
+++ b/grammar/jsonschema/decode.go
@@ -1,171 +0,0 @@
-package jsonschema
-
-import (
-	"bytes"
-	"encoding/json"
-	"errors"
-)
-
-// Schema holds a JSON schema.
-type Schema struct {
-	// Name is the name of the property. For the parent/root property, this
-	// is "root". For child properties, this is the name of the property.
-	Name string `json:"-"`
-
-	// Type is the type of the property.
-	//
-	// TODO: Union types (e.g. make this a []string).
-	Type string
-
-	// PrefixItems is a list of schemas for each item in a tuple. By
-	// default, the tuple is "closed." unless Items is set to true or a
-	// valid Schema.
-	PrefixItems []*Schema
-
-	// Items is the schema for each item in a list.
-	//
-	// If it is missing, or its JSON value is "null" or "false", it is nil.
-	// If the JSON value is "true", it is set to the empty Schema. If the
-	// JSON value is an object, it will be decoded as a Schema.
-	Items *Schema
-
-	// MinItems specifies the minimum number of items allowed in a list.
-	MinItems int
-
-	// MaxItems specifies the maximum number of items allowed in a list.
-	MaxItems int
-
-	// Properties is the schema for each property of an object.
-	Properties []*Schema
-
-	// Format is the format of the property. This is used to validate the
-	// property against a specific format.
-	//
-	// It is the callers responsibility to validate the property against
-	// the format.
-	Format string
-
-	// Minimum specifies the minimum value for numeric properties.
-	Minimum float64
-
-	// Maximum specifies the maximum value for numeric properties.
-	Maximum float64
-
-	// Enum is a list of valid values for the property.
-	Enum []json.RawMessage
-}
-
-func (s *Schema) UnmarshalJSON(data []byte) error {
-	type S Schema
-	w := struct {
-		Properties props
-		Items      items
-		*S
-	}{
-		S: (*S)(s),
-	}
-	if err := json.Unmarshal(data, &w); err != nil {
-		return err
-	}
-	if w.Items.set {
-		s.Items = &w.Items.Schema
-	}
-	s.Properties = w.Properties
-	return nil
-}
-
-type items struct {
-	Schema
-	set bool
-}
-
-func (s *items) UnmarshalJSON(data []byte) error {
-	switch b := data[0]; b {
-	case 't':
-		*s = items{set: true}
-	case '{':
-		type I items
-		if err := json.Unmarshal(data, (*I)(s)); err != nil {
-			return err
-		}
-		s.set = true
-	case 'n', 'f':
-	default:
-		return errors.New("invalid Items")
-	}
-	return nil
-}
-
-// EffectiveType returns the effective type of the schema. If the Type field is
-// not empty, it is returned; otherwise:
-//
-//   - If the schema has both Properties and Items, it returns an empty string.
-//   - If the schema has Properties, it returns "object".
-//   - If the schema has Items, it returns "array".
-//   - If the schema has neither Properties nor Items, it returns "value".
-//
-// The returned string is never empty.
-func (d *Schema) EffectiveType() string {
-	if d.Type == "" {
-		if len(d.Properties) > 0 {
-			return "object"
-		}
-		if len(d.PrefixItems) > 0 || d.Items != nil {
-			return "array"
-		}
-		return "value"
-	}
-	return d.Type
-}
-
-// props is an ordered list of properties. The order of the properties
-// is the order in which they were defined in the schema.
-type props []*Schema
-
-var _ json.Unmarshaler = (*props)(nil)
-
-func (v *props) UnmarshalJSON(data []byte) error {
-	if len(data) == 0 {
-		return nil
-	}
-	if data[0] != '{' {
-		return errors.New("expected object")
-	}
-
-	d := json.NewDecoder(bytes.NewReader(data))
-
-	// TODO(bmizerany): Consider DisallowUnknownFields. Currently, we, like
-	// llama.cpp, ignore unknown fields, which could be lead to unexpected
-	// behavior for clients of this package, since they may not be aware
-	// that "additionalFields", "itemsPrefix", etc, are being ignored.
-	//
-	// For now, just do what llama.cpp does.
-
-	t, err := d.Token()
-	if err != nil {
-		return err
-	}
-	if t != json.Delim('{') {
-		return errors.New("expected object")
-	}
-	for d.More() {
-		// Use the first token (map key) as the property name, then
-		// decode the rest of the object fields into a Schema and
-		// append.
-		t, err := d.Token()
-		if err != nil {
-			return err
-		}
-		if t == json.Delim('}') {
-			return nil
-		}
-		s := &Schema{
-			Name: t.(string),
-		}
-		if err := d.Decode(s); err != nil {
-			return err
-		}
-		*v = append(*v, s)
-	}
-	return nil
-}
--- a/grammar/jsonschema/decode_test.go
+++ b/grammar/jsonschema/decode_test.go
@@ -1,104 +0,0 @@
-package jsonschema
-
-import (
-	"encoding/json"
-	"reflect"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-const testSchemaBasic = `
-{
-  "properties": {
-    "tupleClosedEmpty":   { "prefixItems": [] },
-    "tupleClosedMissing": { "prefixItems": [{}] },
-    "tupleClosedNull":    { "prefixItems": [{}], "items": null },
-    "tupleClosedFalse":   { "prefixItems": [{}], "items": false },
-    "tupleOpenTrue":      { "prefixItems": [{}], "items": true },
-    "tupleOpenEmpty":     { "prefixItems": [{}], "items": {} },
-    "tupleOpenTyped":     { "prefixItems": [{}], "items": {"type": "boolean"} },
-    "tupleOpenMax":       { "prefixItems": [{}], "items": true, "maxItems": 3},
-
-    "array": { "items": {"type": "number"} },
-
-    "null": { "type": "null" },
-    "string": { "type": "string" },
-    "boolean": { "type": "boolean" }
-  }
-}
-`
-
-func TestSchemaUnmarshal(t *testing.T) {
-	var got *Schema
-	if err := json.Unmarshal([]byte(testSchemaBasic), &got); err != nil {
-		t.Fatalf("Unmarshal: %v", err)
-	}
-	want := &Schema{
-		Properties: []*Schema{
-			{Name: "tupleClosedEmpty", PrefixItems: []*Schema{}, Items: nil},
-			{Name: "tupleClosedMissing", PrefixItems: []*Schema{{}}, Items: nil},
-			{Name: "tupleClosedNull", PrefixItems: []*Schema{{}}, Items: nil},
-			{Name: "tupleClosedFalse", PrefixItems: []*Schema{{}}, Items: nil},
-
-			{Name: "tupleOpenTrue", PrefixItems: []*Schema{{}}, Items: &Schema{}},
-			{Name: "tupleOpenEmpty", PrefixItems: []*Schema{{}}, Items: &Schema{}},
-			{Name: "tupleOpenTyped", PrefixItems: []*Schema{{}}, Items: &Schema{Type: "boolean"}},
-			{Name: "tupleOpenMax", PrefixItems: []*Schema{{}}, Items: &Schema{}, MaxItems: 3},
-
-			{Name: "array", Items: &Schema{Type: "number"}},
-
-			{Name: "null", Type: "null"},
-			{Name: "string", Type: "string"},
-			{Name: "boolean", Type: "boolean"},
-		},
-	}
-
-	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("(-want, +got)\n%s", diff)
-	}
-}
-
-func TestEffectiveType(t *testing.T) {
-	const schema = `
-		{"properties": {
-			"o": {"type": "object"},
-			"a": {"type": "array"},
-			"n": {"type": "number"},
-			"s": {"type": "string"},
-			"z": {"type": "null"},
-			"b": {"type": "boolean"},
-
-			"t0": {"prefixItems": [{}], "items": {"type": "number"}},
-			"t1": {"items": {"type": "number"}, "maxItems": 3},
-
-			"v": {"maxItems": 3}
-		}}
-	`
-
-	var s *Schema
-	if err := json.Unmarshal([]byte(schema), &s); err != nil {
-		t.Fatalf("json.Unmarshal: %v", err)
-	}
-
-	var got []string
-	for _, p := range s.Properties {
-		got = append(got, p.EffectiveType())
-	}
-
-	want := strings.Fields(`
-		object
-		array
-		number
-		string
-		null
-		boolean
-		array
-		array
-		value
-	`)
-	if !reflect.DeepEqual(want, got) {
-		t.Errorf("\ngot:\n\t%v\nwant:\n\t%v", got, want)
-	}
-}
--- a/grammar/testdata/schemas.txt
+++ b/grammar/testdata/schemas.txt
@@ -1,76 +0,0 @@
-# This file holds tests for JSON schema to EBNF grammar conversions.
-#
-# The format is a JSON schema, followed by the expected EBNF grammar. Each test
-# MAY be preceded by a comment that describes the test (e.g. the test name), followed by
-# the JSON schema and the expected EBNF grammar. If no comment is present, the test
-# name the tests number in the file (e.g. "#0", "#1", etc.)
-#
-# Blank lines signify the end or start of a new test. Comments can be added
-# anywhere in the file, but they must be preceded by a '#' character and start at
-# the beginning of the line.
-
-# default
-{}
-root ::= value;
-
-{"properties": {}}
-root ::= value;
-
-# array
-{"properties": {"a": {"type": "array", "items": {"type": "string"}}}}
-root_0_tuple_0 ::= string;
-root_0         ::= "[" ( root_0_tuple_0 )* "]";
-root           ::= "{" "a" ":" root_0 "}";
-
-# array with nested array
-{"type": "array", "items": {"type": "array", "items": {"type": "string"}}}
-root_tuple_0_tuple_0 ::= string;
-root_tuple_0         ::= "[" ( root_tuple_0_tuple_0 )* "]";
-root                 ::= "[" ( root_tuple_0 )* "]";
-
-# object
-{"properties": {"e": {}}}
-root_0 ::= value;
-root   ::= "{" "e" ":" root_0 "}";
-
-# object with nested object
-{"properties": {"o": {"type": "object", "properties": {"e": {}}}}}
-root_0_0 ::= value;
-root_0   ::= "{" "e" ":" root_0_0 "}";
-root     ::= "{" "o" ":" root_0 "}";
-
-# boolean
-{"type": "boolean"}
-root ::= boolean;
-
-# number
-{"properties": {"n": {"type": "number", "minimum": 123, "maximum": 4567}}}
-root_0 ::= number;
-root   ::= "{" "n" ":" root_0 "}";
-
-# string
-{"type": "string"}
-root ::= string;
-
-# string with enum
-{"type": "string", "enum": ["a", "b", "c"]}
-root ::= ( "\"a\"" "|" "\"b\"" "|" "\"c\"" );
-
-# spaces in key
-{"properties": {"a b": {}}}
-root_0 ::= value;
-root   ::= "{" "a b" ":" root_0 "}";
-
-# issue7978
-{ "type": "object", "properties": { "steps": { "type": "array", "items": { "type": "object", "properties": { "explanation": { "type": "string" }, "output": { "type": "string" } }, "required": [ "explanation", "output" ], "additionalProperties": false } }, "final_answer": { "type": "string" } }, "required": [ "steps", "final_answer" ], "additionalProperties": false }
-root_0_tuple_0_0 ::= string;
-root_0_tuple_0_1 ::= string;
-root_0_tuple_0   ::= "{" "explanation" ":" root_0_tuple_0_0 "," "output" ":" root_0_tuple_0_1 "}";
-root_0           ::= "[" ( root_0_tuple_0 )* "]";
-root_1           ::= string;
-root             ::= "{" "steps" ":" root_0 "," "final_answer" ":" root_1 "}";
-
-# !! # special characters in key
-# !! {"properties": {"a!b": {}}}
-# !! !invalid character '!' in key
-# !! 
--- a/llama/README.md
+++ b/llama/README.md
@@ -1,157 +1,53 @@
 # `llama`

-This package integrates the [llama.cpp](https://github.com/ggerganov/llama.cpp) library as a Go package and makes it easy to build it with tags for different CPU and GPU processors.
-
-Supported:
-
- [x] CPU
- [x] avx, avx2
- [x] macOS Metal
- [x] Windows CUDA
- [x] Windows ROCm
- [x] Linux CUDA
- [x] Linux ROCm
- [x] Llava
-
-Extra build steps are required for CUDA and ROCm on Windows since `nvcc` and `hipcc` both require using msvc as the host compiler. For these shared libraries are created:
-
- `ggml_cuda.dll` on Windows or `ggml_cuda.so` on Linux
- `ggml_hipblas.dll` on Windows or `ggml_hipblas.so` on Linux
-
-> Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
-
-## Building
-
-```
-go build .
-```
-
-### AVX
-
-```shell
-go build -tags avx .
-```
-
-### AVX2
-
-```shell
-# go doesn't recognize `-mfma` as a valid compiler flag
-# see https://github.com/golang/go/issues/17895
-go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
-go build -tags=avx,avx2 .
-```
-
-## Linux
-
-### CUDA
-
-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
-
-```shell
-make ggml_cuda.so
-go build -tags avx,cuda .
-```
-
-### ROCm
-
-Install [ROCm](https://rocm.docs.amd.com/en/latest/).
-
-```shell
-make ggml_hipblas.so
-go build -tags avx,rocm .
-```
-
-## Windows
-
-Download [w64devkit](https://github.com/skeeto/w64devkit/releases/latest) for a simple MinGW development environment.
-
-### CUDA
-
-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build the cuda code:
-
-```shell
-make ggml_cuda.dll
-go build -tags avx,cuda .
-```
-
-### ROCm
-
-Install [ROCm](https://rocm.docs.amd.com/en/latest/).
-
-```shell
-make ggml_hipblas.dll
-go build -tags avx,rocm .
-```
-
-## Building runners
-
-```shell
-# build all runners for this platform
-make -j
-```
+This package provides Go bindings to [llama.cpp](https://github.com/ggerganov/llama.cpp).

 ## Vendoring

-Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
+Ollama vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/llama.cpp/tree/master/ggml/src). While we generally strive to contribute changes back upstream to avoid drift, we carry a small set of patches which are applied to the tracking commit.

 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.

 ```
-make apply-patches
+make -f Makefile.sync apply-patches
 ```

 ### Updating Base Commit

 **Pin to new base commit**

-To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring`
-
-#### Applying patches
+To change the base commit, update `FETCH_HEAD` in Makefile.sync.

 When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.

 Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.

 ```
-make apply-patches
+make -f Makefile.sync apply-patches
 ```

-If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
+If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
+
+Once all patches are applied, commit the changes to the tracking repository.

 ```
-make create-patches sync
+make -f Makefile.sync format-patches sync
 ```

-Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
-
 ### Generating Patches

 When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:

 ```
-make apply-patches
+make -f Makefile.sync clean apply-patches
 ```

-Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
-
-```
-make sync
-make -j 8
-go build .
-```
-
-> [!IMPORTANT]
-> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
-
 Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with

 ```
-make create-patches
+make -f Makefile.sync format-patches
 ```

-> [!IMPORTANT]
-> Once you have completed this step, it is safe to run `apply-patches` since your change is preserved in the patches.
-
 In your `./vendor/` directory, create a branch, and cherry-pick the new commit to that branch, then submit a PR upstream to llama.cpp.

 Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.
--- a/llama/llama.cpp/src/llama.go
+++ b/llama/llama.cpp/src/llama.go
@@ -3,5 +3,6 @@ package llama
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/../include
 // #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
+// #cgo windows CPPFLAGS: -D_WIN32_WINNT=0x0602
 import "C"
 import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
--- a/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -0,0 +1,29 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 14 Jan 2025 15:59:04 -0800
+Subject: [PATCH] add phony target ggml-cpu for all cpu variants
+
+---
+ ggml/src/CMakeLists.txt | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 84101c32..72b488dd 100644
+--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+     endforeach()
+ 
+     ggml_add_cpu_backend_variant_impl(${tag_name})
+    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
+ endfunction()
+ 
+ ggml_add_backend(CPU)
+@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
+     if (NOT GGML_BACKEND_DL)
+         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+     endif()
+    add_custom_target(ggml-cpu)
+     ggml_add_cpu_backend_variant(sandybridge    AVX)
+     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -443,7 +443,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		s.lc.Synchronize()
 	}

-	var totalSamplingTime time.Duration
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -478,12 +477,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		}

 		// sample a token
-		samplingStart := time.Now()
 		token := seq.samplingCtx.Sample(s.lc, seq.iBatch)
 		seq.samplingCtx.Accept(token, true)
-		samplingTime := time.Since(samplingStart)
-		totalSamplingTime += samplingTime
-		slog.Info("sampling time", "time", samplingTime)
 		piece := s.model.TokenToPiece(token)

 		seq.numPredicted++
@@ -640,7 +635,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	samplingParams.Seed = uint32(req.Seed)
 	samplingParams.Grammar = req.Grammar

-	start := time.Now()
 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
 		numPredict:     req.NumPredict,
 		stop:           req.Stop,
@@ -648,7 +642,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		samplingParams: &samplingParams,
 		embedding:      false,
 	})
-	slog.Info("new sequence created", "duration", time.Since(start))
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
 		return
--- a/llm/server.go
+++ b/llm/server.go
@@ -29,7 +29,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/grammar"
 	"github.com/ollama/ollama/llama"
 )

@@ -91,6 +90,8 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
 func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+	var err error
+
 	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory := systemInfo.System.TotalMemory
 	systemFreeMemory := systemInfo.System.FreeMemory
@@ -102,12 +103,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		gpus = discover.GetCPUInfo()
 	}

-	var estimate MemoryEstimate
-	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		estimate = EstimateGPULayers(gpus, f, projectors, opts)
-	} else {
-		estimate = EstimateGPULayers(gpus, f, projectors, opts)
-
+	estimate := EstimateGPULayers(gpus, f, projectors, opts)
+	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
@@ -234,149 +231,209 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		params = append(params, "--multiuser-cache")
 	}

-	exe, err := os.Executable()
+	// get available libraries
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("could not get libollama dir: %w", err)
 	}

-	// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
-	port := 0
-	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
-		var l *net.TCPListener
-		if l, err = net.ListenTCP("tcp", a); err == nil {
-			port = l.Addr().(*net.TCPAddr).Port
-			l.Close()
+	entries, err := os.ReadDir(discover.LibOllamaPath)
+	if err != nil {
+		return nil, fmt.Errorf("could not read libollama dir: %w", err)
+	}
+
+	libs := make(map[string]string)
+	for _, entry := range entries {
+		if entry.IsDir() {
+			libs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
 		}
 	}
-	if port == 0 {
-		slog.Debug("ResolveTCPAddr failed ", "error", err)
-		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-	}
-	finalParams := []string{"runner"}
-	finalParams = append(finalParams, params...)
-	finalParams = append(finalParams, "--port", strconv.Itoa(port))

-	pathEnv := "LD_LIBRARY_PATH"
-	if runtime.GOOS == "windows" {
-		pathEnv = "PATH"
-	}
-	// Start with the server directory for the LD_LIBRARY_PATH/PATH
-	libraryPaths := []string{filepath.Dir(exe)}
-
-	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-		// favor our bundled library dependencies over system libraries
-		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
+	lib := gpus[0].RunnerName()
+	requested := envconfig.LLMLibrary()
+	if libs[requested] != "" {
+		slog.Info("using requested gpu library", "requested", requested)
+		lib = requested
 	}

-	// Note: we always put the dependency path first
-	// since this was the exact version we compiled/linked against
-	if gpus[0].DependencyPath != nil {
-		// assume gpus from the same library have the same dependency path
-		libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
+	var compatible []string
+	for k := range libs {
+		// exact match first
+		if k == lib {
+			compatible = append([]string{k}, compatible...)
+			continue
+		}
+
+		// then match the family (e.g. 'cuda')
+		if strings.Split(k, "_")[0] == strings.Split(lib, "_")[0] {
+			compatible = append(compatible, k)
+		}
 	}
+	slog.Debug("compatible gpu libraries", "compatible", compatible)

-	// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
-	s := &llmServer{
-		port:        port,
-		cmd:         exec.Command(exe, finalParams...),
-		status:      NewStatusWriter(os.Stderr),
-		options:     opts,
-		modelPath:   model,
-		estimate:    estimate,
-		numParallel: numParallel,
-		sem:         semaphore.NewWeighted(int64(numParallel)),
-		totalLayers: f.KV().BlockCount() + 1,
-		gpus:        gpus,
-		done:        make(chan error, 1),
-	}
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
+	// without any LD_LIBRARY_PATH flags
+	for {
+		port := 0
+		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
+			var l *net.TCPListener
+			if l, err = net.ListenTCP("tcp", a); err == nil {
+				port = l.Addr().(*net.TCPAddr).Port
+				l.Close()
+			}
+		}
+		if port == 0 {
+			slog.Debug("ResolveTCPAddr failed ", "error", err)
+			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+		}
+		finalParams := []string{"runner"}
+		finalParams = append(finalParams, params...)
+		finalParams = append(finalParams, "--port", strconv.Itoa(port))

-	s.cmd.Env = os.Environ()
-	s.cmd.Stdout = os.Stdout
-	s.cmd.Stderr = s.status
-	s.cmd.SysProcAttr = LlamaServerSysProcAttr
+		pathEnv := "LD_LIBRARY_PATH"
+		if runtime.GOOS == "windows" {
+			pathEnv = "PATH"
+		}

-	envWorkarounds := [][2]string{}
-	for _, gpu := range gpus {
-		envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
-	}
-	visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
-	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+		var libraryPaths []string
+		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
+			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
+		}

-	// Update or add the path and visible devices variable with our adjusted version
-	pathNeeded := true
-	devicesNeeded := visibleDevicesEnv != ""
-	for i := range s.cmd.Env {
-		cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
-		if strings.EqualFold(cmp[0], pathEnv) {
-			s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
-			pathNeeded = false
-		} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
-			s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
-			devicesNeeded = false
-		} else if len(envWorkarounds) != 0 {
-			for _, kv := range envWorkarounds {
-				if strings.EqualFold(cmp[0], kv[0]) {
-					s.cmd.Env[i] = kv[0] + "=" + kv[1]
+		if len(compatible) > 0 {
+			c := compatible[0]
+			if libpath, ok := libs[c]; ok {
+				slog.Debug("adding gpu library", "path", libpath)
+				libraryPaths = append(libraryPaths, libpath)
+			}
+		}
+
+		// Note: we always put the dependency path first
+		// since this was the exact version we compiled/linked against
+		if gpus[0].DependencyPath != nil {
+			slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
+			// assume gpus from the same library have the same dependency path
+			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
+		}
+
+		// finally, add the root library path
+		libraryPaths = append(libraryPaths, discover.LibOllamaPath)
+
+		exe, err := os.Executable()
+		if err != nil {
+			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
+		}
+
+		exe, err = filepath.EvalSymlinks(exe)
+		if err != nil {
+			return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
+		}
+
+		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
+		s := &llmServer{
+			port:        port,
+			cmd:         exec.Command(exe, finalParams...),
+			status:      NewStatusWriter(os.Stderr),
+			options:     opts,
+			modelPath:   model,
+			estimate:    estimate,
+			numParallel: numParallel,
+			sem:         semaphore.NewWeighted(int64(numParallel)),
+			totalLayers: f.KV().BlockCount() + 1,
+			gpus:        gpus,
+			done:        make(chan error, 1),
+		}
+
+		s.cmd.Env = os.Environ()
+		s.cmd.Stdout = os.Stdout
+		s.cmd.Stderr = s.status
+		s.cmd.SysProcAttr = LlamaServerSysProcAttr
+
+		envWorkarounds := [][2]string{}
+		for _, gpu := range gpus {
+			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
+		}
+		visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
+		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+
+		// Update or add the path and visible devices variable with our adjusted version
+		pathNeeded := true
+		devicesNeeded := visibleDevicesEnv != ""
+		for i := range s.cmd.Env {
+			cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
+			if strings.EqualFold(cmp[0], pathEnv) {
+				s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
+				pathNeeded = false
+			} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
+				s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
+				devicesNeeded = false
+			} else if len(envWorkarounds) != 0 {
+				for _, kv := range envWorkarounds {
+					if strings.EqualFold(cmp[0], kv[0]) {
+						s.cmd.Env[i] = kv[0] + "=" + kv[1]
+					}
 				}
 			}
 		}
-	}
-	if pathNeeded {
-		s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
-	}
-	if devicesNeeded {
-		s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
-	}
+		if pathNeeded {
+			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
+		}
+		if devicesNeeded {
+			s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
+		}

-	slog.Info("starting llama server", "cmd", s.cmd.String())
-	if envconfig.Debug() {
-		filteredEnv := []string{}
-		for _, ev := range s.cmd.Env {
-			if strings.HasPrefix(ev, "CUDA_") ||
-				strings.HasPrefix(ev, "ROCR_") ||
-				strings.HasPrefix(ev, "ROCM_") ||
-				strings.HasPrefix(ev, "HIP_") ||
-				strings.HasPrefix(ev, "GPU_") ||
-				strings.HasPrefix(ev, "HSA_") ||
-				strings.HasPrefix(ev, "GGML_") ||
-				strings.HasPrefix(ev, "PATH=") ||
-				strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
-				filteredEnv = append(filteredEnv, ev)
+		slog.Info("starting llama server", "cmd", s.cmd.String())
+		if envconfig.Debug() {
+			filteredEnv := []string{}
+			for _, ev := range s.cmd.Env {
+				if strings.HasPrefix(ev, "CUDA_") ||
+					strings.HasPrefix(ev, "ROCR_") ||
+					strings.HasPrefix(ev, "ROCM_") ||
+					strings.HasPrefix(ev, "HIP_") ||
+					strings.HasPrefix(ev, "GPU_") ||
+					strings.HasPrefix(ev, "HSA_") ||
+					strings.HasPrefix(ev, "GGML_") ||
+					strings.HasPrefix(ev, "PATH=") ||
+					strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
+					filteredEnv = append(filteredEnv, ev)
+				}
 			}
-		}
-		// Log at debug as the environment is inherited and might contain sensitive information
-		slog.Debug("subprocess", "environment", filteredEnv)
-	}
-
-	if err = s.cmd.Start(); err != nil {
-		// Detect permission denied and augment the message about noexec
-		if errors.Is(err, os.ErrPermission) {
-			return nil, fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, exe)
+			// Log at debug as the environment is inherited and might contain sensitive information
+			slog.Debug("subprocess", "environment", filteredEnv)
 		}

-		msg := ""
-		if s.status != nil && s.status.LastErrMsg != "" {
-			msg = s.status.LastErrMsg
-		}
-		return nil, fmt.Errorf("error starting the external llama server: %v %s", err, msg)
-	}
-
-	// reap subprocess when it exits
-	go func() {
-		err := s.cmd.Wait()
-		// Favor a more detailed message over the process exit status
-		if err != nil && s.status != nil && s.status.LastErrMsg != "" {
-			slog.Debug("llama runner terminated", "error", err)
-			if strings.Contains(s.status.LastErrMsg, "unknown model") {
-				s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+		if err = s.cmd.Start(); err != nil {
+			var msg string
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
+			}
+			err := fmt.Errorf("error starting runner: %v %s", err, msg)
+			if len(compatible) == 0 {
+				return nil, err
 			}
-			s.done <- errors.New(s.status.LastErrMsg)
-		} else {
-			s.done <- err
-		}
-	}()

-	return s, nil
+			slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
+			compatible = compatible[1:]
+			continue
+		}
+
+		// reap subprocess when it exits
+		go func() {
+			err := s.cmd.Wait()
+			// Favor a more detailed message over the process exit status
+			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
+				slog.Error("llama runner terminated", "error", err)
+				if strings.Contains(s.status.LastErrMsg, "unknown model") {
+					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+				}
+				s.done <- errors.New(s.status.LastErrMsg)
+			} else {
+				s.done <- err
+			}
+		}()
+
+		return s, nil
+	}
 }

 type ServerStatus int
@@ -661,9 +718,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}

 			// User provided a JSON schema
-			g, err := grammar.FromSchema(nil, req.Format)
-			if err != nil {
-				return fmt.Errorf("invalid JSON schema in format: %w", err)
+			g := llama.SchemaToGrammar(req.Format)
+			if g == nil {
+				return fmt.Errorf("invalid JSON schema in format")
 			}
 			request["grammar"] = string(g)
 		}
@@ -683,6 +740,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
 		req.Options.NumPredict = 10 * s.options.NumCtx
 	}
+
 	// Make sure the server is ready
 	status, err := s.getServerStatusRetry(ctx)
 	if err != nil {
--- a/macapp/forge.config.ts
+++ b/macapp/forge.config.ts
@@ -18,8 +18,8 @@ const config: ForgeConfig = {
    asar: true,
    icon: './assets/icon.icns',
    extraResource: [
-      '../dist/ollama',
-      '../dist/darwin-amd64/lib',
+      path.join(__dirname, '../dist/darwin/ollama'),
+      ...fs.readdirSync(path.join(__dirname, '../dist/darwin/amd64')).map(f => path.join(__dirname, '../dist/darwin/amd64', f)),
      path.join(__dirname, './assets/iconTemplate.png'),
      path.join(__dirname, './assets/iconTemplate@2x.png'),
      path.join(__dirname, './assets/iconUpdateTemplate.png'),
@@ -43,7 +43,7 @@ const config: ForgeConfig = {
        }
      : {}),
    osxUniversal: {
-      x64ArchFiles: '**/ollama*',
+      x64ArchFiles: '*',
    },
  },
  rebuildConfig: {},
--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -1,7 +1,9 @@
-protect **/*.go
-protect **/*-embed.*
+protect *.go
+protect *-embed.*
 include include/
 include src/
+include src/CMakeLists.txt
+include src/**/CMakeLists.txt
 include src/ggml-blas/
 include src/ggml-cpu/
 include src/ggml-cpu/amx/
@@ -10,12 +12,11 @@ include src/ggml-cuda/
 include src/ggml-cuda/template-instances/
 include src/ggml-hip/
 include src/ggml-metal/
-include **/CMakeLists.txt
-include **/*.c
-include **/*.h
-include **/*.cpp
-include **/*.cu
-include **/*.cuh
-include **/*.m
-include **/*.metal
+include *.c
+include *.h
+include *.cpp
+include *.cu
+include *.cuh
+include *.m
+include *.metal
 exclude *
--- a/ml/backend/ggml/ggml/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/CMakeLists.txt
@@ -1,262 +0,0 @@
-cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
-project("ggml" C CXX)
-include(CheckIncludeFileCXX)
-
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif()
-
-if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-    set(GGML_STANDALONE ON)
-
-    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
-    # configure project version
-    # TODO
-else()
-    set(GGML_STANDALONE OFF)
-endif()
-
-if (EMSCRIPTEN)
-    set(BUILD_SHARED_LIBS_DEFAULT OFF)
-
-    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
-else()
-    if (MINGW)
-        set(BUILD_SHARED_LIBS_DEFAULT OFF)
-    else()
-        set(BUILD_SHARED_LIBS_DEFAULT ON)
-    endif()
-endif()
-
-# remove the lib prefix on win32 mingw
-if (WIN32)
-    set(CMAKE_STATIC_LIBRARY_PREFIX "")
-    set(CMAKE_SHARED_LIBRARY_PREFIX "")
-    set(CMAKE_SHARED_MODULE_PREFIX  "")
-endif()
-
-option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
-option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
-
-#
-# option list
-#
-
-# TODO: mark all options as advanced when not GGML_STANDALONE
-
-if (APPLE)
-    set(GGML_METAL_DEFAULT ON)
-    set(GGML_BLAS_DEFAULT ON)
-    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
-else()
-    set(GGML_METAL_DEFAULT OFF)
-    set(GGML_BLAS_DEFAULT OFF)
-    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
-endif()
-
-if (CMAKE_CROSSCOMPILING)
-    set(GGML_NATIVE_DEFAULT OFF)
-else()
-    set(GGML_NATIVE_DEFAULT ON)
-endif()
-
-# defaults
-if (NOT GGML_LLAMAFILE_DEFAULT)
-    set(GGML_LLAMAFILE_DEFAULT OFF)
-endif()
-
-if (NOT GGML_CUDA_GRAPHS_DEFAULT)
-    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
-endif()
-
-# general
-option(GGML_STATIC "ggml: static link libraries"                     OFF)
-option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
-option(GGML_LTO    "ggml: enable link time optimization"             OFF)
-option(GGML_CCACHE "ggml: use ccache if available"                   ON)
-
-# debug
-option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
-option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
-option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
-
-# build
-option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
-
-# sanitizers
-option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
-option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
-option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
-
-# instruction set specific
-if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
-    set(INS_ENB OFF)
-else()
-    set(INS_ENB ON)
-endif()
-
-option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
-option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
-option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
-option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
-option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
-option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
-option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
-option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
-option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
-if (NOT MSVC)
-    # in MSVC F16C and FMA is implied with AVX2/AVX512
-    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
-    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
-    # MSVC does not seem to support AMX
-    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
-    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
-    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
-endif()
-option(GGML_LASX             "ggml: enable lasx"             ON)
-option(GGML_LSX              "ggml: enable lsx"              ON)
-option(GGML_RVV              "ggml: enable rvv"              ON)
-
-option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
-set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
-
-
-if (WIN32)
-    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
-endif()
-
-# ggml core
-set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
-option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
-
-# 3rd party libs / backends
-option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
-option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
-set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
-                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
-
-option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
-option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
-option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
-option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
-option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
-set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                            "ggml: max. batch size for using peer access")
-option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
-option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
-option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
-option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
-
-option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
-option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
-option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
-option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
-option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
-option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
-option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
-option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
-option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
-option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
-option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
-option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
-option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
-option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
-option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
-set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
-                                            "ggml: metal minimum macOS version")
-set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
-option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
-option(GGML_RPC                             "ggml: use RPC"                                   OFF)
-option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
-option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
-set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
-                                            "ggml: sycl target device")
-set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
-                                            "ggml: sycl device architecture")
-
-option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
-option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
-option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
-option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
-
-# extra artifacts
-option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
-option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
-
-#
-# dependencies
-#
-
-set(CMAKE_C_STANDARD 11)
-set(CMAKE_C_STANDARD_REQUIRED true)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED true)
-
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-
-find_package(Threads REQUIRED)
-
-#
-# build the library
-#
-
-add_subdirectory(src)
-
-#
-# tests and examples
-#
-
-if (GGML_BUILD_TESTS)
-    enable_testing()
-    add_subdirectory(tests)
-endif ()
-
-if (GGML_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-endif ()
-
-#
-# install
-#
-
-include(GNUInstallDirs)
-include(CMakePackageConfigHelpers)
-
-# all public headers
-set(GGML_PUBLIC_HEADERS
-    include/ggml.h
-    include/ggml-cpu.h
-    include/ggml-alloc.h
-    include/ggml-backend.h
-    include/ggml-blas.h
-    include/ggml-cann.h
-    include/ggml-cuda.h
-    include/ggml-kompute.h
-    include/ggml-opt.h
-    include/ggml-metal.h
-    include/ggml-rpc.h
-    include/ggml-sycl.h
-    include/ggml-vulkan.h)
-
-set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
-#if (GGML_METAL)
-#    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
-#endif()
-install(TARGETS ggml LIBRARY PUBLIC_HEADER)
-install(TARGETS ggml-base LIBRARY)
-
-if (GGML_STANDALONE)
-    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
-        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
-        @ONLY)
-
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
-        DESTINATION share/pkgconfig)
-endif()
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
    endforeach()

    ggml_add_cpu_backend_variant_impl(${tag_name})
+    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
 endfunction()

 ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
    if (NOT GGML_BACKEND_DL)
        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
    endif()
+    add_custom_target(ggml-cpu)
    ggml_add_cpu_backend_variant(sandybridge    AVX)
    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -1,5 +1,6 @@
 package cpu

+// #cgo CFLAGS: -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
 // #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
--- a/ml/backend/ggml/ggml/src/ggml.go
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -3,6 +3,7 @@ package ggml
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
 // #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu
+// #cgo windows LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++
 // #include <stdlib.h>
 // #include "ggml-backend.h"
 // extern void sink(int level, char *text, void *user_data);
@@ -57,11 +58,24 @@ var OnceLoad = sync.OnceFunc(func() {
 		paths = lib.defaultValue
 	}

-	for _, path := range filepath.SplitList(paths) {
-		func() {
-			cpath := C.CString(path)
-			defer C.free(unsafe.Pointer(cpath))
-			C.ggml_backend_load_all_from_path(cpath)
-		}()
+	if runtime.GOOS == "darwin" {
+		if _, ok := os.LookupEnv("DYLD_LIBRARY_PATH"); !ok {
+			os.Setenv("DYLD_LIBRARY_PATH", paths)
+		}
+	}
+
+	split := filepath.SplitList(paths)
+	visited := make(map[string]struct{}, len(split))
+	for _, path := range split {
+		abspath, _ := filepath.Abs(path)
+		if _, ok := visited[abspath]; !ok {
+			func() {
+				cpath := C.CString(path)
+				defer C.free(unsafe.Pointer(cpath))
+				C.ggml_backend_load_all_from_path(cpath)
+			}()
+
+			visited[abspath] = struct{}{}
+		}
 	}
 })
--- a/model/cmd/main.go
+++ b/model/cmd/main.go
@@ -10,7 +10,6 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
-	"time"

 	"github.com/ollama/ollama/cache"
 	"github.com/ollama/ollama/ml"
@@ -28,7 +27,6 @@ var args struct {
 }

 func temp() error {
-	// start := time.Now()
 	flag.IntVar(&args.n, "n", 10, "number of samples")
 	flag.BoolVar(&args.debug, "debug", false, "enable debug logging")
 	flag.StringVar(&args.image, "image", "", "path to image file")
@@ -106,62 +104,30 @@ func temp() error {
 		}
 	}

-	// Schema for a list of friends with their info
-	// Maps to JSON like:
-	// {
-	// 	"name": "string",
-	// 	"age": integer,
-	// 	"is_available": boolean
-	// }
-	schema := &sample.Schema{
-		Name: "root",
-		Type: "object",
-		Properties: []*sample.Schema{
-			{Name: "name", Type: "string"},
-			{Name: "age", Type: "integer"},
-			{Name: "is_available", Type: "boolean"},
-		},
-	}
-
-	// fmt.Println("schema", schema)
-	// schema = nil
-	jsonTransform, err := sample.NewJSONSampler(m.(model.TextProcessor), schema)
-	if err != nil {
-		return err
-	}
-
-	transforms := []sample.Transform{
-		jsonTransform,
-	}
-
 	var offset int
-	var stringBuffer string
-	// var ttft time.Duration
-	var totalSamplingTime time.Duration
-	count := 0
 	for range args.n {
-		logits, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
+		logit, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
 		if err != nil {
 			return err
 		}

-		samplingStart := time.Now()
-		sampler := sample.Greedy()
-		sampledIdx, err := sampler.Sample(logits.Floats(), transforms...)
+		f32s := logit.Floats()
+		f64s := make([]float64, len(f32s))
+		for i, f32 := range f32s {
+			f64s[i] = float64(f32)
+		}
+
+		// do sampling
+		f64s, err = sample.Sample(f64s, sample.Greedy())
 		if err != nil {
 			return err
 		}

-		samplingTime := time.Since(samplingStart)
-		totalSamplingTime += samplingTime
-
-		// fmt.Println("sampling time", samplingTime)
-		// fmt.Printf("Sample time: %vms\n", finishTime.Sub(sampleTime).Milliseconds())
-
 		var outputIDs []int32
-
-		if !m.(model.TextProcessor).Is(uint32(sampledIdx), model.SpecialEOS) {
-			outputIDs = append(outputIDs, int32(sampledIdx))
+		for _, f64 := range f64s {
+			if !m.(model.TextProcessor).Is(uint32(f64), model.SpecialEOS) {
+				outputIDs = append(outputIDs, int32(f64))
+			}
 		}

 		if len(outputIDs) == 0 {
@@ -175,32 +141,14 @@ func temp() error {
 			return err
 		}

-		// if ttft == 0 {
-		// 	ttft = time.Since(start)
-		// fmt.Printf("Time to first token: %vms\n", ttft.Milliseconds())
-		// }
+		fmt.Print(s)

-		// fmt.Printf("--- token: %q\n", s)
-		// fmt.Printf("--- outputIDs: %v\n", outputIDs)
-		stringBuffer += s
-		count++
-		fmt.Println("--- stringBuffer", stringBuffer)
-
-		outputIDs, err = jsonTransform.UpdateState(outputIDs)
-		if err != nil {
-			return err
-		}
-
-		// can do fun shifting stuff here if needed
 		inputIDs = append(inputIDs, outputIDs...)
 		if args.cache {
 			offset = len(inputIDs) - 1
 		}
 	}
-	fmt.Println("\n------ Output: ------")
-	fmt.Println(stringBuffer)
-	fmt.Println("--------------------")
-	fmt.Println("sample average time", totalSamplingTime/time.Duration(count))
+
 	return nil
 }

--- a/model/cmd/test.go
+++ b/model/cmd/test.go
@@ -1 +0,0 @@
-package main
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -21,8 +21,6 @@ type TextProcessor interface {
 	Encode(string) ([]int32, error)
 	Decode([]int32) (string, error)
 	Is(uint32, Special) bool
-
-	GetVocabulary() *Vocabulary
 }

 type Vocabulary struct {
@@ -100,10 +98,6 @@ func (v *Vocabulary) Merge(left, right string) int {
 	return -1
 }

-func (v *Vocabulary) GetVocabulary() *Vocabulary {
-	return v
-}
-
 type BytePairEncoding struct {
 	Pretokenizer string

--- a/model/process_text_test.go
+++ b/model/process_text_test.go
@@ -0,0 +1,228 @@
+package model
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestBytePairEncoding(t *testing.T) {
+	// Create a simple test vocabulary
+	vocab := &Vocabulary{
+		Values: []string{
+			"Hello",
+			"World",
+			"!",
+			"How",
+			"are",
+			"you",
+			"t",
+			"o",
+			"d",
+			"a",
+			"y",
+			"to",
+			"tod",
+			"toda",
+			"today",
+			" ",
+		},
+		Types: []uint32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3}, // 3 for special token (space)
+		Merges: []string{
+			"to",
+			"tod",
+			"toda",
+			"today",
+		},
+		BOS: 0,
+		EOS: 1,
+	}
+
+	bpe := BytePairEncoding{
+		Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		Vocabulary:   vocab,
+	}
+
+	tests := []struct {
+		name    string
+		input   string
+		want    []int32
+		wantErr bool
+	}{
+		{
+			name:    "simple hello world",
+			input:   "Hello World!",
+			want:    []int32{0, 15, 1, 2}, // indexes in the vocabulary
+			wantErr: false,
+		},
+		{
+			name:    "empty string",
+			input:   "",
+			wantErr: false,
+		},
+		{
+			name:    "just spaces",
+			input:   "   ",
+			want:    []int32{15, 15, 15}, // space token repeated
+			wantErr: false,
+		},
+		{
+			name:    "today with merges",
+			input:   "today",
+			want:    []int32{14}, // should merge
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := bpe.Encode(tt.input)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("BytePairEncoding.Encode() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("BytePairEncoding.Encode() = %v, want %v", got, tt.want)
+			}
+
+			// Test round trip if encoding succeeded
+			if err == nil {
+				decoded, err := bpe.Decode(got)
+				if err != nil {
+					t.Errorf("BytePairEncoding.Decode() error = %v", err)
+					return
+				}
+				// Note: The decoded string might not exactly match the input due to
+				// tokenization/normalization, so we re-encode it to compare
+				reEncoded, err := bpe.Encode(decoded)
+				if err != nil {
+					t.Errorf("BytePairEncoding.Encode() error on round trip = %v", err)
+					return
+				}
+				if !reflect.DeepEqual(reEncoded, got) {
+					t.Errorf("Round trip failed: original tokens = %v, after round trip = %v", got, reEncoded)
+				}
+			}
+		})
+	}
+}
+
+func TestBytePairEncodingSpecialTokens(t *testing.T) {
+	vocab := &Vocabulary{
+		Values: []string{
+			"<s>",
+			"</s>",
+			"<pad>",
+			"Hello",
+			"World",
+		},
+		Types: []uint32{3, 3, 3, 1, 1}, // 3 for special tokens
+		BOS:   0,
+		EOS:   1,
+	}
+
+	bpe := BytePairEncoding{
+		Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		Vocabulary:   vocab,
+	}
+
+	tests := []struct {
+		name    string
+		input   string
+		want    []int32
+		wantErr bool
+	}{
+		{
+			name:    "text with special token at start",
+			input:   "<s>Hello",
+			want:    []int32{0, 3},
+			wantErr: false,
+		},
+		{
+			name:    "text with special token at end",
+			input:   "World</s>",
+			want:    []int32{4, 1},
+			wantErr: false,
+		},
+		{
+			name:    "special token in middle",
+			input:   "Hello<pad>World",
+			want:    []int32{3, 2, 4},
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := bpe.Encode(tt.input)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("BytePairEncoding.Encode() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("BytePairEncoding.Encode() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestBytePairEncodingSplit(t *testing.T) {
+	bpe := BytePairEncoding{
+		Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+	}
+
+	tests := []struct {
+		name    string
+		input   string
+		want    []string
+		wantErr bool
+	}{
+		{
+			name:  "basic splitting",
+			input: "Hello World!",
+			want:  []string{"Hello", " World", "!"},
+		},
+		{
+			name:  "contractions",
+			input: "I'm don't won't",
+			want:  []string{"I", "'m", " don", "'t", " won", "'t"},
+		},
+		{
+			name:  "numbers",
+			input: "In 2024 there are 365 days",
+			want:  []string{"In", " ", "202", "4", " there", " are", " ", "365", " days"},
+		},
+		{
+			name:  "special characters",
+			input: "Hello!! ...world",
+			want:  []string{"Hello", "!!", " ...", "world"},
+		},
+		{
+			name:  "multiple spaces",
+			input: "Hello    World",
+			want:  []string{"Hello", "   ", " World"},
+		},
+		{
+			name:  "newlines",
+			input: "Hello\nWorld",
+			want:  []string{"Hello", "\n", "World"},
+		},
+		{
+			name:  "mixed case and punctuation",
+			input: "Hello, WORLD!! How's it going?",
+			want:  []string{"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := bpe.split(tt.input)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("BytePairEncoding.split() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("BytePairEncoding.split() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
--- a/parser/expandpath_test.go
+++ b/parser/expandpath_test.go
@@ -4,6 +4,7 @@ import (
 	"os"
 	"os/user"
 	"path/filepath"
+	"runtime"
 	"testing"
 )

@@ -11,14 +12,29 @@ func TestExpandPath(t *testing.T) {
 	mockCurrentUser := func() (*user.User, error) {
 		return &user.User{
 			Username: "testuser",
-			HomeDir:  "/home/testuser",
+			HomeDir: func() string {
+				if os.PathSeparator == '\\' {
+					return filepath.FromSlash("D:/home/testuser")
+				}
+				return "/home/testuser"
+			}(),
 		}, nil
 	}

 	mockLookupUser := func(username string) (*user.User, error) {
 		fakeUsers := map[string]string{
-			"testuser":    "/home/testuser",
-			"anotheruser": "/home/anotheruser",
+			"testuser": func() string {
+				if os.PathSeparator == '\\' {
+					return filepath.FromSlash("D:/home/testuser")
+				}
+				return "/home/testuser"
+			}(),
+			"anotheruser": func() string {
+				if os.PathSeparator == '\\' {
+					return filepath.FromSlash("D:/home/anotheruser")
+				}
+				return "/home/anotheruser"
+			}(),
 		}

 		if homeDir, ok := fakeUsers[username]; ok {
@@ -30,30 +46,78 @@ func TestExpandPath(t *testing.T) {
 		return nil, os.ErrNotExist
 	}

-	tests := []struct {
-		path            string
-		relativeDir     string
-		expected        string
-		windowsExpected string
-		shouldErr       bool
-	}{
-		{"~", "", "/home/testuser", "D:\\home\\testuser", false},
-		{"~/myfolder/myfile.txt", "", "/home/testuser/myfolder/myfile.txt", "D:\\home\\testuser\\myfolder\\myfile.txt", false},
-		{"~anotheruser/docs/file.txt", "", "/home/anotheruser/docs/file.txt", "D:\\home\\anotheruser\\docs\\file.txt", false},
-		{"~nonexistentuser/file.txt", "", "", "", true},
-		{"relative/path/to/file", "", filepath.Join(os.Getenv("PWD"), "relative/path/to/file"), "relative\\path\\to\\file", false},
-		{"/absolute/path/to/file", "", "/absolute/path/to/file", "D:\\absolute\\path\\to\\file", false},
-		{".", os.Getenv("PWD"), "", os.Getenv("PWD"), false},
-		{"somefile", "somedir", filepath.Join(os.Getenv("PWD"), "somedir", "somefile"), "somedir\\somefile", false},
+	pwd, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
 	}

-	for _, test := range tests {
-		result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
-		if (err != nil) != test.shouldErr {
-			t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
+	t.Run("unix tests", func(t *testing.T) {
+		if runtime.GOOS == "windows" {
+			return
 		}
-		if result != test.expected && result != test.windowsExpected && !test.shouldErr {
-			t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
+
+		tests := []struct {
+			path        string
+			relativeDir string
+			expected    string
+			shouldErr   bool
+		}{
+			{"~", "", "/home/testuser", false},
+			{"~/myfolder/myfile.txt", "", "/home/testuser/myfolder/myfile.txt", false},
+			{"~anotheruser/docs/file.txt", "", "/home/anotheruser/docs/file.txt", false},
+			{"~nonexistentuser/file.txt", "", "", true},
+			{"relative/path/to/file", "", filepath.Join(pwd, "relative/path/to/file"), false},
+			{"/absolute/path/to/file", "", "/absolute/path/to/file", false},
+			{"/absolute/path/to/file", "someotherdir/", "/absolute/path/to/file", false},
+			{".", pwd, pwd, false},
+			{".", "", pwd, false},
+			{"somefile", "somedir", filepath.Join(pwd, "somedir", "somefile"), false},
 		}
-	}
+
+		for _, test := range tests {
+			result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
+			if (err != nil) != test.shouldErr {
+				t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
+			}
+
+			if result != test.expected && !test.shouldErr {
+				t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
+			}
+		}
+	})
+
+	t.Run("windows tests", func(t *testing.T) {
+		if runtime.GOOS != "windows" {
+			return
+		}
+
+		tests := []struct {
+			path        string
+			relativeDir string
+			expected    string
+			shouldErr   bool
+		}{
+			{"~", "", "D:\\home\\testuser", false},
+			{"~/myfolder/myfile.txt", "", "D:\\home\\testuser\\myfolder\\myfile.txt", false},
+			{"~anotheruser/docs/file.txt", "", "D:\\home\\anotheruser\\docs\\file.txt", false},
+			{"~nonexistentuser/file.txt", "", "", true},
+			{"relative\\path\\to\\file", "", filepath.Join(pwd, "relative\\path\\to\\file"), false},
+			{"D:\\absolute\\path\\to\\file", "", "D:\\absolute\\path\\to\\file", false},
+			{"D:\\absolute\\path\\to\\file", "someotherdir/", "D:\\absolute\\path\\to\\file", false},
+			{".", pwd, pwd, false},
+			{".", "", pwd, false},
+			{"somefile", "somedir", filepath.Join(pwd, "somedir", "somefile"), false},
+		}
+
+		for _, test := range tests {
+			result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
+			if (err != nil) != test.shouldErr {
+				t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
+			}
+
+			if result != test.expected && !test.shouldErr {
+				t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
+			}
+		}
+	})
 }
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -62,7 +62,13 @@ func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error)
 				return nil, err
 			}

-			req.Files = digestMap
+			if req.Files == nil {
+				req.Files = digestMap
+			} else {
+				for k, v := range digestMap {
+					req.Files[k] = v
+				}
+			}
 		case "adapter":
 			path, err := expandPath(c.Args, relativeDir)
 			if err != nil {
@@ -564,7 +570,9 @@ func isValidCommand(cmd string) bool {
 }

 func expandPathImpl(path, relativeDir string, currentUserFunc func() (*user.User, error), lookupUserFunc func(string) (*user.User, error)) (string, error) {
-	if strings.HasPrefix(path, "~") {
+	if filepath.IsAbs(path) || strings.HasPrefix(path, "\\") || strings.HasPrefix(path, "/") {
+		return filepath.Abs(path)
+	} else if strings.HasPrefix(path, "~") {
 		var homeDir string

 		if path == "~" || strings.HasPrefix(path, "~/") {
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -490,7 +490,6 @@ func TestParseFileParameters(t *testing.T) {
 		"top_k 1":                      {"top_k", "1"},
 		"top_p 1.0":                    {"top_p", "1.0"},
 		"min_p 0.05":                   {"min_p", "0.05"},
-		"tfs_z 1.0":                    {"tfs_z", "1.0"},
 		"typical_p 1.0":                {"typical_p", "1.0"},
 		"repeat_last_n 1":              {"repeat_last_n", "1"},
 		"temperature 1.0":              {"temperature", "1.0"},
@@ -793,15 +792,20 @@ func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, s
 }

 func TestCreateRequestFiles(t *testing.T) {
-	name, digest := createBinFile(t, nil, nil)
+	n1, d1 := createBinFile(t, nil, nil)
+	n2, d2 := createBinFile(t, map[string]any{"foo": "bar"}, nil)

 	cases := []struct {
 		input    string
 		expected *api.CreateRequest
 	}{
 		{
-			fmt.Sprintf("FROM %s", name),
-			&api.CreateRequest{Files: map[string]string{name: digest}},
+			fmt.Sprintf("FROM %s", n1),
+			&api.CreateRequest{Files: map[string]string{n1: d1}},
+		},
+		{
+			fmt.Sprintf("FROM %s\nFROM %s", n1, n2),
+			&api.CreateRequest{Files: map[string]string{n1: d1, n2: d2}},
 		},
 	}

--- a/runners/common.go
+++ b/runners/common.go
@@ -1,39 +0,0 @@
-package runners
-
-import (
-	"golang.org/x/sys/cpu"
-)
-
-type CPUCapability uint32
-
-// Override at build time when building base GPU runners
-// var GPURunnerCPUCapability = CPUCapabilityAVX
-
-const (
-	CPUCapabilityNone CPUCapability = iota
-	CPUCapabilityAVX
-	CPUCapabilityAVX2
-	// TODO AVX512
-)
-
-func (c CPUCapability) String() string {
-	switch c {
-	case CPUCapabilityAVX:
-		return "avx"
-	case CPUCapabilityAVX2:
-		return "avx2"
-	default:
-		return "no vector extensions"
-	}
-}
-
-func GetCPUCapability() CPUCapability {
-	if cpu.X86.HasAVX2 {
-		return CPUCapabilityAVX2
-	}
-	if cpu.X86.HasAVX {
-		return CPUCapabilityAVX
-	}
-	// else LCD
-	return CPUCapabilityNone
-}
--- a/sample/decode.go
+++ b/sample/decode.go
@@ -1,171 +0,0 @@
-package sample
-
-import (
-	"bytes"
-	"encoding/json"
-	"errors"
-)
-
-// Schema holds a JSON schema.
-type Schema struct {
-	// Name is the name of the property. For the parent/root property, this
-	// is "root". For child properties, this is the name of the property.
-	Name string `json:"-"`
-
-	// Type is the type of the property.
-	//
-	// TODO: Union types (e.g. make this a []string).
-	Type string
-
-	// PrefixItems is a list of schemas for each item in a tuple. By
-	// default, the tuple is "closed." unless Items is set to true or a
-	// valid Schema.
-	PrefixItems []*Schema
-
-	// Items is the schema for each item in a list.
-	//
-	// If it is missing, or its JSON value is "null" or "false", it is nil.
-	// If the JSON value is "true", it is set to the empty Schema. If the
-	// JSON value is an object, it will be decoded as a Schema.
-	Items *Schema
-
-	// MinItems specifies the minimum number of items allowed in a list.
-	MinItems int
-
-	// MaxItems specifies the maximum number of items allowed in a list.
-	MaxItems int
-
-	// Properties is the schema for each property of an object.
-	Properties []*Schema
-
-	// Format is the format of the property. This is used to validate the
-	// property against a specific format.
-	//
-	// It is the callers responsibility to validate the property against
-	// the format.
-	Format string
-
-	// Minimum specifies the minimum value for numeric properties.
-	Minimum float64
-
-	// Maximum specifies the maximum value for numeric properties.
-	Maximum float64
-
-	// Enum is a list of valid values for the property.
-	Enum []json.RawMessage
-}
-
-func (s *Schema) UnmarshalJSON(data []byte) error {
-	type S Schema
-	w := struct {
-		Properties props
-		Items      items
-		*S
-	}{
-		S: (*S)(s),
-	}
-	if err := json.Unmarshal(data, &w); err != nil {
-		return err
-	}
-	if w.Items.set {
-		s.Items = &w.Items.Schema
-	}
-	s.Properties = w.Properties
-	return nil
-}
-
-type items struct {
-	Schema
-	set bool
-}
-
-func (s *items) UnmarshalJSON(data []byte) error {
-	switch b := data[0]; b {
-	case 't':
-		*s = items{set: true}
-	case '{':
-		type I items
-		if err := json.Unmarshal(data, (*I)(s)); err != nil {
-			return err
-		}
-		s.set = true
-	case 'n', 'f':
-	default:
-		return errors.New("invalid Items")
-	}
-	return nil
-}
-
-// EffectiveType returns the effective type of the schema. If the Type field is
-// not empty, it is returned; otherwise:
-//
-//   - If the schema has both Properties and Items, it returns an empty string.
-//   - If the schema has Properties, it returns "object".
-//   - If the schema has Items, it returns "array".
-//   - If the schema has neither Properties nor Items, it returns "value".
-//
-// The returned string is never empty.
-func (d *Schema) EffectiveType() string {
-	if d.Type == "" {
-		if len(d.Properties) > 0 {
-			return "object"
-		}
-		if len(d.PrefixItems) > 0 || d.Items != nil {
-			return "array"
-		}
-		return "value"
-	}
-	return d.Type
-}
-
-// props is an ordered list of properties. The order of the properties
-// is the order in which they were defined in the schema.
-type props []*Schema
-
-var _ json.Unmarshaler = (*props)(nil)
-
-func (v *props) UnmarshalJSON(data []byte) error {
-	if len(data) == 0 {
-		return nil
-	}
-	if data[0] != '{' {
-		return errors.New("expected object")
-	}
-
-	d := json.NewDecoder(bytes.NewReader(data))
-
-	// TODO(bmizerany): Consider DisallowUnknownFields. Currently, we, like
-	// llama.cpp, ignore unknown fields, which could be lead to unexpected
-	// behavior for clients of this package, since they may not be aware
-	// that "additionalFields", "itemsPrefix", etc, are being ignored.
-	//
-	// For now, just do what llama.cpp does.
-
-	t, err := d.Token()
-	if err != nil {
-		return err
-	}
-	if t != json.Delim('{') {
-		return errors.New("expected object")
-	}
-	for d.More() {
-		// Use the first token (map key) as the property name, then
-		// decode the rest of the object fields into a Schema and
-		// append.
-		t, err := d.Token()
-		if err != nil {
-			return err
-		}
-		if t == json.Delim('}') {
-			return nil
-		}
-		s := &Schema{
-			Name: t.(string),
-		}
-		if err := d.Decode(s); err != nil {
-			return err
-		}
-		*v = append(*v, s)
-	}
-	return nil
-}
--- a/sample/feedback.txt
+++ b/sample/feedback.txt
@@ -1,32 +0,0 @@
-// Feedback from code review:
-
-// pushdown_automata.go:
-// 1. The BuildGraph function is quite long and could be split into smaller, more focused functions
-// 2. Consider using constants instead of magic runes like rune(-1) for sentinel values
-// 3. The state machine transitions could be defined more declaratively, perhaps in a config
-// 4. The stringInvalidRunes list needs to handle escape sequences properly
-// 5. The graph building could be optimized to avoid duplicate nodes/transitions
-// 6. Consider adding validation for max nesting depth of braces/brackets
-// 7. The CreateMask function is doing a lot - could be split into smaller pieces
-// 8. isRuneValid has a "garbage interface" per TODO - needs cleaner design
-
-// pushdown_runner.go:
-// 1. The Apply method has a lot of duplicated logic around EOS handling
-// 2. The UpdateState method could use more granular error messages
-// 3. The braceStack validation could be moved to a separate validator
-// 4. Consider adding max length limits for strings/numbers
-// 5. The stateCounter isn't being used effectively yet
-// 6. Need to add penalties for staying in same state too long
-// 7. The maskLogits function could be optimized to avoid allocations
-// 8. Missing proper cleanup/reset functionality
-// 9. Error handling could be more consistent throughout
-// 10. Consider adding debug logging levels instead of raw fmt.Println
-
-// General improvements needed:
-// - More comprehensive testing, especially edge cases
-// - Better documentation of state machine transitions
-// - Performance optimization for large inputs
-// - Memory usage optimization for the graph structure
-// - Cleaner interfaces between components
-// - More robust error handling and recovery
-
--- a/sample/fused_mask_sample.go
+++ b/sample/fused_mask_sample.go
@@ -1,11 +0,0 @@
-package sample
-
-// type fusedMaskSampler struct{}
-
-// func FusedMaskSampler() Sampler {
-// 	return fusedMaskSampler{}
-// }
-
-// func (f fusedMaskSampler) Sample(logits []float64) (int, error) {
-// 	return int(logits[0]), nil
-// }
--- a/sample/greedy.go
+++ b/sample/greedy.go
@@ -8,19 +8,6 @@ func Greedy() Sampler {
 	return greedy{}
 }

-func (s greedy) Sample(logits []float32, transforms ...Transform) (int, error) {
-	logits64 := make([]float64, len(logits))
-	for i, v := range logits {
-		logits64[i] = float64(v)
-	}
-
-	var err error
-	for _, t := range transforms {
-		logits64, err = t.Apply(logits64)
-		if err != nil {
-			return -1, err
-		}
-	}
-
-	return floats.MaxIdx(logits64), nil
+func (s greedy) Sample(t []float64) ([]float64, error) {
+	return []float64{float64(floats.MaxIdx(t))}, nil
 }
--- a/sample/hid.txt
+++ b/sample/hid.txt
@@ -1,296 +0,0 @@
-package sample
-
-import (
-	"slices"
-
-	"github.com/ollama/ollama/model"
-)
-
-var stringInvalidRunes = []rune{'\\', '\n', '\t', '{', '}', ':', ','}
-
-var intInvalidRunes = []rune{'e', 'E', ' ', '\n', '\t', '{', '}', ':', ',', '"'}
-var validIntRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'}
-
-var validNumberRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '-', '+', 'e', 'E'}
-
-var validBoolRunes = []rune{'t', 'r', 'u', 'e', 'f', 'a', 'l', 's', 'e'}
-
-var validNullRunes = []rune{'n', 'u', 'l', 'l'}
-
-type PDANode struct {
-	State             JSONState
-	TransitionEdges   map[rune]*PDANode
-	MaskTokenIDToNode map[int32]JSONState
-}
-
-func NewPDANode(state JSONState) *PDANode {
-	return &PDANode{
-		State:             state,
-		TransitionEdges:   make(map[rune]*PDANode),
-		MaskTokenIDToNode: make(map[int32]JSONState),
-	}
-}
-
-func BuildGraph(proc model.TextProcessor) (*PDANode, map[JSONState]*PDANode, error) {
-	stateToNodeMap := make(map[JSONState]*PDANode)
-
-	startNode := NewPDANode(StateStart)
-	stateToNodeMap[StateStart] = startNode
-
-	objNode := NewPDANode(StateInObject)
-	stateToNodeMap[StateInObject] = objNode
-
-	objEndNode := NewPDANode(StateInObjectEnd)
-	stateToNodeMap[StateInObjectEnd] = objEndNode
-
-	objKeyNode := NewPDANode(StateInObjectKey)
-	stateToNodeMap[StateInObjectKey] = objKeyNode
-
-	objKeyEndNode := NewPDANode(StateInObjectKeyEnd)
-	stateToNodeMap[StateInObjectKeyEnd] = objKeyEndNode
-
-	colonNode := NewPDANode(StateInColon)
-	stateToNodeMap[StateInColon] = colonNode
-
-	commaNode := NewPDANode(StateInComma)
-	stateToNodeMap[StateInComma] = commaNode
-
-	newlineNode := NewPDANode(StateInNewline)
-	stateToNodeMap[StateInNewline] = newlineNode
-
-	spaceNode := NewPDANode(StateInSpace)
-	stateToNodeMap[StateInSpace] = spaceNode
-
-	spaceObjNode := NewPDANode(StateInObjSpace)
-	stateToNodeMap[StateInObjSpace] = spaceObjNode
-
-	tabNode := NewPDANode(StateInTab)
-	stateToNodeMap[StateInTab] = tabNode
-
-	stringNode := NewPDANode(StateInString)
-	stateToNodeMap[StateInString] = stringNode
-
-	stringEndNode := NewPDANode(StateInStringEnd)
-	stateToNodeMap[StateInStringEnd] = stringEndNode
-
-	listNode := NewPDANode(StateInList)
-	stateToNodeMap[StateInList] = listNode
-
-	listCommaNode := NewPDANode(StateInListComma)
-	stateToNodeMap[StateInListComma] = listCommaNode
-
-	listEndNode := NewPDANode(StateListEnd)
-	stateToNodeMap[StateListEnd] = listEndNode
-
-	numberNode := NewPDANode(StateInNumber)
-	stateToNodeMap[StateInNumber] = numberNode
-
-	boolNode := NewPDANode(StateInBool)
-	stateToNodeMap[StateInBool] = boolNode
-
-	nullNode := NewPDANode(StateInNull)
-	stateToNodeMap[StateInNull] = nullNode
-
-	// Defined with structured outputs only
-	intNode := NewPDANode(StateInInt)
-	stateToNodeMap[StateInInt] = intNode
-
-	// TODO:
-	// consider adding a node to just point to values, could be good to compute that
-	// mask rather than many different nodes
-
-	// Connect nodes
-	// TODO: if all are single tokens then this can just be connected instead of defining the token
-	startNode.TransitionEdges['{'] = objNode
-
-	objNode.TransitionEdges['"'] = objKeyNode
-	objNode.TransitionEdges['\n'] = newlineNode
-	// objNode.TransitionEdges['\t'] = tabNode
-
-	newlineNode.TransitionEdges['"'] = objKeyNode
-	newlineNode.TransitionEdges['\t'] = tabNode
-
-	tabNode.TransitionEdges['"'] = objKeyNode
-	// tabNode.TransitionEdges['\t'] = tabNode
-
-	objKeyNode.TransitionEdges[rune(-1)] = objKeyNode
-	objKeyNode.TransitionEdges['"'] = objKeyEndNode
-
-	objKeyEndNode.TransitionEdges[':'] = colonNode
-	objEndNode.TransitionEdges[' '] = spaceNode
-
-	// where values should be
-	// this could be combined but the probs might change, we're alr doing a skip ahead
-	colonNode.TransitionEdges[' '] = spaceNode
-
-	// Leads to a value
-	spaceNode.TransitionEdges['"'] = stringNode
-	spaceNode.TransitionEdges['['] = listNode
-	spaceNode.TransitionEdges['{'] = objNode
-
-	for _, r := range validNumberRunes {
-		spaceNode.TransitionEdges[r] = numberNode
-	}
-	for _, r := range validBoolRunes {
-		spaceNode.TransitionEdges[r] = boolNode
-	}
-
-	for _, r := range validNullRunes {
-		spaceNode.TransitionEdges[r] = nullNode
-	}
-
-	// Values
-	// string node
-	stringNode.TransitionEdges[rune(-1)] = stringNode
-	stringNode.TransitionEdges['"'] = stringEndNode
-
-	stringEndNode.TransitionEdges[','] = commaNode
-	stringEndNode.TransitionEdges['}'] = objEndNode
-	stringEndNode.TransitionEdges[']'] = listEndNode
-
-	// TODO: add counters for allowable number of decimals, e, E, etc
-	// number node
-	for _, r := range validNumberRunes {
-		numberNode.TransitionEdges[r] = numberNode
-	}
-	numberNode.TransitionEdges[','] = commaNode
-	numberNode.TransitionEdges['}'] = objEndNode
-	numberNode.TransitionEdges[']'] = listEndNode
-
-	for _, r := range validBoolRunes {
-		boolNode.TransitionEdges[r] = boolNode
-	}
-
-	// list node
-	listNode.TransitionEdges[','] = commaNode
-	listNode.TransitionEdges['"'] = stringNode
-	// squash states to a value
-	for _, r := range validNumberRunes {
-		listNode.TransitionEdges[r] = numberNode
-	}
-	for _, r := range validBoolRunes {
-		listNode.TransitionEdges[r] = boolNode
-	}
-	for _, r := range validNullRunes {
-		listNode.TransitionEdges[r] = nullNode
-	}
-
-	// null node
-	for _, r := range validNullRunes {
-		nullNode.TransitionEdges[r] = nullNode
-	}
-	nullNode.TransitionEdges[','] = commaNode
-	nullNode.TransitionEdges['}'] = objEndNode
-	nullNode.TransitionEdges[']'] = listEndNode
-
-	// list comma
-	// should point to values
-	listCommaNode.TransitionEdges['"'] = stringNode
-	listCommaNode.TransitionEdges[' '] = listCommaNode
-	listCommaNode.TransitionEdges['{'] = objNode
-	listCommaNode.TransitionEdges['\n'] = newlineNode
-
-	for _, r := range validNumberRunes {
-		listCommaNode.TransitionEdges[r] = numberNode
-	}
-	for _, r := range validBoolRunes {
-		listCommaNode.TransitionEdges[r] = boolNode
-	}
-	for _, r := range validNullRunes {
-		listCommaNode.TransitionEdges[r] = nullNode
-	}
-
-	// bool node
-	for _, r := range validBoolRunes {
-		boolNode.TransitionEdges[r] = boolNode
-	}
-	boolNode.TransitionEdges['}'] = objEndNode
-	boolNode.TransitionEdges[']'] = listEndNode
-	boolNode.TransitionEdges[','] = commaNode
-
-	listEndNode.TransitionEdges['}'] = objEndNode
-	listEndNode.TransitionEdges[','] = commaNode
-
-	commaNode.TransitionEdges['{'] = objNode
-	commaNode.TransitionEdges['\n'] = newlineNode
-	commaNode.TransitionEdges['\t'] = tabNode
-	commaNode.TransitionEdges['"'] = objKeyNode
-	commaNode.TransitionEdges[' '] = spaceObjNode
-
-	spaceObjNode.TransitionEdges['"'] = objKeyNode
-
-	return startNode, stateToNodeMap, nil
-}
-
-func PreComputeValidStates(stateToNodeMap map[JSONState]*PDANode, proc model.TextProcessor) error {
-
-	vocab := proc.GetVocabulary()
-
-	decodedToks := make([]string, len(vocab.Values))
-	for i := range vocab.Values {
-		token, err := proc.Decode([]int32{int32(i)})
-		if err != nil {
-			return err
-		}
-		decodedToks[i] = token
-	}
-
-	var err error
-	for _, node := range stateToNodeMap {
-		for i := range vocab.Values {
-			token := decodedToks[i]
-			// Skip EOS/BOS tokens and empty tokens since they are not valid in JSON
-			if proc.Is(uint32(i), model.SpecialEOS) || proc.Is(uint32(i), model.SpecialBOS) || token == "" {
-				continue
-			}
-			valid := true
-			curNode := node
-			consumedSpecialRunes := make(map[rune]bool)
-			for _, r := range token {
-				valid, curNode, err = isRuneValid(r, curNode, consumedSpecialRunes)
-				if err != nil {
-					return err
-				}
-				if !valid {
-					break
-				}
-			}
-			if valid {
-				node.MaskTokenIDToNode[int32(i)] = curNode.State
-			}
-		}
-	}
-	return nil
-}
-
-func isRuneValid(r rune, curNode *PDANode, consumedSpecialRunes map[rune]bool) (bool, *PDANode, error) {
-	if consumedSpecialRunes[r] {
-		return false, nil, nil
-	}
-
-	specialRune := slices.Contains(stringInvalidRunes, r)
-	if specialRune {
-		if curNode.State == StateInString || curNode.State == StateInObjectKey {
-			return false, nil, nil
-		}
-	}
-
-	// Check for specific rune transition
-	if nextNode, ok := curNode.TransitionEdges[r]; ok {
-		if specialRune {
-			if curNode.State == nextNode.State {
-				return false, nil, nil
-			}
-			// fmt.Println("special rune", r, "consumed")
-			consumedSpecialRunes[r] = true
-		}
-		return true, nextNode, nil
-	}
-
-	// Check for sentinel value - if present, any rune is valid
-	if nextNode, ok := curNode.TransitionEdges[rune(-1)]; ok {
-		return true, nextNode, nil
-	}
-
-	return false, nil, nil
-}
--- a/sample/json_types.go
+++ b/sample/json_types.go
@@ -1,160 +0,0 @@
-package sample
-
-import (
-	"fmt"
-)
-
-type JSONState int
-
-const (
-	StateStart JSONState = iota
-	StateInObject
-	StateInObjectKey
-	StateInStructuredKey
-	StateInStructuredValue
-	StateNewline
-	StateTab
-	StateSpace
-	StateInString
-	StateInInt
-	StateInFloat
-	StateInBool
-	StateInNull
-	StateInColon
-	StateInComma
-	StateInTab
-	StateInSpaceToValue
-	StateInSpaceEndValue
-	StateInNewlineEndValue
-	StateInObjSpace
-	StateInList
-	StateInListComma
-	StateInValue
-	StateInValueEnd
-	StateInListEnd
-	StateInListObjectEnd
-	StateInNewline
-	StateInNumber
-	StateInNumberEnd
-	StateInStringEnd
-	StateInObjectKeyEnd
-	StateTerminate
-	StateInObjectEnd
-	StateTransitioningToTerminate
-	StateInListStartJSON
-)
-
-var JSONStates = []JSONState{
-	StateStart,
-	StateInObject,
-	StateInObjectKey,
-	StateInStructuredKey,
-	StateInStructuredValue,
-	StateNewline,
-	StateTab,
-	StateSpace,
-	StateInString,
-	StateInInt,
-	StateInFloat,
-	StateInBool,
-	StateInNull,
-	StateInColon,
-	StateInComma,
-	StateInTab,
-	StateInSpaceToValue,
-	StateInSpaceEndValue,
-	StateInNewlineEndValue,
-	StateInObjSpace,
-	StateInListStartJSON,
-	StateInList,
-	StateInListComma,
-	StateInValue,
-	StateInValueEnd,
-	StateInListEnd,
-	StateInListObjectEnd,
-	StateInNewline,
-	StateInNumber,
-	StateInNumberEnd,
-	StateInStringEnd,
-	StateInObjectKeyEnd,
-	StateTerminate,
-	StateInObjectEnd,
-	StateTransitioningToTerminate,
-}
-
-func (s JSONState) String() string {
-	switch s {
-	case StateStart:
-		return "StateStart"
-	case StateInObject:
-		return "StateInObject"
-	case StateInObjectKey:
-		return "StateInObjectKey"
-	case StateInStructuredKey:
-		return "StateInStructuredKey"
-	case StateInStructuredValue:
-		return "StateInStructuredValue"
-	case StateNewline:
-		return "StateNewline"
-	case StateTab:
-		return "StateTab"
-	case StateSpace:
-		return "StateSpace"
-	case StateInString:
-		return "StateInString"
-	case StateInInt:
-		return "StateInInt"
-	case StateInFloat:
-		return "StateInFloat"
-	case StateInBool:
-		return "StateInBool"
-	case StateInNull:
-		return "StateInNull"
-	case StateInColon:
-		return "StateInColon"
-	case StateInComma:
-		return "StateInComma"
-	case StateInTab:
-		return "StateInTab"
-	case StateInSpaceToValue:
-		return "StateInSpaceToValue"
-	case StateInSpaceEndValue:
-		return "StateInSpaceEndValue"
-	case StateInNewlineEndValue:
-		return "StateInNewlineEndValue"
-	case StateInObjSpace:
-		return "StateInObjSpace"
-	case StateInList:
-		return "StateInList"
-	case StateInListComma:
-		return "StateInListComma"
-	case StateInValue:
-		return "StateInValue"
-	case StateInValueEnd:
-		return "StateInValueEnd"
-	case StateInListEnd:
-		return "StateInListEnd"
-	case StateInListObjectEnd:
-		return "StateInListObjectEnd"
-	case StateInNewline:
-		return "StateInNewline"
-	case StateInNumber:
-		return "StateInNumber"
-	case StateInNumberEnd:
-		return "StateInNumberEnd"
-	case StateInStringEnd:
-		return "StateInStringEnd"
-	case StateInObjectKeyEnd:
-		return "StateInObjectKeyEnd"
-	case StateTerminate:
-		return "StateTerminate"
-	case StateInObjectEnd:
-		return "StateInObjectEnd"
-	case StateTransitioningToTerminate:
-		return "StateTransitioningToTerminate"
-	case StateInListStartJSON:
-		return "StateInListStartJSON"
-	default:
-		return fmt.Sprintf("Unknown state: %d", s)
-	}
-}
--- a/sample/pushdown_automata.go
+++ b/sample/pushdown_automata.go
@@ -1,326 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"slices"
-
-	"github.com/ollama/ollama/model"
-)
-
-/*
-Key JSON rules to consider:
-
-1. Whitespace handling:
-   - Need to handle all valid JSON whitespace characters (\r, spaces between tokens)
-   - Current code only handles some whitespace cases
-
-2. Number validation:
-   - Need proper validation for special number cases like -0
-   - Should handle .5 style decimals
-   - Need limits on scientific notation (e, E)
-
-3. String escaping:
-   - Currently marks \ as invalid but should allow escaped sequences:
-     - \"
-     - \n
-     - \u1234 unicode escapes
-
-4. Empty object/array transitions:
-   - Direct {} and [] cases could be more explicit
-   - Need clear transitions for these edge cases
-
-5. Nested depth limits:
-   - No protection against excessive nesting
-   - Could cause stack overflow with deeply nested structures
-*/
-
-// TODO: / should be valid but an escape character
-var stringInvalidRunes = []rune{'\\', '\n', '\t', '{', '}', ':', ',', '/'}
-
-var (
-	intInvalidRunes = []rune{'e', 'E', ' ', '\n', '\t', '{', '}', ':', ',', '"'}
-	validIntRunes   = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'}
-)
-
-var validNumberRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '-', '+', 'e', 'E'}
-
-var validBoolRunes = []rune{'t', 'r', 'u', 'e', 'f', 'a', 'l', 's', 'e'}
-
-var validNullRunes = []rune{'n', 'u', 'l', 'l'}
-
-type PDA struct {
-	State             JSONState
-	TransitionEdges   map[rune]*PDA
-	MaskTokenIDToNode map[int32]*PDA
-}
-
-func NewPDANode(state JSONState) *PDA {
-	return &PDA{
-		State:             state,
-		TransitionEdges:   make(map[rune]*PDA),
-		MaskTokenIDToNode: make(map[int32]*PDA),
-	}
-}
-
-type PDAGraphBuilder struct {
-	proc             model.TextProcessor
-	decodedToks      []string
-	stateToNodeMap   map[JSONState]*PDA
-	tokenToStatesMap map[int32][]JSONState
-}
-
-func (b *PDAGraphBuilder) BuildGraph() error {
-	stateToNodeMap := make(map[JSONState]*PDA)
-	for _, state := range JSONStates {
-		stateToNodeMap[state] = NewPDANode(state)
-	}
-
-	stateToNodeMap[StateStart].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateStart].TransitionEdges['['] = stateToNodeMap[StateInListStartJSON]
-
-	// TODO: update naming here - and revisit values
-	stateToNodeMap[StateInListStartJSON].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInListStartJSON].TransitionEdges['['] = stateToNodeMap[StateInListStartJSON]
-
-	stateToNodeMap[StateInObject].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInObject].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	stateToNodeMap[StateInObject].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-	stateToNodeMap[StateInObject].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-
-	// new line
-	stateToNodeMap[StateInNewline].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInNewline].TransitionEdges['\t'] = stateToNodeMap[StateInTab]
-	stateToNodeMap[StateInNewline].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInNewline].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-	// stateToNodeMap[StateInNewline].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-
-	// new line end value
-	// stateToNodeMap[StateInNewlineEndValue].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInNewlineEndValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInNewlineEndValue].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-
-	stateToNodeMap[StateInObjSpace].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInObjSpace].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	// TODO: see if this is needed for formatting
-	stateToNodeMap[StateInObjSpace].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-
-	stateToNodeMap[StateInTab].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-
-	stateToNodeMap[StateInObjectKey].TransitionEdges[rune(-1)] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInObjectKey].TransitionEdges['"'] = stateToNodeMap[StateInObjectKeyEnd]
-
-	stateToNodeMap[StateInObjectKeyEnd].TransitionEdges[':'] = stateToNodeMap[StateInColon]
-
-	stateToNodeMap[StateInObjectEnd].TransitionEdges[','] = stateToNodeMap[StateInComma]
-	stateToNodeMap[StateInObjectEnd].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-
-	// where values should be
-	// this could be combined but the probl might change, we're alr doing a skip ahead
-	stateToNodeMap[StateInColon].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
-	stateToNodeMap[StateInColon].TransitionEdges['\n'] = stateToNodeMap[StateInSpaceToValue]
-
-	stateToNodeMap[StateInColon].TransitionEdges['['] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInColon].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	addValueConnections(stateToNodeMap[StateInColon], stateToNodeMap)
-
-	// Leads to a value
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['['] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	addValueConnections(stateToNodeMap[StateInSpaceToValue], stateToNodeMap)
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInSpaceToValue].TransitionEdges['\n'] = stateToNodeMap[StateInSpaceToValue]
-
-	// Values
-	// string node
-	stateToNodeMap[StateInString].TransitionEdges[rune(-1)] = stateToNodeMap[StateInString]
-	stateToNodeMap[StateInString].TransitionEdges['"'] = stateToNodeMap[StateInStringEnd]
-
-	// String end node
-	addEnds(stateToNodeMap[StateInStringEnd], stateToNodeMap)
-	// stateToNodeMap[StateInStringEnd].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInStringEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// TODO: add counters for allowable number of decimals, e, E, etc
-	// number node
-	for _, r := range validNumberRunes {
-		stateToNodeMap[StateInNumber].TransitionEdges[r] = stateToNodeMap[StateInNumber]
-	}
-	addEnds(stateToNodeMap[StateInNumber], stateToNodeMap)
-	// stateToNodeMap[StateInNumber].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInNumber].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// list node
-	stateToNodeMap[StateInList].TransitionEdges[','] = stateToNodeMap[StateInComma]
-	stateToNodeMap[StateInList].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInList].TransitionEdges[' '] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInList].TransitionEdges['\n'] = stateToNodeMap[StateInList]
-	// early end
-	stateToNodeMap[StateInList].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-
-	// list end node
-	stateToNodeMap[StateInListEnd].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	// stateToNodeMap[StateInListEnd].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInListEnd].TransitionEdges[','] = stateToNodeMap[StateInComma]
-	stateToNodeMap[StateInListEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// empty list
-	stateToNodeMap[StateInList].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-	addValueConnections(stateToNodeMap[StateInList], stateToNodeMap)
-
-	// null node
-	for _, r := range validNullRunes {
-		stateToNodeMap[StateInNull].TransitionEdges[r] = stateToNodeMap[StateInNull]
-	}
-	addEnds(stateToNodeMap[StateInNull], stateToNodeMap)
-	stateToNodeMap[StateInNull].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
-	stateToNodeMap[StateInNull].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// list comma
-	// should point to values
-	stateToNodeMap[StateInListComma].TransitionEdges[' '] = stateToNodeMap[StateInListComma]
-	stateToNodeMap[StateInListComma].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInListComma].TransitionEdges['\n'] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInListComma].TransitionEdges[' '] = stateToNodeMap[StateInList]
-	stateToNodeMap[StateInListComma].TransitionEdges['\t'] = stateToNodeMap[StateInList]
-
-	addValueConnections(stateToNodeMap[StateInListComma], stateToNodeMap)
-
-	// list object end
-	stateToNodeMap[StateInListObjectEnd].TransitionEdges[','] = stateToNodeMap[StateInListComma]
-	stateToNodeMap[StateInListObjectEnd].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-	// TODO: not sure if this is needed
-	stateToNodeMap[StateInListObjectEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// bool node
-	for _, r := range validBoolRunes {
-		stateToNodeMap[StateInBool].TransitionEdges[r] = stateToNodeMap[StateInBool]
-	}
-	stateToNodeMap[StateInBool].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	addEnds(stateToNodeMap[StateInBool], stateToNodeMap)
-	// stateToNodeMap[StateInBool].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInBool].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	// comma node
-	stateToNodeMap[StateInComma].TransitionEdges['{'] = stateToNodeMap[StateInObject]
-	stateToNodeMap[StateInComma].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
-	stateToNodeMap[StateInComma].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
-	stateToNodeMap[StateInComma].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
-	// todo: review this space transition
-	// stateToNodeMap[StateInComma].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
-
-	// space end value
-	// stateToNodeMap[StateInSpaceEndValue].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
-	stateToNodeMap[StateInSpaceEndValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	stateToNodeMap[StateInSpaceEndValue].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-	stateToNodeMap[StateInSpaceEndValue].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
-
-	b.stateToNodeMap = stateToNodeMap
-	if err := b.preComputeValidStates(); err != nil {
-		return err
-	}
-	return nil
-}
-
-func addEnds(node *PDA, stateToNodeMap map[JSONState]*PDA) {
-	node.TransitionEdges[','] = stateToNodeMap[StateInComma]
-	node.TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
-	node.TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
-}
-
-func addValueConnections(node *PDA, stateToNodeMap map[JSONState]*PDA) {
-	node.TransitionEdges['"'] = stateToNodeMap[StateInString]
-	for _, r := range validNumberRunes {
-		node.TransitionEdges[r] = stateToNodeMap[StateInNumber]
-	}
-	// TODO(parthsareen): force the output and shift similar to structured outputs
-	node.TransitionEdges['t'] = stateToNodeMap[StateInBool]
-	node.TransitionEdges['f'] = stateToNodeMap[StateInBool]
-	node.TransitionEdges['n'] = stateToNodeMap[StateInNull]
-}
-
-func (b *PDAGraphBuilder) preComputeValidStates() error {
-	for _, node := range b.stateToNodeMap {
-		// if node.State == StateInObjectKey {
-		// 	if len(b.stateToNodeMap[StateInString].MaskTokenIDToNode) > 0 {
-		// 		b.stateToNodeMap[StateInObjectKey].MaskTokenIDToNode = b.stateToNodeMap[StateInString].MaskTokenIDToNode
-		// 		fmt.Println("copying string mask to object key mask")
-		// 	}
-		// }
-		if err := b.CreateMask(node); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func (b *PDAGraphBuilder) preComputeTokenToStatesMap() error {
-	// TODO: make can be somewhere else too
-	b.tokenToStatesMap = make(map[int32][]JSONState)
-	for i, t := range b.decodedToks {
-		for _, r := range t {
-			if r == '"' {
-				b.tokenToStatesMap[int32(i)] = append(b.tokenToStatesMap[int32(i)], StateInString)
-			}
-		}
-	}
-	return nil
-}
-
-// TODO: the mask for obj key and string should be the same?
-func (b *PDAGraphBuilder) CreateMask(node *PDA) error {
-	if node == nil {
-		return fmt.Errorf("node cannot be nil")
-	}
-	for i := range b.decodedToks {
-		token := b.decodedToks[i]
-		// Skip EOS/BOS tokens and empty tokens since they are not valid in JSON
-		if b.proc.Is(uint32(i), model.SpecialEOS) || b.proc.Is(uint32(i), model.SpecialBOS) || token == "" || token == "\"\"" {
-			continue
-		}
-		curNode := node
-		valid := true
-		consumedSpecialRunes := make(map[rune]bool)
-		for _, r := range token {
-			curNode, valid = isRuneValid(r, curNode, consumedSpecialRunes)
-			if curNode == nil || !valid {
-				break
-			}
-		}
-		if valid {
-			node.MaskTokenIDToNode[int32(i)] = curNode
-		}
-	}
-	return nil
-}
-
-func isRuneValid(r rune, curNode *PDA, consumedSpecialRunes map[rune]bool) (*PDA, bool) {
-	if consumedSpecialRunes[r] {
-		return nil, false
-	}
-
-	specialRune := slices.Contains(stringInvalidRunes, r)
-	if specialRune {
-		if curNode.State == StateInString || curNode.State == StateInObjectKey {
-			return nil, false
-		}
-	}
-
-	// Check for specific rune transition
-	if nextNode, ok := curNode.TransitionEdges[r]; ok {
-		// fmt.Println("next node", nextNode)
-		if specialRune {
-			if curNode.State == nextNode.State {
-				return nil, false
-			}
-			consumedSpecialRunes[r] = true
-		}
-		return nextNode, true
-	}
-
-	// Check for sentinel value - if present, any rune is valid
-	if nextNode, ok := curNode.TransitionEdges[rune(-1)]; ok {
-		return nextNode, true
-	}
-
-	return nil, false
-}
--- a/sample/pushdown_runner.go
+++ b/sample/pushdown_runner.go
@@ -1,255 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"math"
-	"runtime"
-	"time"
-
-	"github.com/ollama/ollama/model"
-)
-
-// TODO: safety in case of invalid json
-// TODO: partial JSON matching?
-// TODO: interfaces to cleanup with return values
-// TODO this interface shouldn't be the sampler - should just use Sampler
-// TODO: add penalties for string \n stuff
-// TODO: minimize number of fwd passes if there is only one match
-// TODO: greedy sample initially and then backtrack if no match
-
-type PushdownSampler struct {
-	PDAGraphBuilder
-	curNode      *PDA
-	braceStack   []rune
-	stateCounter uint32
-}
-
-// graph should be built once and reused per tokenizer
-func NewPushdownSampler(proc model.TextProcessor) (*PushdownSampler, error) {
-	start := time.Now()
-
-	fmt.Println("--------------------------------")
-	fmt.Println("PDA sampler")
-	fmt.Println("--------------------------------")
-	var m runtime.MemStats
-	runtime.ReadMemStats(&m)
-	before := m.Alloc
-	fmt.Printf("Alloc = %.2f MB\n", float64(before)/(1024*1024))
-
-	vocab := proc.GetVocabulary()
-	decodedToks := make([]string, len(vocab.Values))
-	for i := range vocab.Values {
-		token, err := proc.Decode([]int32{int32(i)})
-		if err != nil {
-			return nil, err
-		}
-		decodedToks[i] = token
-	}
-
-	gb := &PDAGraphBuilder{
-		proc:        proc,
-		decodedToks: decodedToks,
-	}
-
-	if err := gb.BuildGraph(); err != nil {
-		return nil, err
-	}
-
-	runtime.ReadMemStats(&m)
-	after := m.Alloc
-	fmt.Printf("Alloc = %.2f MB\n", float64(after)/(1024*1024))
-	fmt.Printf("Graph memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
-	fmt.Printf("Graph build time = %v\n", time.Since(start))
-
-	// TODO: this can be simplified
-	return &PushdownSampler{
-		curNode:         gb.stateToNodeMap[StateStart],
-		PDAGraphBuilder: *gb,
-		braceStack:      []rune{},
-		stateCounter:    0,
-	}, nil
-}
-
-// TODO: need to add resampling logic if the first sample was not good
-// greedy sample + backtrack?
-func (s *PushdownSampler) Apply(logits []float64) ([]float64, error) {
-	switch s.curNode.State {
-	case StateInString:
-		return s.maskLogits(logits, s.curNode)
-
-	case StateInListEnd:
-		// force finish if no braces left
-		if len(s.braceStack) == 0 {
-			s.curNode = NewPDANode(StateTerminate)
-			return forceFinish(s, logits)
-		}
-
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	case StateTerminate:
-		return forceFinish(s, logits)
-
-	case StateInObjectEnd:
-		// force finish if no braces left
-		if len(s.braceStack) == 0 {
-			s.curNode = NewPDANode(StateTerminate)
-			return forceFinish(s, logits)
-		}
-
-		peek := s.braceStack[len(s.braceStack)-1]
-		if peek == rune('[') {
-			s.curNode = s.stateToNodeMap[StateInListObjectEnd]
-		}
-
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	case StateInComma:
-		peek := s.braceStack[len(s.braceStack)-1]
-		if peek == rune('[') {
-			s.curNode = s.stateToNodeMap[StateInListComma]
-		}
-
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	default:
-		fmt.Println("masking logits current state", s.curNode.State)
-		logits, err := s.maskLogits(logits, s.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-	}
-}
-
-func forceFinish(s *PushdownSampler, logits []float64) ([]float64, error) {
-	for i := range logits {
-		if s.proc.Is(uint32(i), model.SpecialEOS) {
-			logits[i] = 1.0
-		} else {
-			logits[i] = math.Inf(-1)
-		}
-	}
-	return logits, nil
-}
-
-func (s *PushdownSampler) UpdateState(tokenSlice []int32) ([]int32, error) {
-	fmt.Println("current state - updating", s.curNode.State)
-	mappedString, err := s.proc.Decode(tokenSlice)
-	if err != nil {
-		return nil, err
-	}
-	fmt.Printf(">>> mappedString: %q\n", mappedString)
-
-	// flag := -1
-	// endBraceRunes := []rune{'}', ']'}
-	for _, r := range mappedString {
-		// TODO: if this is enabled again, make sure to appropriately handle the state transitions
-		// if slices.Contains(endBraceRunes, r) && len(s.braceStack) == 0 {
-		// 	fmt.Printf("stack is empty, extra closing brace %c\n", r)
-		// 	// flag = i
-		// 	break
-
-		// }
-		if r == rune('{') {
-			s.braceStack = append(s.braceStack, r)
-		}
-		if r == rune('[') {
-			s.braceStack = append(s.braceStack, r)
-		}
-		if r == rune('}') {
-			if len(s.braceStack) == 0 {
-				return nil, fmt.Errorf("stack is empty, extra closing brace %c", r)
-			}
-			top := s.braceStack[len(s.braceStack)-1]
-			if top != rune('{') {
-				return nil, fmt.Errorf("unmatched closing brace, got%c, want%c", top, '{')
-			}
-			s.braceStack = s.braceStack[:len(s.braceStack)-1]
-		}
-
-		if r == rune(']') {
-			if len(s.braceStack) == 0 {
-				return nil, fmt.Errorf("stack is empty, extra closing brace %c", r)
-			}
-			top := s.braceStack[len(s.braceStack)-1]
-			if top != rune('[') {
-				return nil, fmt.Errorf("unmatched closing brace, got%c, want%c", top, '[')
-			}
-			s.braceStack = s.braceStack[:len(s.braceStack)-1]
-		}
-	}
-
-	// if flag != -1 {
-	// 	tokenSlice = tokenSlice[:flag]
-	// }
-	// fmt.Println("flag!", flag)
-	for _, tokenID := range tokenSlice {
-		// transition to the next node
-		nextNode, ok := s.curNode.MaskTokenIDToNode[tokenID]
-		if !ok {
-			return nil, fmt.Errorf("invalid token: %q", mappedString)
-		}
-		fmt.Println("transitioning to", nextNode.State)
-
-		// TODO: add a penalty for staying in the same state too long
-		if nextNode.State == s.curNode.State {
-			s.stateCounter++
-		} else {
-			s.stateCounter = 0
-		}
-		s.curNode = nextNode
-		fmt.Println("updated curNode state", s.curNode.State)
-	}
-	return tokenSlice, nil
-}
-
-// greedy sample + backtrack?
-func (s *PushdownSampler) maskLogits(logits []float64, node *PDA) ([]float64, error) {
-	// Create a new slice with same length as logits, initialized to -Inf
-	maskedLogits := make([]float64, len(logits))
-	for i := range maskedLogits {
-		maskedLogits[i] = math.Inf(-1)
-	}
-
-	// Only update values for valid token IDs from the mask map
-	for tokenID := range node.MaskTokenIDToNode {
-		if int(tokenID) < len(logits) {
-			maskedLogits[tokenID] = logits[tokenID]
-		}
-	}
-
-	return maskedLogits, nil
-}
-
-func (s *PushdownSampler) fastMaskLogits(logits []float64, node *PDA) ([]float64, error) {
-	maxLogit := math.Inf(-1)
-	maxIndex := -1
-
-	// Find the maximum logit value among valid tokens
-	for tokenID := range node.MaskTokenIDToNode {
-		if int(tokenID) < len(logits) && logits[tokenID] > maxLogit {
-			maxLogit = logits[tokenID]
-			maxIndex = int(tokenID)
-		}
-	}
-
-	if maxIndex == -1 {
-		return nil, fmt.Errorf("no valid tokens found in mask")
-	}
-
-	logits[0] = float64(maxIndex)
-	return logits, nil
-	// return maxIndex, nil
-}
--- a/sample/sample.go
+++ b/sample/sample.go
@@ -1,190 +1,74 @@
 package sample

 import (
-	"cmp"
-	"errors"
-	"math"
 	"slices"

-	pq "github.com/emirpasic/gods/v2/queues/priorityqueue"
-	"golang.org/x/exp/rand"
 	"gonum.org/v1/gonum/floats"
 	"gonum.org/v1/gonum/stat/sampleuv"
 )

-type Transform interface {
-	Apply([]float64) ([]float64, error)
-}
-
 type Sampler interface {
-	Sample([]float32, ...Transform) (int, error)
-}
-
-// TODO(parthsareen): potentially cache softmax values
-func softmax(logits []float64) []float64 {
-	var sum float64
-	tt := make([]float64, len(logits))
-	for i, v := range logits {
-		tt[i] = math.Exp(v)
-		sum += tt[i]
-	}
-	floats.Scale(1/sum, tt)
-	return tt
+	Sample([]float64) ([]float64, error)
 }

 type Temperature float64

-func (t Temperature) Apply(logits []float64) ([]float64, error) {
-	if t == 0 {
-		return nil, errors.New("use Greedy sampler instead of Temperature(0)")
-	}
-	if t < 0 || t > 2 {
-		return nil, errors.New("temperature must be between 0 and 2")
-	}
-	temp := math.Max(float64(t), 1e-7)
-
-	// subtracting max logit to avoid under/overflow
-	maxLogit := slices.Max(logits)
-	for i := range logits {
-		logits[i] = (logits[i] - maxLogit) / temp
-	}
-
-	return logits, nil
+func (s Temperature) Sample(t []float64) ([]float64, error) {
+	floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
+	return t, nil
 }

-type logitMap struct {
-	index int
-	logit float64
+type softmax struct{}
+
+func Softmax() Sampler {
+	return softmax{}
 }

-func logitMapComparator(a, b logitMap) int {
-	return -cmp.Compare(a.logit, b.logit)
+func (softmax) Sample(t []float64) ([]float64, error) {
+	return t, nil
 }

 type TopK int

-// TODO(parthsareen): avoid having to check all logits after this transform
-func (k TopK) Apply(logits []float64) ([]float64, error) {
-	if k <= 0 {
-		return nil, errors.New("k must be greater than 0")
-	}
-	if int(k) >= len(logits) {
-		return logits, nil
-	}
-
-	q := pq.NewWith(logitMapComparator)
-	for i, logit := range logits {
-		q.Enqueue(logitMap{index: i, logit: logit})
-	}
-
-	validLogits := make(map[int]float64)
-	for range k {
-		logitMap, _ := q.Dequeue()
-		validLogits[logitMap.index] = logitMap.logit
-	}
-
-	for i := range logits {
-		if _, ok := validLogits[i]; !ok {
-			logits[i] = math.Inf(-1)
-		}
-	}
-
-	return logits, nil
+func (s TopK) Sample(t []float64) ([]float64, error) {
+	return t, nil
 }

-type TopP float64
+type TopP float32

-func (p TopP) Apply(logits []float64) ([]float64, error) {
-	if p <= 0 || p >= 1 {
-		return nil, errors.New("p must be between 0 and 1")
-	}
-
-	probs := softmax(logits)
-	indices := make([]int, len(probs))
-	for i := range indices {
-		indices[i] = i
-	}
-
-	// sort in descending order
-	slices.SortFunc(indices, func(i, j int) int {
-		return cmp.Compare(probs[j], probs[i])
-	})
-
-	var cumSum float64
-	for i, idx := range indices {
-		cumSum += probs[idx]
-		if cumSum > float64(p) {
-			for _, idx := range indices[i+1:] {
-				logits[idx] = math.Inf(-1)
-			}
-			break
-		}
-	}
-	return logits, nil
+func (s TopP) Sample(t []float64) ([]float64, error) {
+	return t, nil
 }

-type MinP float64
+type MinP float32

-func (p MinP) Apply(logits []float64) ([]float64, error) {
-	if p <= 0 || p >= 1 {
-		return nil, errors.New("p must be between 0 and 1")
-	}
-
-	probs := softmax(logits)
-	threshold := slices.Max(probs) * float64(p)
-
-	for i, prob := range probs {
-		if prob < threshold {
-			logits[i] = math.Inf(-1)
-		}
-	}
-
-	return logits, nil
+func (s MinP) Sample(t []float64) ([]float64, error) {
+	return t, nil
 }

-type weighted struct {
-	src rand.Source
+type weighed struct{}
+
+func Weighed() Sampler {
+	return weighed{}
 }

-func Weighted(seed *int64) Sampler {
-	var src rand.Source
-	if seed != nil {
-		src = rand.NewSource(uint64(*seed))
+func (s weighed) Sample(t []float64) ([]float64, error) {
+	w := sampleuv.NewWeighted(t, nil)
+	if v, ok := w.Take(); ok {
+		return []float64{float64(v)}, nil
 	}
-	return weighted{src: src}
+
+	return t, nil
 }

-func (s weighted) Sample(logits []float32, transforms ...Transform) (int, error) {
-	logits64 := make([]float64, len(logits))
-	for i, v := range logits {
-		logits64[i] = float64(v)
-	}
-
+func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
 	var err error
-	for _, t := range transforms {
-		logits64, err = t.Apply(logits64)
+	for _, sampler := range samplers {
+		floats, err = sampler.Sample(floats)
 		if err != nil {
-			return -1, err
+			return nil, err
 		}
 	}

-	logitsCopy := make([]float64, 0, len(logits))
-	indices := make([]int, 0, len(logits))
-	for i, logit := range logits64 {
-		if !math.IsInf(logit, -1) {
-			logitsCopy = append(logitsCopy, logit)
-			indices = append(indices, i)
-		}
-	}
-
-	if len(logitsCopy) == 0 {
-		return -1, errors.New("no valid logits found for weighed sampling")
-	}
-
-	probs := softmax(logitsCopy)
-	w := sampleuv.NewWeighted(probs, s.src)
-	if idx, ok := w.Take(); ok {
-		return indices[idx], nil
-	}
-	return -1, errors.New("weighed sampler failed, no valid token found")
+	return floats, nil
 }
--- a/sample/sample_test.go
+++ b/sample/sample_test.go
@@ -1,242 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"math"
-	"math/rand/v2"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestTemperature(t *testing.T) {
-	logits, err := Temperature(0.5).Apply([]float64{2, -1, 4, -3, 1, -2, 0})
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	want := []float64{-4, -10, 0, -14, -6, -12, -8}
-	if diff := cmp.Diff(want, logits); diff != "" {
-		t.Errorf("logits mismatch (-want +got):\n%s", diff)
-	}
-
-	logits, err = Temperature(-1).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err == nil {
-		t.Errorf("expected error for temperature=-1, got %v", logits)
-	}
-	logits, err = Temperature(0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err == nil {
-		t.Errorf("expected error for temperature=0, got %v", logits)
-	}
-	logits, err = Temperature(2.1).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err == nil {
-		t.Errorf("expected error for temperature=2.1, got %v", logits)
-	}
-}
-
-func TestSoftmax(t *testing.T) {
-	probs := softmax([]float64{-3, -2, -1, 0, 1, 2, 4})
-
-	expectedProbs := []float64{0.000751406628089903, 0.0020425349829204676, 0.005552185728064613, 0.015092405572827691, 0.04102541181635154, 0.11151863144543739, 0.8240174238263085}
-	if diff := cmp.Diff(expectedProbs, probs); diff != "" {
-		t.Errorf("probs mismatch (-want +got):\n%s", diff)
-	}
-}
-
-func TestTopK(t *testing.T) {
-	logits, err := TopK(3).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	expectedlogits := []float64{math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), 1, 2, 4}
-	if diff := cmp.Diff(expectedlogits, logits); diff != "" {
-		t.Errorf("logits mismatch (-want +got):\n%s", diff)
-	}
-
-	_, err = TopK(0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err == nil {
-		t.Errorf("expected error for k=0, got %v", err)
-	}
-
-	logits, err = TopK(10).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	expectedlogits = []float64{-3, -2, -1, 0, 1, 2, 4}
-	if diff := cmp.Diff(expectedlogits, logits); diff != "" {
-		t.Errorf("logits mismatch (-want +got):\n%s", diff)
-	}
-}
-
-func TestTopP(t *testing.T) {
-	logits, err := TopP(0.9).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	want := []float64{math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), 2, 4}
-	if diff := cmp.Diff(want, logits); diff != "" {
-		t.Errorf("logits mismatch (-want +got):\n%s", diff)
-	}
-
-	_, err = TopP(1.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err == nil {
-		t.Error("expected error for p=1.0")
-	}
-	_, err = TopP(0.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
-	if err == nil {
-		t.Error("expected error for p=0.0")
-	}
-}
-
-func TestMinP(t *testing.T) {
-	logits, err := MinP(0.2).Apply([]float64{-3, -2, -1, 0, 1, 2, 4, 3})
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	want := []float64{math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), 4, 3}
-	if diff := cmp.Diff(want, logits); diff != "" {
-		t.Errorf("logits mismatch (-want +got):\n%s", diff)
-	}
-
-	_, err = MinP(1.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 3, 4})
-	if err == nil {
-		t.Error("expected error for p=1.0")
-	}
-	_, err = MinP(0.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 3, 4})
-	if err == nil {
-		t.Error("expected error for p=0.0")
-	}
-}
-
-func TestWeighed(t *testing.T) {
-	idx, err := Weighted(nil).Sample([]float32{float32(math.Inf(-1)), 2, float32(math.Inf(-1)), float32(math.Inf(-1))})
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	want := 1
-	if diff := cmp.Diff(want, idx); diff != "" {
-		t.Errorf("index mismatch (-want +got):\n%s", diff)
-	}
-
-	idx, err = Weighted(nil).Sample([]float32{float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1))})
-	if err == nil {
-		t.Error("expected error for no valid tokens, got index", idx)
-	}
-}
-
-func TestSample(t *testing.T) {
-	input := []float32{1, 2, 3, 4}
-
-	var callOrder []int
-	mock1 := &testTransform{
-		id:        1,
-		callOrder: &callOrder,
-	}
-	mock2 := &testTransform{
-		id:        2,
-		callOrder: &callOrder,
-	}
-	mock3 := &testTransform{
-		id:        3,
-		callOrder: &callOrder,
-	}
-
-	got, err := Greedy().Sample(input, mock1, mock2, mock3)
-	if err != nil {
-		t.Error(err)
-		return
-	}
-
-	want := 3 // Greedy sampler should pick highest logit
-	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("sampled index mismatch (-want +got):\n%s", diff)
-	}
-
-	_, err = Weighted(nil).Sample(input, mock1, mock2, mock3)
-	if err != nil {
-		t.Error(err)
-		return
-	}
-	wantOrder := []int{1, 2, 3}
-	if diff := cmp.Diff(wantOrder, callOrder); diff != "" {
-		t.Errorf("call order mismatch (-want +got):\n%s", diff)
-	}
-
-	errMock := &testTransform{
-		returnErr: fmt.Errorf("mock error"),
-	}
-	_, err = Weighted(nil).Sample(input, mock1, errMock, mock2)
-	if err == nil {
-		t.Error("Expected error from sampler")
-	}
-}
-
-type testTransform struct {
-	id        int
-	callOrder *[]int
-	returnErr error
-}
-
-func (ts *testTransform) Apply(logits []float64) ([]float64, error) {
-	if ts.callOrder != nil {
-		*ts.callOrder = append(*ts.callOrder, ts.id)
-	}
-	if ts.returnErr != nil {
-		return nil, ts.returnErr
-	}
-	return logits, nil
-}
-
-func BenchmarkTransform(b *testing.B) {
-	transforms := map[string]Transform{
-		"Temperature": Temperature(0.5),
-		"TopK":        TopK(10),
-		"TopP":        TopP(0.9),
-		"MinP":        MinP(0.2),
-	}
-
-	logits := make([]float64, 1<<16)
-	for i := range logits {
-		logits[i] = rand.Float64()
-	}
-
-	for name, transform := range transforms {
-		b.Run(name, func(b *testing.B) {
-			b.ResetTimer()
-			for range b.N {
-				_, err := transform.Apply(logits)
-				if err != nil {
-					b.Error(err)
-				}
-			}
-		})
-	}
-}
-
-func BenchmarkSample(b *testing.B) {
-	samplers := map[string]Sampler{
-		"Greedy":   Greedy(),
-		"Weighted": Weighted(nil),
-	}
-
-	logits := make([]float32, 1<<16)
-	for i := range logits {
-		logits[i] = rand.Float32()
-	}
-
-	for name, s := range samplers {
-		b.Run(name, func(b *testing.B) {
-			b.ResetTimer()
-			for range b.N {
-				if _, err := s.Sample(logits); err != nil {
-					b.Error(err)
-				}
-			}
-		})
-	}
-}
--- a/sample/structured_outputs.go
+++ b/sample/structured_outputs.go
@@ -1,296 +0,0 @@
-package sample
-
-import (
-	"fmt"
-	"runtime"
-	"time"
-
-	"github.com/ollama/ollama/model"
-)
-
-type JSONSampler struct {
-	schema        *Schema
-	propIdx       int
-	propToNodeMap map[string]*PDA
-	pdaSampler    *PushdownSampler
-	decodedToks   []string
-}
-
-func NewJSONSampler(proc model.TextProcessor, schema *Schema) (*JSONSampler, error) {
-	if proc == nil {
-		return nil, fmt.Errorf("TextProcessor cannot be nil")
-	}
-
-	pdaSampler, err := NewPushdownSampler(proc)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create PushdownSampler: %w", err)
-	}
-
-	if schema == nil {
-		return &JSONSampler{
-			schema:        nil,
-			propIdx:       -1,
-			propToNodeMap: nil,
-			pdaSampler:    pdaSampler,
-		}, nil
-	}
-
-	// fmt.Println("schema not nil")
-	so := &JSONSampler{
-		schema:        schema,
-		propIdx:       -1,
-		propToNodeMap: make(map[string]*PDA),
-		pdaSampler:    pdaSampler,
-	}
-
-	so.schemaToGraph()
-
-	// Benchmark token decoding
-	start := time.Now()
-	var m runtime.MemStats
-	runtime.ReadMemStats(&m)
-	before := m.Alloc
-
-	vocab := proc.GetVocabulary()
-	decodedToks := make([]string, len(vocab.Values))
-	for i := range vocab.Values {
-		token, err := proc.Decode([]int32{int32(i)})
-		if err != nil {
-			return nil, err
-		}
-		decodedToks[i] = token
-	}
-	so.decodedToks = decodedToks
-
-	runtime.ReadMemStats(&m)
-	after := m.Alloc
-	fmt.Printf("Token decode memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
-	fmt.Printf("Token decode time = %v\n", time.Since(start))
-
-	fmt.Println("--------------------------------")
-	fmt.Println("SOSampler")
-	fmt.Println("--------------------------------")
-	// Benchmark this section
-	start = time.Now()
-	runtime.ReadMemStats(&m)
-	before = m.Alloc
-
-	// TODO: still messed up
-	// TODO: recursion use case
-	// key masks
-	for _, prop := range so.schema.Properties {
-		node := so.propToNodeMap[prop.Name]
-		// propName -> node
-		curState := node.State
-		fromNode := node
-		so.pdaSampler.CreateMask(fromNode)
-		for curState == StateInStructuredKey {
-			// there is only one edge
-			for r, toNode := range fromNode.TransitionEdges {
-				fmt.Println("rune", r, "edge", toNode.State)
-				so.pdaSampler.CreateMask(toNode)
-				fmt.Printf("created mask for %c\n", r)
-				curState = toNode.State
-				fmt.Println("next state", curState)
-				// TODO: theres an extra gen for " right now
-				fromNode = toNode
-			}
-		}
-
-		if curState != StateInColon {
-			return nil, fmt.Errorf("expected state to be StateInColon, got %v", curState)
-		}
-
-		// so.pdaSampler.CreateMask(fromNode)
-
-		fromNode = fromNode.TransitionEdges[' ']
-
-		so.pdaSampler.CreateMask(fromNode)
-		curState = fromNode.State
-		for _, toNode := range fromNode.TransitionEdges {
-			fmt.Println("toNode", toNode.State)
-		}
-	}
-
-	// runtime.ReadMemStats(&m)
-	// after = m.Alloc
-	// fmt.Printf("Mask creation memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
-	// fmt.Printf("Mask creation time = %v\n", time.Since(start))
-	// fmt.Println("--------------------------------")
-
-	return so, nil
-}
-
-func (s *JSONSampler) schemaToGraph() {
-	schemaType := s.schema.EffectiveType()
-	switch schemaType {
-	case "object":
-		// TODO: see if we need to connect these to the JSON graph
-
-		// each prop is a key
-		for _, prop := range s.schema.Properties {
-			// name of key
-			name := prop.Name
-			keyNode := &PDA{
-				State:             StateInStructuredKey, // this is unchanging, will impact sampling
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-
-			prevNode := keyNode
-			for _, r := range name {
-				runeNode := &PDA{
-					State:             StateInStructuredKey, // this is unchanging, will impact sampling
-					TransitionEdges:   make(map[rune]*PDA),
-					MaskTokenIDToNode: make(map[int32]*PDA),
-				}
-				// fmt.Println("runeNode created", runeNode.State)
-				// fmt.Printf("runeNode created %c\n", r)
-
-				// since alloc on heap connections wil still map
-				prevNode.TransitionEdges[r] = runeNode
-				prevNode = runeNode
-			}
-
-			// point to end of object key node after all chars are done
-			// prevNode.TransitionEdges['"'] = s.pdaSampler.stateToNodeMap[StateInObjectKeyEnd]
-
-			// link to value node
-			// Create a node for the end of the key (after the closing quote)
-			stringEndNode := &PDA{
-				State:             StateInStructuredKey,
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-			prevNode.TransitionEdges['"'] = stringEndNode
-			prevNode = stringEndNode
-
-			// Add transition for colon after key
-			colonNode := &PDA{
-				State:             StateInColon,
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-			prevNode.TransitionEdges[':'] = colonNode
-			prevNode = colonNode
-
-			// Add transition for space after colon
-			spaceNode := &PDA{
-				State:             StateInSpaceToValue,
-				TransitionEdges:   make(map[rune]*PDA),
-				MaskTokenIDToNode: make(map[int32]*PDA),
-			}
-			prevNode.TransitionEdges[' '] = spaceNode
-			prevNode = spaceNode
-
-			value := prop.Type
-			switch value {
-			case "object":
-				fmt.Println("object under key: ", name)
-			case "array":
-				fmt.Println("array under key: ", name)
-			case "string":
-				fmt.Println("string under key: ", name)
-				prevNode.TransitionEdges['"'] = s.pdaSampler.stateToNodeMap[StateInString]
-			case "number":
-				fmt.Println("number under key: ", name)
-				for _, r := range validNumberRunes {
-					prevNode.TransitionEdges[r] = s.pdaSampler.stateToNodeMap[StateInNumber]
-				}
-			case "boolean":
-				fmt.Println("boolean under key: ", name)
-				prevNode.TransitionEdges['t'] = s.pdaSampler.stateToNodeMap[StateInBool]
-				prevNode.TransitionEdges['f'] = s.pdaSampler.stateToNodeMap[StateInBool]
-				prevNode.TransitionEdges['n'] = s.pdaSampler.stateToNodeMap[StateInNull]
-			}
-
-			// points to start of the key
-			s.propToNodeMap[name] = keyNode
-			fmt.Println("name", name, "keyNode", keyNode.State)
-		}
-	}
-	// TODO: do values + recursion
-}
-
-func (s *JSONSampler) Apply(logits []float64) ([]float64, error) {
-	if s.schema == nil {
-		return s.pdaSampler.Apply(logits)
-	}
-
-	switch s.pdaSampler.curNode.State {
-	// TODO: doesnt account for multi rune case
-	case StateInObjectKey:
-		if s.propIdx > len(s.schema.Properties)-1 {
-			return nil, fmt.Errorf("propIdx out of bounds")
-		}
-		// fmt.Println("in object key - structured outputs")
-		// TODO: this tracking should probably be coming from a stack to track nested objects
-		// simple case
-		s.propIdx++
-		fmt.Println("propIdx", s.propIdx)
-		prop := s.schema.Properties[s.propIdx]
-		fmt.Println("prop", prop.Name)
-		s.pdaSampler.curNode = s.propToNodeMap[prop.Name]
-		fmt.Println("changed curNode state to", s.pdaSampler.curNode.State)
-		logits, err := s.pdaSampler.maskLogits(logits, s.pdaSampler.curNode)
-		if err != nil {
-			return nil, err
-		}
-		return logits, nil
-
-	default:
-
-		// Will only happen for the last prop - can also be precomputed.
-		if s.propIdx == len(s.schema.Properties)-1 {
-			// todo: if i incremenet propidx then i know im in last value as well
-			switch s.pdaSampler.curNode.State {
-			case StateInObjectEnd:
-				fmt.Println("<<<<< in obj end - generating mask for", s.pdaSampler.curNode.State)
-				s.pdaSampler.curNode.TransitionEdges = make(map[rune]*PDA)
-				s.pdaSampler.curNode = NewPDANode(StateTerminate)
-				s.propIdx++
-
-			// TODO: this needs to be optimized in some way, computing mask on the fly is expensive
-			case StateInNumber, StateInString, StateInBool, StateInNull, StateInListEnd:
-				fmt.Println("<<<<< last prop - generating mask for", s.pdaSampler.curNode.State)
-				delete(s.pdaSampler.curNode.TransitionEdges, ',')
-				s.pdaSampler.curNode.MaskTokenIDToNode = make(map[int32]*PDA)
-
-				s.pdaSampler.CreateMask(s.pdaSampler.curNode)
-				s.propIdx++
-			}
-		}
-		return s.pdaSampler.Apply(logits)
-	}
-}
-
-func (s *JSONSampler) UpdateState(tokenSlice []int32) ([]int32, error) {
-	tokenSlice, err := s.pdaSampler.UpdateState(tokenSlice)
-	if err != nil {
-		return nil, err
-	}
-
-	if s.schema == nil {
-		// Don't need to update state for unconstrained JSON sampling
-		return tokenSlice, nil
-	}
-
-	switch s.pdaSampler.curNode.State {
-	case StateInObjectKey:
-		s.propIdx++
-		fmt.Println("propIdx", s.propIdx)
-		prop := s.schema.Properties[s.propIdx]
-		fmt.Println("prop", prop.Name)
-		s.pdaSampler.curNode = s.propToNodeMap[prop.Name]
-		// TODO: this does not work - mike
-		// str, err := s.pdaSampler.proc.Decode(tokenSlice)
-		// if err != nil {
-		// 	return nil, err
-		// }
-		// fmt.Println("str", str)
-
-		return tokenSlice, nil
-	default:
-		return tokenSlice, nil
-	}
-}
--- a/sample/trace.out
+++ b/sample/trace.out
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -1,21 +0,0 @@
-#!/bin/sh
-
-set -eu
-
-usage() {
-    echo "usage: $(basename $0) VERSION"
-    exit 1
-}
-
-[ "$#" -eq 1 ] || usage
-
-export VERSION="$1"
-
-# build universal MacOS binary
-sh $(dirname $0)/build_darwin.sh
-
-# # build arm64 and amd64 Linux binaries
-sh $(dirname $0)/build_linux.sh
-
-# # build arm64 and amd64 Docker images
-sh $(dirname $0)/build_docker.sh
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -2,53 +2,92 @@

 set -e

-. $(dirname $0)/env.sh
+status() { echo >&2 ">>> $@"; }
+usage() {
+    echo "usage: $(basename $0) [build [sign]]"
+    exit 1
+}

-mkdir -p dist
+export VERSION=${VERSION:-$(git describe --tags --dirty)}
+export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${VERSION#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+export CGO_CPPFLAGS='-mmacosx-version-min=11.3'

-# These require Xcode v13 or older to target MacOS v11
-# If installed to an alternate location use the following to enable
-# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
-export CGO_CPPFLAGS=-mmacosx-version-min=11.3
+ARCHS="arm64 amd64"
+while getopts "a:h" OPTION; do
+    case $OPTION in
+        a) ARCHS=$OPTARG ;;
+        h) usage ;;
+    esac
+done

-rm -rf llama/build dist/darwin-*
+shift $(( $OPTIND - 1 ))

-# Generate the universal ollama binary for stand-alone usage: metal + avx
-echo "Building binary"
-echo "Building darwin arm64"
-GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
-echo "Building darwin amd64 with AVX enabled"
-GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
-lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
+_build_darwin() {
+    for ARCH in $ARCHS; do
+        status "Building darwin $ARCH"
+        INSTALL_PREFIX=dist/darwin-$ARCH/
+        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 go build -o $INSTALL_PREFIX .

-# sign the binary and rename it
-if [ -n "$APPLE_IDENTITY" ]; then
-    codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama-darwin
-else
-    echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
+        if [ "$ARCH" = "amd64" ]; then
+            status "Building darwin $ARCH dynamic backends"
+            cmake -B build/darwin-$ARCH \
+                -DCMAKE_OSX_ARCHITECTURES=x86_64 \
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3
+            cmake --build build/darwin-$ARCH --target ggml-cpu -j
+            install build/darwin-$ARCH/lib/ollama/*.{dylib,so} $INSTALL_PREFIX
+        fi
+    done
+}
+
+_sign_darwin() {
+    status "Creating universal binary..."
+    lipo -create -output dist/darwin/ollama dist/darwin/*/ollama
+
+    if [ -z "$APPLE_IDENTITY" ]; then
+        status "No APPLE_IDENTITY set, skipping code signing"
+        return
+    fi
+
+    for F in dist/darwin/ollama dist/darwin/amd64/lib*; do
+        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime $F
+    done
+
+    # create a temporary zip for notarization
+    TEMP=$(mktemp -u).zip
+    ditto -c -k --keepParent dist/darwin/ollama "$TEMP"
+    xcrun notarytool submit dist/darwin/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+    rm -f "$TEMP"
+
+    # create a universal tarball
+    tar -cf dist/ollama-darwin.tar --strip-components 2 dist/darwin/ollama
+    tar -rf dist/ollama-darwin.tar --strip-components 3 dist/darwin/amd64/lib*
+    gzip -9vc <dist/ollama-darwin.tar >dist/ollama-darwin.tgz
+}
+
+_build_macapp() {
+    # build and optionally sign the mac app
+    npm install --prefix macapp
+    if [ -n "$APPLE_IDENTITY" ]; then
+        npm run --prefix macapp make:sign
+    else
+        npm run --prefix macapp make
+    fi
+
+    mv ./macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
+}
+
+if [ "$#" -eq 0 ]; then
+    _build_darwin
+    _sign_darwin
+    _build_macapp
+    exit 0
 fi
-ditto -c -k --keepParent dist/ollama-darwin dist/temp.zip
-if [ -n "$APPLE_IDENTITY" ]; then
-    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
-fi
-rm -f dist/temp.zip
-
-# Build the app bundle
-echo "Building app"
-echo "Building darwin amd64 with runners"
-rm dist/darwin-amd64/bin/ollama
-GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
-
-# Generate the universal ollama binary for the app bundle: metal + no-avx
-lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
-
-# build and optionally sign the mac app
-npm install --prefix macapp
-if [ -n "$APPLE_IDENTITY" ]; then
-    npm run --prefix macapp make:sign
-else 
-    npm run --prefix macapp make
-fi
-cp macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip

+for CMD in "$@"; do
+    case $CMD in
+        build) _build_darwin ;;
+        sign) _sign_darwin ;;
+        macapp) _build_macapp ;;
+        *) usage ;;
+    esac
+done
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -18,7 +18,7 @@ docker buildx build \
        --output type=local,dest=./dist/ \
        --platform=${PLATFORM} \
        ${OLLAMA_COMMON_BUILD_ARGS} \
-        --target dist \
+        --target archive \
        -f Dockerfile \
        .

@@ -26,4 +26,4 @@ docker buildx build \
 if echo $PLATFORM | grep "," > /dev/null ; then 
        mv -f ./dist/linux_*64/ollama* ./dist/
        rmdir ./dist/linux_*64
-fi
+fi
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -80,18 +80,61 @@ function checkEnv() {

 function buildOllama() {
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
-        write-host "Building ollama runners"
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        & make -j 12 dist
+        New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
+
+
+        # Default first, then conditionall ROCm and cuda v11
+        write-host "Building Default native backend libraries"
+         $env:CMAKE_GENERATOR="ninja"
+        & cmake --preset Default
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        & cmake --build --preset Default -j 12
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        & cmake --install build -j 12
+        
+        # TODO - add steps for v11 and ROCm
+        #
+        # if ("$script:CUDA_DIRS".Contains("v11") -and "$script:CUDA_DIRS".Contains("v12")) {
+        #     # We assume the default is v12, so override for v11
+        #     $origCUDA_PATH=$env:CUDA_PATH
+        #     $hashEnv = @{}
+        #     Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
+        #     $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
+        #     write-host "$v11"
+        #     # $env:CUDA_PATH=$hashEnv[$v11]
+        #     # $env:CUDACXX=$hashEnv[$v11]+"\bin\nvcc.exe"
+        #     $env:CUDAToolkit_ROOT=$hashEnv[$v11]
+        #     # ls env:
+        #     write-host "Building CUDA v11 backend libraries"
+        #     & cmake --preset "CUDA 11"
+        #     $env:CUDA_PATH=$origCUDA_PATH
+        #     exit(1)
+        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        #     # & cmake --build --preset "CUDA 11" -j 12
+        #     # if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        # }
+
+        # if ($env:HIP_PATH) {
+        #     write-host "Building ROCm backend libraries"
+        #     $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
+        #     $env:HIP_PLATFORM="amd"
+        #     $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+        #     & cmake --preset "ROCm"
+        #     $env:HIPCXX=""
+        #     $env:HIP_PLATFORM=""
+        #     $env:CMAKE_PREFIX_PATH=""
+        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        #     & cmake --build --preset "ROCm" -j 12
+        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        # }
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
    }
    write-host "Building ollama CLI"
    & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
+    cp .\ollama.exe "${script:DIST_DIR}\"
 }

 function buildApp() {
--- a/scripts/fast.sh
+++ b/scripts/fast.sh
@@ -1,20 +0,0 @@
-#/bin/sh
-
-# Wrapper script to speed up builds by disabling some permutations and reduce compatibility matrix
-# Don't use for release builds, but suitable for local developer iteration
-
-# Only build cuda v12
-export OLLAMA_SKIP_CUDA_11_GENERATE=1
-# Major versions only
-export CUDA_V12_ARCHITECTURES="60;70;80;90"
-# Skip ROCm
-export OLLAMA_SKIP_ROCM_GENERATE=1
-# Disable various less common quants and fattn
-export OLLAMA_FAST_BUILD=1
-
-if [ $# -ne 1 ] ; then
-    echo "Usage: ./scripts/fast.sh <build_script>"
-    exit 1
-fi
-
-exec $1
--- a/scripts/publish.sh
+++ b/scripts/publish.sh
@@ -1,25 +0,0 @@
-# Set your variables here.
-REPO="jmorganca/ollama"
-
-# Check if VERSION is set
-if [[ -z "${VERSION}" ]]; then
-  echo "VERSION is not set. Please set the VERSION environment variable."
-  exit 1
-fi
-
-OS=$(go env GOOS)
-
-./script/build_${OS}.sh
-
-# Create a new tag if it doesn't exist.
-if ! git rev-parse v$VERSION >/dev/null 2>&1; then
-  git tag v$VERSION
-fi
-
-git push origin v$VERSION
-
-# Create a new release.
-gh release create -p v$VERSION -t v$VERSION
-
-# Upload the zip file.
-gh release upload v$VERSION ./dist/* --clobber
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -1,78 +0,0 @@
-#!/bin/sh
-
-# Script for common Dockerfile dependency installation in redhat linux based images
-
-set -ex
-set -o pipefail
-MACHINE=$(uname -m)
-
-if grep -i "centos" /etc/system-release >/dev/null; then
-    # As of 7/1/2024 mirrorlist.centos.org has been taken offline, so adjust accordingly
-    sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-    sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-    sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-
-    # Centos 7 derivatives have too old of a git version to run our generate script
-    # uninstall and ignore failures
-    yum remove -y git
-    yum -y install epel-release centos-release-scl
-
-    # The release packages reinstate the mirrors, undo that again
-    sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-    sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-    sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-
-    yum -y install dnf
-    if [ "${MACHINE}" = "x86_64" ]; then
-        yum -y install https://repo.ius.io/ius-release-el7.rpm
-        dnf install -y git236
-    else
-        dnf install -y rh-git227-git
-        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
-    fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz findutils
-elif grep -i "rocky" /etc/system-release >/dev/null; then
-    # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
-    cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
-[vault]
-name=Rocky Vault
-baseurl=https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/
-gpgcheck=1
-enabled=1
-countme=1
-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
-EOF
-    dnf install -y git \
-        gcc-toolset-10-gcc-10.2.1-8.2.el8 \
-        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
-        findutils \
-        yum-utils \
-        pigz
-else
-    echo "ERROR Unexpected distro"
-    exit 1
-fi
-
-if [ "${MACHINE}" = "x86_64" ] ; then
-    curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
-    mv /tmp/ccache /usr/local/bin/
-else
-    yum -y install epel-release
-    yum install -y ccache
-fi
-
-if [ -n "${CMAKE_VERSION}" ]; then
-    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
-fi
-
-if [ -n "${GOLANG_VERSION}" ]; then
-    if [ "${MACHINE}" = "x86_64" ]; then
-        GO_ARCH="amd64"
-    else
-        GO_ARCH="arm64"
-    fi
-    mkdir -p /usr/local
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local
-    ln -s /usr/local/go/bin/go /usr/local/bin/go
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt
-fi
--- a/server/create.go
+++ b/server/create.go
@@ -178,12 +178,37 @@ func convertModelFromFiles(files map[string]string, baseLayers []*layerGGML, isA
 }

 func detectModelTypeFromFiles(files map[string]string) string {
-	// todo make this more robust by actually introspecting the files
 	for fn := range files {
 		if strings.HasSuffix(fn, ".safetensors") {
 			return "safetensors"
-		} else if strings.HasSuffix(fn, ".bin") || strings.HasSuffix(fn, ".gguf") {
+		} else if strings.HasSuffix(fn, ".gguf") {
 			return "gguf"
+		} else {
+			// try to see if we can find a gguf file even without the file extension
+			blobPath, err := GetBlobsPath(files[fn])
+			if err != nil {
+				slog.Error("error getting blobs path", "file", fn)
+				return ""
+			}
+
+			f, err := os.Open(blobPath)
+			if err != nil {
+				slog.Error("error reading file", "error", err)
+				return ""
+			}
+			defer f.Close()
+
+			buf := make([]byte, 4)
+			_, err = f.Read(buf)
+			if err != nil {
+				slog.Error("error reading file", "error", err)
+				return ""
+			}
+
+			ct := ggml.DetectContentType(buf)
+			if ct == "gguf" {
+				return "gguf"
+			}
 		}
 	}

--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -3,6 +3,7 @@ package server
 import (
 	"bytes"
 	"cmp"
+	"crypto/sha256"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -710,3 +711,100 @@ func TestCreateDetectTemplate(t *testing.T) {
 		})
 	})
 }
+
+func TestDetectModelTypeFromFiles(t *testing.T) {
+	t.Run("gguf file", func(t *testing.T) {
+		_, digest := createBinFile(t, nil, nil)
+		files := map[string]string{
+			"model.gguf": digest,
+		}
+
+		modelType := detectModelTypeFromFiles(files)
+		if modelType != "gguf" {
+			t.Fatalf("expected model type 'gguf', got %q", modelType)
+		}
+	})
+
+	t.Run("gguf file w/o extension", func(t *testing.T) {
+		_, digest := createBinFile(t, nil, nil)
+		files := map[string]string{
+			fmt.Sprintf("%x", digest): digest,
+		}
+
+		modelType := detectModelTypeFromFiles(files)
+		if modelType != "gguf" {
+			t.Fatalf("expected model type 'gguf', got %q", modelType)
+		}
+	})
+
+	t.Run("safetensors file", func(t *testing.T) {
+		files := map[string]string{
+			"model.safetensors": "sha256:abc123",
+		}
+
+		modelType := detectModelTypeFromFiles(files)
+		if modelType != "safetensors" {
+			t.Fatalf("expected model type 'safetensors', got %q", modelType)
+		}
+	})
+
+	t.Run("unsupported file type", func(t *testing.T) {
+		p := t.TempDir()
+		t.Setenv("OLLAMA_MODELS", p)
+
+		data := []byte("12345678")
+		digest := fmt.Sprintf("sha256:%x", sha256.Sum256(data))
+		if err := os.MkdirAll(filepath.Join(p, "blobs"), 0o755); err != nil {
+			t.Fatal(err)
+		}
+
+		f, err := os.Create(filepath.Join(p, "blobs", fmt.Sprintf("sha256-%s", strings.TrimPrefix(digest, "sha256:"))))
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer f.Close()
+
+		if _, err := f.Write(data); err != nil {
+			t.Fatal(err)
+		}
+
+		files := map[string]string{
+			"model.bin": digest,
+		}
+
+		modelType := detectModelTypeFromFiles(files)
+		if modelType != "" {
+			t.Fatalf("expected empty model type for unsupported file, got %q", modelType)
+		}
+	})
+
+	t.Run("file with less than 4 bytes", func(t *testing.T) {
+		p := t.TempDir()
+		t.Setenv("OLLAMA_MODELS", p)
+
+		data := []byte("123")
+		digest := fmt.Sprintf("sha256:%x", sha256.Sum256(data))
+		if err := os.MkdirAll(filepath.Join(p, "blobs"), 0o755); err != nil {
+			t.Fatal(err)
+		}
+
+		f, err := os.Create(filepath.Join(p, "blobs", fmt.Sprintf("sha256-%s", strings.TrimPrefix(digest, "sha256:"))))
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer f.Close()
+
+		if _, err := f.Write(data); err != nil {
+			t.Fatal(err)
+		}
+
+		files := map[string]string{
+			"noext": digest,
+		}
+
+		modelType := detectModelTypeFromFiles(files)
+		if modelType != "" {
+			t.Fatalf("expected empty model type for small file, got %q", modelType)
+		}
+	})
+}
--- a/template/command-r.gotmpl
+++ b/template/command-r.gotmpl
@@ -0,0 +1,67 @@
+{{- if or .Tools .System }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+{{- if .Tools }}# Safety Preamble
+The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+# System Preamble
+## Basic Rules
+You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+{{ if .System }}# User Preamble
+{{ .System }}
+{{- end }}
+
+## Available Tools
+Here is a list of tools that you have available to you:
+{{- range .Tools }}
+
+```python
+def {{ .Function.Name }}(
+{{- range $name, $property := .Function.Parameters.Properties }}{{ $name }}: {{ $property.Type }}, {{ end }}) -> List[Dict]:
+    '''{{ .Function.Description }}
+
+{{- if .Function.Parameters.Properties }}
+
+    Args:
+{{- range $name, $property := .Function.Parameters.Properties }}
+        {{ $name }} ({{ $property.Type }}): {{ $property.Description }}
+{{- end }}
+{{- end }}
+    '''
+    pass
+```
+{{- end }}
+{{- else if .System }}{{ .System }}
+{{- end }}<|END_OF_TURN_TOKEN|>
+{{- end }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- continue }}
+{{- end }}<|START_OF_TURN_TOKEN|>
+{{- if eq .Role "user" }}<|USER_TOKEN|>{{ .Content }}
+{{- if $.Tools }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
+```json
+[
+    {
+        "tool_name": title of the tool in the specification,
+        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
+    }
+]```
+{{- end }}
+{{- else if eq .Role "assistant" }}<|CHATBOT_TOKEN|>
+{{- if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}
+Action: ```json
+[
+{{- range .ToolCalls }}
+    {
+        "tool_name": "{{ .Function.Name }}",
+        "parameters": {{ .Function.Arguments }}
+    }
+{{- end }}
+]```
+{{- end }}
+{{- else if eq .Role "tool" }}<|SYSTEM_TOKEN|><results>
+console_output: {{ .Content }}
+</results>
+{{- end }}<|END_OF_TURN_TOKEN|>
+{{- end }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
--- a/template/command-r.json
+++ b/template/command-r.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<|START_OF_TURN_TOKEN|>",
+    "<|END_OF_TURN_TOKEN|>"
+  ]
+}
--- a/template/index.json
+++ b/template/index.json
@@ -138,5 +138,9 @@
  {
    "template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}",
    "name": "solar-instruct"
+  },
+  {
+    "template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
+    "name": "command-r"
  }
 ]
--- a/template/testdata/command-r.gotmpl/system-user-assistant-user
+++ b/template/testdata/command-r.gotmpl/system-user-assistant-user
@@ -0,0 +1 @@
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I'm doing great. How can I help you today?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>I'd like to show off how chat templating works!<|END_OF_TURN_TOKEN|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
--- a/template/testdata/command-r.gotmpl/user
+++ b/template/testdata/command-r.gotmpl/user
@@ -0,0 +1 @@
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
--- a/template/testdata/command-r.gotmpl/user-assistant-user
+++ b/template/testdata/command-r.gotmpl/user-assistant-user
@@ -0,0 +1 @@
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I'm doing great. How can I help you today?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>I'd like to show off how chat templating works!<|END_OF_TURN_TOKEN|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
Author	SHA1	Message	Date
Bruce MacDonald	60b2d494bc	model: test byte pair encoding	2025-01-29 15:17:47 -08:00
Michael Yang	b21482e4a9	fix linter	2025-01-29 15:08:37 -08:00
Michael Yang	6a4120143f	next	2025-01-29 15:05:24 -08:00
Michael Yang	dcfb7a105c	next build (#8539 ) * add build to .dockerignore * test: only build one arch * add build to .gitignore * fix ccache path * filter amdgpu targets * only filter if autodetecting * Don't clobber gpu list for default runner This ensures the GPU specific environment variables are set properly * explicitly set CXX compiler for HIP * Update build_windows.ps1 This isn't complete, but is close. Dependencies are missing, and it only builds the "default" preset. * build: add ollama subdir * add .git to .dockerignore * docs: update development.md * update build_darwin.sh * remove unused scripts * llm: add cwd and build/lib/ollama to library paths * default DYLD_LIBRARY_PATH to LD_LIBRARY_PATH in runner on macOS * add additional cmake output vars for msvc * interim edits to make server detection logic work with dll directories like lib/ollama/cuda_v12 * remove unncessary filepath.Dir, cleanup * add hardware-specific directory to path * use absolute server path * build: linux arm * cmake install targets * remove unused files * ml: visit each library path once * build: skip cpu variants on arm * build: install cpu targets * build: fix workflow * shorter names * fix rocblas install * docs: clean up development.md * consistent build dir removal in development.md * silence -Wimplicit-function-declaration build warnings in ggml-cpu * update readme * update development readme * llm: update library lookup logic now that there is one runner (#8587) * tweak development.md * update docs * add windows cuda/rocm tests --------- Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Daniel Hiltgen <daniel@ollama.com>	2025-01-29 15:03:38 -08:00
Xiaofu Huang	2ef3c803a1	readme: add AI Toolkit for VSCode to community integrations (#8604 )	2025-01-27 00:36:23 -08:00
Matěj Štágl	453e4d090b	readme: add LlmTornado to community integrations (#8551 )	2025-01-25 01:04:07 -08:00
Daniel Jalkut	ca2f9843c8	docs: remove reference to the deleted examples folder (#8524 )	2025-01-22 22:52:15 -08:00
frob	294b6f5a22	docs: remove tfs_z option from documentation (#8515 )	2025-01-21 09:28:59 -08:00
EndoTheDev	7bb356c680	docs: update suspend header in gpu.md (#8487 )	2025-01-19 18:45:35 -08:00
Jannik Maierhöfer	021817e59a	readme: add link to Langfuse (#8455 )	2025-01-16 22:41:12 -08:00
Patrick Devine	a420a453b4	fix default modelfile for create (#8452 )	2025-01-16 01:14:04 -08:00
Jeffrey Morgan	42cf4db601	parser: fix parsing Modelfiles with multiple FROM commands (#8449 )	2025-01-16 00:14:04 -08:00
Josh	93a8daf285	convert: import support for command-r models from safetensors (#6063 ) --------- Co-authored-by: Patrick Devine <patrick@infrahq.com>	2025-01-15 16:31:22 -08:00
Gloryjaw	a041b4df7c	docs: fix path to examples (#8438 )	2025-01-15 11:49:12 -08:00
Patrick Devine	2539f2dbf9	Fix absolute path names + gguf detection (#8428 )	2025-01-14 19:01:24 -08:00
				`@@ -0,0 +1 @@`
				`<\|START_OF_TURN_TOKEN\|><\|SYSTEM_TOKEN\|>You are a helpful assistant.<\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|USER_TOKEN\|>Hello, how are you?<\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|CHATBOT_TOKEN\|>I'm doing great. How can I help you today?<\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|USER_TOKEN\|>I'd like to show off how chat templating works!<\|END_OF_TURN_TOKEN\|><\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|CHATBOT_TOKEN\|>`
				`@@ -0,0 +1 @@`
				`<\|START_OF_TURN_TOKEN\|><\|USER_TOKEN\|>Hello, how are you?<\|END_OF_TURN_TOKEN\|><\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|CHATBOT_TOKEN\|>`