draft: mlx

no runners
estimate log value
2024-12-31 11:13:09 -08:00 · 2024-12-21 01:31:48 -08:00 · 2024-12-21 01:31:48 -08:00 · 2024-12-21 01:31:25 -08:00 · 2024-12-20 16:56:03 -08:00 · 2024-12-20 16:28:53 -08:00
626 changed files with 117223 additions and 36723 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,5 +7,14 @@ llama/**/*.cuh linguist-vendored
 llama/**/*.m linguist-vendored
 llama/**/*.metal linguist-vendored

+ml/backend/**/*.c linguist-vendored
+ml/backend/**/*.h linguist-vendored
+ml/backend/**/*.cpp linguist-vendored
+ml/backend/**/*.hpp linguist-vendored
+ml/backend/**/*.cu linguist-vendored
+ml/backend/**/*.cuh linguist-vendored
+ml/backend/**/*.m linguist-vendored
+ml/backend/**/*.metal linguist-vendored
+
 * text=auto
 *.go text eol=lf
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -85,13 +85,12 @@ jobs:
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make
+          make dist
        name: make
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
          path: |
-            build/**/*
            dist/windows-amd64/**

  # ROCm generation step
@@ -143,13 +142,12 @@ jobs:
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          make rocm
+          make help-runners
+          make dist_rocm
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
          path: |
-            build/**/*
            dist/windows-amd64/**

  # CUDA generation step
@@ -226,12 +224,11 @@ jobs:
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+          make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
-            build/**/*
            dist/windows-amd64/**

  # windows arm64 generate, go build, and zip file (no installer)
@@ -463,7 +460,6 @@ jobs:
        with:
          name: windows-arm64
          path: dist
-      - run: dir build
      - run: |
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -269,6 +269,15 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
+      - name: Add msys paths
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -300,6 +309,15 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
+      - name: Add msys paths
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
--- a/.gitignore
+++ b/.gitignore
@@ -10,9 +10,6 @@ ollama
 .idea
 test_data
 *.crt
-llm/build
-build/*/*/*
-!build/**/placeholder
 llama/build
 __debug_bin*
 llama/vendor
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -5,10 +5,10 @@ linters:
    - asasalint
    - bidichk
    - bodyclose
+    - copyloopvar
    - containedctx
    - contextcheck
    - errcheck
-    - exportloopref
    - gci
    - gocheckcompilerdirectives
    - gofmt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(Ollama C CXX)
+
+set(CMAKE_BUILD_TYPE Release)
+set(BUILD_SHARED_LIBS ON)
+
+function(set_target_output_directory _target)
+    if(TARGET ${_target})
+        set_target_properties(${_target} PROPERTIES
+            RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+            LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+            ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+        )
+    endif()
+endfunction()
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml)
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/mlx)
+endif()
--- a/136
+++ b/136
@@ -1,9 +1,6 @@
 ARG GOLANG_VERSION=1.22.8
-ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -15,24 +12,22 @@ ARG JETPACK_5=r35.4.1
 #
 ### Then incremental builds will be much faster in this container
 #
-# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
+# make -j 10 dist
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    dnf clean all && \
    dnf install -y \
    zsh \
-    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
+    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
+    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 # TODO intel oneapi goes here...
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
@@ -46,12 +41,11 @@ ENTRYPOINT [ "zsh" ]
 # docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
 #
 FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
-ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
    dnf config-manager --set-enabled appstream && \
    dnf clean all && \
@@ -62,37 +56,28 @@ RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH amd64
+ENV GOARCH arm64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]

-FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
+FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -j $(expr $(nproc) / 2 ) ; \
+        make -j $(expr $(nproc) / 2 ) dist ; \
    else \
-        make -j 5 ; \
+        make -j 5 dist ; \
+    fi
+RUN cd dist/linux-$GOARCH && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
+    cd dist/linux-$GOARCH-rocm && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
    fi
-
-FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5

 # Jetsons need to be built in discrete stages
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
@@ -107,10 +92,9 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v11 \
+    make -j 5 dist_cuda_v11 \
        CUDA_ARCHITECTURES="72;87" \
        GPU_RUNNER_VARIANT=_jetpack5 \
-        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5

@@ -126,69 +110,26 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v12 \
+    make -j 5 dist_cuda_v12 \
        CUDA_ARCHITECTURES="87" \
        GPU_RUNNER_VARIANT=_jetpack6 \
-        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6

-
-# Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 centos:7 AS builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH amd64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/amd64 builder-amd64 AS build-amd64
+FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
 COPY . .
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-ARG OLLAMA_SKIP_ROCM_GENERATE
+ARG OLLAMA_SKIP_CUDA_GENERATE
+ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
-    fi
-
-FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH arm64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/arm64 builder-arm64 AS build-arm64
-COPY . .
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
+    make -j 5 dist
 COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN cd dist/linux-$GOARCH-jetpack5 && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
 RUN cd dist/linux-$GOARCH-jetpack6 && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz

 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
@@ -197,30 +138,13 @@ COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz
 FROM dist-$TARGETARCH AS dist


-# Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-
-FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-
 # For amd64 container images, filter out cuda/rocm to minimize size
-FROM runners-amd64 AS runners-cuda-amd64
+FROM build-amd64 AS runners-cuda-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
    ./dist/linux-amd64/lib/ollama/runners/rocm*

-FROM runners-amd64 AS runners-rocm-amd64
+FROM build-amd64 AS runners-rocm-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
    ./dist/linux-amd64/lib/ollama/libcu*.so* \
@@ -230,15 +154,15 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
 COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/

@@ -251,7 +175,7 @@ COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/l
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

 EXPOSE 11434
--- a/66
+++ b/66
@@ -0,0 +1,66 @@
+ARG CUDA_11_VERSION=11.3
+ARG CUDA_12_VERSION=12.4
+ARG ROCM_VERSION=6.1.2
+ARG JETPACK_5_VERSION=r35.4.1
+ARG JETPACK_6_VERSION=r36.2.0
+ARG CMAKE_VERSION=3.31.2
+
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS base
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz | tar xz -C /usr --strip-components 1
+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+
+# FROM --platform=linux/arm64 rockylinux:8 AS base
+# ARG CMAKE_VERSION
+# RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+# RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
+
+FROM base AS amd64
+ARG CUDA_11_VERSION
+ARG CUDA_12_VERSION
+RUN yum install -y cuda-toolkit-${CUDA_11_VERSION//./-} \
+    && yum install -y cuda-toolkit-${CUDA_12_VERSION//./-}
+COPY CMakeLists.txt CMakeLists.txt
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+
+FROM --platform=linux/amd64 amd64 AS cuda_11
+ENV PATH=/usr/local/cuda-${CUDA_11_VERSION}/bin:$PATH
+RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+RUN cmake --build build --target ggml-cuda -j
+
+FROM --platform=linux/amd64 amd64 AS cuda_12
+ENV PATH=/usr/local/cuda-${CUDA_12_VERSION}/bin:$PATH
+RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+RUN cmake --build build --target ggml-cuda -j
+
+FROM --platform=linux/amd64 amd64 AS rocm
+RUN cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+RUN cmake --build build --target ggml-hip -j
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5_VERSION} AS jetpack_5
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+COPY CMakeLists.txt .
+COPY ml/backend/ggml/ggml .
+RUN cmake -S . -B build \
+    -DCMAKE_CUDA_ARCHITECTURES="72;87"
+RUN cmake --build build --target ggml-cuda
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6_VERSION} AS jetpack_6
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+COPY CMakeLists.txt .
+COPY ml/backend/ggml/ggml .
+RUN cmake -S . -B build \
+    -DCMAKE_CUDA_ARCHITECTURES="87"
+RUN cmake --build build --target ggml-cuda
+
+FROM --platform=linux/amd64 golang:1.23
+COPY --from=cuda_11 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-11.so
+COPY --from=cuda_12 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-12.so
+COPY --from=rocm build/ml/backend/ggml/ggml/src/ggml-hip/libggml-hip.so libggml-hip.so
+
+# FROM --platform=linux/arm64 golang:1.23
+# COPY --from=jetpack_5 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-5.so
+# COPY --from=jetpack_6 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-6.so
--- a/4
+++ b/4
@@ -1,4 +0,0 @@
-GOALS := $(or $(MAKECMDGOALS),all)
-.PHONY: $(GOALS)
-$(GOALS):
-	$(MAKE) -C llama $@
--- a/48
+++ b/48
@@ -0,0 +1,48 @@
+UPSTREAM=https://github.com/ggerganov/llama.cpp.git
+WORKDIR=llama/vendor
+FETCH_HEAD=40c6d79fb52f995f47507fedfeaae2ac05d9b35c
+
+all: sync
+
+.PHONY: sync
+sync: llama/llama.cpp ml/backend/ggml/ggml
+
+.PHONY: llama/llama.cpp
+llama/llama.cpp: llama/vendor apply_patches
+	rsync -arvzc --delete -f "merge $@/.rsync-filter" $< $@
+	rsync -arvzc --delete --include LICENSE --exclude $(WORKDIR) $@
+
+.PHONY: ml/backend/ggml/ggml apply_patches
+ml/backend/ggml/ggml: llama/vendor/ggml apply_patches
+	rsync -arvzc --delete -f "merge $@/.rsync-filter" $< $@
+	rsync -arvzc --delete --include LICENSE --exclude $(WORKDIR) $@
+
+PATCHES=$(wildcard llama/patches/*.patch)
+
+.PHONY: apply_patches
+.NOTPARALLEL:
+apply_patches: $(addsuffix ed, $(PATCHES))
+
+%.patched: %.patch
+	if git -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
+
+.PHONY: checkout
+checkout: $(WORKDIR)
+	git -C $(WORKDIR) fetch
+	git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
+
+$(WORKDIR):
+	git clone $(UPSTREAM) $(WORKDIR)
+
+.PHONE: format_patches
+format_patches: llama/patches
+	git -C $(WORKDIR) format-patch \
+		--no-signature \
+		--no-numbered \
+		--zero-commit \
+		-o $(realpath $<) \
+		$(FETCH_HEAD)
+
+.PHONE: clean
+clean: checkout
+	$(RM) $(addsuffix ed, $(PATCHES))
--- a/README.md
+++ b/README.md
@@ -49,12 +49,12 @@ Here are some example models that can be downloaded:

 | Model              | Parameters | Size  | Download                         |
 | ------------------ | ---------- | ----- | -------------------------------- |
+| Llama 3.3          | 70B        | 43GB  | `ollama run llama3.3`            |
 | Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
 | Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
 | Llama 3.2 Vision   | 11B        | 7.9GB | `ollama run llama3.2-vision`     |
 | Llama 3.2 Vision   | 90B        | 55GB  | `ollama run llama3.2-vision:90b` |
 | Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
-| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`        |
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`         |
@@ -357,9 +357,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
 - [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
+- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application available for Mac/Windows/Linux)
 - [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
 - [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
+- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)

 ### Cloud

@@ -408,6 +409,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
+- [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)

 ### Package managers

@@ -512,6 +514,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
+- [AI Summmary Helper plugin](https://github.com/philffm/ai-summary-helper)
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)

--- a/api/types.go
+++ b/api/types.go
@@ -216,7 +216,6 @@ type Options struct {
 	TopK             int      `json:"top_k,omitempty"`
 	TopP             float32  `json:"top_p,omitempty"`
 	MinP             float32  `json:"min_p,omitempty"`
-	TFSZ             float32  `json:"tfs_z,omitempty"`
 	TypicalP         float32  `json:"typical_p,omitempty"`
 	RepeatLastN      int      `json:"repeat_last_n,omitempty"`
 	Temperature      float32  `json:"temperature,omitempty"`
@@ -595,7 +594,6 @@ func DefaultOptions() Options {
 		Temperature:      0.8,
 		TopK:             40,
 		TopP:             0.9,
-		TFSZ:             1.0,
 		TypicalP:         1.0,
 		RepeatLastN:      64,
 		RepeatPenalty:    1.1,
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -97,7 +97,6 @@ Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Chec
 Source: "..\dist\windows-arm64\vc_redist.arm64.exe"; DestDir: "{tmp}"; Check: IsArm64() and vc_redist_needed(); Flags: deleteafterinstall
 Source: "..\dist\windows-arm64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: IsArm64();  Flags: ignoreversion 64bit
 Source: "..\dist\windows-arm64\ollama.exe"; DestDir: "{app}"; Check: IsArm64(); Flags: ignoreversion 64bit
-Source: "..\dist\windows-arm64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: IsArm64(); Flags: ignoreversion 64bit recursesubdirs
 #endif

 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
--- a/app/tray/wintray/eventloop.go
+++ b/app/tray/wintray/eventloop.go
@@ -98,7 +98,7 @@ func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam ui
 		}
 		err = t.wcex.unregister()
 		if err != nil {
-			slog.Error(fmt.Sprintf("failed to uregister windo %s", err))
+			slog.Error(fmt.Sprintf("failed to unregister window %s", err))
 		}
 	case WM_DESTROY:
 		// same as WM_ENDSESSION, but throws 0 exit code after all
--- a/build/darwin/amd64/placeholder
+++ b/build/darwin/amd64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/build/darwin/arm64/placeholder
+++ b/build/darwin/arm64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/build/embed_darwin_amd64.go
+++ b/build/embed_darwin_amd64.go
@@ -1,8 +0,0 @@
-package build
-
-import "embed"
-
-// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
-
-//go:embed darwin/amd64/*
-var EmbedFS embed.FS
--- a/build/embed_darwin_arm64.go
+++ b/build/embed_darwin_arm64.go
@@ -1,8 +0,0 @@
-package build
-
-import "embed"
-
-// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
-
-//go:embed darwin/arm64/*
-var EmbedFS embed.FS
--- a/build/embed_linux.go
+++ b/build/embed_linux.go
@@ -1,6 +0,0 @@
-package build
-
-import "embed"
-
-//go:embed linux/*
-var EmbedFS embed.FS
--- a/build/embed_unused.go
+++ b/build/embed_unused.go
@@ -1,8 +0,0 @@
-//go:build !linux && !darwin
-
-package build
-
-import "embed"
-
-// unused on windows
-var EmbedFS embed.FS
--- a/build/linux/amd64/placeholder
+++ b/build/linux/amd64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/build/linux/arm64/placeholder
+++ b/build/linux/arm64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/cache/cache.go
+++ b/cache/cache.go
@@ -0,0 +1,63 @@
+package cache
+
+import (
+	"github.com/ollama/ollama/ml"
+)
+
+type Options struct {
+	Position int
+}
+
+type Cache interface {
+	Sub(i int) Cache
+	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
+}
+
+type Simple struct {
+	DType    ml.DType
+	Capacity int
+
+	keys, values []ml.Tensor
+}
+
+func (c *Simple) Sub(i int) Cache {
+	if i >= len(c.keys) {
+		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
+		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
+	}
+
+	return &Simple{
+		keys:     c.keys[i : i+1],
+		values:   c.values[i : i+1],
+		Capacity: c.Capacity,
+		DType:    c.DType,
+	}
+}
+
+func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
+	if c.keys[0] == nil || c.values[0] == nil {
+		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
+		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
+	}
+
+	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
+	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
+
+	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
+
+	key = c.keys[0].View(ctx, 0,
+		int(key.Dim(0)), int(key.Stride(1)),
+		int(key.Dim(1)), int(key.Stride(2)),
+		n,
+	)
+
+	value = c.values[0].View(ctx, 0,
+		int(value.Dim(0)), int(value.Stride(1)),
+		int(value.Dim(1)), int(value.Stride(2)),
+		n,
+	)
+
+	// TODO shift context if necessary
+
+	return key, value
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -37,6 +37,8 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/llama"
+	"github.com/ollama/ollama/llama/runner"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
@@ -1036,6 +1038,10 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		return nil
 	}

+	if opts.Format == "json" {
+		opts.Format = `"` + opts.Format + `"`
+	}
+
 	req := &api.ChatRequest{
 		Model:    opts.Model,
 		Messages: opts.Messages,
@@ -1121,6 +1127,10 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		}
 	}

+	if opts.Format == "json" {
+		opts.Format = `"` + opts.Format + `"`
+	}
+
 	request := api.GenerateRequest{
 		Model:     opts.Model,
 		Prompt:    opts.Prompt,
@@ -1412,6 +1422,19 @@ func NewCLI() *cobra.Command {
 		RunE:    DeleteHandler,
 	}

+	runnerCmd := &cobra.Command{
+		Use:    "runner",
+		Short:  llama.PrintSystemInfo(),
+		Hidden: true,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return runner.Execute(os.Args[1:])
+		},
+		FParseErrWhitelist: cobra.FParseErrWhitelist{UnknownFlags: true},
+	}
+	runnerCmd.SetHelpFunc(func(cmd *cobra.Command, args []string) {
+		_ = runner.Execute(args[1:])
+	})
+
 	envVars := envconfig.AsMap()

 	envs := []envconfig.EnvVar{envVars["OLLAMA_HOST"]}
@@ -1468,6 +1491,7 @@ func NewCLI() *cobra.Command {
 		psCmd,
 		copyCmd,
 		deleteCmd,
+		runnerCmd,
 	)

 	return rootCmd
--- a/cmd/runner/main.go
+++ b/cmd/runner/main.go
@@ -0,0 +1,15 @@
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/ollama/ollama/llama/runner"
+)
+
+func main() {
+	if err := runner.Execute(os.Args[1:]); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %s\n", err)
+		os.Exit(1)
+	}
+}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -9,7 +9,7 @@ import (
 	"log/slog"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type ModelParameters struct {
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 }

-func (ModelParameters) KV(t *Tokenizer) llm.KV {
-	kv := llm.KV{
+func (ModelParameters) KV(t *Tokenizer) ggml.KV {
+	kv := ggml.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p AdapterParameters) KV() llm.KV {
+func (p AdapterParameters) KV() ggml.KV {
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
 		alpha = p.LoraParameters.Alpha
 	}

-	kv := llm.KV{
+	kv := ggml.KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(ws, kv, ts)
 }

-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(ws, kv, ts)
 }

 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) llm.KV
+	KV(*Tokenizer) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
@@ -99,7 +99,7 @@ type ModelConverter interface {
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }

 type moreParser interface {
@@ -108,17 +108,17 @@ type moreParser interface {

 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(llm.KV) llm.KV
+	KV(ggml.KV) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string

-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }

-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -8,7 +8,7 @@ import (
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type bertModel struct {
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *bertModel) KV(t *Tokenizer) llm.KV {
+func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
 			"embeddings.position_ids",
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 			continue
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type gemmaModel struct {
@@ -23,7 +23,7 @@ type gemmaModel struct {

 var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,8 +1,6 @@
 package convert

-import (
-	"github.com/ollama/ollama/llm"
-)
+import "github.com/ollama/ollama/fs/ggml"

 type gemma2Model struct {
 	gemmaModel
@@ -11,7 +9,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }

-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type gemma2Adapter struct {
@@ -15,14 +15,14 @@ type gemma2Adapter struct {

 var _ AdapterConverter = (*gemma2Adapter)(nil)

-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	return kv
 }

-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -9,7 +9,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type llamaModel struct {
@@ -46,7 +46,7 @@ type llamaModel struct {

 var _ ModelConverter = (*llamaModel)(nil)

-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor

 	if p.RopeScaling.factors != nil {
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,7 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type llamaAdapter struct {
@@ -18,7 +18,7 @@ type llamaAdapter struct {

 var _ AdapterConverter = (*llamaAdapter)(nil)

-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 	return kv
 }

-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    shape,
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -6,7 +6,7 @@ import (
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type mixtralModel struct {
@@ -15,7 +15,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 		return true
 	})

-	var out []llm.Tensor
+	var out []ggml.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -8,7 +8,7 @@ import (
 	"strings"
 	"sync"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type phi3Model struct {
@@ -37,7 +37,7 @@ type phi3Model struct {

 var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
 	var addRopeFactors sync.Once

-	out := make([]llm.Tensor, 0, len(ts)+2)
+	out := make([]ggml.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
-				out = append(out, llm.Tensor{
+				out = append(out, ggml.Tensor{
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
-				}, llm.Tensor{
+				}, ggml.Tensor{
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 			})
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -20,7 +20,7 @@ import (

 	"golang.org/x/exp/maps"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type tensorData struct {
@@ -29,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 }

-func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, _, err := llm.DecodeGGML(r, math.MaxInt)
+	m, _, err := ggml.Decode(r, math.MaxInt)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
 	actual := make(map[string]string)
 	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
@@ -330,7 +330,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			m, _, err := ggml.Decode(r, math.MaxInt)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/sentencepiece/sentencepiece_model.pb.go
+++ b/convert/sentencepiece/sentencepiece_model.pb.go
@@ -331,7 +331,7 @@ type TrainerSpec struct {
 	// Reserved special meta tokens.
 	// * -1 is not used.
 	// * unk_id must not be -1.
-	// Id must starts with 0 and be contigous.
+	// Id must start with 0 and be contiguous.
 	UnkId    *int32  `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"`  // <unk>
 	BosId    *int32  `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"`  // <s>
 	EosId    *int32  `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"`  // </s>
--- a/convert/sentencepiece_model.proto
+++ b/convert/sentencepiece_model.proto
@@ -213,7 +213,7 @@ message TrainerSpec {
  // Reserved special meta tokens.
  // * -1 is not used.
  // * unk_id must not be -1.
-  // Id must starts with 0 and be contigous.
+  // Id must start with 0 and be contiguous.
  optional int32 unk_id = 40 [default = 0];   // <unk>
  optional int32 bos_id = 41 [default = 1];   // <s>
  optional int32 eos_id = 42 [default = 2];   // </s>
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -77,6 +77,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {

 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
+	depPaths := LibraryDirs()
 	libDir := ""

 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
@@ -300,8 +301,11 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			})
 			continue
 		}
-
-		if int(major) < RocmComputeMin {
+		minVer, err := strconv.Atoi(RocmComputeMajorMin)
+		if err != nil {
+			slog.Error("invalid RocmComputeMajorMin setting", "value", RocmComputeMajorMin, "error", err)
+		}
+		if int(major) < minVer {
 			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
 			slog.Warn(reason, "gpu", gpuID)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
@@ -349,8 +353,9 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				})
 				return nil, err
 			}
+			depPaths = append(depPaths, libDir)
 		}
-		gpuInfo.DependencyPath = []string{libDir}
+		gpuInfo.DependencyPath = depPaths

 		if gfxOverride == "" {
 			// Only load supported list once
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -50,12 +50,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		slog.Info(err.Error())
 		return nil, err
 	}
+	depPaths := LibraryDirs()
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
 		err = fmt.Errorf("unable to verify rocm library: %w", err)
 		slog.Warn(err.Error())
 		return nil, err
 	}
+	depPaths = append(depPaths, libDir)

 	var supported []string
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
@@ -111,7 +113,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,

 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: []string{libDir},
+				DependencyPath: depPaths,
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
@@ -182,7 +184,7 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
-		return nil
+		return err
 	}
 	defer hl.Release()

--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@@ -5,21 +5,8 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
-
-	"golang.org/x/sys/cpu"
 )

-func GetCPUCapability() CPUCapability {
-	if cpu.X86.HasAVX2 {
-		return CPUCapabilityAVX2
-	}
-	if cpu.X86.HasAVX {
-		return CPUCapabilityAVX
-	}
-	// else LCD
-	return CPUCapabilityNone
-}
-
 func IsNUMA() bool {
 	if runtime.GOOS != "linux" {
 		// numa support in llama.cpp is linux only
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -16,12 +16,14 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"

 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/runners"
 )

 type cudaHandles struct {
@@ -45,7 +47,6 @@ const (
 var (
 	gpuMutex      sync.Mutex
 	bootstrapped  bool
-	cpuCapability CPUCapability
 	cpus          []CPUInfo
 	cudaGPUs      []CudaGPUInfo
 	nvcudaLibPath string
@@ -64,9 +65,13 @@ var (
 )

 // With our current CUDA compile flags, older than 5.0 will not work properly
-var CudaComputeMin = [2]C.int{5, 0}
+// (string values used to allow ldflags overrides at build time)
+var (
+	CudaComputeMajorMin = "5"
+	CudaComputeMinorMin = "0"
+)

-var RocmComputeMin = 9
+var RocmComputeMajorMin = "9"

 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
@@ -101,9 +106,9 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
-	libDir := LibraryDir()
-	if libDir != "" {
-		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
+	libDirs := LibraryDirs()
+	for _, d := range libDirs {
+		cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(d, CudartMgmtName))
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)

@@ -219,16 +224,23 @@ func GetGPUInfo() GpuInfoList {

 	if !bootstrapped {
 		slog.Info("looking for compatible GPUs")
+		cudaComputeMajorMin, err := strconv.Atoi(CudaComputeMajorMin)
+		if err != nil {
+			slog.Error("invalid CudaComputeMajorMin setting", "value", CudaComputeMajorMin, "error", err)
+		}
+		cudaComputeMinorMin, err := strconv.Atoi(CudaComputeMinorMin)
+		if err != nil {
+			slog.Error("invalid CudaComputeMinorMin setting", "value", CudaComputeMinorMin, "error", err)
+		}
 		bootstrapErrors = []error{}
 		needRefresh = false
-		cpuCapability = GetCPUCapability()
 		var memInfo C.mem_info_t

 		mem, err := GetCPUMem()
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
-		depPath := LibraryDir()
+		depPaths := LibraryDirs()
 		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
@@ -238,24 +250,14 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo:        mem,
 					Library:        "cpu",
-					Variant:        cpuCapability.String(),
+					Variant:        runners.GetCPUCapability().String(),
 					ID:             "0",
-					DependencyPath: []string{depPath},
+					DependencyPath: depPaths,
 				},
 				CPUs: details,
 			},
 		}

-		// Fallback to CPU mode if we're lacking required vector extensions on x86
-		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
-			err := fmt.Errorf("CPU does not have minimum vector extensions, GPU inference disabled.  Required:%s  Detected:%s", GPURunnerCPUCapability, cpuCapability)
-			slog.Warn(err.Error())
-			bootstrapErrors = append(bootstrapErrors, err)
-			bootstrapped = true
-			// No need to do any GPU discovery, since we can't run on them
-			return GpuInfoList{cpus[0].GpuInfo}
-		}
-
 		// Load ALL libraries
 		cHandles = initCudaHandles()

@@ -292,19 +294,23 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
-				if depPath != "" {
-					gpuInfo.DependencyPath = []string{depPath}
+				if depPaths != nil {
+					gpuInfo.DependencyPath = depPaths
 					// Check for variant specific directory
 					if variant != "" {
-						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
-							gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
+						for _, d := range depPaths {
+							if _, err := os.Stat(filepath.Join(d, "cuda_"+variant)); err == nil {
+								// Put the variant directory first in the search path to avoid runtime linking to the wrong library
+								gpuInfo.DependencyPath = append([]string{filepath.Join(d, "cuda_"+variant)}, gpuInfo.DependencyPath...)
+								break
+							}
 						}
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.Variant = variant

-				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
+				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) {
 					unsupportedGPUs = append(unsupportedGPUs,
 						UnsupportedGPUInfo{
 							GpuInfo: gpuInfo.GpuInfo,
@@ -370,7 +376,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = []string{depPath}
+						gpuInfo.DependencyPath = depPaths
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
@@ -385,6 +391,8 @@ func GetGPUInfo() GpuInfoList {
 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
 			slog.Info("no compatible GPUs were discovered")
 		}
+
+		// TODO verify we have runners for the discovered GPUs, filter out any that aren't supported with good error messages
 	}

 	// For detected GPUs, load library if not loaded
@@ -509,7 +517,10 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	slog.Debug("Searching for GPU library", "name", baseLibName)

 	// Start with our bundled libraries
-	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
+	patterns := []string{}
+	for _, d := range LibraryDirs() {
+		patterns = append(patterns, filepath.Join(d, baseLibName))
+	}

 	switch runtime.GOOS {
 	case "windows":
@@ -705,32 +716,21 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }

-func LibraryDir() string {
-	// On Windows/linux we bundle the dependencies at the same level as the executable
-	appExe, err := os.Executable()
+func LibraryDirs() []string {
+	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
+	// This can be simplified once we no longer carry runners as payloads
+	exe, err := os.Executable()
 	if err != nil {
 		slog.Warn("failed to lookup executable path", "error", err)
+		return nil
 	}
-	cwd, err := os.Getwd()
-	if err != nil {
-		slog.Warn("failed to lookup working directory", "error", err)
+
+	lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
+	if _, err := os.Stat(lib); err != nil {
+		return nil
 	}
-	// Scan for any of our dependeices, and pick first match
-	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
-		libDep := filepath.Join("lib", "ollama")
-		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
-			return filepath.Join(root, libDep)
-		}
-		// Developer mode, local build
-		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
-			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
-		}
-		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
-			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
-		}
-	}
-	slog.Warn("unable to locate gpu dependency libraries")
-	return ""
+
+	return []string{lib}
 }

 func GetSystemInfo() SystemInfo {
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -15,6 +15,7 @@ import (
 	"syscall"

 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/runners"
 )

 const (
@@ -27,7 +28,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability().String(),
+				Variant: runners.GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -50,7 +51,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUCapability().String(),
+			Variant: runners.GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
--- a/discover/gpu_windows.go
+++ b/discover/gpu_windows.go
@@ -209,7 +209,7 @@ func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
 		}
 	}

-	// Sumarize the results
+	// Summarize the results
 	for i, pkg := range packages {
 		slog.Info("", "package", i, "cores", pkg.coreCount, "efficiency", pkg.efficiencyCoreCount, "threads", pkg.threadCount)
 	}
--- a/discover/types.go
+++ b/discover/types.go
@@ -5,6 +5,7 @@ import (
 	"log/slog"

 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/runners"
 )

 type memInfo struct {
@@ -47,6 +48,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	// TODO other performance capability info to help in scheduling decisions
 }

+func (gpu GpuInfo) RunnerName() string {
+	if gpu.Variant != "" {
+		return gpu.Library + "_" + gpu.Variant
+	}
+	return gpu.Library
+}
+
 type CPUInfo struct {
 	GpuInfo
 	CPUs []CPU
@@ -99,7 +107,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone.String() {
+		if info.Variant != runners.CPUCapabilityNone.String() {
 			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
@@ -140,29 +148,6 @@ func (a ByFreeMemory) Len() int           { return len(a) }
 func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }

-type CPUCapability uint32
-
-// Override at build time when building base GPU runners
-var GPURunnerCPUCapability = CPUCapabilityAVX
-
-const (
-	CPUCapabilityNone CPUCapability = iota
-	CPUCapabilityAVX
-	CPUCapabilityAVX2
-	// TODO AVX512
-)
-
-func (c CPUCapability) String() string {
-	switch c {
-	case CPUCapabilityAVX:
-		return "avx"
-	case CPUCapabilityAVX2:
-		return "avx2"
-	default:
-		return "no vector extensions"
-	}
-}
-
 type SystemInfo struct {
 	System          CPUInfo              `json:"system"`
 	GPUs            []GpuInfo            `json:"gpus"`
--- a/docs/api.md
+++ b/docs/api.md
@@ -45,7 +45,7 @@ Generate a response for a given prompt with a provided model. This is a streamin

 Advanced parameters (optional):

- `format`: the format to return a response in. Currently the only accepted value is `json`
+- `format`: the format to return a response in. Format can be `json` or a JSON schema
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
@@ -54,6 +54,10 @@ Advanced parameters (optional):
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 - `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory

+#### Structured outputs
+
+Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
+
 #### JSON mode

 Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
@@ -185,6 +189,52 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```

+#### Request (Structured outputs)
+
+##### Request
+
+```shell
+curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
+  "model": "llama3.1:8b",
+  "prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
+  "stream": false,
+  "format": {
+    "type": "object",
+    "properties": {
+      "age": {
+        "type": "integer"
+      },
+      "available": {
+        "type": "boolean"
+      }
+    },
+    "required": [
+      "age",
+      "available"
+    ]
+  }
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.1:8b",
+  "created_at": "2024-12-06T00:48:09.983619Z",
+  "response": "{\n  \"age\": 22,\n  \"available\": true\n}",
+  "done": true,
+  "done_reason": "stop",
+  "context": [1, 2, 3],
+  "total_duration": 1075509083,
+  "load_duration": 567678166,
+  "prompt_eval_count": 28,
+  "prompt_eval_duration": 236000000,
+  "eval_count": 16,
+  "eval_duration": 269000000
+}
+```
+
 #### Request (JSON mode)

 > [!IMPORTANT]
@@ -337,7 +387,6 @@ curl http://localhost:11434/api/generate -d '{
    "top_k": 20,
    "top_p": 0.9,
    "min_p": 0.0,
-    "tfs_z": 0.5,
    "typical_p": 0.7,
    "repeat_last_n": 33,
    "temperature": 0.8,
@@ -456,11 +505,15 @@ The `message` object has the following fields:

 Advanced parameters (optional):

- `format`: the format to return a response in. Currently the only accepted value is `json`
+- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

+### Structured outputs
+
+Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
+
 ### Examples

 #### Chat Request (Streaming)
@@ -551,6 +604,54 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```

+#### Chat request (Structured outputs)
+
+##### Request
+
+```shell
+curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+  "model": "llama3.1",
+  "messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
+  "stream": false,
+  "format": {
+    "type": "object",
+    "properties": {
+      "age": {
+        "type": "integer"
+      },
+      "available": {
+        "type": "boolean"
+      }
+    },
+    "required": [
+      "age",
+      "available"
+    ]
+  },
+  "options": {
+    "temperature": 0
+  }
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.1",
+  "created_at": "2024-12-06T00:46:58.265747Z",
+  "message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 2254970291,
+  "load_duration": 574751416,
+  "prompt_eval_count": 34,
+  "prompt_eval_duration": 1502000000,
+  "eval_count": 12,
+  "eval_duration": 175000000
+}
+```
+
 #### Chat request (With History)

 Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
--- a/docs/development.md
+++ b/docs/development.md
@@ -3,35 +3,24 @@
 Install required tools:

 - go version 1.22 or higher
- gcc version 11.4.0 or higher
+- OS specific C/C++ compiler (see below)
+- GNU Make


+## Overview
+
+Ollama uses a mix of Go and C/C++ code to interface with GPUs.  The C/C++ code is compiled with both CGO and GPU library specific compilers.  A set of GNU Makefiles are used to compile the project.  GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary.  The default make target will build the runners and primary Go Ollama application that will run within the repo directory.  Throughout the examples below `-j 5` is suggested for 5 parallel jobs to speed up the build.  You can adjust the job count based on your CPU Core count to reduce build times.  If you want to relocate the built binaries, use the `dist` target and recursively copy the files in `./dist/$OS-$ARCH/` to your desired location. To learn more about the other make targets use `make help`
+
+Once you have built the GPU/CPU runners, you can compile the main application with `go build .` 
+
 ### MacOS

 [Download Go](https://go.dev/dl/)

-Optionally enable debugging and more verbose logging:
-
-```bash
-# At build time
-export CGO_CFLAGS="-g"
-
-# At runtime
-export OLLAMA_DEBUG=1
-```
-
-Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
-
 ```bash
 make -j 5
 ```

-Then build ollama:
-
-```bash
-go build .
-```
-
 Now you can run `ollama`:

 ```bash
@@ -51,64 +40,42 @@ _Your operating system distribution may already have packages for NVIDIA CUDA. D
 Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.

-Typically the build scripts will auto-detect CUDA, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
-
-Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
+Typically the makefile will auto-detect CUDA, however, if your Linux distro
+or installation approach uses alternative paths, you can specify the location by
+overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
+a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)

 ```
 make -j 5
 ```

-Then build the binary:
+If both v11 and v12 tookkits are detected, runners for both major versions will be built by default.  You can build just v12 with `make cuda_v12`

-```
-go build .
-```
+#### Older Linux CUDA (NVIDIA)
+
+To support older GPUs with Compute Capability 3.5 or 3.7, you will need to use an older version of the Driver from [Unix Driver Archive](https://www.nvidia.com/en-us/drivers/unix/) (tested with 470) and [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (tested with cuda V11).  When you build Ollama, you will need to set two make variable to adjust the minimum compute capability Ollama supports via `make -j 5 CUDA_ARCHITECTURES="35;37;50;52" EXTRA_GOLDFLAGS="\"-X=github.com/ollama/ollama/discover.CudaComputeMajorMin=3\" \"-X=github.com/ollama/ollama/discover.CudaComputeMinorMin=5\""`.  To find the Compute Capability of your older GPU, refer to [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).

 #### Linux ROCm (AMD)

-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_

-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
+Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.

 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `ROCM_PATH` to the location of the ROCm
-install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
-the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
-
-Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
+specifying an environment variable `HIP_PATH` to the location of the ROCm
+install (typically `/opt/rocm`). You can also customize
+the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)

 ```
 make -j 5
 ```

-Then build the binary:
-
-```
-go build .
-```
-
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.

-#### Advanced CPU Settings
-
-By default, running `make` will compile a few different variations
-of the LLM library based on common CPU families and vector math capabilities,
-including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. 
-
-Custom CPU settings are not currently supported in the new Go server build but will be added back after we complete the transition.
-
 #### Containerized Linux Build

-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`

 ### Windows

@@ -126,12 +93,8 @@ The following tools are required as a minimal development environment to build C
 > [!NOTE]  
 > Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.

-Then, build the `ollama` binary:
-
-```powershell
-$env:CGO_ENABLED="1"
-make -j 8
-go build .
+```
+make -j 5
 ```

 #### GPU Support
@@ -173,3 +136,30 @@ pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw
 ```

 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
+
+
+## Advanced CPU Vector Settings
+
+On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
+
+To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
+
+To build without any vector flags:
+
+```
+make CUSTOM_CPU_FLAGS=""
+```
+
+To build with both AVX and AVX2:
+```
+make CUSTOM_CPU_FLAGS=avx,avx2
+```
+
+To build with AVX512 features turned on:
+
+```
+make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
+```
+
+> [!NOTE]  
+> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -28,6 +28,7 @@ Check your compute compatibility to see if your card is supported:
 | 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                            |
 |                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`  |

+For building locally to support older GPUs, see [developer.md](./development.md#linux-cuda-nvidia)

 ### GPU Selection

--- a/docs/linux.md
+++ b/docs/linux.md
@@ -10,6 +10,9 @@ curl -fsSL https://ollama.com/install.sh | sh

 ## Manual install

+> [!NOTE]
+> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+
 Download and extract the package:

 ```shell
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -59,6 +59,40 @@ embeddings = client.embeddings.create(
    input=["why is the sky blue?", "why is the grass green?"],
 )
 ```
+#### Structured outputs
+```py
+from pydantic import BaseModel
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
+
+# Define the schema for the response
+class FriendInfo(BaseModel):
+    name: str
+    age: int 
+    is_available: bool
+
+class FriendList(BaseModel):
+    friends: list[FriendInfo]
+
+try:
+    completion = client.beta.chat.completions.parse(
+        temperature=0,
+        model="llama3.1:8b",
+        messages=[
+            {"role": "user", "content": "I have two friends. The first is Ollama 22 years old busy saving the world, and the second is Alonso 23 years old and wants to hang out. Return a list of friends in JSON format"}
+        ],
+        response_format=FriendList,
+    )
+
+    friends_response = completion.choices[0].message
+    if friends_response.parsed:
+        print(friends_response.parsed)
+    elif friends_response.refusal:
+        print(friends_response.refusal)
+except Exception as e:
+    print(f"Error: {e}")
+```

 ### OpenAI JavaScript library

@@ -181,7 +215,7 @@ curl http://localhost:11434/v1/embeddings \
 - [x] JSON mode
 - [x] Reproducible outputs
 - [x] Vision
- [x] Tools (streaming support coming soon)
+- [x] Tools
 - [ ] Logprobs

 #### Supported request fields
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -80,7 +80,7 @@ If you are using a container to run Ollama, make sure you've set up the containe

 Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem

- If you are using a container, is the container runtime working?  Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
+- If you are using a container, is the container runtime working?  Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama won't be able to see your NVIDIA GPU.
 - Is the uvm driver loaded? `sudo nvidia-modprobe -u`
 - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
 - Try rebooting
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -83,3 +83,6 @@ If you'd like to install or integrate Ollama as a service, a standalone
 and GPU library dependencies for Nvidia and AMD. This allows for embedding
 Ollama in existing applications, or running it as a system service via `ollama
 serve` with tools such as [NSSM](https://nssm.cc/).
+
+> [!NOTE]  
+> If you are upgrading from a prior version, you should remove the old directories first.
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -175,7 +175,6 @@ func String(s string) func() string {

 var (
 	LLMLibrary = String("OLLAMA_LLM_LIBRARY")
-	TmpDir     = String("OLLAMA_TMPDIR")

 	CudaVisibleDevices    = String("CUDA_VISIBLE_DEVICES")
 	HipVisibleDevices     = String("HIP_VISIBLE_DEVICES")
@@ -250,7 +249,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
-		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},

 		// Informational
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,3 +1,14 @@
 # Examples

 This directory contains different examples of using Ollama.
+
+## Python examples
+Ollama Python examples at [ollama-python/examples](https://github.com/ollama/ollama-python/tree/main/examples)
+
+
+## JavaScript examples
+Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/ollama-js/tree/main/examples)
+
+
+## OpenAI compatibility examples
+Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
--- a/examples/python-grounded-factuality-rag-check/README.md
+++ b/examples/python-grounded-factuality-rag-check/README.md
@@ -1,6 +1,6 @@
 # RAG Hallucination Checker using Bespoke-Minicheck

-This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.2` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations. 
+This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retrieved from the document and given to `llama3.2` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations.

 ## Running the Example

--- a/examples/python-grounded-factuality-rag-check/main.py
+++ b/examples/python-grounded-factuality-rag-check/main.py
@@ -115,7 +115,7 @@ if __name__ == "__main__":

        print(f"\nRetrieved chunks: \n{sourcetext}\n")

-        # Give the retreived chunks and question to the chat model
+        # Give the retrieved chunks and question to the chat model
        system_prompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"

        ollama_response = ollama.generate(
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,15 +1,15 @@
-package llm
+package ggml

 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+	"log/slog"
 	"slices"
 	"strings"
-	"sync"

-	"github.com/ollama/ollama/util/bufioutil"
+	"github.com/ollama/ollama/fs/util/bufioutil"
 )

 type GGML struct {
@@ -19,145 +19,168 @@ type GGML struct {

 type model interface {
 	KV() KV
-	Tensors() *Tensors
+	Tensors() Tensors
 }

 type KV map[string]any

-func (kv KV) u64(key string) uint64 {
-	switch v := kv[key].(type) {
-	case uint64:
-		return v
-	case uint32:
-		return uint64(v)
-	case float64:
-		return uint64(v)
-	default:
-		return 0
-	}
-}
-
 func (kv KV) Architecture() string {
-	if s, ok := kv["general.architecture"].(string); ok {
-		return s
-	}
-
-	return "unknown"
+	return kv.String("general.architecture", "unknown")
 }

 func (kv KV) Kind() string {
-	if s, ok := kv["general.type"].(string); ok {
-		return s
-	}
-
-	return "unknown"
+	return kv.String("general.kind", "unknown")
 }

 func (kv KV) ParameterCount() uint64 {
-	return kv.u64("general.parameter_count")
+	return keyValue[uint64](kv, "general.parameter_count")
 }

 func (kv KV) FileType() fileType {
-	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(uint32(u64))
+	if t := kv.Uint("general.file_type"); t > 0 {
+		return fileType(t)
 	}

 	return fileTypeUnknown
 }

 func (kv KV) BlockCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
+	return uint64(kv.Uint("block_count"))
+}
+
+func (kv KV) EmbeddingLength() uint64 {
+	return uint64(kv.Uint("embedding_length"))
 }

 func (kv KV) HeadCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
+	return uint64(kv.Uint("attention.head_count"))
 }

 func (kv KV) HeadCountKV() uint64 {
-	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
-		return headCountKV
-	}
-
-	return 1
+	return uint64(kv.Uint("attention.head_count_kv", 1))
 }

 func (kv KV) EmbeddingHeadCount() uint64 {
 	if heads := kv.HeadCount(); heads > 0 {
-		return kv.EmbeddingLength() / kv.HeadCount()
+		return kv.EmbeddingLength() / heads
 	}

 	return 0
 }

 func (kv KV) EmbeddingHeadCountK() uint64 {
-	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
-		return k
-	}
-
-	return kv.EmbeddingHeadCount()
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
 }

 func (kv KV) EmbeddingHeadCountV() uint64 {
-	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
-		return v
-	}
-
-	return kv.EmbeddingHeadCount()
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
 }

 func (kv KV) GQA() uint64 {
 	return kv.HeadCount() / kv.HeadCountKV()
 }

-func (kv KV) EmbeddingLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
-}
-
 func (kv KV) ContextLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
+	return uint64(kv.Uint("context_length"))
 }

 func (kv KV) ChatTemplate() string {
-	s, _ := kv["tokenizer.chat_template"].(string)
+	return kv.String("tokenizer.chat_template")
+}
+
+func (kv KV) String(key string, defaultValue ...string) string {
+	return keyValue(kv, key, append(defaultValue, "")...)
+}
+
+func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
+	return keyValue(kv, key, append(defaultValue, 0)...)
+}
+
+func (kv KV) Float(key string, defaultValue ...float32) float32 {
+	return keyValue(kv, key, append(defaultValue, 0)...)
+}
+
+func (kv KV) Strings(key string, defaultValue ...[]string) []string {
+	r := keyValue(kv, key, &array{})
+	s := make([]string, r.size)
+	for i := range r.size {
+		s[i] = r.values[i].(string)
+	}
+
 	return s
 }

-type Tensors struct {
-	Items  []*Tensor
-	Offset uint64
+func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
+	r := keyValue(kv, key, &array{})
+	s := make([]uint32, r.size)
+	for i := range r.size {
+		s[i] = uint32(r.values[i].(int32))
+	}

-	layers     map[string]Layer
-	layersOnce sync.Once
+	return s
 }

-func (ts *Tensors) Layers() map[string]Layer {
-	ts.layersOnce.Do(func() {
-		ts.layers = make(map[string]Layer)
-		for _, t := range ts.Items {
-			parts := strings.Split(t.Name, ".")
-			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
-				if len(parts) > index+2 {
-					// blk and mm should have a number after them, join it
-					parts = append(
-						[]string{strings.Join(parts[:index+2], ".")},
-						parts[index+2:]...)
-				}
-			}
+func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
+		key = kv.Architecture() + "." + key
+	}

-			if _, ok := ts.layers[parts[0]]; !ok {
-				ts.layers[parts[0]] = make(Layer)
-			}
+	if val, ok := kv[key]; ok {
+		return val.(T)
+	}

-			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
+	slog.Warn("key not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0]
+}
+
+type Tensors struct {
+	items  []*Tensor
+	Offset uint64
+}
+
+func (s Tensors) Items(prefix ...string) []*Tensor {
+	if len(prefix) == 0 {
+		return s.items
+	}
+
+	var items []*Tensor
+	for _, t := range s.items {
+		if strings.HasPrefix(t.Name, prefix[0]) {
+			items = append(items, t)
 		}
-	})
+	}

-	return ts.layers
+	return items
+}
+
+func (ts Tensors) Layers() map[string]Layer {
+	layers := make(map[string]Layer)
+	for _, t := range ts.items {
+		parts := strings.Split(t.Name, ".")
+		if i := slices.Index(parts, "blk"); i > 0 {
+			parts = append([]string{
+				strings.Join(parts[:i], "."),
+				strings.Join(parts[i:i+2], "."),
+			}, parts[i+2:]...)
+		} else if i == 0 {
+			parts = append([]string{
+				strings.Join(parts[i:i+2], "."),
+			}, parts[i+2:]...)
+		}
+
+		if _, ok := layers[parts[0]]; !ok {
+			layers[parts[0]] = make(Layer)
+		}
+
+		layers[parts[0]][strings.Join(parts[1:], ".")] = t
+	}
+
+	return layers
 }

 type Layer map[string]*Tensor

-func (l Layer) size() (size uint64) {
+func (l Layer) Size() (size uint64) {
 	for _, t := range l {
 		size += t.Size()
 	}
@@ -255,8 +278,6 @@ func (t Tensor) typeSize() uint64 {
 		return 8
 	case 29: // IQ1_M
 		return blockSize/8 + blockSize/16 + blockSize/32
-	case 30: // BF16
-		return 2
 	default:
 		return 0
 	}
@@ -295,7 +316,7 @@ const (

 var ErrUnsupportedFormat = errors.New("unsupported model format")

-func DetectGGMLType(b []byte) string {
+func DetectContentType(b []byte) string {
 	switch binary.LittleEndian.Uint32(b[:4]) {
 	case FILE_MAGIC_GGML:
 		return "ggml"
@@ -312,12 +333,12 @@ func DetectGGMLType(b []byte) string {
 	}
 }

-// DecodeGGML decodes a GGML model from the given reader.
+// Decode decodes a GGML model from the given reader.
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
-func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	if maxArraySize == 0 {
 		maxArraySize = 1024
 	}
@@ -331,10 +352,6 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {

 	var c container
 	switch magic {
-	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
-		return nil, 0, ErrUnsupportedFormat
-	case FILE_MAGIC_GGLA:
-		c = &containerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
 		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
 	case FILE_MAGIC_GGUF_BE:
@@ -530,21 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 }

 // SupportsKVCacheType checks if the requested cache type is supported
-func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
-	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
-	return slices.Contains(validKVCacheTypes, cacheType)
+func (llm GGML) SupportsKVCacheType(cacheType string) bool {
+	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }

 // SupportsFlashAttention checks if the model supports flash attention
-func (ggml GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
+func (llm GGML) SupportsFlashAttention() bool {
+	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
 	if isEmbedding {
 		return false
 	}

 	// Check head counts match and are non-zero
-	headCountK := ggml.KV().EmbeddingHeadCountK()
-	headCountV := ggml.KV().EmbeddingHeadCountV()
+	headCountK := llm.KV().EmbeddingHeadCountK()
+	headCountV := llm.KV().EmbeddingHeadCountV()
 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }

--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -1,4 +1,4 @@
-package llm
+package ggml

 import (
 	"bytes"
@@ -8,10 +8,9 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"maps"
 	"slices"
 	"strings"
-
-	"golang.org/x/exp/maps"
 )

 type containerGGUF struct {
@@ -110,9 +109,9 @@ func (llm *gguf) KV() KV {
 	return llm.kv
 }

-func (llm *gguf) Tensors() *Tensors {
-	return &Tensors{
-		Items:  llm.tensors,
+func (llm *gguf) Tensors() Tensors {
+	return Tensors{
+		items:  llm.tensors,
 		Offset: llm.tensorOffset,
 	}
 }
@@ -523,7 +522,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 		return err
 	}

-	keys := maps.Keys(kv)
+	keys := slices.Collect(maps.Keys(kv))
 	slices.Sort(keys)

 	for _, key := range keys {
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -1,4 +1,4 @@
-package llm
+package ggml

 import "fmt"

@@ -32,10 +32,9 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
-	fileTypeIQ3_M
 	fileTypeIQ2_S
-	fileTypeIQ2_M
 	fileTypeIQ4_XS
+	fileTypeIQ2_M
 	fileTypeIQ1_M
 	fileTypeBF16

@@ -94,8 +93,6 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
-	case "IQ3_M":
-		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
 	case "IQ4_XS":
@@ -163,8 +160,6 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
-	case fileTypeIQ3_M:
-		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:
--- a/fs/util/bufioutil/buffer_seeker.go
+++ b/fs/util/bufioutil/buffer_seeker.go
--- a/fs/util/bufioutil/buffer_seeker_test.go
+++ b/fs/util/bufioutil/buffer_seeker_test.go
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/ollama/ollama

-go 1.22.8
+go 1.23.4

 require (
 	github.com/containerd/console v1.0.3
@@ -18,11 +18,14 @@ require (
 require (
 	github.com/agnivade/levenshtein v1.1.1
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
+	github.com/dlclark/regexp2 v1.11.4
+	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
+	gonum.org/v1/gonum v0.15.0
 )

 require (
@@ -42,7 +45,6 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -42,8 +42,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
+github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
+github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
+github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
+github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -207,7 +207,7 @@ func TestMultiModelStress(t *testing.T) {
 		chosenModels = mediumModels
 		// default:
 		// 	slog.Info("selecting large models")
-		// 	chosenModels = largModels
+		// 	chosenModels = largeModels
 	}

 	req, resp := GenerateRequests()
@@ -232,7 +232,7 @@ func TestMultiModelStress(t *testing.T) {
 	var wg sync.WaitGroup
 	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
 	for i := 0; i < len(req); i++ {
-		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
+		// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
 		if i > 1 && consumed > maxVram {
 			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
 			break
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -20,7 +20,7 @@ import (

 func TestMaxQueue(t *testing.T) {
 	if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
-		t.Skip("Max Queue test requires spawing a local server so we can adjust the queue size")
+		t.Skip("Max Queue test requires spawning a local server so we can adjust the queue size")
 		return
 	}

@@ -67,7 +67,7 @@ func TestMaxQueue(t *testing.T) {
 	busyCount := 0
 	resetByPeerCount := 0
 	canceledCount := 0
-	succesCount := 0
+	successCount := 0
 	counterMu := sync.Mutex{}
 	var embedwg sync.WaitGroup
 	for i := 0; i < threadCount; i++ {
@@ -88,7 +88,7 @@ func TestMaxQueue(t *testing.T) {
 			defer counterMu.Unlock()
 			switch {
 			case genErr == nil:
-				succesCount++
+				successCount++
 				require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
 			case errors.Is(genErr, context.Canceled):
 				canceledCount++
@@ -107,7 +107,7 @@ func TestMaxQueue(t *testing.T) {
 	slog.Info("generate done, waiting for embeds")
 	embedwg.Wait()

-	slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
+	slog.Info("embeds completed", "success", successCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
 	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
 	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
 	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
--- a/llama/Makefile
+++ b/llama/Makefile
@@ -1,57 +0,0 @@
-# top level makefile for Go server
-include make/common-defs.make
-
-RUNNER_TARGETS := default
-
-# Determine which if any GPU runners we should build
-ifeq ($(OS),windows)
-	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
-	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
-	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
-	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
-else ifeq ($(OS),linux)
-	HIP_PATH?=/opt/rocm
-	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
-	CUDA_PATH?=/usr/local/cuda
-	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
-endif
-
-ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
-ifneq ($(CUDA_11),)
-	RUNNER_TARGETS += cuda_v11
-endif
-ifneq ($(CUDA_12),)
-	RUNNER_TARGETS += cuda_v12
-endif
-endif
-ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
-ifneq ($(HIP_LIB_DIR),)
-	RUNNER_TARGETS += rocm
-endif
-endif
-
-
-all: clean-payload .WAIT runners
-
-runners: $(RUNNER_TARGETS)
-
-$(RUNNER_TARGETS):
-	$(MAKE) -f make/Makefile.$@
-
-help-sync apply-patches create-patches sync:
-	$(MAKE) -f make/Makefile.sync $@
-
-clean:
-	rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-	go clean -cache
-
-clean-payload:
-	rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2)
-
-.PHONY: all runners clean clean-payload $(RUNNER_TARGETS) .WAIT
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
--- a/llama/README.md
+++ b/llama/README.md
@@ -37,8 +37,7 @@ go build -tags avx .
 ```shell
 # go doesn't recognize `-mfma` as a valid compiler flag
 # see https://github.com/golang/go/issues/17895
-go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c"
-go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
+go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
 go build -tags=avx,avx2 .
 ```

--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "3f1ae2e32cde00c39b96be6d01c2997c29bae555";
+char const *LLAMA_COMMIT = "40c6d79fb52f995f47507fedfeaae2ac05d9b35c";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/ggml-aarch64.h
+++ b/llama/ggml-aarch64.h
@@ -1,65 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
-#pragma once
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-
-#include "ggml.h"
-
-// GGML internal header
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantization
-void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
-
-// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-
-// GEMV
-void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
-// GEMM
-void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
-#ifdef __cplusplus
-}
-#endif
-
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,180 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    typedef void * ggml_backend_buffer_type_context_t;
-
-    struct ggml_backend_buffer_type_i {
-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
-        // allocate a buffer of this type
-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        // tensor alignment
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft);
-        // max buffer size that can be allocated
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft);
-        // data size needed to allocate the tensor, including padding
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
-        // check if tensor data is in host memory
-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_buffer_type_context_t context;
-    };
-
-    // buffer
-    typedef void * ggml_backend_buffer_context_t;
-
-    struct ggml_backend_buffer_i {
-        const char * (*GGML_CALL get_name)      (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL free_buffer)   (ggml_backend_buffer_t buffer);
-        void *       (*GGML_CALL get_base)      (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
-        void         (*GGML_CALL set_tensor)    (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*GGML_CALL get_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*GGML_CALL cpy_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*GGML_CALL clear)         (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*GGML_CALL reset)         (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
-        ggml_backend_buffer_context_t context;
-        size_t size;
-        enum ggml_backend_buffer_usage usage;
-    };
-
-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t      buft,
-            struct ggml_backend_buffer_i           iface,
-                   ggml_backend_buffer_context_t   context,
-                   size_t                          size);
-
-    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // buffer that contains a collection of buffers
-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-
-    //
-    // Backend
-    //
-
-    typedef void * ggml_backend_context_t;
-
-    struct ggml_backend_i {
-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
-
-        void (*GGML_CALL free)(ggml_backend_t backend);
-
-        // buffer allocation
-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
-
-        // (optional) asynchronous tensor data access
-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // (optional) complete all pending operations
-        void (*GGML_CALL synchronize)(ggml_backend_t backend);
-
-        // compute graph with a plan (not used currently)
-        // create a new plan for a graph
-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
-        void                      (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
-        // compute the graph with the plan
-        enum ggml_status          (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph without a plan (async)
-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // check if the backend can compute an operation
-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
-        // check if the backend can use tensors allocated in a buffer type
-        bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
-
-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
-        // these should be expensive operations with large batch sizes that may benefit from running on this backend
-        // even if the weight has to be copied from the CPU temporarily
-        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
-        // (optional) event synchronization
-        // create a new event that can record events on this backend instance
-        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
-        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
-        // record an event on the backend instance that created it
-        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
-        // wait for an event on on a different backend instance
-        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
-        // block until an event is recorded
-        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
-    };
-
-    struct ggml_backend {
-        ggml_guid_t guid;
-
-        struct ggml_backend_i iface;
-        ggml_backend_context_t context;
-    };
-
-    struct ggml_backend_event {
-        ggml_backend_t backend;
-        void * context;
-    };
-
-    //
-    // Backend registry
-    //
-
-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
-
-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,267 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef struct ggml_backend_event * ggml_backend_event_t;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
-
-    // buffer
-    enum ggml_backend_buffer_usage {
-        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
-        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
-        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
-    };
-
-    GGML_API           const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API           void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API           void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API GGML_CALL void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API           bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API           void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API           enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
-    GGML_API           ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API           void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
-
-    //
-    // Backend
-    //
-
-    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
-    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    // "offset" refers to the offset of the tensor data for setting/getting data
-    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    GGML_API GGML_CALL void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
-    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
-    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // asynchronous copy
-    // the copy is performed after all the currently queued operations in backend_src
-    // backend_dst will wait for the copy to complete before performing other operations
-    // automatic fallback to sync copy if async is not supported
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // events
-    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
-    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event);
-
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    // Create a backend buffer from an existing pointer
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-
-    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    //
-    // Backend registry
-    //
-
-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
-
-    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
-
-    //
-    // Backend scheduler
-    //
-
-    // The backend scheduler allows for multiple backends to be used together
-    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
-    // The backends are selected based on:
-    // - the backend that supports the operation
-    // - the location of the pre-allocated tensors (e.g. the weights)
-    /*
-      Example usage:
-
-        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
-        // preferrably to run on the same backend as the buffer
-        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
-
-        // initialize buffers from a max size graph (optional)
-        reserve_graph = build_graph(sched, max_batch_size);
-
-        // manually assign nodes to a backend (optional, should not be needed in most cases)
-        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
-
-        ggml_backend_sched_reserve(sched, reserve_graph);
-
-        // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
-
-        // if there are graph inputs:
-        ggml_backend_sched_reset(sched);
-        ggml_backend_sched_alloc_graph(sched, graph);
-        ggml_backend_tensor_set(input_tensor, ...);
-        ggml_backend_sched_graph_compute(sched, graph);
-    }
-    */
-
-    struct ggml_backend_sched;
-    typedef struct ggml_backend_sched * ggml_backend_sched_t;
-
-    // when ask == true, the scheduler wants to know if the user wants to observe this node
-    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
-    //
-    // when ask == false, the scheduler is passing the node tensor to the user for observation
-    // if the user returns false, the scheduler will cancel the graph compute
-    //
-    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-
-    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
-    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
-
-    // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-
-    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
-    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
-
-    // Get the number of splits of the last graph
-    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
-
-    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
-    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
-
-    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
-
-    // Reset all assignments and allocators - must be called before changing the node backends
-    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
-
-    // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
-
-    //
-    // Utils
-    //
-
-    struct ggml_backend_graph_copy {
-        ggml_backend_buffer_t buffer;
-        struct ggml_context * ctx_allocated;
-        struct ggml_context * ctx_unallocated;
-        struct ggml_cgraph * graph;
-    };
-
-    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
-    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
-    // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
-    // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
-
-
-#ifdef  __cplusplus
-}
-#endif
--- a/llama/ggml-blas.h
+++ b/llama/ggml-blas.h
@@ -1,49 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
-
-GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
-
-// number of threads used for conversion to float
-// for openblas and blis, this will also set the number of threads used for blas operations
-GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
-
-
-#ifdef  __cplusplus
-}
-#endif
--- a/llama/ggml-cpu-impl.h
+++ b/llama/ggml-cpu-impl.h
@@ -1,640 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-// GGML CPU internal header
-
-#include "ggml.h"
-#include "ggml-impl.h"
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-//#include <stddef.h>
-#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h>   // fabsf
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(_MSC_VER)
-
-#define m512bh(p) p
-#define m512i(p) p
-
-#else
-
-#define m512bh(p) (__m512bh)(p)
-#define m512i(p) (__m512i)(p)
-
-#endif
-
-/**
- * Converts brain16 to float32.
- *
- * The bfloat16 floating point format has the following structure:
- *
- *       ┌sign
- *       │
- *       │   ┌exponent
- *       │   │
- *       │   │      ┌mantissa
- *       │   │      │
- *       │┌──┴───┐┌─┴───┐
- *     0b0000000000000000 brain16
- *
- * Since bf16 has the same number of exponent bits as a 32bit float,
- * encoding and decoding numbers becomes relatively straightforward.
- *
- *       ┌sign
- *       │
- *       │   ┌exponent
- *       │   │
- *       │   │      ┌mantissa
- *       │   │      │
- *       │┌──┴───┐┌─┴───────────────────┐
- *     0b00000000000000000000000000000000 IEEE binary32
- *
- * For comparison, the standard fp16 format has fewer exponent bits.
- *
- *       ┌sign
- *       │
- *       │  ┌exponent
- *       │  │
- *       │  │    ┌mantissa
- *       │  │    │
- *       │┌─┴─┐┌─┴──────┐
- *     0b0000000000000000 IEEE binary16
- *
- * @see IEEE 754-2008
- */
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.i = (uint32_t)h.bits << 16;
-    return u.f;
-}
-
-/**
- * Converts float32 to brain16.
- *
- * This is binary identical with Google Brain float conversion.
- * Floats shall round to nearest even, and NANs shall be quiet.
- * Subnormals aren't flushed to zero, except perhaps when used.
- * This code should vectorize nicely if using modern compilers.
- */
-static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
-    ggml_bf16_t h;
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.f = s;
-    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
-        h.bits = (u.i >> 16) | 64; /* force to quiet */
-        return h;
-    }
-    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
-    return h;
-}
-
-#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
-#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#endif
-
-// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
-#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#ifndef __SSSE3__
-#define __SSSE3__
-#endif
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-#include <sys/prctl.h>
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#ifdef _MSC_VER
-
-typedef uint16_t ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
-#else
-
-typedef __fp16 ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
-#endif // _MSC_VER
-
-#if !defined(__aarch64__)
-
-// 32-bit ARM compatibility
-
-// vaddlvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddlvq_s16(int16x8_t v) {
-    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
-    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
-    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
-    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
-    return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
-    int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
-    ggml_int16x8x2_t res;
-
-    res.val[0] = vld1q_s16(ptr + 0);
-    res.val[1] = vld1q_s16(ptr + 8);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
-    uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
-    ggml_uint8x16x2_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
-    uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
-    ggml_uint8x16x4_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-    res.val[2] = vld1q_u8(ptr + 32);
-    res.val[3] = vld1q_u8(ptr + 48);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x2_t {
-    int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
-    ggml_int8x16x2_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x4_t {
-    int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
-    ggml_int8x16x4_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-    res.val[2] = vld1q_s8(ptr + 32);
-    res.val[3] = vld1q_s8(ptr + 48);
-
-    return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
-    int8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-// NOTE: not tested
-inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t  int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t  int8x16x2_t
-#define ggml_int8x16x4_t  int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2  vld1q_u8_x2
-#define ggml_vld1q_u8_x4  vld1q_u8_x4
-#define ggml_vld1q_s8_x2  vld1q_s8_x2
-#define ggml_vld1q_s8_x4  vld1q_s8_x4
-#define ggml_vqtbl1q_s8   vqtbl1q_s8
-#define ggml_vqtbl1q_u8   vqtbl1q_u8
-
-#endif // !defined(__aarch64__)
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
-    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
-    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
-    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif // !defined(__ARM_FEATURE_DOTPROD)
-
-#endif // defined(__ARM_NEON)
-
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    ggml_fp16_internal_t tmp;
-    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-    return (float)tmp;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    ggml_fp16_t res;
-    ggml_fp16_internal_t tmp = f;
-    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
-    return res;
-}
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#if defined(__loongarch64)
-#if defined(__loongarch_asx)
-#include <lasxintrin.h>
-#endif
-#if defined(__loongarch_sx)
-#include <lsxintrin.h>
-#endif
-#endif
-
-#if defined(__loongarch_asx)
-
-typedef union {
-    int32_t i;
-    float f;
-} ft_union;
-
-/* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
-}
-
-static __m256 __lasx_xvreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
-}
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
-
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif // __ARM_FEATURE_SVE
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
-
-#if !defined(GGML_FP32_TO_FP16)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
-
-#ifdef __cplusplus
-}
-#endif
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,75 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_HIPBLAS
-#define GGML_CUDA_NAME "ROCm"
-#define GGML_CUBLAS_NAME "hipBLAS"
-#elif defined(GGML_USE_MUSA)
-#define GGML_CUDA_NAME "MUSA"
-#define GGML_CUBLAS_NAME "muBLAS"
-#else
-#define GGML_CUDA_NAME "CUDA"
-#define GGML_CUBLAS_NAME "cuBLAS"
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_CUDA_MAX_DEVICES       16
-
-// backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
-
-GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
-
-// device buffer
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
-
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-
-GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
-
-GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-
-GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
-
-GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
-#ifdef  __cplusplus
-}
-#endif
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ACC_BLOCK_SIZE 256
-
-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,60 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arange.cuh"
-
-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    dst[nidx] = start + step * nidx;
-}
-
-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
-}
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    float start;
-    float stop;
-    float step;
-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
-}
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ARANGE_BLOCK_SIZE 256
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,60 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "clamp.cuh"
-
-static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
-}
-
-
-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
-}
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CLAMP_BLOCK_SIZE 256
-
-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CONCAT_BLOCK_SIZE 256
-
-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
-
-void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,39 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-
-template<typename T>
-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
-
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<half> to_fp16_cuda_t;
-
-to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
-
-to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CPY_BLOCK_SIZE 32
-
-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
-
-void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ b/llama/ggml-cuda/cross-entropy-loss.cuh
@@ -1,33 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
-
-void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-
-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/dmmv.cu
+++ b/llama/ggml-cuda/dmmv.cu
@@ -1,709 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "dmmv.cuh"
-#include "dequantize.cuh"
-#include "convert.cuh"
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 2
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp += dall * sum1 - dmin * sum2;
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-        const uint8_t * h = x[i].hmask + l0;
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp += d * sum;
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-
-    const int il  = tid/step;                            // 0...3
-    const int ir  = tid - step*il;                       // 0...7 or 0...3
-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-#if K_QUANTS_PER_ITERATION == 2
-    uint32_t q32[4];
-    const uint8_t * q4 = (const uint8_t *)q32;
-#else
-    uint16_t q16[4];
-    const uint8_t * q4 = (const uint8_t *)q16;
-#endif
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y1 = yy + i*QK_K + y_offset;
-        const float   * y2 = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
-        const uint32_t * q2 = q1 + 16;
-
-        q32[0] = q1[0] & 0x0f0f0f0f;
-        q32[1] = q1[0] & 0xf0f0f0f0;
-        q32[2] = q2[0] & 0x0f0f0f0f;
-        q32[3] = q2[0] & 0xf0f0f0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 4; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
-            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#else
-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
-        const uint16_t * q2 = q1 + 32;
-
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[0] & 0xf0f0;
-        q16[2] = q2[0] & 0x0f0f;
-        q16[3] = q2[0] & 0xf0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 2; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#endif
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
-
-    const int row = blockIdx.x;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/2;  // 0...15
-    const int ix  = threadIdx.x%2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    uint16_t q16[8];
-    const uint8_t * q4 = (const uint8_t *)q16;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        const uint8_t * ql1 = x[i].qs + q_offset;
-        const uint8_t * qh  = x[i].qh + l0;
-        const float   * y1  = yy + i*QK_K + y_offset;
-        const float   * y2  = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 sum = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        const uint16_t * q1 = (const uint16_t *)ql1;
-        const uint16_t * q2 = q1 + 32;
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[8] & 0x0f0f;
-        q16[2] = (q1[0] >> 4) & 0x0f0f;
-        q16[3] = (q1[8] >> 4) & 0x0f0f;
-        q16[4] = q2[0] & 0x0f0f;
-        q16[5] = q2[8] & 0x0f0f;
-        q16[6] = (q2[0] >> 4) & 0x0f0f;
-        q16[7] = (q2[8] >> 4) & 0x0f0f;
-        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
-    const half * x = (const half *) vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x = x[ib + iqs + 0];
-    v.y = x[ib + iqs + 1];
-}
-
-static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
-        type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
-        type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
-        type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
-        type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
-        type == GGML_TYPE_F16 ? convert_f16 :
-        nullptr;
-}
-
-template <ggml_type type>
-static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
-    constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
-    constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
-    constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
-
-    const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = threadIdx.x;
-
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-// partial sum for each thread
-#ifdef GGML_CUDA_F16
-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
-#else
-    float tmp = 0.0f;
-#endif // GGML_CUDA_F16
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-
-            // dequantize
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-            dfloat2 v;
-            dequantize_kernel(vx, ib, iqs + j/qr, v);
-
-            // matrix multiplication
-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
-#else
-            tmp += v.x * y[iybs + iqs + j/qr + 0];
-            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
-#endif // GGML_CUDA_F16
-        }
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-#ifdef GGML_CUDA_F16
-        dst[row] = tmp.x + tmp.y;
-#else
-        dst[row] = tmp;
-#endif // GGML_CUDA_F16
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const dim3 block_dims(32, 1, 1);
-    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
-}
-
-static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<GGML_TYPE_F16>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-void ggml_cuda_op_dequantize_mul_mat_vec(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-    GGML_UNUSED(ctx);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_F16
-    ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
-    half * src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 =
-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        src1_dfloat = src1_dfloat_a.alloc(ne00);
-        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-        GGML_ASSERT(to_fp16_cuda != nullptr);
-        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
-    }
-#else
-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_CUDA_F16
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ABORT("fatal error");
-            break;
-    }
-
-    GGML_UNUSED(src1);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(src1_ddq_i);
-    GGML_UNUSED(src1_ncols);
-    GGML_UNUSED(src1_padded_row_size);
-}
-
-bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
-    return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
-        src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
-        src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
-        src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
-        src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
-        src0_type == GGML_TYPE_F16;
-}
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,46 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-// dmmv = dequantize_mul_mat_vec
-
-// TODO: remove this?
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
-#endif
-
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
-#endif
-
-void ggml_cuda_op_dequantize_mul_mat_vec(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
-
-bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-
-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_IM2COL_BLOCK_SIZE 256
-
-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
-
-void ggml_cuda_op_mul_mat_vec_q(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,33 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/opt-step-adamw.cu
+++ b/llama/ggml-cuda/opt-step-adamw.cu
@@ -1,106 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "opt-step-adamw.cuh"
-
-#include <cstdint>
-
-static __global__ void opt_step_adamw_f32(
-    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const int64_t k,
-    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
-    const float beta1h, const float beta2h) {
-
-    const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    const float gi = g[i];
-    const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
-    const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
-
-    g_m[i] = gmi;
-    g_v[i] = gvi;
-
-    const float mh =       gmi*beta1h;
-    const float vh = sqrtf(gvi*beta2h) + eps;
-
-    x[i] = x[i]*(1.0f - alpha*wd) - mh/vh;
-}
-
-static void opt_step_adamw_f32_cuda(
-    float * x, const float * g, float * g_m, float * g_v, const int64_t k,
-    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
-    const float beta1h, const float beta2h, cudaStream_t stream) {
-
-    const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
-    const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
-    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, k, alpha, beta1, beta2, eps, wd, beta1h, beta2h);
-}
-
-void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0        = dst->src[0];
-    const ggml_tensor * src0_grad   = dst->src[1];
-    const ggml_tensor * src0_grad_m = dst->src[2];
-    const ggml_tensor * src0_grad_v = dst->src[3];
-
-    GGML_ASSERT(src0->type        == GGML_TYPE_F32);
-    GGML_ASSERT(src0_grad->type   == GGML_TYPE_F32);
-    GGML_ASSERT(src0_grad_m->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0_grad_v->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src0_grad));
-    GGML_ASSERT(ggml_is_contiguous(src0_grad_m));
-    GGML_ASSERT(ggml_is_contiguous(src0_grad_v));
-    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
-    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
-    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
-
-    float       * src0_d        = (float       *) src0->data;
-    const float * src0_grad_d   = (const float *) src0_grad->data;
-    float       * src0_grad_m_d = (float       *) src0_grad_m->data;
-    float       * src0_grad_v_d = (float       *) src0_grad_v->data;
-
-    cudaStream_t stream = ctx.stream();
-
-    const int64_t ne = ggml_nelements(src0);
-
-    int64_t iter;  memcpy(&iter,  &dst->op_params[0], sizeof(int64_t));
-    float   alpha; memcpy(&alpha, &dst->op_params[2], sizeof(float));
-    float   beta1; memcpy(&beta1, &dst->op_params[3], sizeof(float));
-    float   beta2; memcpy(&beta2, &dst->op_params[4], sizeof(float));
-    float   eps;   memcpy(&eps,   &dst->op_params[5], sizeof(float));
-    float   wd;    memcpy(&wd,    &dst->op_params[6], sizeof(float));
-
-    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
-    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
-
-    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, ne, alpha, beta1, beta2, eps, wd, beta1h, beta2h, stream);
-
-    iter++;
-    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
-}
--- a/llama/ggml-cuda/opt-step-adamw.cuh
+++ b/llama/ggml-cuda/opt-step-adamw.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
-
-void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/out-prod.cuh
+++ b/llama/ggml-cuda/out-prod.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,32 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_PAD_BLOCK_SIZE 256
-
-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_POOL2D_BLOCK_SIZE 256
-
-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michael Yang	6f09d63862	draft: mlx	2024-12-31 11:13:09 -08:00
Michael Yang	67bcb55941	no runners	2024-12-21 01:31:48 -08:00
Michael Yang	cef3cf353a	estimate log value	2024-12-21 01:31:48 -08:00
Michael Yang	512bbb8044	add log sink	2024-12-21 01:31:25 -08:00
Michael Yang	bca6ed0ccc	llama.cpp subdirs	2024-12-20 16:56:03 -08:00
Michael Yang	756bfebe1b	backend libraries	2024-12-20 16:28:53 -08:00
Michael Yang	054e31146e	build: recursive make ggml-cuda	2024-12-20 16:28:53 -08:00
Michael Yang	9f7d3ff2a8	preserve vendor directory structure	2024-12-20 16:28:53 -08:00
Bruce MacDonald	3c5f96b738	model: test bpe tokenizer	2024-12-20 13:49:23 -08:00
Michael Yang	e699b8f5b9	concurrent load tensors	2024-12-11 10:22:38 -08:00
Michael Yang	b7943d941d	merge ggml file decoding	2024-12-11 10:22:38 -08:00
Michael Yang	2c5fb24855	wip: next ollama runner implement llama and mllama model architectures in go using ggml (through cgo)	2024-12-11 10:22:38 -08:00
Michael Yang	8d15a7a964	go1.23	2024-12-11 10:22:37 -08:00
Daniel Hiltgen	cf4d7c52c4	win: builtin arm runner (#8039 ) The new build embeds the arm runner in the main binary, so there is no longer a lib/ollama	2024-12-11 08:32:13 -08:00
Daniel Hiltgen	6a6328a5e9	ci: build dir changed (#8037 ) Remove no longer relevant build log dir	2024-12-10 20:33:34 -08:00
Jeffrey Morgan	527cc97899	llama: update vendored code to commit 40c6d79f (#7875 )	2024-12-10 19:21:34 -08:00
Blake Mizerany	a37f4a86a7	go.mod: go 1.22.8 -> 1.23.4 (#8036 )	2024-12-10 18:16:16 -08:00
湛露先生	46f74e0cb5	Return err when NewHipLib() detect error. (#8012 ) Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>	2024-12-10 16:32:29 -08:00
Phil Wornath	7622ea21af	readme: add AI summary helper plugin to community-integrations (#7202 )	2024-12-10 16:13:06 -08:00
Tao Zuhong	c5d3947084	readme: add Kangaroo, an AI-powered SQL admin tool to community integrations (#7948 )	2024-12-10 13:48:32 -08:00
frob	757eeacc1b	server: lowercase hostname for Host header check (#5851 )	2024-12-10 13:43:22 -08:00
Dr. Daniel Bender	dd42acf737	readme: add aidful-ollama-model-delete to community integrations (#8024 )	2024-12-10 13:03:19 -08:00
Daniel Hiltgen	b9ccb3741e	Remove unused runner CpuFeatures (#8032 ) The final implementation of #7499 removed dynamic vector requirements in favor of a simpler filename based model, and this was left over logic that is no longer needed.	2024-12-10 12:59:39 -08:00
Stefan Weil	abfdc4710f	all: fix typos in documentation, code, and comments (#7021 )	2024-12-10 12:58:06 -08:00
Daniel Hiltgen	82a02e18d9	build: fix typo in override variable (#8031 ) The "F" was missing.	2024-12-10 10:51:16 -08:00
Daniel Hiltgen	4879a234c4	build: Make target improvements (#7499 ) * llama: wire up builtin runner This adds a new entrypoint into the ollama CLI to run the cgo built runner. On Mac arm64, this will have GPU support, but on all other platforms it will be the lowest common denominator CPU build. After we fully transition to the new Go runners more tech-debt can be removed and we can stop building the "default" runner via make and rely on the builtin always. * build: Make target improvements Add a few new targets and help for building locally. This also adjusts the runner lookup to favor local builds, then runners relative to the executable, and finally payloads. * Support customized CPU flags for runners This implements a simplified custom CPU flags pattern for the runners. When built without overrides, the runner name contains the vector flag we check for (AVX) to ensure we don't try to run on unsupported systems and crash. If the user builds a customized set, we omit the naming scheme and don't check for compatibility. This avoids checking requirements at runtime, so that logic has been removed as well. This can be used to build GPU runners with no vector flags, or CPU/GPU runners with additional flags (e.g. AVX512) enabled. * Use relative paths If the user checks out the repo in a path that contains spaces, make gets really confused so use relative paths for everything in-repo to avoid breakage. * Remove payloads from main binary * install: clean up prior libraries This removes support for v0.3.6 and older versions (before the tar bundle) and ensures we clean up prior libraries before extracting the bundle(s). Without this change, runners and dependent libraries could leak when we update and lead to subtle runtime errors.	2024-12-10 09:47:19 -08:00
frob	63269668c0	Prevent underflow when FreeMemory < overhead (#8014 ) Co-authored-by: Richard Lyons <frob@cloudstaff.com>	2024-12-10 09:10:40 -08:00
Jesse Gross	900f64e6be	prompt: Don't trim whitespace from prompts New lines can be an important part of a user's prompt and trimming it can alter the results. We previously only trimmed prompts with images but refactoring brought this behavior to all prompts, where it became more noticable. The /generate endpoint adds less whitespace and therefore doesn't need to trim it out - this brings the same behavior to /chat. Thanks to @gabe-l-hart for spotting the issue! Fixes #7795	2024-12-09 11:02:55 -08:00
Yannick Gloster	da09488fbf	docs: remove comment regarding tool streaming in openai.md (#7960 )	2024-12-07 22:16:21 -08:00
湛露先生	7f0ccc8a9d	docs: fix syntax error in openai.md (#7986 )	2024-12-07 22:14:36 -08:00
Parth Sareen	de52b6c2f9	bugfix: "null" value json mode (#7979 )	2024-12-06 14:13:15 -08:00
Michael	acd7d03266	readme: add llama3.3 to readme (#7975 ) readme: add llama3.3 to readme	2024-12-06 14:05:11 -05:00
Parth Sareen	f6e87fd628	docs: update readmes for structured outputs (#7962 )	2024-12-06 10:35:37 -08:00
				`@@ -1 +0,0 @@`
				`This is here to make sure the build/ directory exists for the go:embed command`