disable execstack for amd libraries

2024-03-10 15:08:46 -07:00
160 changed files with 4030 additions and 45496 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +0,0 @@
 llm/ext_server/* linguist-vendored
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@@ -1,60 +0,0 @@
 name: Bug report
 labels: [bug]
 description: Something isn't working right.
 body:
  - type: textarea
    id: description
    attributes:
      label: What is the issue?
      description: What happened? What did you expect to happen?
    validations:
      required: true
  - type: dropdown
    id: os
    attributes:
      label: OS
      description: Which operating system are you using?
      multiple: true
      options:
        - Linux
        - macOS
        - Windows
        - Docker
        - WSL2
    validations:
      required: false
  - type: dropdown
    id: gpu
    attributes:
      label: GPU
      description: Which GPU are you using?
      multiple: true
      options:
        - Nvidia
        - AMD
        - Intel
        - Apple
        - Other
    validations:
      required: false
  - type: dropdown
    id: cpu
    attributes:
      label: CPU
      description: Which CPU are you using?
      multiple: true
      options:
        - Intel
        - AMD
        - Apple
        - Other
    validations:
      required: false
  - type: input
    id: version
    attributes:
      label: Ollama version
      description: What version of Ollama are you using? (`ollama --version`)
      placeholder: e.g., 0.1.32
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/20_feature_request.md
+++ b/.github/ISSUE_TEMPLATE/20_feature_request.md
@@ -1,6 +0,0 @@
 ---
 name: Feature request
 about: Request a new feature
 labels: feature request
 ---
--- a/.github/ISSUE_TEMPLATE/30_model_request.md
+++ b/.github/ISSUE_TEMPLATE/30_model_request.md
@@ -1,5 +0,0 @@
 ---
 name: Model request
 about: Request support for a new model to be added to Ollama
 labels: model request
 ---
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,8 +0,0 @@
 blank_issues_enabled: true
 contact_links:
  - name: Help
    url: https://discord.com/invite/ollama
    about: Please join our Discord server for help using Ollama
  - name: Troubleshooting
    url: https://github.com/ollama/ollama/blob/main/docs/faq.md#faq
    about: See the FAQ for common issues and solutions
--- a/.github/workflows/latest.yaml
+++ b/.github/workflows/latest.yaml
@@ -1,24 +0,0 @@
 name: latest
 on:
  release:
    types: [released]
 jobs:
  update-latest:
    environment: release
    runs-on: linux
    steps:
      - uses: actions/checkout@v4
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - name: Tag images as latest
        env:
          PUSH: "1"
        shell: bash
        run: |
          export "VERSION=${GITHUB_REF_NAME#v}"
          ./scripts/tag_latest.sh
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,473 +0,0 @@
 name: release
 on:
  push:
    tags:
      - 'v*'
 jobs:
  # Full build of the Mac assets
  build-darwin:
    runs-on: macos-12
    environment: release
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
        shell: bash
        run: |
          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
      - name: key
        env:
          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
        run: |
          echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
          security create-keychain -p password build.keychain
          security default-keychain -s build.keychain
          security unlock-keychain -p password build.keychain
          security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - name: Build Darwin
        env:
          APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
          APPLE_ID: ${{ vars.APPLE_ID }}
          SDKROOT: /Applications/Xcode_13.4.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
          DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
        run: |
          ./scripts/build_darwin.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-darwin
          path: |
            dist/*arwin*
            !dist/*-cov
  # Windows builds take a long time to both install the dependencies and build, so parallelize
  # CPU generation step
  generate-windows-cpu:
    environment: release
    runs-on: windows
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - uses: 'google-github-actions/auth@v2'
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
      - name: install Windows SDK 8.1 to get signtool
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading SDK"
          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          write-host "Win SDK 8.1 installed"
          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
      - name: install signing plugin
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading plugin"
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          go generate -x ./...
        name: go generate
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
          path: |
            llm/build/**/bin/*
            llm/build/**/*.a
  # ROCm generation step
  generate-windows-rocm:
    environment: release
    runs-on: windows
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - uses: 'google-github-actions/auth@v2'
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
      - name: install Windows SDK 8.1 to get signtool
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading SDK"
          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          write-host "Win SDK 8.1 installed"
          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
      - name: install signing plugin
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading plugin"
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - name: 'Install ROCm'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading AMD HIP Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP"
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          go generate -x ./...
        name: go generate
      - name: 'gather rocm dependencies'
        run: |
          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          md "dist\deps\bin\rocblas\library"
          cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
          cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
          cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
          path: llm/build/**/bin/*
      - uses: actions/upload-artifact@v4
        with:
          name: windows-rocm-deps
          path: dist/deps/*
  # CUDA generation step
  generate-windows-cuda:
    environment: release
    runs-on: windows
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - uses: 'google-github-actions/auth@v2'
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
      - name: install Windows SDK 8.1 to get signtool
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading SDK"
          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          write-host "Win SDK 8.1 installed"
          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
      - name: install signing plugin
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading plugin"
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - name: 'Install CUDA'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading CUDA Installer"
          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
          write-host "Installing CUDA"
          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
          write-host "Completed CUDA"
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
          echo "$cudaPath\bin" >> $env:GITHUB_PATH
          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
      - name: 'Verify CUDA'
        run: nvcc -V
      - run: go get ./...
      - name: go generate
        run: |
          $gopath=(get-command go).source | split-path -parent
          $cudabin=(get-command nvcc).source | split-path
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          go generate -x ./...
      - name: 'gather cuda dependencies'
        run: |
          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
          md "dist\deps"
          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda
          path: llm/build/**/bin/*
      - uses: actions/upload-artifact@v4
        with:
          name: windows-cuda-deps
          path: dist/deps/*
  # Import the prior generation steps and build the final windows assets
  build-windows:
    environment: release
    runs-on: windows
    needs:
      - generate-windows-cuda
      - generate-windows-rocm
      - generate-windows-cpu
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - uses: 'google-github-actions/auth@v2'
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
      - name: install Windows SDK 8.1 to get signtool
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading SDK"
          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          write-host "Win SDK 8.1 installed"
          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
      - name: install signing plugin
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading plugin"
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - run: go get
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cpu
          path: llm/build
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cuda
          path: llm/build
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps
          path: dist/deps
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
          path: dist/deps
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
          path: llm/build
      - run: dir llm/build
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_GENERATE="1"
          $env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
          $env:HIP_PATH=$(resolve-path ".\dist\deps")
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:
          name: dist-windows
          path: dist/*.exe
  # Linux x86 assets built using the container based build
  build-linux-amd64:
    environment: release
    runs-on: linux
    env:
      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: amd64
      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
          mv dist/deps/* dist/
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
          path: |
            dist/*linux*
            !dist/*-cov
  # Linux ARM assets built using the container based build
  # (at present, docker isn't pre-installed on arm ubunutu images)
  build-linux-arm64:
    environment: release
    runs-on: linux-arm64
    env:
      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: arm64
      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - name: 'Install Docker'
        run: |
          # Add Docker's official GPG key:
          env
          uname -a
          sudo apt-get update
          sudo apt-get install -y ca-certificates curl
          sudo install -m 0755 -d /etc/apt/keyrings
          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
          sudo chmod a+r /etc/apt/keyrings/docker.asc
          # Add the repository to Apt sources:
          echo \
            "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
          sudo apt-get update
          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
          sudo usermod -aG docker $USER
          sudo apt-get install acl
          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-arm64
          path: |
            dist/*linux*
            !dist/*-cov
  # Aggregate all the assets and ship a release
  release:
    needs:
      - build-darwin
      - build-windows
      - build-linux-amd64
      - build-linux-arm64
    runs-on: linux
    environment: release
    permissions:
      contents: write
    env:
      OLLAMA_SKIP_IMAGE_BUILD: '1'
      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
        shell: bash
        run: |
          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: ./scripts/build_docker.sh
      - name: Retrieve built artifact
        uses: actions/download-artifact@v4
        with:
          path: dist
          pattern: dist-*
          merge-multiple: true
      - run: |
          ls -lh dist/
          (cd dist; sha256sum * > sha256sum.txt)
          cat dist/sha256sum.txt
      - uses: ncipollo/release-action@v1
        with:
          name: ${{ env.RELEASE_VERSION }}
          allowUpdates: true
          artifacts: 'dist/*'
          draft: true
          prerelease: true
          omitBodyDuringUpdate: true
          generateReleaseNotes: true
          omitDraftDuringUpdate: true
          omitPrereleaseDuringUpdate: true
          replacesArtifacts: true
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -5,35 +5,11 @@ on:
    paths:
      - '**/*'
      - '!docs/**'
      - '!examples/**'
      - '!README.md'
 jobs:
  changes:
    runs-on: ubuntu-latest
    outputs:
      GENERATE: ${{ steps.changes.outputs.GENERATE }}
      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - id: changes
        run: |
          changed() {
            git diff-tree -r --no-commit-id --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
              | xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
          }
          {
            echo GENERATE=$(changed llm/)
            echo GENERATE_CUDA=$(changed llm/)
            echo GENERATE_ROCM=$(changed llm/)
          } >>$GITHUB_OUTPUT
  generate:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE == 'True' }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-2019]
@@ -50,32 +26,26 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH"
+          $env:PATH="$gopath;$env:PATH"
          echo $env:PATH
          go generate -x ./...
        if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: 'Windows Go Generate'
+        name: "Windows Go Generate"
      - run: go generate -x ./...
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: 'Unix Go Generate'
+        name: "Unix Go Generate"
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
-          path: |
+          path: llm/llama.cpp/build/**/lib/*
            llm/build/**/bin/*
            llm/build/**/*.a
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
    strategy:
      matrix:
        cuda-version:
@@ -92,7 +62,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
@@ -103,14 +73,12 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: cuda-${{ matrix.cuda-version }}-libraries
-          path: llm/build/**/bin/*
+          path: llm/llama.cpp/build/**/lib/*
  generate-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
    strategy:
      matrix:
        rocm-version:
-          - '6.0.2'
+          - '6.0'
    runs-on: linux
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
    steps:
@@ -123,7 +91,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
@@ -134,87 +102,7 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: rocm-${{ matrix.rocm-version }}-libraries
-          path: llm/build/**/bin/*
+          path: llm/llama.cpp/build/**/lib/*
  # ROCm generation step
  generate-windows-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - name: 'Install ROCm'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading AMD HIP Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP"
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          go generate -x ./...
        name: go generate
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      # TODO - do we need any artifacts?
  # CUDA generation step
  generate-windows-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - name: 'Install CUDA'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading CUDA Installer"
          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
          write-host "Installing CUDA"
          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
          write-host "Completed CUDA"
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
          echo "$cudaPath\bin" >> $env:GITHUB_PATH
          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
      - name: 'Verify CUDA'
        run: nvcc -V
      - run: go get ./...
      - name: go generate
        run: |
          $gopath=(get-command go).source | split-path -parent
          $cudabin=(get-command nvcc).source | split-path
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      # TODO - do we need any artifacts?
  lint:
    strategy:
      matrix:
@@ -237,31 +125,24 @@ jobs:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: false
      - run: |
-          case ${{ matrix.arch }} in
+          mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
-            amd64) echo ARCH=x86_64 ;;
+          touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
          mkdir -p llm/build/linux/$ARCH/stub/bin
          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
-          mkdir -p llm/build/darwin/$ARCH/stub/bin
+          mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/
-          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
+          touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib
          touch llm/llama.cpp/ggml-metal.metal
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - run: |
-          mkdir -p llm/build/windows/$ARCH/stub/bin
+          mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/
-          touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
+          touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll
        if: ${{ startsWith(matrix.os, 'windows-') }}
-        shell: bash
+      - uses: golangci/golangci-lint-action@v3
      - uses: golangci/golangci-lint-action@v4
        with:
          args: --timeout 8m0s
  test:
    needs: generate
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-2019]
@@ -275,36 +156,19 @@ jobs:
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
      OLLAMA_CPU_TARGET: 'static'
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get
-      - run: |
+      - uses: actions/download-artifact@v4
-          case ${{ matrix.arch }} in
+        with:
-            amd64) echo ARCH=x86_64 ;;
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
-            arm64) echo ARCH=arm64 ;;
+          path: llm/llama.cpp/build
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
          mkdir -p llm/build/linux/$ARCH/stub/bin
          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
          mkdir -p llm/build/darwin/$ARCH/stub/bin
          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - run: |
          mkdir -p llm/build/windows/$ARCH/stub/bin
          touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'windows-') }}
        shell: bash
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
      - uses: actions/upload-artifact@v4
--- a/.gitignore
+++ b/.gitignore
@@ -10,5 +10,4 @@ ggml-metal.metal
 *.exe
 .idea
 test_data
-*.crt
+*.crt
 llm/build
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -15,3 +15,13 @@ linters:
    - misspell
    - nilerr
    - unused
 linters-settings:
  errcheck:
    # exclude the following functions since we don't generally
    # need to be concerned with the returned errors
    exclude-functions:
      - encoding/binary.Read
      - (*os.File).Seek
      - (*bufio.Writer).WriteString
      - (*github.com/spf13/pflag.FlagSet).Set
      - (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek
--- a/83
+++ b/83
@@ -1,8 +1,7 @@
 ARG GOLANG_VERSION=1.22.1
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.0.2
+ARG ROCM_VERSION=6.0
 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -15,20 +14,20 @@ ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -36,18 +35,18 @@ COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 RUN mkdir /tmp/scratch && \
-    for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
+    for dep in $(cat /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
+    cp ${dep} /tmp/scratch/ || exit 1 ; \
    done && \
    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
-    mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
+    mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
-    (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
+    (cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
@@ -56,76 +55,68 @@ ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
 RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
+RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
+RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
-FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
+FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 # Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
 RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama
+WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
-COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
-WORKDIR /go/src/github.com/ollama/ollama
+WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
-COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-  <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+  <img alt="ollama" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 </div>
 # Ollama
@@ -22,7 +22,7 @@ Get up and running with large language models locally.
 curl -fsSL https://ollama.com/install.sh | sh
 ```
-[Manual install instructions](https://github.com/ollama/ollama/blob/main/docs/linux.md)
+[Manual install instructions](https://github.com/jmorganca/ollama/blob/main/docs/linux.md)
 ### Docker
@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 ## Quickstart
-To run and chat with [Llama 3](https://ollama.com/library/llama3):
+To run and chat with [Llama 2](https://ollama.com/library/llama2):
 ```
-ollama run llama3
+ollama run llama2
 ```
 ## Model library
@@ -49,8 +49,7 @@ Here are some example models that can be downloaded:
 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
+| Llama 2            | 7B         | 3.8GB | `ollama run llama2`            |
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Dolphin Phi        | 2.7B       | 1.6GB | `ollama run dolphin-phi`       |
 | Phi-2              | 2.7B       | 1.7GB | `ollama run phi`               |
@@ -61,10 +60,10 @@ Here are some example models that can be downloaded:
 | Llama 2 13B        | 13B        | 7.3GB | `ollama run llama2:13b`        |
 | Llama 2 70B        | 70B        | 39GB  | `ollama run llama2:70b`        |
 | Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
 | Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
 | Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
 | Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
 | Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
 > Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -98,16 +97,16 @@ See the [guide](docs/import.md) on importing models for more information.
 ### Customize a prompt
-Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama2` model:
 ```
-ollama pull llama3
+ollama pull llama2
 ```
 Create a `Modelfile`:
 ```
-FROM llama3
+FROM llama2
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -142,7 +141,7 @@ ollama create mymodel -f ./Modelfile
 ### Pull a model
 ```
-ollama pull llama3
+ollama pull llama2
 ```
 > This command can also be used to update a local model. Only the diff will be pulled.
@@ -150,13 +149,13 @@ ollama pull llama3
 ### Remove a model
 ```
-ollama rm llama3
+ollama rm llama2
 ```
 ### Copy a model
 ```
-ollama cp llama3 my-model
+ollama cp llama2 my-llama2
 ```
 ### Multiline input
@@ -180,7 +179,7 @@ The image features a yellow smiley face, which is likely the central focus of th
 ### Pass in prompt as arguments
 ```
-$ ollama run llama3 "Summarize this file: $(cat README.md)"
+$ ollama run llama2 "Summarize this file: $(cat README.md)"
 Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
@@ -214,7 +213,7 @@ Then build the binary:
 go build .
 ```
-More detailed instructions can be found in the [developer guide](https://github.com/ollama/ollama/blob/main/docs/development.md)
+More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
 ### Running local builds
@@ -227,7 +226,7 @@ Next, start the server:
 Finally, in a separate shell, run a model:
 ```
-./ollama run llama3
+./ollama run llama2
 ```
 ## REST API
@@ -238,7 +237,7 @@ Ollama has a REST API for running and managing models.
 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama2",
  "prompt":"Why is the sky blue?"
 }'
 ```
@@ -247,7 +246,7 @@ curl http://localhost:11434/api/generate -d '{
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "mistral",
  "messages": [
    { "role": "user", "content": "why is the sky blue?" }
  ]
@@ -260,12 +259,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Web & Desktop
 - [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
@@ -276,24 +272,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
 - [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
 - [Dify.AI](https://github.com/langgenius/dify)
 - [MindMac](https://mindmac.app)
 - [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
 - [Msty](https://msty.app)
 - [Chatbox](https://github.com/Bin-Huang/Chatbox)
 - [WinForm Ollama Copilot](https://github.com/tgraupmann/WinForm_Ollama_Copilot)
 - [NextChat](https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web) with [Get Started Doc](https://docs.nextchat.dev/models/ollama)
 - [Alpaca WebUI](https://github.com/mmo80/alpaca-webui)
 - [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
 - [OpenAOE](https://github.com/InternLM/OpenAOE)
 - [Odin Runes](https://github.com/leonid20000/OdinRunes)
 - [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
 - [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
 - [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
 ### Terminal
@@ -302,23 +288,18 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Emacs client](https://github.com/zweifisch/ollama)
 - [gen.nvim](https://github.com/David-Kunz/gen.nvim)
 - [ollama.nvim](https://github.com/nomnivore/ollama.nvim)
 - [ollero.nvim](https://github.com/marco-souza/ollero.nvim)
 - [ollama-chat.nvim](https://github.com/gerazov/ollama-chat.nvim)
 - [ogpt.nvim](https://github.com/huynle/ogpt.nvim)
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
 - [cmdh](https://github.com/pgibler/cmdh)
 - [ooo](https://github.com/npahlfer/ooo)
 - [tenere](https://github.com/pythops/tenere)
 - [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
 - [typechat-cli](https://github.com/anaisbetts/typechat-cli)
 - [ShellOracle](https://github.com/djcopley/ShellOracle)
 - [tlm](https://github.com/yusufcanb/tlm)
 ### Database
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
+- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
 ### Package managers
@@ -331,6 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LangChain4j](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
 - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
@@ -347,7 +329,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama for R - rollama](https://github.com/JBGruber/rollama)
 - [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
 - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
 - [Testcontainers](https://testcontainers.com/modules/ollama/)
 ### Mobile
@@ -369,15 +350,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
 - [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 ### Supported backends 
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. 
--- a/api/client.go
+++ b/api/client.go
@@ -1,9 +1,3 @@
 // Package api implements the client-side API for code wishing to interact
 // with the ollama service. The methods of the [Client] type correspond to
 // the ollama REST API as described in https://github.com/ollama/ollama/blob/main/docs/api.md
 //
 // The ollama command-line client itself uses this package to interact with
 // the backend service.
 package api
 import (
@@ -11,6 +5,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
 	"net"
@@ -20,12 +15,10 @@ import (
 	"runtime"
 	"strings"
-	"github.com/ollama/ollama/format"
+	"github.com/jmorganca/ollama/format"
-	"github.com/ollama/ollama/version"
+	"github.com/jmorganca/ollama/version"
 )
 // Client encapsulates client state for interacting with the ollama
 // service. Use [ClientFromEnvironment] to create new Clients.
 type Client struct {
 	base *url.URL
 	http *http.Client
@@ -47,15 +40,6 @@ func checkError(resp *http.Response, body []byte) error {
 	return apiError
 }
 // ClientFromEnvironment creates a new [Client] using configuration from the
 // environment variable OLLAMA_HOST, which points to the network host and
 // port on which the ollama service is listenting. The format of this variable
 // is:
 //
 //	<scheme>://<host>:<port>
 //
 // If the variable is not specified, a default ollama host and port will be
 // used.
 func ClientFromEnvironment() (*Client, error) {
 	defaultPort := "11434"
@@ -207,14 +191,8 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	return nil
 }
 // GenerateResponseFunc is a function that [Client.Generate] invokes every time
 // a response is received from the service. If this function returns an error,
 // [Client.Generate] will stop generating and return this error.
 type GenerateResponseFunc func(GenerateResponse) error
 // Generate generates a response for a given prompt. The req parameter should
 // be populated with prompt details. fn is called for each response (there may
 // be multiple responses, e.g. in case streaming is enabled).
 func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn GenerateResponseFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/generate", req, func(bts []byte) error {
 		var resp GenerateResponse
@@ -226,15 +204,8 @@ func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn Generate
 	})
 }
 // ChatResponseFunc is a function that [Client.Chat] invokes every time
 // a response is received from the service. If this function returns an error,
 // [Client.Chat] will stop generating and return this error.
 type ChatResponseFunc func(ChatResponse) error
 // Chat generates the next message in a chat. [ChatRequest] may contain a
 // sequence of messages which can be used to maintain chat history with a model.
 // fn is called for each response (there may be multiple responses, e.g. if case
 // streaming is enabled).
 func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/chat", req, func(bts []byte) error {
 		var resp ChatResponse
@@ -246,14 +217,8 @@ func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc
 	})
 }
 // PullProgressFunc is a function that [Client.Pull] invokes every time there
 // is progress with a "pull" request sent to the service. If this function
 // returns an error, [Client.Pull] will stop the process and return this error.
 type PullProgressFunc func(ProgressResponse) error
 // Pull downloads a model from the ollama library. fn is called each time
 // progress is made on the request and can be used to display a progress bar,
 // etc.
 func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/pull", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -336,7 +301,18 @@ func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*Embedd
 }
 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
-	return c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil)
+	if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
 		var statusError StatusError
 		if !errors.As(err, &statusError) || statusError.StatusCode != http.StatusNotFound {
 			return err
 		}
 		if err := c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 func (c *Client) Version(ctx context.Context) (string, error) {
--- a/api/types.go
+++ b/api/types.go
@@ -2,7 +2,6 @@ package api
 import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -34,46 +33,18 @@ func (e StatusError) Error() string {
 type ImageData []byte
 // GenerateRequest describes a request sent by [Client.Generate]. While you
 // have to specify the Model and Prompt fields, all the other fields have
 // reasonable defaults for basic uses.
 type GenerateRequest struct {
-	// Model is the model name; it should be a name familiar to Ollama from
+	Model     string      `json:"model"`
-	// the library at https://ollama.com/library
+	Prompt    string      `json:"prompt"`
-	Model string `json:"model"`
+	System    string      `json:"system"`
 	Template  string      `json:"template"`
 	Context   []int       `json:"context,omitempty"`
 	Stream    *bool       `json:"stream,omitempty"`
 	Raw       bool        `json:"raw,omitempty"`
 	Format    string      `json:"format"`
 	KeepAlive *Duration   `json:"keep_alive,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	// Prompt is the textual prompt to send to the model.
 	Prompt string `json:"prompt"`
 	// System overrides the model's default system message/prompt.
 	System string `json:"system"`
 	// Template overrides the model's default prompt template.
 	Template string `json:"template"`
 	// Context is the context parameter returned from a previous call to
 	// Generate call. It can be used to keep a short conversational memory.
 	Context []int `json:"context,omitempty"`
 	// Stream specifies whether the response is streaming; it is true by default.
 	Stream *bool `json:"stream,omitempty"`
 	// Raw set to true means that no formatting will be applied to the prompt.
 	Raw bool `json:"raw,omitempty"`
 	// Format specifies the format to return a response in.
 	Format string `json:"format"`
 	// KeepAlive controls how long the model will stay loaded in memory following
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
 	// Images is an optional list of base64-encoded images accompanying this
 	// request, for multimodal models.
 	Images []ImageData `json:"images,omitempty"`
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
 	Options map[string]interface{} `json:"options"`
 }
@@ -138,24 +109,21 @@ type Options struct {
 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	UseNUMA   bool `json:"numa,omitempty"`
+	UseNUMA            bool    `json:"numa,omitempty"`
-	NumCtx    int  `json:"num_ctx,omitempty"`
+	NumCtx             int     `json:"num_ctx,omitempty"`
-	NumBatch  int  `json:"num_batch,omitempty"`
+	NumBatch           int     `json:"num_batch,omitempty"`
-	NumGQA    int  `json:"num_gqa,omitempty"`
+	NumGQA             int     `json:"num_gqa,omitempty"`
-	NumGPU    int  `json:"num_gpu,omitempty"`
+	NumGPU             int     `json:"num_gpu,omitempty"`
-	MainGPU   int  `json:"main_gpu,omitempty"`
+	MainGPU            int     `json:"main_gpu,omitempty"`
-	LowVRAM   bool `json:"low_vram,omitempty"`
+	LowVRAM            bool    `json:"low_vram,omitempty"`
-	F16KV     bool `json:"f16_kv,omitempty"`
+	F16KV              bool    `json:"f16_kv,omitempty"`
-	LogitsAll bool `json:"logits_all,omitempty"`
+	LogitsAll          bool    `json:"logits_all,omitempty"`
-	VocabOnly bool `json:"vocab_only,omitempty"`
+	VocabOnly          bool    `json:"vocab_only,omitempty"`
-	UseMMap   bool `json:"use_mmap,omitempty"`
+	UseMMap            bool    `json:"use_mmap,omitempty"`
-	UseMLock  bool `json:"use_mlock,omitempty"`
+	UseMLock           bool    `json:"use_mlock,omitempty"`
-	NumThread int  `json:"num_thread,omitempty"`
+	RopeFrequencyBase  float32 `json:"rope_frequency_base,omitempty"`
 	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
 	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
 	// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
 	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
 	NumThread          int     `json:"num_thread,omitempty"`
 }
 type EmbeddingRequest struct {
@@ -171,11 +139,10 @@ type EmbeddingResponse struct {
 }
 type CreateRequest struct {
-	Model        string `json:"model"`
+	Model     string `json:"model"`
-	Path         string `json:"path"`
+	Path      string `json:"path"`
-	Modelfile    string `json:"modelfile"`
+	Modelfile string `json:"modelfile"`
-	Stream       *bool  `json:"stream,omitempty"`
+	Stream    *bool  `json:"stream,omitempty"`
 	Quantization string `json:"quantization,omitempty"`
 	// Name is deprecated, see Model
 	Name string `json:"name"`
@@ -308,7 +275,7 @@ func (m *Metrics) Summary() {
 	}
 }
-var ErrInvalidOpts = errors.New("invalid options")
+var ErrInvalidOpts = fmt.Errorf("invalid options")
 func (opts *Options) FromMap(m map[string]interface{}) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
@@ -415,16 +382,18 @@ func DefaultOptions() Options {
 		Runner: Runner{
 			// options set when the model is loaded
-			NumCtx:    2048,
+			NumCtx:             2048,
-			NumBatch:  512,
+			RopeFrequencyBase:  10000.0,
-			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
+			RopeFrequencyScale: 1.0,
-			NumGQA:    1,
+			NumBatch:           512,
-			NumThread: 0, // let the runtime decide
+			NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
-			LowVRAM:   false,
+			NumGQA:             1,
-			F16KV:     true,
+			NumThread:          0, // let the runtime decide
-			UseMLock:  false,
+			LowVRAM:            false,
-			UseMMap:   true,
+			F16KV:              true,
-			UseNUMA:   false,
+			UseMLock:           false,
 			UseMMap:            true,
 			UseNUMA:            false,
 		},
 	}
 }
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -1,50 +0,0 @@
 package api
 import (
 	"encoding/json"
 	"math"
 	"testing"
 	"time"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 func TestKeepAliveParsingFromJSON(t *testing.T) {
 	tests := []struct {
 		name string
 		req  string
 		exp  *Duration
 	}{
 		{
 			name: "Positive Integer",
 			req:  `{ "keep_alive": 42 }`,
 			exp:  &Duration{42 * time.Second},
 		},
 		{
 			name: "Positive Integer String",
 			req:  `{ "keep_alive": "42m" }`,
 			exp:  &Duration{42 * time.Minute},
 		},
 		{
 			name: "Negative Integer",
 			req:  `{ "keep_alive": -1 }`,
 			exp:  &Duration{math.MaxInt64},
 		},
 		{
 			name: "Negative Integer String",
 			req:  `{ "keep_alive": "-1m" }`,
 			exp:  &Duration{math.MaxInt64},
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			var dec ChatRequest
 			err := json.Unmarshal([]byte(test.req), &dec)
 			require.NoError(t, err)
 			assert.Equal(t, test.exp, dec.KeepAlive)
 		})
 	}
 }
--- a/app/lifecycle/lifecycle.go
+++ b/app/lifecycle/lifecycle.go
@@ -9,8 +9,8 @@ import (
 	"os/signal"
 	"syscall"
-	"github.com/ollama/ollama/app/store"
+	"github.com/jmorganca/ollama/app/store"
-	"github.com/ollama/ollama/app/tray"
+	"github.com/jmorganca/ollama/app/tray"
 )
 func Run() {
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -11,7 +11,7 @@ import (
 	"path/filepath"
 	"time"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )
 func getCLIFullPath(command string) string {
@@ -83,38 +83,6 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 		io.Copy(logFile, stderr) //nolint:errcheck
 	}()
 	// Re-wire context done behavior to attempt a graceful shutdown of the server
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			err := terminate(cmd)
 			if err != nil {
 				slog.Warn("error trying to gracefully terminate server", "err", err)
 				return cmd.Process.Kill()
 			}
 			tick := time.NewTicker(10 * time.Millisecond)
 			defer tick.Stop()
 			for {
 				select {
 				case <-tick.C:
 					exited, err := isProcessExited(cmd.Process.Pid)
 					if err != nil {
 						return err
 					}
 					if exited {
 						return nil
 					}
 				case <-time.After(5 * time.Second):
 					slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid)
 					return cmd.Process.Kill()
 				}
 			}
 		}
 		return nil
 	}
 	// run the command and wait for it to finish
 	if err := cmd.Start(); err != nil {
 		return done, fmt.Errorf("failed to start server %w", err)
@@ -137,7 +105,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 			select {
 			case <-ctx.Done():
-				slog.Info(fmt.Sprintf("server shutdown with exit code %d", code))
+				slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code))
 				done <- code
 				return
 			default:
--- a/app/lifecycle/server_unix.go
+++ b/app/lifecycle/server_unix.go
@@ -4,35 +4,9 @@ package lifecycle
 import (
 	"context"
 	"errors"
 	"fmt"
 	"os"
 	"os/exec"
 	"syscall"
 )
 func getCmd(ctx context.Context, cmd string) *exec.Cmd {
 	return exec.CommandContext(ctx, cmd, "serve")
 }
 func terminate(cmd *exec.Cmd) error {
 	return cmd.Process.Signal(os.Interrupt)
 }
 func isProcessExited(pid int) (bool, error) {
 	proc, err := os.FindProcess(pid)
 	if err != nil {
 		return false, fmt.Errorf("failed to find process: %v", err)
 	}
 	err = proc.Signal(syscall.Signal(0))
 	if err != nil {
 		if errors.Is(err, os.ErrProcessDone) || errors.Is(err, syscall.ESRCH) {
 			return true, nil
 		}
 		return false, fmt.Errorf("error signaling process: %v", err)
 	}
 	return false, nil
 }
--- a/app/lifecycle/server_windows.go
+++ b/app/lifecycle/server_windows.go
@@ -2,88 +2,12 @@ package lifecycle
 import (
 	"context"
 	"fmt"
 	"os/exec"
 	"syscall"
 	"golang.org/x/sys/windows"
 )
 func getCmd(ctx context.Context, exePath string) *exec.Cmd {
 	cmd := exec.CommandContext(ctx, exePath, "serve")
-	cmd.SysProcAttr = &syscall.SysProcAttr{
+	cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true, CreationFlags: 0x08000000}
 		HideWindow:    true,
 		CreationFlags: windows.CREATE_NEW_PROCESS_GROUP,
 	}
 	return cmd
 }
 func terminate(cmd *exec.Cmd) error {
 	dll, err := windows.LoadDLL("kernel32.dll")
 	if err != nil {
 		return err
 	}
 	defer dll.Release() // nolint: errcheck
 	pid := cmd.Process.Pid
 	f, err := dll.FindProc("AttachConsole")
 	if err != nil {
 		return err
 	}
 	r1, _, err := f.Call(uintptr(pid))
 	if r1 == 0 && err != syscall.ERROR_ACCESS_DENIED {
 		return err
 	}
 	f, err = dll.FindProc("SetConsoleCtrlHandler")
 	if err != nil {
 		return err
 	}
 	r1, _, err = f.Call(0, 1)
 	if r1 == 0 {
 		return err
 	}
 	f, err = dll.FindProc("GenerateConsoleCtrlEvent")
 	if err != nil {
 		return err
 	}
 	r1, _, err = f.Call(windows.CTRL_BREAK_EVENT, uintptr(pid))
 	if r1 == 0 {
 		return err
 	}
 	r1, _, err = f.Call(windows.CTRL_C_EVENT, uintptr(pid))
 	if r1 == 0 {
 		return err
 	}
 	return nil
 }
 const STILL_ACTIVE = 259
 func isProcessExited(pid int) (bool, error) {
 	hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION, false, uint32(pid))
 	if err != nil {
 		return false, fmt.Errorf("failed to open process: %v", err)
 	}
 	defer windows.CloseHandle(hProcess) // nolint: errcheck
 	var exitCode uint32
 	err = windows.GetExitCodeProcess(hProcess, &exitCode)
 	if err != nil {
 		return false, fmt.Errorf("failed to get exit code: %v", err)
 	}
 	if exitCode == STILL_ACTIVE {
 		return false, nil
 	}
 	return true, nil
 }
--- a/app/lifecycle/updater.go
+++ b/app/lifecycle/updater.go
@@ -18,8 +18,8 @@ import (
 	"strings"
 	"time"
-	"github.com/ollama/ollama/auth"
+	"github.com/jmorganca/ollama/auth"
-	"github.com/ollama/ollama/version"
+	"github.com/jmorganca/ollama/version"
 )
 var (
--- a/app/main.go
+++ b/app/main.go
@@ -4,7 +4,7 @@ package main
 // go build -ldflags="-H windowsgui" .
 import (
-	"github.com/ollama/ollama/app/lifecycle"
+	"github.com/jmorganca/ollama/app/lifecycle"
 )
 func main() {
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -28,8 +28,8 @@ AppPublisher={#MyAppPublisher}
 AppPublisherURL={#MyAppURL}
 AppSupportURL={#MyAppURL}
 AppUpdatesURL={#MyAppURL}
-ArchitecturesAllowed=x64 arm64
+ArchitecturesAllowed=x64
-ArchitecturesInstallIn64BitMode=x64 arm64
+ArchitecturesInstallIn64BitMode=x64
 DefaultDirName={localappdata}\Programs\{#MyAppName}
 DefaultGroupName={#MyAppName}
 DisableProgramGroupPage=yes
--- a/app/tray/tray.go
+++ b/app/tray/tray.go
@@ -4,8 +4,8 @@ import (
 	"fmt"
 	"runtime"
-	"github.com/ollama/ollama/app/assets"
+	"github.com/jmorganca/ollama/app/assets"
-	"github.com/ollama/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/tray/commontray"
 )
 func NewTray() (commontray.OllamaTray, error) {
@@ -24,5 +24,10 @@ func NewTray() (commontray.OllamaTray, error) {
 		return nil, fmt.Errorf("failed to load icon %s: %w", iconName, err)
 	}
-	return InitPlatformTray(icon, updateIcon)
+	tray, err := InitPlatformTray(icon, updateIcon)
 	if err != nil {
 		return nil, err
 	}
 	return tray, nil
 }
--- a/app/tray/tray_nonwindows.go
+++ b/app/tray/tray_nonwindows.go
@@ -5,7 +5,7 @@ package tray
 import (
 	"fmt"
-	"github.com/ollama/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/tray/commontray"
 )
 func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {
--- a/app/tray/tray_windows.go
+++ b/app/tray/tray_windows.go
@@ -1,8 +1,8 @@
 package tray
 import (
-	"github.com/ollama/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/tray/commontray"
-	"github.com/ollama/ollama/app/tray/wintray"
+	"github.com/jmorganca/ollama/app/tray/wintray"
 )
 func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -13,7 +13,7 @@ import (
 	"sync"
 	"unsafe"
-	"github.com/ollama/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/tray/commontray"
 	"golang.org/x/sys/windows"
 )
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -30,12 +30,12 @@ import (
 	"golang.org/x/exp/slices"
 	"golang.org/x/term"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
-	"github.com/ollama/ollama/format"
+	"github.com/jmorganca/ollama/format"
-	"github.com/ollama/ollama/parser"
+	"github.com/jmorganca/ollama/parser"
-	"github.com/ollama/ollama/progress"
+	"github.com/jmorganca/ollama/progress"
-	"github.com/ollama/ollama/server"
+	"github.com/jmorganca/ollama/server"
-	"github.com/ollama/ollama/version"
+	"github.com/jmorganca/ollama/version"
 )
 func CreateHandler(cmd *cobra.Command, args []string) error {
@@ -105,48 +105,24 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				zf := zip.NewWriter(tf)
-				files := []string{}
+				files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors"))
 				tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
 				if err != nil {
 					return err
 				} else if len(tfiles) == 0 {
 					tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
 					if err != nil {
 						return err
 					}
 				}
 				files = append(files, tfiles...)
 				if len(files) == 0 {
-					return fmt.Errorf("no models were found in '%s'", path)
+					return fmt.Errorf("no safetensors files were found in '%s'", path)
 				}
-				// add the safetensor/torch config file + tokenizer
+				// add the safetensor config file + tokenizer
 				files = append(files, filepath.Join(path, "config.json"))
 				files = append(files, filepath.Join(path, "params.json"))
 				files = append(files, filepath.Join(path, "added_tokens.json"))
 				files = append(files, filepath.Join(path, "tokenizer.model"))
 				for _, fn := range files {
 					f, err := os.Open(fn)
-
+					if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") {
-					// just skip whatever files aren't there
+						continue
 					if os.IsNotExist(err) {
 						if strings.HasSuffix(fn, "tokenizer.model") {
 							// try the parent dir before giving up
 							parentDir := filepath.Dir(path)
 							newFn := filepath.Join(parentDir, "tokenizer.model")
 							f, err = os.Open(newFn)
 							if os.IsNotExist(err) {
 								continue
 							} else if err != nil {
 								return err
 							}
 						} else {
 							continue
 						}
 					} else if err != nil {
 						return err
 					}
@@ -218,9 +194,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}
-	quantization, _ := cmd.Flags().GetString("quantization")
+	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile)}
 	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -239,10 +213,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
 	if _, err := io.Copy(hash, bin); err != nil {
 		return "", err
 	}
-
+	bin.Seek(0, io.SeekStart)
 	if _, err := bin.Seek(0, io.SeekStart); err != nil {
 		return "", err
 	}
 	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
 	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
@@ -929,7 +900,8 @@ func NewCLI() *cobra.Command {
 	cobra.EnableCommandSorting = false
 	if runtime.GOOS == "windows" {
-		console.ConsoleFromFile(os.Stdin) //nolint:errcheck
+		// Enable colorful ANSI escape code in Windows terminal (disabled by default)
 		console.ConsoleFromFile(os.Stdout) //nolint:errcheck
 	}
 	rootCmd := &cobra.Command{
@@ -961,7 +933,6 @@ func NewCLI() *cobra.Command {
 	}
 	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
 	createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")
 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@@ -999,11 +970,9 @@ func NewCLI() *cobra.Command {
 	serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
 Environment Variables:
-    OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
+    OLLAMA_HOST       The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS      A comma separated list of allowed origins.
+    OLLAMA_ORIGINS    A comma separated list of allowed origins.
-    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
+    OLLAMA_MODELS     The path to the models directory (default is "~/.ollama/models")
    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
    OLLAMA_DEBUG        Set to 1 to enable additional debug logging
 `)
 	pullCmd := &cobra.Command{
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -14,9 +14,9 @@ import (
 	"github.com/spf13/cobra"
 	"golang.org/x/exp/slices"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
-	"github.com/ollama/ollama/progress"
+	"github.com/jmorganca/ollama/progress"
-	"github.com/ollama/ollama/readline"
+	"github.com/jmorganca/ollama/readline"
 )
 type MultilineState int
@@ -295,14 +295,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					opts.WordWrap = false
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
-					if err := cmd.Flags().Set("verbose", "true"); err != nil {
+					cmd.Flags().Set("verbose", "true")
 						return err
 					}
 					fmt.Println("Set 'verbose' mode.")
 				case "quiet":
-					if err := cmd.Flags().Set("verbose", "false"); err != nil {
+					cmd.Flags().Set("verbose", "false")
 						return err
 					}
 					fmt.Println("Set 'quiet' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -7,7 +7,7 @@ import (
 	"github.com/stretchr/testify/assert"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )
 func TestExtractFilenames(t *testing.T) {
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -7,7 +7,7 @@ import (
 	"os/exec"
 	"strings"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )
 func startApp(ctx context.Context, client *api.Client) error {
--- a/cmd/start_default.go
+++ b/cmd/start_default.go
@@ -6,7 +6,7 @@ import (
 	"context"
 	"fmt"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )
 func startApp(ctx context.Context, client *api.Client) error {
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -10,7 +10,7 @@ import (
 	"strings"
 	"syscall"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )
 func startApp(ctx context.Context, client *api.Client) error {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,20 +1,23 @@
 package convert
 import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"regexp"
 	"slices"
 	"strings"
 	"github.com/mitchellh/mapstructure"
 	"google.golang.org/protobuf/proto"
-	"github.com/ollama/ollama/convert/sentencepiece"
+	"github.com/jmorganca/ollama/convert/sentencepiece"
-	"github.com/ollama/ollama/llm"
+	"github.com/jmorganca/ollama/llm"
 )
 type Params struct {
@@ -27,58 +30,137 @@ type Params struct {
 	AttentionHeads   int      `json:"num_attention_heads"` // n_head
 	KeyValHeads      int      `json:"num_key_value_heads"`
 	NormEPS          float64  `json:"rms_norm_eps"`
 	RopeFreqBase     float64  `json:"rope_theta"`
 	BoSTokenID       int      `json:"bos_token_id"`
 	EoSTokenID       int      `json:"eos_token_id"`
 	HeadDimension    int      `json:"head_dim"`
 	PaddingTokenID   int      `json:"pad_token_id"`
 	ByteOrder
 }
-type ByteOrder interface {
+type MetaData struct {
-	binary.ByteOrder
+	Type    string `mapstructure:"dtype"`
-	binary.AppendByteOrder
+	Shape   []int  `mapstructure:"shape"`
 	Offsets []int  `mapstructure:"data_offsets"`
 }
-type ModelArch interface {
+func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
-	GetTensors() error
+	f, err := os.Open(fn)
-	LoadVocab() error
+	if err != nil {
-	WriteGGUF() (string, error)
+		return []llm.Tensor{}, 0, err
 	}
 	defer f.Close()
 	var jsonSize uint64
 	binary.Read(f, binary.LittleEndian, &jsonSize)
 	buf := make([]byte, jsonSize)
 	_, err = io.ReadFull(f, buf)
 	if err != nil {
 		return []llm.Tensor{}, 0, err
 	}
 	d := json.NewDecoder(bytes.NewBuffer(buf))
 	d.UseNumber()
 	var parsed map[string]interface{}
 	if err = d.Decode(&parsed); err != nil {
 		return []llm.Tensor{}, 0, err
 	}
 	var keys []string
 	for k := range parsed {
 		keys = append(keys, k)
 	}
 	slices.Sort(keys)
 	slog.Info("converting layers")
 	var tensors []llm.Tensor
 	for _, k := range keys {
 		vals := parsed[k].(map[string]interface{})
 		var data MetaData
 		if err = mapstructure.Decode(vals, &data); err != nil {
 			return []llm.Tensor{}, 0, err
 		}
 		var size uint64
 		var kind uint32
 		switch len(data.Shape) {
 		case 0:
 			// metadata
 			continue
 		case 1:
 			// convert to float32
 			kind = 0
 			size = uint64(data.Shape[0] * 4)
 		case 2:
 			// convert to float16
 			kind = 1
 			size = uint64(data.Shape[0] * data.Shape[1] * 2)
 		}
 		ggufName, err := GetTensorName(k)
 		if err != nil {
 			slog.Error("%v", err)
 			return []llm.Tensor{}, 0, err
 		}
 		shape := [4]uint64{1, 1, 1, 1}
 		for cnt, s := range data.Shape {
 			shape[cnt] = uint64(s)
 		}
 		t := llm.Tensor{
 			Name:          ggufName,
 			Kind:          kind,
 			Offset:        offset,
 			Shape:         shape[:],
 			FileName:      fn,
 			OffsetPadding: 8 + jsonSize,
 			FileOffsets:   []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
 		}
 		slog.Debug(fmt.Sprintf("%v", t))
 		tensors = append(tensors, t)
 		offset += size
 	}
 	return tensors, offset, nil
 }
-type ModelFormat interface {
+func GetSafeTensors(dirpath string) ([]llm.Tensor, error) {
-	GetLayerName(string) (string, error)
+	var tensors []llm.Tensor
-	GetTensors(string, *Params) ([]llm.Tensor, error)
+	files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
-	GetParams(string) (*Params, error)
+	if err != nil {
-	GetModelArch(string, string, *Params) (ModelArch, error)
+		return []llm.Tensor{}, err
 	}
 	var offset uint64
 	for _, f := range files {
 		var t []llm.Tensor
 		var err error
 		t, offset, err = ReadSafeTensors(f, offset)
 		if err != nil {
 			slog.Error("%v", err)
 			return []llm.Tensor{}, err
 		}
 		tensors = append(tensors, t...)
 	}
 	return tensors, nil
 }
-type ModelData struct {
+func GetParams(dirpath string) (*Params, error) {
-	Path    string
+	f, err := os.Open(filepath.Join(dirpath, "config.json"))
-	Name    string
+	if err != nil {
-	Params  *Params
+		return nil, err
-	Vocab   *Vocab
+	}
-	Tensors []llm.Tensor
+	defer f.Close()
 	Format  ModelFormat
 }
-func GetModelFormat(dirname string) (ModelFormat, error) {
+	var params Params
-	files, err := filepath.Glob(filepath.Join(dirname, "*"))
+
 	d := json.NewDecoder(f)
 	err = d.Decode(&params)
 	if err != nil {
 		return nil, err
 	}
-	for _, fn := range files {
+	return &params, nil
 		slog.Debug(fmt.Sprintf("file = %s", fn))
 		if strings.HasSuffix(fn, ".safetensors") {
 			return &SafetensorFormat{}, nil
 		} else if strings.HasSuffix(fn, ".bin") {
 			slog.Debug("model is torch")
 			return &TorchFormat{}, nil
 		}
 	}
 	return nil, fmt.Errorf("couldn't determine model format")
 }
 // Details on gguf's tokenizer can be found at:
@@ -89,7 +171,7 @@ type Vocab struct {
 	Types  []int32
 }
-func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
+func LoadTokens(dirpath string) (*Vocab, error) {
 	slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
 	in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
 	if err != nil {
@@ -114,14 +196,6 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
 		v.Tokens = append(v.Tokens, p.GetPiece())
 		v.Scores = append(v.Scores, p.GetScore())
 		t := p.GetType()
 		switch t {
 		case sentencepiece.ModelProto_SentencePiece_UNKNOWN:
 		case sentencepiece.ModelProto_SentencePiece_CONTROL:
 		case sentencepiece.ModelProto_SentencePiece_UNUSED:
 		case sentencepiece.ModelProto_SentencePiece_BYTE:
 		default:
 			t = sentencepiece.ModelProto_SentencePiece_NORMAL
 		}
 		v.Types = append(v.Types, int32(t))
 	}
@@ -169,15 +243,89 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
 	}
 	slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
-	if params.VocabSize > len(v.Tokens) {
+	return v, nil
-		missingTokens := params.VocabSize - len(v.Tokens)
+}
-		slog.Warn(fmt.Sprintf("vocab is missing %d tokens", missingTokens))
+
-		for cnt := 0; cnt < missingTokens; cnt++ {
+func GetTensorName(n string) (string, error) {
-			v.Tokens = append(v.Tokens, fmt.Sprintf("<dummy%05d>", cnt+1))
+	tMap := map[string]string{
-			v.Scores = append(v.Scores, -1)
+		"model.embed_tokens.weight":                           "token_embd.weight",
-			v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
+		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
 		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
 		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
 		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
 		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
 		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
 		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
 		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
 		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
 		"lm_head.weight":    "output.weight",
 		"model.norm.weight": "output_norm.weight",
 	}
 	v, ok := tMap[n]
 	if ok {
 		return v, nil
 	}
 	// quick hack to rename the layers to gguf format
 	for k, v := range tMap {
 		re := regexp.MustCompile(k)
 		newName := re.ReplaceAllString(n, v)
 		if newName != n {
 			return newName, nil
 		}
 	}
-	return v, nil
+	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
 }
 func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) {
 	c := llm.ContainerGGUF{
 		ByteOrder: binary.LittleEndian,
 	}
 	m := llm.NewGGUFModel(&c)
 	m.Tensors = tensors
 	m.KV["general.architecture"] = "llama"
 	m.KV["general.name"] = name
 	m.KV["llama.context_length"] = uint32(params.ContextSize)
 	m.KV["llama.embedding_length"] = uint32(params.HiddenSize)
 	m.KV["llama.block_count"] = uint32(params.HiddenLayers)
 	m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize)
 	m.KV["llama.rope.dimension_count"] = uint32(128)
 	m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads)
 	m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads)
 	m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS)
 	m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase)
 	m.KV["general.file_type"] = uint32(1)
 	m.KV["tokenizer.ggml.model"] = "llama"
 	m.KV["tokenizer.ggml.tokens"] = vocab.Tokens
 	m.KV["tokenizer.ggml.scores"] = vocab.Scores
 	m.KV["tokenizer.ggml.token_type"] = vocab.Types
 	m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID)
 	m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID)
 	m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0)
 	m.KV["tokenizer.ggml.add_bos_token"] = true
 	m.KV["tokenizer.ggml.add_eos_token"] = false
 	// llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer
 	// m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme
 	c.V3.NumTensor = uint64(len(tensors))
 	c.V3.NumKV = uint64(len(m.KV))
 	f, err := os.CreateTemp("", "ollama-gguf")
 	if err != nil {
 		return "", err
 	}
 	defer f.Close()
 	err = m.Encode(f)
 	if err != nil {
 		return "", err
 	}
 	return f.Name(), nil
 }
--- a/convert/gemma.go
+++ b/convert/gemma.go
@@ -1,137 +0,0 @@
 package convert
 import (
 	"encoding/binary"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"strings"
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/ollama/ollama/llm"
 )
 type GemmaModel struct {
 	ModelData
 }
 func gemmaLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
 	slog.Debug(fmt.Sprintf("converting '%s'", r.t.Name))
 	data := make([]byte, r.end-r.start)
 	if err := binary.Read(f, r.bo, data); err != nil {
 		return err
 	}
 	tDataF32 := bfloat16.DecodeFloat32(data)
 	var err error
 	tDataF32, err = addOnes(tDataF32, int(r.t.Shape[0]))
 	if err != nil {
 		return err
 	}
 	if err := binary.Write(w, r.bo, tDataF32); err != nil {
 		return err
 	}
 	return nil
 }
 func addOnes(data []float32, vectorSize int) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(vectorSize), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, vectorSize)
 	var err error
 	n, err = n.Add(ones)
 	if err != nil {
 		return []float32{}, err
 	}
 	newN, err := native.SelectF32(n, 0)
 	if err != nil {
 		return []float32{}, err
 	}
 	var fullTensor []float32
 	for _, v := range newN {
 		fullTensor = append(fullTensor, v...)
 	}
 	return fullTensor, nil
 }
 func (m *GemmaModel) GetTensors() error {
 	t, err := m.Format.GetTensors(m.Path, m.Params)
 	if err != nil {
 		return err
 	}
 	slog.Debug(fmt.Sprintf("Total tensors: %d", len(t)))
 	m.Tensors = []llm.Tensor{}
 	for _, l := range t {
 		if strings.HasSuffix(l.Name, "norm.weight") {
 			wt := l.WriterTo.(safetensorWriterTo)
 			wt.handler = gemmaLayerHandler
 			l.WriterTo = wt
 		}
 		m.Tensors = append(m.Tensors, l)
 	}
 	return nil
 }
 func (m *GemmaModel) LoadVocab() error {
 	v, err := LoadSentencePieceTokens(m.Path, m.Params)
 	if err != nil {
 		return err
 	}
 	m.Vocab = v
 	return nil
 }
 func (m *GemmaModel) WriteGGUF() (string, error) {
 	kv := llm.KV{
 		"general.architecture":                   "gemma",
 		"general.name":                           m.Name,
 		"gemma.context_length":                   uint32(m.Params.ContextSize),
 		"gemma.embedding_length":                 uint32(m.Params.HiddenSize),
 		"gemma.block_count":                      uint32(m.Params.HiddenLayers),
 		"gemma.feed_forward_length":              uint32(m.Params.IntermediateSize),
 		"gemma.attention.head_count":             uint32(m.Params.AttentionHeads),
 		"gemma.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
 		"gemma.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
 		"gemma.attention.key_length":             uint32(m.Params.HeadDimension),
 		"gemma.attention.value_length":           uint32(m.Params.HeadDimension),
 		"general.file_type":                      uint32(1),
 		"tokenizer.ggml.model":                   "llama",
 		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
 		"tokenizer.ggml.scores":     m.Vocab.Scores,
 		"tokenizer.ggml.token_type": m.Vocab.Types,
 		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
 		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
 		"tokenizer.ggml.padding_token_id": uint32(m.Params.PaddingTokenID),
 		"tokenizer.ggml.unknown_token_id": uint32(3),
 		"tokenizer.ggml.add_bos_token":    true,
 		"tokenizer.ggml.add_eos_token":    false,
 	}
 	f, err := os.CreateTemp("", "ollama-gguf")
 	if err != nil {
 		return "", err
 	}
 	defer f.Close()
 	mod := llm.NewGGUFV3(m.Params.ByteOrder)
 	if err := mod.Encode(f, kv, m.Tensors); err != nil {
 		return "", err
 	}
 	return f.Name(), nil
 }
--- a/convert/llama.go
+++ b/convert/llama.go
@@ -1,176 +0,0 @@
 package convert
 import (
 	"encoding/binary"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"regexp"
 	"strings"
 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/x448/float16"
 	"github.com/ollama/ollama/llm"
 )
 type LlamaModel struct {
 	ModelData
 }
 func llamaLayerHandler(w io.Writer, r torchWriterTo) error {
 	slog.Debug(fmt.Sprintf("repacking layer '%s'", r.t.Name))
 	data := r.storage.(*pytorch.HalfStorage).Data
 	tData := make([]uint16, len(data))
 	for cnt, v := range data {
 		tData[cnt] = uint16(float16.Fromfloat32(v))
 	}
 	var err error
 	var heads uint32
 	if strings.Contains(r.t.Name, "attn_q") {
 		heads = uint32(r.params.AttentionHeads)
 	} else if strings.Contains(r.t.Name, "attn_k") {
 		heads = uint32(r.params.KeyValHeads)
 		if heads == 0 {
 			heads = uint32(r.params.AttentionHeads)
 		}
 	} else {
 		return fmt.Errorf("unknown layer type")
 	}
 	slog.Debug(fmt.Sprintf("heads = %d", heads))
 	tData, err = llamaRepack(tData, int(heads), r.t.Shape)
 	if err != nil {
 		return err
 	}
 	if err = binary.Write(w, r.bo, tData); err != nil {
 		return err
 	}
 	return nil
 }
 func llamaRepack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
 	origShape := n.Shape().Clone()
 	// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
 	if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
 		return nil, err
 	}
 	if err := n.T(0, 2, 1, 3); err != nil {
 		return nil, err
 	}
 	if err := n.Reshape(origShape...); err != nil {
 		return nil, err
 	}
 	if err := n.Transpose(); err != nil {
 		return nil, err
 	}
 	newN, err := native.SelectU16(n, 1)
 	if err != nil {
 		return nil, err
 	}
 	var fullTensor []uint16
 	for _, v := range newN {
 		fullTensor = append(fullTensor, v...)
 	}
 	return fullTensor, nil
 }
 func (m *LlamaModel) GetTensors() error {
 	t, err := m.Format.GetTensors(m.Path, m.Params)
 	if err != nil {
 		return err
 	}
 	m.Tensors = []llm.Tensor{}
 	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
 	re, err := regexp.Compile(pattern)
 	if err != nil {
 		return err
 	}
 	for _, l := range t {
 		matches := re.FindAllStringSubmatch(l.Name, -1)
 		if len(matches) > 0 {
 			slog.Debug(fmt.Sprintf("setting handler for: %s", l.Name))
 			wt := l.WriterTo.(torchWriterTo)
 			wt.handler = llamaLayerHandler
 			l.WriterTo = wt
 		}
 		m.Tensors = append(m.Tensors, l)
 	}
 	return nil
 }
 func (m *LlamaModel) LoadVocab() error {
 	var v *Vocab
 	var err error
 	slog.Debug("loading vocab")
 	v, err = LoadSentencePieceTokens(m.Path, m.Params)
 	if err != nil {
 		return err
 	}
 	slog.Debug("vocab loaded")
 	m.Vocab = v
 	return nil
 }
 func (m *LlamaModel) WriteGGUF() (string, error) {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
 		"llama.vocab_size":                       uint32(len(m.Vocab.Tokens)),
 		"llama.context_length":                   uint32(m.Params.ContextSize),
 		"llama.embedding_length":                 uint32(m.Params.HiddenSize),
 		"llama.block_count":                      uint32(m.Params.HiddenLayers),
 		"llama.feed_forward_length":              uint32(m.Params.IntermediateSize),
 		"llama.rope.dimension_count":             uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
 		"llama.attention.head_count":             uint32(m.Params.AttentionHeads),
 		"llama.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
 		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
 		"general.file_type":                      uint32(1),
 		"tokenizer.ggml.model":                   "llama",
 		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
 		"tokenizer.ggml.scores":     m.Vocab.Scores,
 		"tokenizer.ggml.token_type": m.Vocab.Types,
 		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
 		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 		"tokenizer.ggml.add_bos_token":    true,
 		"tokenizer.ggml.add_eos_token":    false,
 	}
 	f, err := os.CreateTemp("", "ollama-gguf")
 	if err != nil {
 		return "", err
 	}
 	defer f.Close()
 	mod := llm.NewGGUFV3(m.Params.ByteOrder)
 	if err := mod.Encode(f, kv, m.Tensors); err != nil {
 		return "", err
 	}
 	slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
 	return f.Name(), nil
 }
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -1,173 +0,0 @@
 package convert
 import (
 	"encoding/binary"
 	"fmt"
 	"io"
 	"os"
 	"regexp"
 	"strings"
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/x448/float16"
 	"github.com/ollama/ollama/llm"
 )
 type MistralModel struct {
 	ModelData
 }
 func mistralLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
 	layerSize := r.end - r.start
 	var err error
 	tData := make([]uint16, layerSize/2)
 	if err = binary.Read(f, r.bo, tData); err != nil {
 		return err
 	}
 	var heads uint32
 	if strings.Contains(r.t.Name, "attn_q") {
 		heads = uint32(r.params.AttentionHeads)
 	} else if strings.Contains(r.t.Name, "attn_k") {
 		heads = uint32(r.params.KeyValHeads)
 		if heads == 0 {
 			heads = uint32(r.params.AttentionHeads)
 		}
 	} else {
 		return fmt.Errorf("unknown layer type")
 	}
 	tData, err = repack(tData, int(heads), r.t.Shape)
 	if err != nil {
 		return err
 	}
 	var buf []byte
 	for _, n := range tData {
 		buf = r.bo.AppendUint16(buf, n)
 	}
 	tempBuf := make([]uint16, len(tData))
 	tDataF32 := bfloat16.DecodeFloat32(buf)
 	for cnt, v := range tDataF32 {
 		tDataF16 := float16.Fromfloat32(v)
 		tempBuf[cnt] = uint16(tDataF16)
 	}
 	if err = binary.Write(w, r.bo, tempBuf); err != nil {
 		return err
 	}
 	return nil
 }
 func repack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
 	origShape := n.Shape().Clone()
 	// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
 	if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
 		return nil, err
 	}
 	if err := n.T(0, 2, 1, 3); err != nil {
 		return nil, err
 	}
 	if err := n.Reshape(origShape...); err != nil {
 		return nil, err
 	}
 	if err := n.Transpose(); err != nil {
 		return nil, err
 	}
 	newN, err := native.SelectU16(n, 1)
 	if err != nil {
 		return nil, err
 	}
 	var fullTensor []uint16
 	for _, v := range newN {
 		fullTensor = append(fullTensor, v...)
 	}
 	return fullTensor, nil
 }
 func (m *MistralModel) GetTensors() error {
 	t, err := m.Format.GetTensors(m.Path, m.Params)
 	if err != nil {
 		return err
 	}
 	m.Tensors = []llm.Tensor{}
 	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
 	re, err := regexp.Compile(pattern)
 	if err != nil {
 		return err
 	}
 	for _, l := range t {
 		matches := re.FindAllStringSubmatch(l.Name, -1)
 		if len(matches) > 0 {
 			wt := l.WriterTo.(safetensorWriterTo)
 			wt.handler = mistralLayerHandler
 			l.WriterTo = wt
 		}
 		m.Tensors = append(m.Tensors, l)
 	}
 	return nil
 }
 func (m *MistralModel) LoadVocab() error {
 	v, err := LoadSentencePieceTokens(m.Path, m.Params)
 	if err != nil {
 		return err
 	}
 	m.Vocab = v
 	return nil
 }
 func (m *MistralModel) WriteGGUF() (string, error) {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
 		"llama.context_length":                   uint32(m.Params.ContextSize),
 		"llama.embedding_length":                 uint32(m.Params.HiddenSize),
 		"llama.block_count":                      uint32(m.Params.HiddenLayers),
 		"llama.feed_forward_length":              uint32(m.Params.IntermediateSize),
 		"llama.rope.dimension_count":             uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
 		"llama.attention.head_count":             uint32(m.Params.AttentionHeads),
 		"llama.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
 		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
 		"general.file_type":                      uint32(1),
 		"tokenizer.ggml.model":                   "llama",
 		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
 		"tokenizer.ggml.scores":     m.Vocab.Scores,
 		"tokenizer.ggml.token_type": m.Vocab.Types,
 		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
 		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
 		"tokenizer.ggml.add_bos_token":    true,
 		"tokenizer.ggml.add_eos_token":    false,
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}
 	f, err := os.CreateTemp("", "ollama-gguf")
 	if err != nil {
 		return "", err
 	}
 	defer f.Close()
 	mod := llm.NewGGUFV3(m.Params.ByteOrder)
 	if err := mod.Encode(f, kv, m.Tensors); err != nil {
 		return "", err
 	}
 	return f.Name(), nil
 }
--- a/convert/safetensors.go
+++ b/convert/safetensors.go
@@ -1,304 +0,0 @@
 package convert
 import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"regexp"
 	"slices"
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/mitchellh/mapstructure"
 	"github.com/x448/float16"
 	"github.com/ollama/ollama/llm"
 )
 type safetensorWriterTo struct {
 	t *llm.Tensor
 	params *Params
 	bo     ByteOrder
 	filename string
 	start, end, padding uint64
 	handler             func(w io.Writer, r safetensorWriterTo, f *os.File) error
 }
 type tensorMetaData struct {
 	Type    string `mapstructure:"dtype"`
 	Shape   []int  `mapstructure:"shape"`
 	Offsets []int  `mapstructure:"data_offsets"`
 }
 type SafetensorFormat struct{}
 func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
 	slog.Debug("getting tensor data")
 	var tensors []llm.Tensor
 	files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
 	if err != nil {
 		return nil, err
 	}
 	var offset uint64
 	for _, f := range files {
 		var t []llm.Tensor
 		var err error
 		t, offset, err = m.readTensors(f, offset, params)
 		if err != nil {
 			slog.Error("%v", err)
 			return nil, err
 		}
 		tensors = append(tensors, t...)
 	}
 	slog.Debug(fmt.Sprintf("all tensors = %d", len(tensors)))
 	return tensors, nil
 }
 func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ([]llm.Tensor, uint64, error) {
 	f, err := os.Open(fn)
 	if err != nil {
 		return nil, 0, err
 	}
 	defer f.Close()
 	var jsonSize uint64
 	if err := binary.Read(f, binary.LittleEndian, &jsonSize); err != nil {
 		return nil, 0, err
 	}
 	buf := make([]byte, jsonSize)
 	_, err = io.ReadFull(f, buf)
 	if err != nil {
 		return nil, 0, err
 	}
 	d := json.NewDecoder(bytes.NewBuffer(buf))
 	d.UseNumber()
 	var parsed map[string]interface{}
 	if err = d.Decode(&parsed); err != nil {
 		return nil, 0, err
 	}
 	var keys []string
 	for k := range parsed {
 		keys = append(keys, k)
 	}
 	slices.Sort(keys)
 	slog.Info("converting layers")
 	var tensors []llm.Tensor
 	for _, k := range keys {
 		vals := parsed[k].(map[string]interface{})
 		var data tensorMetaData
 		if err = mapstructure.Decode(vals, &data); err != nil {
 			slog.Error("couldn't decode properly")
 			return nil, 0, err
 		}
 		slog.Debug(fmt.Sprintf("metadata = %#v", data))
 		var size uint64
 		var kind uint32
 		switch len(data.Shape) {
 		case 0:
 			// metadata
 			continue
 		case 1:
 			// convert to float32
 			kind = 0
 			size = uint64(data.Shape[0] * 4)
 		case 2:
 			// convert to float16
 			kind = 1
 			size = uint64(data.Shape[0] * data.Shape[1] * 2)
 		}
 		ggufName, err := m.GetLayerName(k)
 		if err != nil {
 			slog.Error("%v", err)
 			return nil, 0, err
 		}
 		shape := []uint64{0, 0, 0, 0}
 		for i := range data.Shape {
 			shape[i] = uint64(data.Shape[i])
 		}
 		t := llm.Tensor{
 			Name:   ggufName,
 			Kind:   kind,
 			Offset: offset,
 			Shape:  shape[:],
 		}
 		t.WriterTo = safetensorWriterTo{
 			t:        &t,
 			params:   params,
 			bo:       params.ByteOrder,
 			filename: fn,
 			start:    uint64(data.Offsets[0]),
 			end:      uint64(data.Offsets[1]),
 			padding:  8 + jsonSize,
 		}
 		tensors = append(tensors, t)
 		offset += size
 	}
 	slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors)))
 	slog.Debug(fmt.Sprintf("offset = %d", offset))
 	return tensors, offset, nil
 }
 func (m *SafetensorFormat) GetParams(dirpath string) (*Params, error) {
 	f, err := os.Open(filepath.Join(dirpath, "config.json"))
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	var params Params
 	d := json.NewDecoder(f)
 	err = d.Decode(&params)
 	if err != nil {
 		return nil, err
 	}
 	params.ByteOrder = binary.LittleEndian
 	return &params, nil
 }
 func (m *SafetensorFormat) GetLayerName(n string) (string, error) {
 	directMap := map[string]string{
 		"model.embed_tokens.weight": "token_embd.weight",
 		"lm_head.weight":            "output.weight",
 		"model.norm.weight":         "output_norm.weight",
 	}
 	tMap := map[string]string{
 		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
 		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
 		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
 		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
 		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
 		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
 		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
 		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
 		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
 	}
 	v, ok := directMap[n]
 	if ok {
 		return v, nil
 	}
 	// quick hack to rename the layers to gguf format
 	for k, v := range tMap {
 		re := regexp.MustCompile(k)
 		newName := re.ReplaceAllString(n, v)
 		if newName != n {
 			return newName, nil
 		}
 	}
 	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
 }
 func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) {
 	f, err := os.Open(r.filename)
 	if err != nil {
 		return 0, err
 	}
 	defer f.Close()
 	if _, err = f.Seek(int64(r.padding+r.start), 0); err != nil {
 		return 0, err
 	}
 	// use the handler if one is present
 	if r.handler != nil {
 		return 0, r.handler(w, r, f)
 	}
 	remaining := r.end - r.start
 	bufSize := uint64(10240)
 	var finished bool
 	for {
 		data := make([]byte, min(bufSize, remaining))
 		b, err := io.ReadFull(f, data)
 		remaining -= uint64(b)
 		if err == io.EOF || remaining <= 0 {
 			finished = true
 		} else if err != nil {
 			return 0, err
 		}
 		// convert bfloat16 -> ieee float32
 		tDataF32 := bfloat16.DecodeFloat32(data)
 		switch r.t.Kind {
 		case 0:
 			if err := binary.Write(w, r.bo, tDataF32); err != nil {
 				return 0, err
 			}
 		case 1:
 			// convert float32 -> float16
 			tempBuf := make([]uint16, len(data)/2)
 			for cnt, v := range tDataF32 {
 				tDataF16 := float16.Fromfloat32(v)
 				tempBuf[cnt] = uint16(tDataF16)
 			}
 			if err := binary.Write(w, r.bo, tempBuf); err != nil {
 				return 0, err
 			}
 		}
 		if finished {
 			break
 		}
 	}
 	return 0, nil
 }
 func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
 	switch len(params.Architectures) {
 	case 0:
 		return nil, fmt.Errorf("No architecture specified to convert")
 	case 1:
 		switch params.Architectures[0] {
 		case "MistralForCausalLM":
 			return &MistralModel{
 				ModelData{
 					Name:   name,
 					Path:   dirPath,
 					Params: params,
 					Format: m,
 				},
 			}, nil
 		case "GemmaForCausalLM":
 			return &GemmaModel{
 				ModelData{
 					Name:   name,
 					Path:   dirPath,
 					Params: params,
 					Format: m,
 				},
 			}, nil
 		default:
 			return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
 		}
 	}
 	return nil, fmt.Errorf("Unknown error")
 }
--- a/convert/torch.go
+++ b/convert/torch.go
@@ -1,286 +0,0 @@
 package convert
 import (
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"regexp"
 	"strings"
 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/nlpodyssey/gopickle/types"
 	"github.com/x448/float16"
 	"github.com/ollama/ollama/llm"
 )
 type torchWriterTo struct {
 	t *llm.Tensor
 	params *Params
 	bo     ByteOrder
 	storage pytorch.StorageInterface
 	handler func(w io.Writer, r torchWriterTo) error
 }
 type TorchFormat struct{}
 func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
 	slog.Debug("getting torch tensors")
 	files, err := filepath.Glob(filepath.Join(dirpath, "pytorch_model-*.bin"))
 	if err != nil {
 		slog.Error("didn't find any torch files")
 		return nil, err
 	}
 	var offset uint64
 	var tensors []llm.Tensor
 	for _, fn := range files {
 		m, err := pytorch.Load(fn)
 		if err != nil {
 			slog.Error(fmt.Sprintf("error unpickling: %q", err))
 			return []llm.Tensor{}, err
 		}
 		for _, k := range m.(*types.Dict).Keys() {
 			if strings.HasSuffix(k.(string), "self_attn.rotary_emb.inv_freq") {
 				continue
 			}
 			t, _ := m.(*types.Dict).Get(k)
 			tshape := t.(*pytorch.Tensor).Size
 			var size uint64
 			var kind uint32
 			switch len(tshape) {
 			case 0:
 				continue
 			case 1:
 				// convert to float32
 				kind = 0
 				size = uint64(tshape[0] * 4)
 			case 2:
 				// convert to float16
 				kind = 1
 				size = uint64(tshape[0] * tshape[1] * 2)
 			}
 			ggufName, err := tf.GetLayerName(k.(string))
 			if err != nil {
 				slog.Error("%v", err)
 				return nil, err
 			}
 			slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
 			shape := []uint64{0, 0, 0, 0}
 			for i := range tshape {
 				shape[i] = uint64(tshape[i])
 			}
 			tensor := llm.Tensor{
 				Name:   ggufName,
 				Kind:   kind,
 				Offset: offset, // calculate the offset
 				Shape:  shape[:],
 			}
 			tensor.WriterTo = torchWriterTo{
 				t:       &tensor,
 				params:  params,
 				bo:      params.ByteOrder,
 				storage: t.(*pytorch.Tensor).Source,
 			}
 			tensors = append(tensors, tensor)
 			offset += size
 		}
 	}
 	return tensors, nil
 }
 func getAltParams(dirpath string) (*Params, error) {
 	f, err := os.Open(filepath.Join(dirpath, "params.json"))
 	if err != nil {
 		slog.Error("no params.json")
 		return nil, err
 	}
 	defer f.Close()
 	type TorchParams struct {
 		HiddenSize     int     `json:"dim"`
 		AttentionHeads int     `json:"n_heads"`
 		KeyValHeads    int     `json:"n_kv_heads"`
 		HiddenLayers   int     `json:"n_layers"`
 		RopeTheta      int     `json:"rope_theta"`
 		NormEPS        float64 `json:"norm_eps"`
 	}
 	var tparams TorchParams
 	d := json.NewDecoder(f)
 	err = d.Decode(&tparams)
 	if err != nil {
 		return nil, err
 	}
 	params := &Params{
 		HiddenSize:     tparams.HiddenSize,
 		AttentionHeads: tparams.AttentionHeads,
 		KeyValHeads:    tparams.KeyValHeads,
 		HiddenLayers:   tparams.HiddenLayers,
 		NormEPS:        tparams.NormEPS,
 	}
 	switch {
 	case tparams.RopeTheta == 1000000:
 		// Codellama
 		params.ContextSize = 16384
 	case tparams.NormEPS == 1e-06:
 		// llama2
 		slog.Debug("Found llama2 - setting context size to 4096")
 		params.ContextSize = 4096
 	default:
 		params.ContextSize = 2048
 	}
 	params.ByteOrder = binary.LittleEndian
 	return params, nil
 }
 func (m *TorchFormat) GetParams(dirpath string) (*Params, error) {
 	f, err := os.Open(filepath.Join(dirpath, "config.json"))
 	if err != nil {
 		if os.IsNotExist(err) {
 			// try params.json instead
 			return getAltParams(dirpath)
 		} else {
 			return nil, err
 		}
 	}
 	var params Params
 	d := json.NewDecoder(f)
 	err = d.Decode(&params)
 	if err != nil {
 		return nil, err
 	}
 	params.ByteOrder = binary.LittleEndian
 	return &params, nil
 }
 func (m *TorchFormat) GetLayerName(n string) (string, error) {
 	directMap := map[string]string{
 		"tok_embeddings.weight":     "token_embd.weight",
 		"output.weight":             "output.weight",
 		"norm.weight":               "output_norm.weight",
 		"rope.freqs":                "rope_freqs.weight",
 		"model.embed_tokens.weight": "token_embd.weight",
 		"lm_head.weight":            "output.weight",
 		"model.norm.weight":         "output_norm.weight",
 	}
 	lMap := map[string]string{
 		"layers.(\\d+).attention_norm.weight":                 "blk.$1.attn_norm.weight",
 		"layers.(\\d+).attention_output_norm.weight":          "blk.$1.attn_norm.weight",
 		"layers.(\\d+).feed_forward.w2.weight":                "blk.$1.ffn_down.weight",
 		"layers.(\\d+).feed_forward.w1.weight":                "blk.$1.ffn_gate.weight",
 		"layers.(\\d+).feed_forward.w3.weight":                "blk.$1.ffn_up.weight",
 		"layers.(\\d+).ffn_norm.weight":                       "blk.$1.ffn_norm.weight",
 		"layers.(\\d+).attention.wk.weight":                   "blk.$1.attn_k.weight",
 		"layers.(\\d+).attention.wo.weight":                   "blk.$1.attn_output.weight",
 		"layers.(\\d+).attention.wq.weight":                   "blk.$1.attn_q.weight",
 		"layers.(\\d+).attention.wv.weight":                   "blk.$1.attn_v.weight",
 		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
 		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
 		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
 		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
 		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
 		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
 		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
 		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
 		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
 	}
 	v, ok := directMap[n]
 	if ok {
 		return v, nil
 	}
 	// quick hack to rename the layers to gguf format
 	for k, v := range lMap {
 		re := regexp.MustCompile(k)
 		newName := re.ReplaceAllString(n, v)
 		if newName != n {
 			return newName, nil
 		}
 	}
 	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
 }
 func (r torchWriterTo) WriteTo(w io.Writer) (n int64, err error) {
 	// use the handler if one is present
 	if r.handler != nil {
 		return 0, r.handler(w, r)
 	}
 	switch r.storage.(type) {
 	case *pytorch.FloatStorage:
 		slog.Warn(fmt.Sprintf("unexpected storage found for layer '%s'; skipping", r.t.Name))
 		return 0, nil
 	case *pytorch.HalfStorage:
 		switch r.t.Kind {
 		case 0:
 			data := r.storage.(*pytorch.HalfStorage).Data
 			slog.Debug(fmt.Sprintf("%35s F32 (%d)", r.t.Name, len(data)))
 			if err := binary.Write(w, r.bo, data); err != nil {
 				return 0, err
 			}
 		case 1:
 			data := r.storage.(*pytorch.HalfStorage).Data
 			tData := make([]uint16, len(data))
 			for cnt, v := range data {
 				tData[cnt] = uint16(float16.Fromfloat32(v))
 			}
 			slog.Debug(fmt.Sprintf("%35s F16 (%d)", r.t.Name, len(tData)))
 			if err := binary.Write(w, r.bo, tData); err != nil {
 				return 0, err
 			}
 		}
 	}
 	return 0, nil
 }
 func (m *TorchFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
 	switch len(params.Architectures) {
 	case 0:
 		return nil, fmt.Errorf("No architecture specified to convert")
 	case 1:
 		switch params.Architectures[0] {
 		case "LlamaForCausalLM":
 			return &LlamaModel{
 				ModelData{
 					Name:   name,
 					Path:   dirPath,
 					Params: params,
 					Format: m,
 				},
 			}, nil
 		default:
 			return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
 		}
 	}
 	return nil, fmt.Errorf("Unknown error")
 }
--- a/docs/README.md
+++ b/docs/README.md
@@ -3,7 +3,7 @@
 ### Getting Started
 * [Quickstart](../README.md#quickstart)
 * [Examples](../examples)
-* [Importing models](./import.md)
+* [Importing models](./import.md) from GGUF, Pytorch and Safetensors
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
--- a/docs/api.md
+++ b/docs/api.md
@@ -90,7 +90,7 @@ The final response in the stream also includes additional data about the generat
 - `load_duration`: time spent in nanoseconds loading the model
 - `prompt_eval_count`: number of tokens in the prompt
 - `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
- `eval_count`: number of tokens in the response
+- `eval_count`: number of tokens the response
 - `eval_duration`: time in nanoseconds spent generating the response
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
 - `response`: empty if the response was streamed, if not streamed, this will contain the full response
@@ -394,6 +394,7 @@ Advanced parameters (optional):
 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
--- a/docs/development.md
+++ b/docs/development.md
@@ -69,7 +69,7 @@ go build .
 _Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) development packages first, as well as `cmake` and `golang`.
 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
@@ -116,30 +116,29 @@ Note: The windows build for Ollama is still under development.
 Install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
+- MSVC toolchain - C/C++ and cmake as minimal requirements - You must build from a "Developer Shell" with the environment variables set
- Go version 1.22 or higher
+- go version 1.22 or higher
 - MinGW (pick one variant) with GCC.
-  - [MinGW-w64](https://www.mingw-w64.org/)
+  - <https://www.mingw-w64.org/>
-  - [MSYS2](https://www.msys2.org/)
+  - <https://www.msys2.org/>
 ```powershell
 $env:CGO_ENABLED="1"
 go generate ./...
 go build .
 ```
 #### Windows CUDA (NVIDIA)
-In addition to the common Windows development tools described above, install CUDA after installing MSVC.
+In addition to the common Windows development tools described above, install CUDA **AFTER** you install MSVC.
 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
 #### Windows ROCm (AMD Radeon)
-In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
+In addition to the common Windows development tools described above, install AMDs HIP package **AFTER** you install MSVC
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
+- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
 - [Strawberry Perl](https://strawberryperl.com/)
 Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -14,10 +14,6 @@ curl -fsSL https://ollama.com/install.sh | sh
 Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
 ## Is my GPU compatible with Ollama?
 Please refer to the [GPU docs](./gpu.md).
 ## How can I specify the context window size?
 By default, Ollama uses a context window size of 2048 tokens.
@@ -99,37 +95,6 @@ Ollama binds 127.0.0.1 port 11434 by default. Change the bind address with the `
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
 ## How can I use Ollama with a proxy server?
 Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:
 ```
 server {
    listen 80;
    server_name example.com;  # Replace with your domain or IP
    location / {
        proxy_pass http://localhost:11434;
        proxy_set_header Host localhost:11434;
    }
 }
 ```
 ## How can I use Ollama with ngrok?
 Ollama can be accessed using a range of tools for tunneling tools. For example with Ngrok:
 ```
 ngrok http 11434 --host-header="localhost:11434"
 ```
 ## How can I use Ollama with Cloudflare Tunnel?
 To use Ollama with Cloudflare Tunnel, use the `--url` and `--http-host-header` flags:
 ```
 cloudflared tunnel --url http://localhost:11434 --http-host-header="localhost:11434"
 ```
 ## How can I allow additional web origins to access Ollama?
 Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
@@ -154,7 +119,7 @@ No. Ollama runs locally, and conversation data does not leave your machine.
 ## How can I use Ollama in Visual Studio Code?
-There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
+There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.
 ## How do I use Ollama behind a proxy?
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,102 +0,0 @@
 # GPU
 ## Nvidia
 Ollama supports Nvidia GPUs with compute capability 5.0+.
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
 | Compute Capability | Family              | Cards                                                                                                       |
 | ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
 | 9.0                | NVIDIA              | `H100`                                                                                                      |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti`                                                           |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
 | 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060`         |
 |                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
 | 8.0                | NVIDIA              | `A100` `A30`                                                                                                |
 | 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                    |
 |                    | NVIDIA Professional | `T4` `RTX 5000` `RTX 4000` `RTX 3000` `T2000` `T1200` `T1000` `T600` `T500`                                 |
 |                    | Quadro              | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000`                                                                 |
 | 7.0                | NVIDIA              | `TITAN V` `V100` `Quadro GV100`                                                                             |
 | 6.1                | NVIDIA TITAN        | `TITAN Xp` `TITAN X`                                                                                        |
 |                    | GeForce GTX         | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050`                                     |
 |                    | Quadro              | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
 |                    | Tesla               | `P40` `P4`                                                                                                  |
 | 6.0                | NVIDIA              | `Tesla P100` `Quadro GP100`                                                                                 |
 | 5.2                | GeForce GTX         | `GTX TITAN X` `GTX 980 Ti` `GTX 980` `GTX 970` `GTX 960` `GTX 950`                                          |
 |                    | Quadro              | `M6000 24GB` `M6000` `M5000` `M5500M` `M4000` `M2200` `M2000` `M620`                                        |
 |                    | Tesla               | `M60` `M40`                                                                                                 |
 | 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                            |
 |                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`  |
 ### GPU Selection
 If you have multiple NVIDIA GPUs in your system and want to limit Ollama to use
 a subset, you can set `CUDA_VISIBLE_DEVICES` to a comma separated list of GPUs.
 Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
 You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
 ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
 ### Laptop Suspend Resume
 On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
 your NVIDIA GPU, and fallback to running on the CPU.  You can workaround this
 driver bug by reloading the NVIDIA UVM driver with `sudo rmmod nvidia_uvm &&
 sudo modprobe nvidia_uvm`
 ## AMD Radeon
 Ollama supports the following AMD GPUs:
 | Family         | Cards and accelerators                                                                                                               |
 | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
 | AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56`    |
 | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
 | AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50`                                                               |
 ### Overrides
 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
 some cases you can force the system to try to use a similar LLVM target that is
 close.  For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
 however, ROCm does not currently support this target. The closest support is
 `gfx1030`.  You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
 `x.y.z` syntax.  So for example, to force the system to run on the RX 5400, you
 would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
 server.  If you have an unsupported AMD GPU you can experiment using the list of
 supported types below.
 At this time, the known supported GPU types are the following LLVM Targets.
 This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
 |-----------------|---------------------|
 | gfx900 | Radeon RX Vega 56 |
 | gfx906 | Radeon Instinct MI50 |
 | gfx908 | Radeon Instinct MI100 |
 | gfx90a | Radeon Instinct MI210 |
 | gfx940 | Radeon Instinct MI300 |
 | gfx941 | |
 | gfx942 | |
 | gfx1030 | Radeon PRO V620 |
 | gfx1100 | Radeon PRO W7900 |
 | gfx1101 | Radeon PRO W7700 |
 | gfx1102 | Radeon RX 7600 |
 AMD is working on enhancing ROCm v6 to broaden support for families of GPUs in a
 future release which should increase support for more GPUs.
 Reach out on [Discord](https://discord.gg/ollama) or file an
 [issue](https://github.com/ollama/ollama/issues) for additional help.
 ### GPU Selection
 If you have multiple AMD GPUs in your system and want to limit Ollama to use a
 subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs.
 You can see the list of devices with `rocminfo`.  If you want to ignore the GPUs
 and force CPU usage, use an invalid GPU ID (e.g., "-1")
 ### Container Permission
 In some Linux distributions, SELinux can prevent containers from
 accessing the AMD GPU devices.  On the host system you can run 
 `sudo setsebool container_use_devices=1` to allow containers to use devices.
 ### Metal (Apple GPUs)
 Ollama supports GPU acceleration on Apple devices via the Metal API.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -113,7 +113,7 @@ FROM llama2
 ```
 A list of available base models:
-<https://github.com/ollama/ollama#model-library>
+<https://github.com/jmorganca/ollama#model-library>
 #### Build from a `bin` file
@@ -131,7 +131,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
 PARAMETER <parameter> <parametervalue>
 ```
-#### Valid Parameters and Values
+### Valid Parameters and Values
 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
@@ -139,6 +139,9 @@ PARAMETER <parameter> <parametervalue>
 | mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                        | float      | mirostat_eta 0.1     |
 | mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                         | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
 | num_gqa        | The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for llama2:70b                                                                                                                                         | int        | num_gqa 1            |
 | num_gpu        | The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable.                                                                                                                                            | int        | num_gpu 50           |
 | num_thread     | Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). | int        | num_thread 8         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
@@ -180,7 +183,7 @@ SYSTEM """<system message>"""
 ### ADAPTER
-The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
+The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
 ```modelfile
 ADAPTER ./ollama-lora.bin
@@ -198,22 +201,7 @@ LICENSE """
 ### MESSAGE
-The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
+The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
 ```modelfile
 MESSAGE <role> <message>
 ```
 #### Valid roles
 | Role      | Description                                                  |
 | --------- | ------------------------------------------------------------ |
 | system    | Alternate way of providing the SYSTEM message for the model. |
 | user      | An example message of what the user could have asked.        |
 | assistant | An example message of how the model should respond.          |
 #### Example conversation
 ```modelfile
 MESSAGE user Is Toronto in Canada?
@@ -224,7 +212,6 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```
 ## Notes
 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -1,6 +1,6 @@
 # OpenAI compatibility
-> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
+> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/jmorganca/ollama/blob/main/docs/api.md).
 Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -67,19 +67,49 @@ You can see what features your CPU has with the following.
 cat /proc/cpuinfo| grep flags  | head -1
 ```
-## Installing older or pre-release versions on Linux
+## AMD Radeon GPU Support
-If you run into problems on Linux and want to install an older version, or you'd
+Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
-like to try out a pre-release before it's officially released, you can tell the
+some cases you can force the system to try to use a similar LLVM target that is
-install script which version to install.
+close.  For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
 however, ROCm does not currently support this target. The closest support is
 `gfx1030`.  You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
 `x.y.z` syntax.  So for example, to force the system to run on the RX 5400, you
 would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
 server.  If you have an unsupported AMD GPU you can experiment using the list of
 supported types below.
 At this time, the known supported GPU types are the following LLVM Targets.
 This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
 |-----------------|---------------------|
 | gfx900 | Radeon RX Vega 56 |
 | gfx906 | Radeon Instinct MI50 |
 | gfx908 | Radeon Instinct MI100 |
 | gfx90a | Radeon Instinct MI210 |
 | gfx940 | Radeon Instinct MI300 |
 | gfx941 | |
 | gfx942 | |
 | gfx1030 | Radeon PRO V620 |
 | gfx1100 | Radeon PRO W7900 |
 | gfx1101 | Radeon PRO W7700 |
 | gfx1102 | Radeon RX 7600 |
 AMD is working on enhancing ROCm v6 to broaden support for families of GPUs in a
 future release which should increase support for more GPUs.
 Reach out on [Discord](https://discord.gg/ollama) or file an
 [issue](https://github.com/ollama/ollama/issues) for additional help.
 ## Installing older versions on Linux
 If you run into problems on Linux and want to install an older version you can tell the install script
 which version to install.
 ```sh
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
 ```
-## Linux tmp noexec 
+## Known issues
-If your system is configured with the "noexec" flag where Ollama stores its
+* N/A
 temporary executable files, you can specify an alternate location by setting
 OLLAMA_TMPDIR to a location writable by the user ollama runs as.  For example
 OLLAMA_TMPDIR=/usr/share/ollama/
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -18,7 +18,7 @@ const ollama = new Ollama({
  model: "llama2",
 });
-const answer = await ollama.invoke(`why is the sky blue?`);
+const answer = await ollama.call(`why is the sky blue?`);
 console.log(answer);
 ```
--- a/docs/tutorials/nvidia-jetson.md
+++ b/docs/tutorials/nvidia-jetson.md
@@ -1,15 +1,38 @@
 # Running Ollama on NVIDIA Jetson Devices
-Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions. 
+With some minor configuration, Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/). The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack).
-The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
+NVIDIA Jetson devices are Linux-based embedded AI computers that are purpose-built for AI applications.
 Jetsons have an integrated GPU that is wired directly to the memory controller of the machine. For this reason, the `nvidia-smi` command is unrecognized, and Ollama proceeds to operate in "CPU only"
 mode. This can be verified by using a monitoring tool like jtop.
 In order to address this, we simply pass the path to the Jetson's pre-installed CUDA libraries into `ollama serve` (while in a tmux session). We then hardcode the num_gpu parameters into a cloned
 version of our target model.
 Prerequisites:
 - curl
 - tmux
 Here are the steps:
 - Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
 - Stop the Ollama service: `sudo systemctl stop ollama`
 - Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson 
 'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
 - Pull the model you want to use (e.g. mistral): `ollama pull mistral`
- Start an interactive session: `ollama run mistral`
+- Create a new Modelfile specifically for enabling GPU support on the Jetson: `touch ModelfileMistralJetson`
 - In the ModelfileMistralJetson file, specify the FROM model and the num_gpu PARAMETER as shown below:
 ```
 FROM mistral
 PARAMETER num_gpu 999
 ```
 - Create a new model from your Modelfile: `ollama create mistral-jetson -f ./ModelfileMistralJetson`
 - Run the new model: `ollama run mistral-jetson`
 If you run a monitoring tool like jtop you should now see that Ollama is using the Jetson's integrated GPU.
 And that's it!
 # Running Ollama in Docker
 When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -1,51 +0,0 @@
 package main
 import (
 	"context"
 	"fmt"
 	"log"
 	"github.com/ollama/ollama/api"
 )
 func main() {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		log.Fatal(err)
 	}
 	messages := []api.Message{
 		api.Message{
 			Role:    "system",
 			Content: "Provide very brief, concise responses",
 		},
 		api.Message{
 			Role:    "user",
 			Content: "Name some unusual animals",
 		},
 		api.Message{
 			Role:    "assistant",
 			Content: "Monotreme, platypus, echidna",
 		},
 		api.Message{
 			Role:    "user",
 			Content: "which of these is the most dangerous?",
 		},
 	}
 	ctx := context.Background()
 	req := &api.ChatRequest{
 		Model:    "llama2",
 		Messages: messages,
 	}
 	respFunc := func(resp api.ChatResponse) error {
 		fmt.Print(resp.Message.Content)
 		return nil
 	}
 	err = client.Chat(ctx, req, respFunc)
 	if err != nil {
 		log.Fatal(err)
 	}
 }
--- a/examples/go-generate-streaming/main.go
+++ b/examples/go-generate-streaming/main.go
@@ -1,40 +0,0 @@
 package main
 import (
 	"context"
 	"fmt"
 	"log"
 	"github.com/ollama/ollama/api"
 )
 func main() {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		log.Fatal(err)
 	}
 	// By default, GenerateRequest is streaming.
 	req := &api.GenerateRequest{
 		Model:  "gemma",
 		Prompt: "how many planets are there?",
 	}
 	ctx := context.Background()
 	respFunc := func(resp api.GenerateResponse) error {
 		// Only print the response here; GenerateResponse has a number of other
 		// interesting fields you want to examine.
 		// In streaming mode, responses are partial so we call fmt.Print (and not
 		// Println) in order to avoid spurious newlines being introduced. The
 		// model will insert its own newlines if it wants.
 		fmt.Print(resp.Response)
 		return nil
 	}
 	err = client.Generate(ctx, req, respFunc)
 	if err != nil {
 		log.Fatal(err)
 	}
 	fmt.Println()
 }
--- a/examples/go-generate/main.go
+++ b/examples/go-generate/main.go
@@ -1,37 +0,0 @@
 package main
 import (
 	"context"
 	"fmt"
 	"log"
 	"github.com/ollama/ollama/api"
 )
 func main() {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		log.Fatal(err)
 	}
 	req := &api.GenerateRequest{
 		Model:  "gemma",
 		Prompt: "how many planets are there?",
 		// set streaming to false
 		Stream: new(bool),
 	}
 	ctx := context.Background()
 	respFunc := func(resp api.GenerateResponse) error {
 		// Only print the response here; GenerateResponse has a number of other
 		// interesting fields you want to examine.
 		fmt.Println(resp.Response)
 		return nil
 	}
 	err = client.Generate(ctx, req, respFunc)
 	if err != nil {
 		log.Fatal(err)
 	}
 }
--- a/examples/go-multimodal/main.go
+++ b/examples/go-multimodal/main.go
@@ -1,47 +0,0 @@
 package main
 import (
 	"context"
 	"fmt"
 	"log"
 	"os"
 	"github.com/ollama/ollama/api"
 )
 func main() {
 	if len(os.Args) <= 1 {
 		log.Fatal("usage: <image name>")
 	}
 	imgData, err := os.ReadFile(os.Args[1])
 	if err != nil {
 		log.Fatal(err)
 	}
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		log.Fatal(err)
 	}
 	req := &api.GenerateRequest{
 		Model:  "llava",
 		Prompt: "describe this image",
 		Images: []api.ImageData{imgData},
 	}
 	ctx := context.Background()
 	respFunc := func(resp api.GenerateResponse) error {
 		// In streaming mode, responses are partial so we call fmt.Print (and not
 		// Println) in order to avoid spurious newlines being introduced. The
 		// model will insert its own newlines if it wants.
 		fmt.Print(resp.Response)
 		return nil
 	}
 	err = client.Generate(ctx, req, respFunc)
 	if err != nil {
 		log.Fatal(err)
 	}
 	fmt.Println()
 }
--- a/examples/go-pull-progress/main.go
+++ b/examples/go-pull-progress/main.go
@@ -1,31 +0,0 @@
 package main
 import (
 	"context"
 	"fmt"
 	"log"
 	"github.com/ollama/ollama/api"
 )
 func main() {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		log.Fatal(err)
 	}
 	ctx := context.Background()
 	req := &api.PullRequest{
 		Model: "mistral",
 	}
 	progressFunc := func(resp api.ProgressResponse) error {
 		fmt.Printf("Progress: status=%v, total=%v, completed=%v\n", resp.Status, resp.Total, resp.Completed)
 		return nil
 	}
 	err = client.Pull(ctx, req, progressFunc)
 	if err != nil {
 		log.Fatal(err)
 	}
 }
--- a/examples/golang-simplegenerate/README.md
+++ b/examples/golang-simplegenerate/README.md
--- a/examples/golang-simplegenerate/main.go
+++ b/examples/golang-simplegenerate/main.go
--- a/examples/langchain-python-rag-privategpt/README.md
+++ b/examples/langchain-python-rag-privategpt/README.md
@@ -1,6 +1,6 @@
 # PrivateGPT with Llama 2 uncensored
-https://github.com/ollama/ollama/assets/3325447/20cf8ec6-ff25-42c6-bdd8-9be594e3ce1b
+https://github.com/jmorganca/ollama/assets/3325447/20cf8ec6-ff25-42c6-bdd8-9be594e3ce1b
 > Note: this example is a slightly modified version of PrivateGPT using models such as Llama 2 Uncensored. All credit for PrivateGPT goes to Iván Martínez who is the creator of it, and you can find his GitHub repo [here](https://github.com/imartinez/privateGPT).
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -28,7 +28,7 @@ You are Mario from Super Mario Bros, acting as an assistant.
 What if you want to change its behaviour?
 - Try changing the prompt
- Try changing the parameters [Docs](https://github.com/ollama/ollama/blob/main/docs/modelfile.md)
+- Try changing the parameters [Docs](https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md)
 - Try changing the model (e.g. An uncensored model by `FROM wizard-vicuna` this is the wizard-vicuna uncensored model )
 Once the changes are made,
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -1,6 +1,6 @@
 # JSON Output Example
-![llmjson 2023-11-10 15_31_31](https://github.com/ollama/ollama/assets/633681/e599d986-9b4a-4118-81a4-4cfe7e22da25)
+![llmjson 2023-11-10 15_31_31](https://github.com/jmorganca/ollama/assets/633681/e599d986-9b4a-4118-81a4-4cfe7e22da25)
 There are two python scripts in this example. `randomaddresses.py` generates random addresses from different countries. `predefinedschema.py` sets a template for the model to fill in.
--- a/examples/python-loganalysis/readme.md
+++ b/examples/python-loganalysis/readme.md
@@ -1,6 +1,6 @@
 # Log Analysis example
-![loganalyzer 2023-11-10 08_53_29](https://github.com/ollama/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)
+![loganalyzer 2023-11-10 08_53_29](https://github.com/jmorganca/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)
 This example shows one possible way to create a log file analyzer. It uses the model **mattw/loganalyzer** which is based on **codebooga**, a 34b parameter model.
--- a/examples/typescript-functioncalling/readme.md
+++ b/examples/typescript-functioncalling/readme.md
@@ -1,6 +1,6 @@
 # Function calling
-![function calling 2023-11-16 16_12_58](https://github.com/ollama/ollama/assets/633681/a0acc247-9746-45ab-b325-b65dfbbee4fb)
+![function calling 2023-11-16 16_12_58](https://github.com/jmorganca/ollama/assets/633681/a0acc247-9746-45ab-b325-b65dfbbee4fb)
 One of the features added to some models is 'function calling'. It's a bit of a confusing name. It's understandable if you think that means the model can call functions, but that's not what it means. Function calling simply means that the output of the model is formatted in JSON, using a preconfigured schema, and uses the expected types. Then your code can use the output of the model and call functions with it. Using the JSON format in Ollama, you can use any model for function calling. 
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -6,15 +6,11 @@ import (
 )
 const (
-	Byte = 1
+	Byte     = 1
 	KiloByte = Byte * 1000
 	MegaByte = KiloByte * 1000
 	GigaByte = MegaByte * 1000
 	TeraByte = GigaByte * 1000
 	KibiByte = Byte * 1024
 	MebiByte = KibiByte * 1024
 )
 func HumanBytes(b int64) string {
@@ -49,14 +45,3 @@ func HumanBytes(b int64) string {
 		return fmt.Sprintf("%d %s", int(value), unit)
 	}
 }
 func HumanBytes2(b uint64) string {
 	switch {
 	case b >= MebiByte:
 		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
 	case b >= KibiByte:
 		return fmt.Sprintf("%.1f KiB", float64(b)/KibiByte)
 	default:
 		return fmt.Sprintf("%d B", b)
 	}
 }
--- a/go.mod
+++ b/go.mod
@@ -1,4 +1,4 @@
-module github.com/ollama/ollama
+module github.com/jmorganca/ollama
 go 1.22
@@ -9,7 +9,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
-	github.com/golang/protobuf v1.5.0 // indirect
+	github.com/golang/protobuf v1.5.0
 	github.com/google/uuid v1.0.0
 	github.com/mitchellh/mapstructure v1.5.0
 	github.com/olekukonko/tablewriter v0.0.5
@@ -19,10 +19,7 @@ require (
 	golang.org/x/sync v0.3.0
 )
-require (
+require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
 )
 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
@@ -71,7 +68,7 @@ require (
 	golang.org/x/net v0.17.0 // indirect
 	golang.org/x/sys v0.13.0
 	golang.org/x/term v0.13.0
-	golang.org/x/text v0.14.0 // indirect
+	golang.org/x/text v0.13.0 // indirect
 	google.golang.org/protobuf v1.30.0
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -122,8 +122,6 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/nlpodyssey/gopickle v0.3.0 h1:BLUE5gxFLyyNOPzlXxt6GoHEMMxD0qhsE4p0CIQyoLw=
 github.com/nlpodyssey/gopickle v0.3.0/go.mod h1:f070HJ/yR+eLi5WmM1OXJEGaTpuJEUiib19olXgYha0=
 github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
 github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
@@ -238,8 +236,8 @@ golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
+golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
-golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -40,17 +40,19 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
 	// TODO - does sort order matter?
 	devices := []string{}
 	for i := range ids {
 		slog.Debug(fmt.Sprintf("i=%d", i))
 		if _, skipped := skip[i]; skipped {
 			slog.Debug("skipped")
 			continue
 		}
 		devices = append(devices, strconv.Itoa(i))
 	}
 	slog.Debug(fmt.Sprintf("devices=%v", devices))
 	val := strings.Join(devices, ",")
 	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
 	if err != nil {
 		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
 	} else {
 		slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
 	}
 	slog.Debug("HIP_VISIBLE_DEVICES=" + val)
 }
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -24,9 +24,6 @@ const (
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
 	RocmStandardLocation   = "/opt/rocm/lib"
 	// TODO find a better way to detect iGPU instead of minimum memory
 	IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
 )
 var (
@@ -100,8 +97,6 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 		return
 	}
 	updateLibPath(libDir)
 	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
 	if gfxOverride == "" {
 		supported, err := GetSupportedGFX(libDir)
@@ -115,7 +110,7 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 			if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
 				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
+				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				skip[i] = struct{}{}
 			} else {
 				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
@@ -145,29 +140,14 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 	}
 }
 func updateLibPath(libDir string) {
 	ldPaths := []string{}
 	if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
 		ldPaths = strings.Split(val, ":")
 	}
 	for _, d := range ldPaths {
 		if d == libDir {
 			return
 		}
 	}
 	val := strings.Join(append(ldPaths, libDir), ":")
 	slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
 	os.Setenv("LD_LIBRARY_PATH", val)
 }
 // Walk the sysfs nodes for the available GPUs and gather information from them
 // skipping over any devices in the skip map
 func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 	resp.memInfo.DeviceCount = 0
 	resp.memInfo.TotalMemory = 0
 	resp.memInfo.FreeMemory = 0
 	slog.Debug("discovering VRAM for amdgpu devices")
 	if len(ids) == 0 {
 		slog.Debug("discovering all amdgpu devices")
 		entries, err := os.ReadDir(AMDNodesSysfsDir)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
@@ -185,7 +165,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			ids = append(ids, id)
 		}
 	}
-	slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
+	slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
 	for _, id := range ids {
 		if _, skipped := skip[id]; skipped {
@@ -193,8 +173,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 		}
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
-		// Adjust for sysfs vs HIP ids
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
 		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
 		propFiles, err := filepath.Glob(propGlob)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
@@ -226,13 +205,6 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 		}
 		if totalMemory == 0 {
 			slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
 			skip[id] = struct{}{}
 			continue
 		}
 		if totalMemory < IGPUMemLimit {
 			slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
 			skip[id] = struct{}{}
 			continue
 		}
 		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
@@ -260,8 +232,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 			usedMemory += used
 		}
-		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
+		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
-		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %dM", id, (totalMemory-usedMemory)/1024/1024))
+		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %d", id, (totalMemory - usedMemory)))
 		resp.memInfo.DeviceCount++
 		resp.memInfo.TotalMemory += totalMemory
 		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
@@ -310,7 +282,7 @@ func AMDValidateLibDir() (string, error) {
 	}
 	// If we already have a rocm dependency wired, nothing more to do
-	rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
+	rocmTargetDir := filepath.Join(payloadsDir, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		return rocmTargetDir, nil
 	}
@@ -386,8 +358,6 @@ func AMDDriverVersion() (string, error) {
 }
 func AMDGFXVersions() map[int]Version {
 	// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	res := map[int]Version{}
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
 	for _, match := range matches {
@@ -403,20 +373,17 @@ func AMDGFXVersions() map[int]Version {
 			continue
 		}
 		if i == 0 {
 			// Skipping the CPU
 			continue
 		}
 		// Align with HIP IDs (zero is first GPU, not CPU)
 		i -= 1
 		scanner := bufio.NewScanner(fp)
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			if strings.HasPrefix(line, "gfx_target_version") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 || len(ver[1]) < 5 {
-					if ver[1] != "0" {
+
 					if ver[1] == "0" {
 						// Silently skip the CPU
 						continue
 					} else {
 						slog.Debug("malformed " + line)
 					}
 					res[i] = Version{
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -1,17 +1,13 @@
 package gpu
 import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
 	"time"
 )
 var (
@@ -22,84 +18,24 @@ var (
 func PayloadsDir() (string, error) {
 	lock.Lock()
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
-		cleanupTmpDirs()
+		tmpDir, err := os.MkdirTemp("", "ollama")
 		tmpDir := os.Getenv("OLLAMA_TMPDIR")
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
 				return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 			}
 		} else {
 			err = os.MkdirAll(tmpDir, 0755)
 			if err != nil {
 				return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
 			}
 		}
 		// Track our pid so we can clean up orphaned tmpdirs
 		pidFilePath := filepath.Join(tmpDir, "ollama.pid")
 		pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
 		if err != nil {
-			return "", err
+			return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 		}
-		if _, err := pidFile.Write([]byte(fmt.Sprint(os.Getpid()))); err != nil {
+		payloadsDir = tmpDir
 			return "", err
 		}
 		// We create a distinct subdirectory for payloads within the tmpdir
 		// This will typically look like /tmp/ollama3208993108/runners on linux
 		payloadsDir = filepath.Join(tmpDir, "runners")
 	}
 	return payloadsDir, nil
 }
 // Best effort to clean up prior tmpdirs
 func cleanupTmpDirs() {
 	dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
 	if err != nil {
 		return
 	}
 	for _, d := range dirs {
 		info, err := os.Stat(d)
 		if err != nil || !info.IsDir() {
 			continue
 		}
 		raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
 		if err == nil {
 			pid, err := strconv.Atoi(string(raw))
 			if err == nil {
 				if proc, err := os.FindProcess(int(pid)); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
 					// Another running ollama, ignore this tmpdir
 					continue
 				}
 			}
 		} else {
 			slog.Debug("failed to open ollama.pid", "path", d, "error", err)
 		}
 		err = os.RemoveAll(d)
 		if err != nil {
 			slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
 		}
 	}
 }
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
 	if payloadsDir != "" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
+		slog.Debug("cleaning up", "dir", payloadsDir)
-		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
+		err := os.RemoveAll(payloadsDir)
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
-			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
+			slog.Warn("failed to clean up", "dir", payloadsDir, "err", err)
 			time.Sleep(1000 * time.Millisecond)
 			err = os.RemoveAll(tmpDir)
 			if err != nil {
 				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 			}
 		}
 	}
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -20,27 +20,20 @@ import (
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/ollama/ollama/format"
 )
 type handles struct {
-	nvml   *C.nvml_handle_t
+	cuda *C.cuda_handle_t
 	cudart *C.cudart_handle_t
 }
 const (
 	cudaMinimumMemory = 457 * format.MebiByte
 	rocmMinimumMemory = 457 * format.MebiByte
 )
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil
 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}
 // Possible locations for the nvidia-ml library
-var NvmlLinuxGlobs = []string{
+var CudaLinuxGlobs = []string{
 	"/usr/local/cuda/lib64/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
@@ -48,98 +41,49 @@ var NvmlLinuxGlobs = []string{
 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
 	"/opt/cuda/lib64/libnvidia-ml.so*",
 	"/usr/lib*/libnvidia-ml.so*",
 	"/usr/local/lib*/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
 	"/usr/local/lib*/libnvidia-ml.so*",
 	// TODO: are these stubs ever valid?
 	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 }
-var NvmlWindowsGlobs = []string{
+var CudaWindowsGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }
 var CudartLinuxGlobs = []string{
 	"/usr/local/cuda/lib64/libcudart.so*",
 	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
 	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
 	"/usr/lib/wsl/lib/libcudart.so*",
 	"/usr/lib/wsl/drivers/*/libcudart.so*",
 	"/opt/cuda/lib64/libcudart.so*",
 	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
 	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
 	"/usr/local/cuda/lib*/libcudart.so*",
 	"/usr/lib*/libcudart.so*",
 	"/usr/local/lib*/libcudart.so*",
 }
 var CudartWindowsGlobs = []string{
 	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
 }
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 // Note: gpuMutex must already be held
-func initGPUHandles() *handles {
+func initGPUHandles() {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
-	gpuHandles := &handles{nil, nil}
+	gpuHandles = &handles{nil}
-	var nvmlMgmtName string
+	var cudaMgmtName string
-	var nvmlMgmtPatterns []string
+	var cudaMgmtPatterns []string
 	var cudartMgmtName string
 	var cudartMgmtPatterns []string
 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
 	case "windows":
-		nvmlMgmtName = "nvml.dll"
+		cudaMgmtName = "nvml.dll"
-		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
+		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
-		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
+		copy(cudaMgmtPatterns, CudaWindowsGlobs)
 		cudartMgmtName = "cudart64_*.dll"
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
 	case "linux":
-		nvmlMgmtName = "libnvidia-ml.so"
+		cudaMgmtName = "libnvidia-ml.so"
-		nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
+		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
-		copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
+		copy(cudaMgmtPatterns, CudaLinuxGlobs)
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
 			// TODO - add "payloads" for subprocess
 			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
 	default:
-		return gpuHandles
+		return
 	}
 	slog.Info("Detecting GPU type")
-	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
+	cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
-	if len(cudartLibPaths) > 0 {
+	if len(cudaLibPaths) > 0 {
-		cudart := LoadCUDARTMgmt(cudartLibPaths)
+		cuda := LoadCUDAMgmt(cudaLibPaths)
-		if cudart != nil {
+		if cuda != nil {
-			slog.Info("Nvidia GPU detected via cudart")
+			slog.Info("Nvidia GPU detected")
-			gpuHandles.cudart = cudart
+			gpuHandles.cuda = cuda
-			return gpuHandles
+			return
 		}
 	}
 	// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
 	nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
 	if len(nvmlLibPaths) > 0 {
 		nvml := LoadNVMLMgmt(nvmlLibPaths)
 		if nvml != nil {
 			slog.Info("Nvidia GPU detected via nvidia-ml")
 			gpuHandles.nvml = nvml
 			return gpuHandles
 		}
 	}
 	return gpuHandles
 }
 func GetGPUInfo() GpuInfo {
@@ -147,16 +91,9 @@ func GetGPUInfo() GpuInfo {
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
-
+	if gpuHandles == nil {
-	gpuHandles := initGPUHandles()
+		initGPUHandles()
-	defer func() {
+	}
 		if gpuHandles.nvml != nil {
 			C.nvml_release(*gpuHandles.nvml)
 		}
 		if gpuHandles.cudart != nil {
 			C.cudart_release(*gpuHandles.cudart)
 		}
 	}()
 	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
 	cpuVariant := GetCPUVariant()
@@ -166,50 +103,28 @@ func GetGPUInfo() GpuInfo {
 	var memInfo C.mem_info_t
 	resp := GpuInfo{}
-	if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+	if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
+			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
 			C.free(unsafe.Pointer(memInfo.err))
 		} else if memInfo.count > 0 {
 			// Verify minimum compute capability
-			var cc C.nvml_compute_capability_t
+			var cc C.cuda_compute_capability_t
-			C.nvml_compute_capability(*gpuHandles.nvml, &cc)
+			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
 			if cc.err != nil {
-				slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
+				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
-				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
+				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 				resp.MinimumMemory = cudaMinimumMemory
 			} else {
-				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
+				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
 	} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
 		C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
 		if memInfo.err != nil {
 			slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
 			C.free(unsafe.Pointer(memInfo.err))
 		} else if memInfo.count > 0 {
 			// Verify minimum compute capability
 			var cc C.cudart_compute_capability_t
 			C.cudart_compute_capability(*gpuHandles.cudart, &cc)
 			if cc.err != nil {
 				slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 				resp.MinimumMemory = cudaMinimumMemory
 			} else {
 				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
 	} else {
 		AMDGetGPUInfo(&resp)
 		if resp.Library != "" {
 			resp.MinimumMemory = rocmMinimumMemory
 			return resp
 		}
 	}
@@ -243,7 +158,7 @@ func getCPUMem() (memInfo, error) {
 	return ret, nil
 }
-func CheckVRAM() (uint64, error) {
+func CheckVRAM() (int64, error) {
 	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
 	if userLimit != "" {
 		avail, err := strconv.ParseInt(userLimit, 10, 64)
@@ -251,11 +166,19 @@ func CheckVRAM() (uint64, error) {
 			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
 		}
 		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return uint64(avail), nil
+		return avail, nil
 	}
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		return gpuInfo.FreeMemory, nil
+		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
 		overhead := gpuInfo.FreeMemory / 10
 		gpus := uint64(gpuInfo.DeviceCount)
 		if overhead < gpus*1024*1024*1024 {
 			overhead = gpus * 1024 * 1024 * 1024
 		}
 		avail := int64(gpuInfo.FreeMemory - overhead)
 		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
 		return avail, nil
 	}
 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
@@ -315,32 +238,15 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 	return gpuLibPaths
 }
-func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
+func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
-	var resp C.nvml_init_resp_t
+	var resp C.cuda_init_resp_t
 	resp.ch.verbose = getVerboseState()
-	for _, libPath := range nvmlLibPaths {
+	for _, libPath := range cudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
-		C.nvml_init(lib, &resp)
+		C.cuda_init(lib, &resp)
 		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
+			slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			return &resp.ch
 		}
 	}
 	return nil
 }
 func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
 	var resp C.cudart_init_resp_t
 	resp.ch.verbose = getVerboseState()
 	for _, libPath := range cudartLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.cudart_init(lib, &resp)
 		if resp.err != nil {
 			slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			return &resp.ch
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -17,7 +17,7 @@ import (
 )
 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
-func CheckVRAM() (uint64, error) {
+func CheckVRAM() (int64, error) {
 	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
 	if userLimit != "" {
 		avail, err := strconv.ParseInt(userLimit, 10, 64)
@@ -25,15 +25,15 @@ func CheckVRAM() (uint64, error) {
 			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
 		}
 		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return uint64(avail), nil
+		return avail, nil
 	}
 	if runtime.GOARCH == "amd64" {
 		// gpu not supported, this may not be metal
 		return 0, nil
 	}
-
+	recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM())
-	return uint64(C.getRecommendedMaxVRAM()), nil
+	return recommendedMaxVRAM, nil
 }
 func GetGPUInfo() GpuInfo {
@@ -53,8 +53,8 @@ func GetGPUInfo() GpuInfo {
 func getCPUMem() (memInfo, error) {
 	return memInfo{
-		TotalMemory: uint64(C.getPhysicalMemory()),
+		TotalMemory: 0,
 		FreeMemory:  0,
-		DeviceCount: 1,
+		DeviceCount: 0,
 	}, nil
 }
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -52,8 +52,7 @@ void cpu_check_ram(mem_info_t *resp);
 }
 #endif
-#include "gpu_info_nvml.h"
+#include "gpu_info_cuda.h"
 #include "gpu_info_cudart.h"
 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -1,10 +1,10 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
 #include "gpu_info_cuda.h"
 #include <string.h>
-#include "gpu_info_nvml.h"
+void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
@@ -30,20 +30,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
      {NULL, NULL},
  };
-  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
+  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
-             nvml_lib_path, msg);
+             cuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }
  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
@@ -82,7 +82,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  }
 }
-void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
+void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
@@ -92,7 +92,7 @@ void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
  int i;
  if (h.handle == NULL) {
-    resp->err = strdup("nvml handle isn't initialized");
+    resp->err = strdup("nvml handle sn't initialized");
    return;
  }
@@ -156,14 +156,14 @@ void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
    }
    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);
    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
 }
-void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
+void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
@@ -211,11 +211,4 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
    }
  }
 }
 void nvml_release(nvml_handle_t h) {
  LOG(h.verbose, "releasing nvml library\n");
  UNLOAD_LIBRARY(h.handle);
  h.handle = NULL;
 }
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -1,6 +1,6 @@
 #ifndef __APPLE__
-#ifndef __GPU_INFO_NVML_H__
+#ifndef __GPU_INFO_CUDA_H__
-#define __GPU_INFO_NVML_H__
+#define __GPU_INFO_CUDA_H__
 #include "gpu_info.h"
 // Just enough typedef's to dlopen/dlsym for memory information
@@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
    NVML_BRAND_UNKNOWN          = 0,
 } nvmlBrandType_t;
-typedef struct nvml_handle {
+typedef struct cuda_handle {
  void *handle;
  uint16_t verbose;
  nvmlReturn_t (*nvmlInit_v2)(void);
@@ -35,23 +35,22 @@ typedef struct nvml_handle {
  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
-} nvml_handle_t;
+} cuda_handle_t;
-typedef struct nvml_init_resp {
+typedef struct cuda_init_resp {
  char *err;  // If err is non-null handle is invalid
-  nvml_handle_t ch;
+  cuda_handle_t ch;
-} nvml_init_resp_t;
+} cuda_init_resp_t;
-typedef struct nvml_compute_capability {
+typedef struct cuda_compute_capability {
  char *err;
  int major;
  int minor;
-} nvml_compute_capability_t;
+} cuda_compute_capability_t;
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
+void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
-void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
+void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
-void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
+void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
 void nvml_release(nvml_handle_t ch);
-#endif  // __GPU_INFO_NVML_H__
+#endif  // __GPU_INFO_CUDA_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -1,200 +0,0 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
 #include <string.h>
 #include "gpu_info_cudart.h"
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  cudartReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
      {NULL, NULL},
  };
  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
    snprintf(buf, buflen,
            "Unable to load %s library to query for Nvidia GPUs: %s",
            cudart_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }
  // TODO once we've squashed the remaining corner cases remove this log
  LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
              msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }
  ret = (*resp->ch.cudaSetDevice)(0);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
      resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama");
      return;
    }
    snprintf(buf, buflen, "cudart init failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  int version = 0;
  cudartDriverVersion_t driverVersion;
  driverVersion.major = 0;
  driverVersion.minor = 0;
  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.cudaDriverGetVersion)(&version);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
  } else {
    driverVersion.major = version / 1000;
    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
  }
 }
 void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  cudartMemory_t memInfo = {0,0,0};
  cudartReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  if (h.handle == NULL) {
    resp->err = strdup("cudart handle isn't initialized");
    return;
  }
  // cudaGetDeviceCount takes int type, resp-> count is uint
  int deviceCount;
  ret = (*h.cudaGetDeviceCount)(&deviceCount);
  if (ret != CUDART_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  } else {
    resp->count = (unsigned int)deviceCount;
  }
  resp->total = 0;
  resp->free = 0;
  for (i = 0; i < resp-> count; i++) {  
    ret = (*h.cudaSetDevice)(i);
    if (ret != CUDART_SUCCESS) {
      snprintf(buf, buflen, "cudart device failed to initialize");
      resp->err = strdup(buf);
      return;
    }
    ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
    if (ret != CUDART_SUCCESS) {
      snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
      resp->err = strdup(buf);
      return;
    }
    LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
    LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
 }
 void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
  int major = 0;
  int minor = 0;
  cudartReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  if (h.handle == NULL) {
    resp->err = strdup("cudart handle not initialized");
    return;
  }
  int devices;
  ret = (*h.cudaGetDeviceCount)(&devices);
  if (ret != CUDART_SUCCESS) {
    snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  for (i = 0; i < devices; i++) {
    ret = (*h.cudaSetDevice)(i);
    if (ret != CUDART_SUCCESS) {
      snprintf(buf, buflen, "cudart device failed to initialize");
      resp->err = strdup(buf);
      return;
    }
    ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
    if (ret != CUDART_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
    if (ret != CUDART_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    // Report the lowest major.minor we detect as that limits our compatibility
    if (resp->major == 0 || resp->major > major ) {
      resp->major = major;
      resp->minor = minor;
    } else if ( resp->major == major && resp->minor > minor ) {
      resp->minor = minor;
    }
  }
 }
 void cudart_release(cudart_handle_t h) {
  LOG(h.verbose, "releasing cudart library\n");
  UNLOAD_LIBRARY(h.handle);
  h.handle = NULL;
 }
 #endif  // __APPLE__
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -1,61 +0,0 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_CUDART_H__
 #define __GPU_INFO_CUDART_H__
 #include "gpu_info.h"
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum cudartReturn_enum {
  CUDART_SUCCESS = 0,
  CUDART_UNSUPPORTED = 1,
  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
  // Other values omitted for now...
 } cudartReturn_t;
 typedef enum cudartDeviceAttr_enum {
  cudartDevAttrComputeCapabilityMajor = 75,
  cudartDevAttrComputeCapabilityMinor = 76,
 } cudartDeviceAttr_t;
 typedef void *cudartDevice_t;  // Opaque is sufficient
 typedef struct cudartMemory_st {
  size_t total;
  size_t free;
  size_t used;
 } cudartMemory_t;
 typedef struct cudartDriverVersion {
  int major;
  int minor;
 } cudartDriverVersion_t;
 typedef struct cudart_handle {
  void *handle;
  uint16_t verbose;
  cudartReturn_t (*cudaSetDevice)(int device);
  cudartReturn_t (*cudaDeviceSynchronize)(void);
  cudartReturn_t (*cudaDeviceReset)(void);
  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
  cudartReturn_t (*cudaGetDeviceCount)(int *);
  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
 } cudart_handle_t;
 typedef struct cudart_init_resp {
  char *err;  // If err is non-null handle is invalid
  cudart_handle_t ch;
 } cudart_init_resp_t;
 typedef struct cudart_compute_capability {
  char *err;
  int major;
  int minor;
 } cudart_compute_capability_t;
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
 void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
 void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
 void cudart_release(cudart_handle_t ch);
 #endif  // __GPU_INFO_CUDART_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@@ -1,4 +1,3 @@
 #import <Metal/Metal.h>
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
 uint64_t getPhysicalMemory();
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
@@ -1,13 +1,11 @@
-// go:build darwin
+//go:build darwin
 #include "gpu_info_darwin.h"
-uint64_t getRecommendedMaxVRAM() {
+uint64_t getRecommendedMaxVRAM()
-  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+{
-  uint64_t result = device.recommendedMaxWorkingSetSize;
+	id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-  CFRelease(device);
+	uint64_t result = device.recommendedMaxWorkingSetSize;
-  return result;
+	CFRelease(device);
 	return result;
 }
 uint64_t getPhysicalMemory() {
  return [[NSProcessInfo processInfo] physicalMemory];
 }
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -14,9 +14,6 @@ type GpuInfo struct {
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant,omitempty"`
 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
 	// TODO add other useful attributes about the card here for discovery information
 }
--- a/integration/README.md
+++ b/integration/README.md
@@ -1,11 +0,0 @@
 # Integration Tests
 This directory contains integration tests to exercise Ollama end-to-end to verify behavior
 By default, these tests are disabled so `go test ./...` will exercise only unit tests.  To run integration tests you must pass the integration tag.  `go test -tags=integration ./...`
 The integration tests have 2 modes of operating.
 1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
 2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -1,28 +0,0 @@
 //go:build integration
 package integration
 import (
 	"context"
 	"net/http"
 	"testing"
 	"time"
 	"github.com/ollama/ollama/api"
 )
 func TestOrcaMiniBlueSky(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
 		Model:  "orca-mini",
 		Prompt: "why is the sky blue?",
 		Stream: &stream,
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
 		},
 	}
 	GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh", "scattering"})
 }
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -1,29 +0,0 @@
 //go:build integration
 package integration
 import (
 	"context"
 	"net/http"
 	"testing"
 	"time"
 	"github.com/ollama/ollama/api"
 )
 func TestContextExhaustion(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
 		Model:  "llama2",
 		Prompt: "Write me a story with a ton of emojis?",
 		Stream: &stream,
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
 			"num_ctx":     128,
 		},
 	}
 	GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"once", "upon", "lived"})
 }
--- a/integration/llm_test.go
+++ b/integration/llm_test.go
@@ -1,69 +0,0 @@
 //go:build integration
 package integration
 import (
 	"context"
 	"net/http"
 	"sync"
 	"testing"
 	"time"
 	"github.com/ollama/ollama/api"
 )
 // TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
 //        package to avoid circular dependencies
 var (
 	stream = false
 	req    = [2]api.GenerateRequest{
 		{
 			Model:  "orca-mini",
 			Prompt: "why is the ocean blue?",
 			Stream: &stream,
 			Options: map[string]interface{}{
 				"seed":        42,
 				"temperature": 0.0,
 			},
 		}, {
 			Model:  "orca-mini",
 			Prompt: "what is the origin of the us thanksgiving holiday?",
 			Stream: &stream,
 			Options: map[string]interface{}{
 				"seed":        42,
 				"temperature": 0.0,
 			},
 		},
 	}
 	resp = [2][]string{
 		[]string{"sunlight"},
 		[]string{"england", "english", "massachusetts", "pilgrims"},
 	}
 )
 func TestIntegrationSimpleOrcaMini(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
 	defer cancel()
 	GenerateTestHelper(ctx, t, &http.Client{}, req[0], resp[0])
 }
 // TODO
 // The server always loads a new runner and closes the old one, which forces serial execution
 // At present this test case fails with concurrency problems.  Eventually we should try to
 // get true concurrency working with n_parallel support in the backend
 func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	var wg sync.WaitGroup
 	wg.Add(len(req))
 	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
 	defer cancel()
 	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
 			GenerateTestHelper(ctx, t, &http.Client{}, req[i], resp[i])
 		}(i)
 	}
 	wg.Wait()
 }
 // TODO - create a parallel test with 2 different models once we support concurrency
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -1,265 +0,0 @@
 //go:build integration
 package integration
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
 	"math/rand"
 	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"testing"
 	"time"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/stretchr/testify/assert"
 )
 func FindPort() string {
 	port := 0
 	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
 		var l *net.TCPListener
 		if l, err = net.ListenTCP("tcp", a); err == nil {
 			port = l.Addr().(*net.TCPAddr).Port
 			l.Close()
 		}
 	}
 	if port == 0 {
 		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 	}
 	return strconv.Itoa(port)
 }
 func GetTestEndpoint() (string, string) {
 	defaultPort := "11434"
 	ollamaHost := os.Getenv("OLLAMA_HOST")
 	scheme, hostport, ok := strings.Cut(ollamaHost, "://")
 	if !ok {
 		scheme, hostport = "http", ollamaHost
 	}
 	// trim trailing slashes
 	hostport = strings.TrimRight(hostport, "/")
 	host, port, err := net.SplitHostPort(hostport)
 	if err != nil {
 		host, port = "127.0.0.1", defaultPort
 		if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
 			host = ip.String()
 		} else if hostport != "" {
 			host = hostport
 		}
 	}
 	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && port == defaultPort {
 		port = FindPort()
 	}
 	url := fmt.Sprintf("%s:%s", host, port)
 	slog.Info("server connection", "url", url)
 	return scheme, url
 }
 // TODO make fanicier, grab logs, etc.
 var serverMutex sync.Mutex
 var serverReady bool
 func StartServer(ctx context.Context, ollamaHost string) error {
 	// Make sure the server has been built
 	CLIName, err := filepath.Abs("../ollama")
 	if err != nil {
 		return err
 	}
 	if runtime.GOOS == "windows" {
 		CLIName += ".exe"
 	}
 	_, err = os.Stat(CLIName)
 	if err != nil {
 		return fmt.Errorf("CLI missing, did you forget to build first?  %w", err)
 	}
 	serverMutex.Lock()
 	defer serverMutex.Unlock()
 	if serverReady {
 		return nil
 	}
 	if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
 		slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
 		os.Setenv("OLLAMA_HOST", ollamaHost)
 	}
 	slog.Info("starting server", "url", ollamaHost)
 	done, err := lifecycle.SpawnServer(ctx, "../ollama")
 	if err != nil {
 		return fmt.Errorf("failed to start server: %w", err)
 	}
 	go func() {
 		<-ctx.Done()
 		serverMutex.Lock()
 		defer serverMutex.Unlock()
 		exitCode := <-done
 		if exitCode > 0 {
 			slog.Warn("server failure", "exit", exitCode)
 		}
 		serverReady = false
 	}()
 	// TODO wait only long enough for the server to be responsive...
 	time.Sleep(500 * time.Millisecond)
 	serverReady = true
 	return nil
 }
 func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
 	slog.Info("checking status of model", "model", modelName)
 	showReq := &api.ShowRequest{Name: modelName}
 	requestJSON, err := json.Marshal(showReq)
 	if err != nil {
 		return err
 	}
 	req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/show", bytes.NewReader(requestJSON))
 	if err != nil {
 		return err
 	}
 	// Make the request with the HTTP client
 	response, err := client.Do(req.WithContext(ctx))
 	if err != nil {
 		return err
 	}
 	defer response.Body.Close()
 	if response.StatusCode == 200 {
 		slog.Info("model already present", "model", modelName)
 		return nil
 	}
 	slog.Info("model missing", "status", response.StatusCode)
 	pullReq := &api.PullRequest{Name: modelName, Stream: &stream}
 	requestJSON, err = json.Marshal(pullReq)
 	if err != nil {
 		return err
 	}
 	req, err = http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/pull", bytes.NewReader(requestJSON))
 	if err != nil {
 		return err
 	}
 	slog.Info("pulling", "model", modelName)
 	response, err = client.Do(req.WithContext(ctx))
 	if err != nil {
 		return err
 	}
 	defer response.Body.Close()
 	if response.StatusCode != 200 {
 		return fmt.Errorf("failed to pull model") // TODO more details perhaps
 	}
 	slog.Info("model pulled", "model", modelName)
 	return nil
 }
 var serverProcMutex sync.Mutex
 func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
 	// TODO maybe stuff in an init routine?
 	lifecycle.InitLogging()
 	requestJSON, err := json.Marshal(genReq)
 	if err != nil {
 		t.Fatalf("Error serializing request: %v", err)
 	}
 	defer func() {
 		if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
 			defer serverProcMutex.Unlock()
 			if t.Failed() {
 				fp, err := os.Open(lifecycle.ServerLogFile)
 				if err != nil {
 					slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
 					return
 				}
 				data, err := io.ReadAll(fp)
 				if err != nil {
 					slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
 					return
 				}
 				slog.Warn("SERVER LOG FOLLOWS")
 				os.Stderr.Write(data)
 				slog.Warn("END OF SERVER")
 			}
 			err = os.Remove(lifecycle.ServerLogFile)
 			if err != nil && !os.IsNotExist(err) {
 				slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
 			}
 		}
 	}()
 	scheme, testEndpoint := GetTestEndpoint()
 	if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
 		serverProcMutex.Lock()
 		fp, err := os.CreateTemp("", "ollama-server-*.log")
 		if err != nil {
 			t.Fatalf("failed to generate log file: %s", err)
 		}
 		lifecycle.ServerLogFile = fp.Name()
 		fp.Close()
 		assert.NoError(t, StartServer(ctx, testEndpoint))
 	}
 	err = PullIfMissing(ctx, client, scheme, testEndpoint, genReq.Model)
 	if err != nil {
 		t.Fatalf("Error pulling model: %v", err)
 	}
 	// Make the request and get the response
 	req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/generate", bytes.NewReader(requestJSON))
 	if err != nil {
 		t.Fatalf("Error creating request: %v", err)
 	}
 	// Set the content type for the request
 	req.Header.Set("Content-Type", "application/json")
 	// Make the request with the HTTP client
 	response, err := client.Do(req.WithContext(ctx))
 	if err != nil {
 		t.Fatalf("Error making request: %v", err)
 	}
 	defer response.Body.Close()
 	body, err := io.ReadAll(response.Body)
 	assert.NoError(t, err)
 	assert.Equal(t, response.StatusCode, 200, string(body))
 	// Verify the response is valid JSON
 	var payload api.GenerateResponse
 	err = json.Unmarshal(body, &payload)
 	if err != nil {
 		assert.NoError(t, err, body)
 	}
 	// Verify the response contains the expected data
 	atLeastOne := false
 	for _, resp := range anyResp {
 		if strings.Contains(strings.ToLower(payload.Response), resp) {
 			atLeastOne = true
 			break
 		}
 	}
 	assert.True(t, atLeastOne, "none of %v found in %s", anyResp, payload.Response)
 }
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -0,0 +1,142 @@
 #include "dyn_ext_server.h"
 #include <stdio.h>
 #include <string.h>
 #ifdef __linux__
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #elif _WIN32
 #include <windows.h>
 #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
 #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
 #define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
 #define LOAD_ERR() ({\
  LPSTR messageBuffer = NULL; \
  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
  char *resp = strdup(messageBuffer); \
  LocalFree(messageBuffer); \
  resp; \
 })
 #else
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #endif
 void dyn_init(const char *libPath, struct dynamic_llama_server *s,
                       ext_server_resp_t *err) {
  int i = 0;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"llama_server_init", (void *)&s->llama_server_init},
      {"llama_server_start", (void *)&s->llama_server_start},
      {"llama_server_stop", (void *)&s->llama_server_stop},
      {"llama_server_completion", (void *)&s->llama_server_completion},
      {"llama_server_completion_next_result",
       (void *)&s->llama_server_completion_next_result},
      {"llama_server_completion_cancel",
       (void *)&s->llama_server_completion_cancel},
      {"llama_server_release_task_result",
       (void *)&s->llama_server_release_task_result},
      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
      {"llama_server_embedding", (void *)&s->llama_server_embedding},
      {"llama_server_release_json_resp",
       (void *)&s->llama_server_release_json_resp},
      {"", NULL},
  };
  printf("loading library %s\n", libPath);
  s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
  if (!s->handle) {
    err->id = -1;
    char *msg = LOAD_ERR();
    snprintf(err->msg, err->msg_len,
             "Unable to load dynamic server library: %s", msg);
    free(msg);
    return;
  }
  for (i = 0; l[i].p != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(s->handle);
      err->id = -1;
      char *msg = LOAD_ERR();
      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
               l[i].s, msg);
      free(msg);
      return;
    }
  }
 }
 inline void dyn_llama_server_init(struct dynamic_llama_server s,
                                           ext_server_params_t *sparams,
                                           ext_server_resp_t *err) {
  s.llama_server_init(sparams, err);
 }
 inline void dyn_llama_server_start(struct dynamic_llama_server s) {
  s.llama_server_start();
 }
 inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
  s.llama_server_stop();
 }
 inline void dyn_llama_server_completion(struct dynamic_llama_server s,
                                                 const char *json_req,
                                                 ext_server_resp_t *resp) {
  s.llama_server_completion(json_req, resp);
 }
 inline void dyn_llama_server_completion_next_result(
    struct dynamic_llama_server s, const int task_id,
    ext_server_task_result_t *result) {
  s.llama_server_completion_next_result(task_id, result);
 }
 inline void dyn_llama_server_completion_cancel(
    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
  s.llama_server_completion_cancel(task_id, err);
 }
 inline void dyn_llama_server_release_task_result(
    struct dynamic_llama_server s, ext_server_task_result_t *result) {
  s.llama_server_release_task_result(result);
 }
 inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
                                               const char *json_req,
                                               char **json_resp,
                                               ext_server_resp_t *err) {
  s.llama_server_tokenize(json_req, json_resp, err);
 }
 inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
                                                 const char *json_req,
                                                 char **json_resp,
                                                 ext_server_resp_t *err) {
  s.llama_server_detokenize(json_req, json_resp, err);
 }
 inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
                                                const char *json_req,
                                                char **json_resp,
                                                ext_server_resp_t *err) {
  s.llama_server_embedding(json_req, json_resp, err);
 }
 inline void dyn_llama_server_release_json_resp(
    struct dynamic_llama_server s, char **json_resp) {
  s.llama_server_release_json_resp(json_resp);
 }
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -0,0 +1,368 @@
 package llm
 /*
 #cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
 #cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
 #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
 #cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 #cgo darwin LDFLAGS: -lc++ -framework Accelerate
 #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
 #cgo linux windows LDFLAGS: -lpthread
 #include <stdlib.h>
 #include "dyn_ext_server.h"
 */
 import "C"
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
 )
 type dynExtServer struct {
 	s       C.struct_dynamic_llama_server
 	options api.Options
 }
 // Note: current implementation does not support concurrent instantiations
 var mutex sync.Mutex
 func newExtServerResp(len C.size_t) C.ext_server_resp_t {
 	var resp C.ext_server_resp_t
 	resp.msg_len = len
 	bytes := make([]byte, len)
 	resp.msg = (*C.char)(C.CBytes(bytes))
 	return resp
 }
 func freeExtServerResp(resp C.ext_server_resp_t) {
 	if resp.msg_len == 0 {
 		return
 	}
 	C.free(unsafe.Pointer(resp.msg))
 }
 func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }
 // Note: current implementation does not support concurrent instantiations
 var llm *dynExtServer
 func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if !mutex.TryLock() {
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
 	gpu.UpdatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(512)
 	defer freeExtServerResp(resp)
 	var srv C.struct_dynamic_llama_server
 	C.dyn_init(libPath, &srv, &resp)
 	if resp.id < 0 {
 		mutex.Unlock()
 		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
 	}
 	llm = &dynExtServer{
 		s:       srv,
 		options: opts,
 	}
 	slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
 	var sparams C.ext_server_params_t
 	sparams.model = C.CString(model)
 	defer C.free(unsafe.Pointer(sparams.model))
 	sparams.embedding = true
 	sparams.n_ctx = C.uint(opts.NumCtx)
 	sparams.n_batch = C.uint(opts.NumBatch)
 	sparams.n_gpu_layers = C.int(opts.NumGPU)
 	sparams.main_gpu = C.int(opts.MainGPU)
 	sparams.n_parallel = 1 // TODO - wire up concurrency
 	// Always use the value encoded in the model
 	sparams.rope_freq_base = 0.0
 	sparams.rope_freq_scale = 0.0
 	sparams.memory_f16 = C.bool(opts.F16KV)
 	sparams.use_mlock = C.bool(opts.UseMLock)
 	sparams.use_mmap = C.bool(opts.UseMMap)
 	if opts.UseNUMA {
 		sparams.numa = C.int(1)
 	} else {
 		sparams.numa = C.int(0)
 	}
 	sparams.lora_adapters = nil
 	for i := 0; i < len(adapters); i++ {
 		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
 		defer C.free(unsafe.Pointer(la))
 		la.adapter = C.CString(adapters[i])
 		defer C.free(unsafe.Pointer(la.adapter))
 		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
 		la.next = nil
 		if i == 0 {
 			sparams.lora_adapters = la
 		} else {
 			tmp := sparams.lora_adapters
 			for ; tmp.next != nil; tmp = tmp.next {
 			}
 			tmp.next = la
 		}
 	}
 	if len(projectors) > 0 {
 		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
 		sparams.mmproj = C.CString(projectors[0])
 		defer C.free(unsafe.Pointer(sparams.mmproj))
 	} else {
 		sparams.mmproj = nil
 	}
 	sparams.n_threads = C.uint(opts.NumThread)
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		sparams.verbose_logging = C.bool(true)
 	} else {
 		sparams.verbose_logging = C.bool(false)
 	}
 	slog.Info("Initializing llama server")
 	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
 	initResp := newExtServerResp(128)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {
 		mutex.Unlock()
 		err := extServerResponseToErr(initResp)
 		slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
 		return nil, err
 	}
 	slog.Info("Starting llama main loop")
 	C.dyn_llama_server_start(llm.s)
 	return llm, nil
 }
 func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	if len(predict.Images) > 0 {
 		slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
 	}
 	request := map[string]any{
 		"prompt":            predict.Prompt,
 		"stream":            true,
 		"n_predict":         predict.Options.NumPredict,
 		"n_keep":            predict.Options.NumKeep,
 		"temperature":       predict.Options.Temperature,
 		"top_k":             predict.Options.TopK,
 		"top_p":             predict.Options.TopP,
 		"tfs_z":             predict.Options.TFSZ,
 		"typical_p":         predict.Options.TypicalP,
 		"repeat_last_n":     predict.Options.RepeatLastN,
 		"repeat_penalty":    predict.Options.RepeatPenalty,
 		"presence_penalty":  predict.Options.PresencePenalty,
 		"frequency_penalty": predict.Options.FrequencyPenalty,
 		"mirostat":          predict.Options.Mirostat,
 		"mirostat_tau":      predict.Options.MirostatTau,
 		"mirostat_eta":      predict.Options.MirostatEta,
 		"penalize_nl":       predict.Options.PenalizeNewline,
 		"seed":              predict.Options.Seed,
 		"stop":              predict.Options.Stop,
 		"image_data":        predict.Images,
 		"cache_prompt":      true,
 	}
 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
 	}
 	retryDelay := 100 * time.Microsecond
 	for retries := 0; retries < maxRetries; retries++ {
 		if retries > 0 {
 			time.Sleep(retryDelay) // wait before retrying
 			retryDelay *= 2        // exponential backoff
 		}
 		// Handling JSON marshaling with special characters unescaped.
 		buffer := &bytes.Buffer{}
 		enc := json.NewEncoder(buffer)
 		enc.SetEscapeHTML(false)
 		if err := enc.Encode(request); err != nil {
 			return fmt.Errorf("failed to marshal data: %w", err)
 		}
 		req := C.CString(buffer.String())
 		defer C.free(unsafe.Pointer(req))
 		C.dyn_llama_server_completion(llm.s, req, &resp)
 		if resp.id < 0 {
 			return extServerResponseToErr(resp)
 		}
 		retryNeeded := false
 	out:
 		for {
 			select {
 			case <-ctx.Done():
 				// This handles the request cancellation
 				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 				if resp.id < 0 {
 					return extServerResponseToErr(resp)
 				} else {
 					return nil
 				}
 			default:
 				var result C.ext_server_task_result_t
 				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
 				json_resp := C.GoString(result.json_resp)
 				C.dyn_llama_server_release_task_result(llm.s, &result)
 				var p prediction
 				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
 					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 					if resp.id < 0 {
 						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
 					} else {
 						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
 					}
 				}
 				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
 					retryNeeded = true
 					// task will already be canceled
 					break out
 				}
 				if p.Content != "" {
 					fn(PredictResult{
 						Content: p.Content,
 					})
 				}
 				if p.Stop || bool(result.stop) {
 					fn(PredictResult{
 						Done:               true,
 						PromptEvalCount:    p.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
 						EvalCount:          p.Timings.PredictedN,
 						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
 					})
 					return nil
 				}
 			}
 		}
 		if !retryNeeded {
 			return nil // success
 		}
 	}
 	// should never reach here ideally
 	return fmt.Errorf("max retries exceeded")
 }
 func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("marshaling encode data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
 	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 	var encoded TokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
 		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
 	}
 	return encoded.Tokens, err
 }
 func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
 	if len(tokens) == 0 {
 		return "", nil
 	}
 	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
 	if err != nil {
 		return "", fmt.Errorf("marshaling decode data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return "", extServerResponseToErr(resp)
 	}
 	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 	var decoded DetokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
 		return "", fmt.Errorf("unmarshal encode response: %w", err2)
 	}
 	return decoded.Content, err
 }
 func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
 	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 	var embedding EmbeddingResponse
 	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 	return embedding.Embedding, nil
 }
 func (llm *dynExtServer) Close() {
 	C.dyn_llama_server_stop(llm.s)
 	mutex.Unlock()
 }
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -0,0 +1,74 @@
 #include <stdlib.h>
 #include "ext_server.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct dynamic_llama_server {
  void *handle;
  void (*llama_server_init)(ext_server_params_t *sparams,
                            ext_server_resp_t *err);
  void (*llama_server_start)();
  void (*llama_server_stop)();
  void (*llama_server_completion)(const char *json_req,
                                  ext_server_resp_t *resp);
  void (*llama_server_completion_next_result)(const int task_id,
                                              ext_server_task_result_t *result);
  void (*llama_server_completion_cancel)(const int task_id,
                                         ext_server_resp_t *err);
  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
                                ext_server_resp_t *err);
  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
                                  ext_server_resp_t *err);
  void (*llama_server_embedding)(const char *json_req, char **json_resp,
                                 ext_server_resp_t *err);
  void (*llama_server_release_json_resp)(char **json_resp);
 };
 void dyn_init(const char *libPath, struct dynamic_llama_server *s,
                       ext_server_resp_t *err);
 // No good way to call C function pointers from Go so inline the indirection
 void dyn_llama_server_init(struct dynamic_llama_server s,
                                    ext_server_params_t *sparams,
                                    ext_server_resp_t *err);
 void dyn_llama_server_start(struct dynamic_llama_server s);
 void dyn_llama_server_stop(struct dynamic_llama_server s);
 void dyn_llama_server_completion(struct dynamic_llama_server s,
                                          const char *json_req,
                                          ext_server_resp_t *resp);
 void dyn_llama_server_completion_next_result(
    struct dynamic_llama_server s, const int task_id,
    ext_server_task_result_t *result);
 void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
                                                 const int task_id,
                                                 ext_server_resp_t *err);
 void dyn_llama_server_release_task_result(
    struct dynamic_llama_server s, ext_server_task_result_t *result);
 void dyn_llama_server_tokenize(struct dynamic_llama_server s,
                                        const char *json_req, char **json_resp,
                                        ext_server_resp_t *err);
 void dyn_llama_server_detokenize(struct dynamic_llama_server s,
                                          const char *json_req,
                                          char **json_resp,
                                          ext_server_resp_t *err);
 void dyn_llama_server_embedding(struct dynamic_llama_server s,
                                         const char *json_req, char **json_resp,
                                         ext_server_resp_t *err);
 void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
                                                 char **json_resp);
 #ifdef __cplusplus
 }
 #endif
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,14 +1,25 @@
 # Ollama specific CMakefile to include in llama.cpp/examples/server
-set(TARGET ollama_llama_server)
+set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
-    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+    add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
 else()
    add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
 endif()
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_include_directories(${TARGET} PRIVATE ../../common)
 target_include_directories(${TARGET} PRIVATE ../..)
 target_include_directories(${TARGET} PRIVATE ../../..)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
 target_link_libraries(${TARGET} PRIVATE ggml llava common )
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
 install(TARGETS ext_server LIBRARY)
 if (CUDAToolkit_FOUND)
    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
    if (WIN32)
        target_link_libraries(${TARGET} PRIVATE nvml)
    endif()
 endif()
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -0,0 +1,18 @@
 # Extern C Server
 This directory contains a thin facade we layer on top of the Llama.cpp server to
 expose `extern C` interfaces to access the functionality through direct API
 calls in-process.  The llama.cpp code uses compile time macros to configure GPU
 type along with other settings.  During the `go generate ./...` execution, the
 build will generate one or more copies of the llama.cpp `extern C` server based
 on what GPU libraries are detected to support multiple GPU types as well as CPU
 only support. The Ollama go build then embeds these different servers to support
 different GPUs and settings at runtime.
 If you are making changes to the code in this directory, make sure to disable
 caching during your go build to ensure you pick up your changes.  A typical
 iteration cycle from the top of the source tree looks like:
 ```
 go generate ./... && go build -a .
 ```
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -0,0 +1,381 @@
 #include "ext_server.h"
 #include <atomic>
 // Necessary evil since the server types are not defined in a header
 #include "server.cpp"
 // Low level API access to verify GPU access
 #if defined(GGML_USE_CUBLAS)
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #ifdef __HIP_PLATFORM_AMD__
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
 #define cudaGetDevice hipGetDevice
 #define cudaError_t hipError_t
 #define cudaSuccess hipSuccess
 #define cudaGetErrorString hipGetErrorString
 #else
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #endif // defined(GGML_USE_HIPBLAS)
 #endif // GGML_USE_CUBLAS
 // Expose the llama server as a callable extern "C" API
 server_context *llama = NULL;
 std::thread ext_server_thread;
 bool shutting_down = false;
 std::atomic_int recv_counter;
 // RAII wrapper for tracking in-flight recv calls
 class atomicRecv {
  public:
    atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
      ++this->atomic;
    }
    ~atomicRecv() {
      --this->atomic;
    }
  private:
    std::atomic<int> &atomic;
 };
 void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
  recv_counter = 0;
  assert(err != NULL && sparams != NULL);
  log_set_target(stderr);
  if (!sparams->verbose_logging) {
    server_verbose = true;
    log_disable();
  }
  LOG_TEE("system info: %s\n", llama_print_system_info());
  err->id = 0;
  err->msg[0] = '\0';
  try {
    llama = new server_context;
    gpt_params params;
    params.n_ctx = sparams->n_ctx;
    params.n_batch = sparams->n_batch;
    if (sparams->n_threads > 0) {
      params.n_threads = sparams->n_threads;
    }
    params.n_parallel = sparams->n_parallel;
    params.rope_freq_base = sparams->rope_freq_base;
    params.rope_freq_scale = sparams->rope_freq_scale;
    if (sparams->memory_f16) {
      params.cache_type_k = "f16";
      params.cache_type_v = "f16";
    } else {
      params.cache_type_k = "f32";
      params.cache_type_v = "f32";
    }
    params.n_gpu_layers = sparams->n_gpu_layers;
    params.main_gpu = sparams->main_gpu;
    params.use_mlock = sparams->use_mlock;
    params.use_mmap = sparams->use_mmap;
    params.numa = (ggml_numa_strategy)sparams->numa;
    params.embedding = sparams->embedding;
    if (sparams->model != NULL) {
      params.model = sparams->model;
    }
    if (sparams->lora_adapters != NULL) {
      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
          la = la->next) {
        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
      }
      params.use_mmap = false;
    }
    if (sparams->mmproj != NULL) {
      params.mmproj = std::string(sparams->mmproj);
    }
 #if defined(GGML_USE_CUBLAS)
    // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
    LOG_TEE("Performing pre-initialization of GPU\n");
    int id;
    cudaError_t cudaErr = cudaGetDevice(&id);
    if (cudaErr != cudaSuccess) {
      err->id = -1;
      snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
      return;
    }
 #endif
    llama_backend_init();
    llama_numa_init(params.numa);
    // load the model
    if (!llama->load_model(params)) {
      // TODO - consider modifying the logging logic or patching load_model so
      // we can capture more detailed error messages and pass them back to the
      // caller for better UX
      err->id = -1;
      snprintf(err->msg, err->msg_len, "error loading model %s",
               params.model.c_str());
      return;
    }
    llama->init();
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len,
             "Unknown exception initializing llama server");
  }
 }
 void llama_server_start() {
  assert(llama != NULL);
  // TODO mutex to protect thread creation
  ext_server_thread = std::thread([&]() {
    try {
      LOG_TEE("llama server main loop starting\n");
      ggml_time_init();
      llama->queue_tasks.on_new_task(std::bind(
        &server_context::process_single_task, llama, std::placeholders::_1));
      llama->queue_tasks.on_finish_multitask(std::bind(
        &server_context::on_finish_multitask, llama, std::placeholders::_1));
      llama->queue_tasks.on_run_slots(std::bind(
        &server_context::update_slots, llama));
      llama->queue_results.on_multitask_update(std::bind(
          &server_queue::update_multitask,
          &llama->queue_tasks,
          std::placeholders::_1,
          std::placeholders::_2,
          std::placeholders::_3
        ));
      llama->queue_tasks.start_loop();
    } catch (std::exception &e) {
      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
    } catch (...) {
      LOG_TEE("caught unknown exception in llama server main loop\n");
    }
    LOG_TEE("\nllama server shutting down\n");
    llama_backend_free();
  });
 }
 void llama_server_stop() {
  assert(llama != NULL);
  // Shutdown any in-flight requests and block incoming requests.
  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
  shutting_down = true;
  while (recv_counter.load() > 0) {
    std::this_thread::sleep_for(std::chrono::milliseconds(50));
  }
  // This may take a while for any pending tasks to drain
  // TODO - consider a timeout to cancel tasks if it's taking too long
  llama->queue_tasks.terminate();
  ext_server_thread.join();
  delete llama;
  llama = NULL;
  LOG_TEE("llama server shutdown complete\n");
  shutting_down = false;
 }
 void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
  assert(llama != NULL && json_req != NULL && resp != NULL);
  resp->id = -1;
  resp->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    json data = json::parse(json_req);
    resp->id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(resp->id);
    llama->request_completion(resp->id, -1, data, false, false);
  } catch (std::exception &e) {
    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
  } catch (...) {
    snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
  }
 }
 void llama_server_completion_next_result(const int task_id,
                                         ext_server_task_result_t *resp) {
  assert(llama != NULL && resp != NULL);
  resp->id = -1;
  resp->stop = false;
  resp->error = false;
  resp->json_resp = NULL;
  std::string result_json;
  try {
    atomicRecv ar(recv_counter);
    server_task_result result = llama->queue_results.recv(task_id);
    result_json =
        result.data.dump(-1, ' ', false, json::error_handler_t::replace);
    resp->id = result.id;
    resp->stop = result.stop;
    resp->error = result.error;
    if (result.error) {
      LOG_TEE("next result cancel on error\n");
      llama->request_cancel(task_id);
      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
      llama->queue_results.remove_waiting_task_id(task_id);
    } else if (result.stop) {
      LOG_TEE("next result cancel on stop\n");
      llama->request_cancel(task_id);
      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
      llama->queue_results.remove_waiting_task_id(task_id);
    } else if (shutting_down) {
      LOG_TEE("aborting completion due to shutdown %d\n", task_id);
      llama->request_cancel(task_id);
      llama->queue_results.remove_waiting_task_id(task_id);
      resp->stop = true;
    }
  } catch (std::exception &e) {
    resp->error = true;
    resp->id = -1;
    result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
    LOG_TEE("llama server completion exception %s\n", e.what());
  } catch (...) {
    resp->error = true;
    resp->id = -1;
    result_json = "{\"error\":\"Unknown exception during completion\"}";
    LOG_TEE("llama server completion unknown exception\n");
  }
  const std::string::size_type size = result_json.size() + 1;
  resp->json_resp = new char[size];
  snprintf(resp->json_resp, size, "%s", result_json.c_str());
 }
 void llama_server_release_task_result(ext_server_task_result_t *result) {
  if (result == NULL || result->json_resp == NULL) {
    return;
  }
  delete[] result->json_resp;
 }
 void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
  assert(llama != NULL && err != NULL);
  err->id = 0;
  err->msg[0] = '\0';
  try {
    llama->request_cancel(task_id);
    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len,
             "Unknown exception completion cancel in llama server");
  }
 }
 void llama_server_tokenize(const char *json_req, char **json_resp,
                           ext_server_resp_t *err) {
  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
  *json_resp = NULL;
  err->id = 0;
  err->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    const json body = json::parse(json_req);
    std::vector<llama_token> tokens;
    if (body.count("content") != 0) {
      tokens = llama->tokenize(body["content"], false);
    }
    const json data = format_tokenizer_response(tokens);
    std::string result_json = data.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
  }
 }
 void llama_server_release_json_resp(char **json_resp) {
  if (json_resp == NULL || *json_resp == NULL) {
    return;
  }
  delete[] *json_resp;
 }
 void llama_server_detokenize(const char *json_req, char **json_resp,
                             ext_server_resp_t *err) {
  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
  *json_resp = NULL;
  err->id = 0;
  err->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    const json body = json::parse(json_req);
    std::string content;
    if (body.count("tokens") != 0) {
      const std::vector<llama_token> tokens = body["tokens"];
      content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
    }
    const json data = format_detokenized_response(content);
    std::string result_json = data.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
  }
 }
 void llama_server_embedding(const char *json_req, char **json_resp,
                            ext_server_resp_t *err) {
  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
  *json_resp = NULL;
  err->id = 0;
  err->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    const json body = json::parse(json_req);
    json prompt;
    if (body.count("content") != 0) {
      prompt = body["content"];
    } else {
      prompt = "";
    }
    const int task_id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(task_id);
    llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true);
    atomicRecv ar(recv_counter);
    server_task_result result = llama->queue_results.recv(task_id);
    std::string result_json = result.data.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
  }
 }
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@@ -0,0 +1,95 @@
 #if defined(LLAMA_SERVER_LIBRARY)
 #ifndef LLAMA_SERVER_H
 #define LLAMA_SERVER_H
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 int __main(int argc, char **argv);
 // This exposes extern C entrypoints into the llama_server
 // To enable the server compile with LLAMA_SERVER_LIBRARY
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct ext_server_resp {
  int id;          // < 0 on error
  size_t msg_len;  // caller must allocate msg and set msg_len
  char *msg;
 } ext_server_resp_t;
 // Allocated and freed by caller
 typedef struct ext_server_lora_adapter {
  char *adapter;
  float scale;
  struct ext_server_lora_adapter *next;
 } ext_server_lora_adapter_t;
 // Allocated and freed by caller
 typedef struct ext_server_params {
  char *model;
  uint32_t n_ctx;         // token context window, 0 = from model
  uint32_t n_batch;       // prompt processing maximum batch size
  uint32_t n_threads;     // number of threads to use for generation
  int32_t n_parallel;     // number of parallel sequences to decodewra
  float rope_freq_base;   // RoPE base frequency, 0 = from model
  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
  bool memory_f16;        // use f16 instead of f32 for memory kv
  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
  bool use_mlock;        // force system to keep model in RAM
  bool use_mmap;         // use mmap if possible
  int numa;              // attempt optimizations that help on some NUMA systems
  bool embedding;        // get only sentence embedding
  ext_server_lora_adapter_t *lora_adapters;
  char *mmproj;
  bool verbose_logging;  // Enable verbose logging of the server
 } ext_server_params_t;
 typedef struct ext_server_task_result {
  int id;
  bool stop;
  bool error;
  char *json_resp;  // null terminated, memory managed by ext_server
 } ext_server_task_result_t;
 // Initialize the server once per process
 // err->id = 0 for success and err->msg[0] = NULL
 // err->id != 0 for failure, and err->msg contains error message
 void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
 // Run the main loop, called once per init
 void llama_server_start();
 // Stop the main loop and free up resources allocated in init and start.  Init
 // must be called again to reuse
 void llama_server_stop();
 // json_req null terminated string, memory managed by caller
 // resp->id >= 0 on success (task ID)
 // resp->id < 0 on error, and resp->msg contains error message
 void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
 // Caller must call llama_server_release_task_result to free resp->json_resp
 void llama_server_completion_next_result(const int task_id,
                                         ext_server_task_result_t *result);
 void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
 void llama_server_release_task_result(ext_server_task_result_t *result);
 // Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
 // 0
 void llama_server_tokenize(const char *json_req, char **json_resp,
                           ext_server_resp_t *err);
 void llama_server_detokenize(const char *json_req, char **json_resp,
                             ext_server_resp_t *err);
 void llama_server_embedding(const char *json_req, char **json_resp,
                            ext_server_resp_t *err);
 void llama_server_release_json_resp(char **json_resp);
 #ifdef __cplusplus
 }
 #endif
 #endif
 #endif  // LLAMA_SERVER_LIBRARY
--- a/llm/ext_server/httplib.h
+++ b/llm/ext_server/httplib.h
--- a/llm/ext_server/json.hpp
+++ b/llm/ext_server/json.hpp
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@@ -1,655 +0,0 @@
 // MIT License
 // Copyright (c) 2023 Georgi Gerganov
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // The above copyright notice and this permission notice shall be included in all
 // copies or substantial portions of the Software.
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include "json.hpp"
 #include "../llava/clip.h"
 using json = nlohmann::json;
 extern bool server_verbose;
 extern bool server_log_json;
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
 #if SERVER_VERBOSE != 1
 #define LOG_VERBOSE(MSG, ...)
 #else
 #define LOG_VERBOSE(MSG, ...)                                            \
    do                                                                   \
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif
 #define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
 };
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
    TASK_TYPE_NEXT_RESPONSE,
    TASK_TYPE_METRICS
 };
 struct task_server {
    int id = -1; // to be filled by llama_server_queue
    int target_id;
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    int multitask_id = -1;
 };
 struct task_result {
    int id;
    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };
 struct task_multi {
    int id;
    std::set<int> subtasks_remaining{};
    std::vector<task_result> results{};
 };
 // completion token output with probabilities
 struct completion_token_output {
    struct token_prob
    {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    llama_token tok;
    std::string text_to_send;
 };
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
    std::stringstream ss_tid;
    ss_tid << std::this_thread::get_id();
    json log = nlohmann::ordered_json{
        {"tid", ss_tid.str()},
        {"timestamp", time(nullptr)},
    };
    if (server_log_json) {
        log.merge_patch(
                {
                        {"level",     level},
                        {"function",  function},
                        {"line",      line},
                        {"msg",       message},
                });
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
    } else {
        char buf[1024];
        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::stringstream ss;
        ss << buf << " |";
        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
            ss << " " << el.key() << "=" << value;
        }
        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
        fflush(stdout);
    }
 }
 //
 // server utils
 //
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value) {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 inline bool verify_custom_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
    std::vector<char> buf(1);
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
    return res >= 0;
 }
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
    size_t alloc_size = 0;
    // vector holding all allocated string to be passed to llama_chat_apply_template
    std::vector<std::string> str(messages.size() * 2);
    std::vector<llama_chat_message> chat(messages.size());
    for (size_t i = 0; i < messages.size(); ++i) {
        auto &curr_msg = messages[i];
        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
        alloc_size     += str[i*2 + 1].length();
        chat[i].role    = str[i*2 + 0].c_str();
        chat[i].content = str[i*2 + 1].c_str();
    }
    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
    std::vector<char> buf(alloc_size * 2);
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }
    std::string formatted_chat(buf.data(), res);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
 //
 // work queue utils
 //
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    bool running;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
    std::vector<task_multi> queue_multitasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
    std::function<void(void)> callback_run_slots;
    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
            LOG_VERBOSE("new task id", {{"new_id", task.id}});
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
        return task.id;
    }
    // Add a new task, but defer until one slot is available
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
    }
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        int new_id = id++;
        LOG_VERBOSE("new task id", {{"new_id", new_id}});
        return new_id;
    }
    // Register function to process a new task
    void on_new_task(std::function<void(task_server&)> callback) {
        callback_new_task = callback;
    }
    // Register function to process a multitask when it is finished
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }
    // Register the function to be called when all slots data is ready to be processed
    void on_run_slots(std::function<void(void)> callback) {
        callback_run_slots = callback;
    }
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
        queue_tasks_deferred.clear();
    }
    // end the start_loop routine
    void terminate() {
        {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            running = false;
        }
        condition_tasks.notify_all();
    }
    /**
     * Main loop consists of these steps:
     * - Wait until a new task arrives
     * - Process the task (i.e. maybe copy data into slot)
     * - Check if multitask is finished
     * - Run all slots
     */
    void start_loop() {
        running = true;
        while (true) {
            LOG_VERBOSE("new task may arrive", {});
            {
                while (true)
                {
                    std::unique_lock<std::mutex> lock(mutex_tasks);
                    if (queue_tasks.empty()) {
                        lock.unlock();
                        break;
                    }
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
                    callback_new_task(task);
                }
                LOG_VERBOSE("update_multitasks", {});
                // check if we have any finished multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
                    if (queue_iterator->subtasks_remaining.empty())
                    {
                        // all subtasks done == multitask is done
                        task_multi current_multitask = *queue_iterator;
                        callback_finish_multitask(current_multitask);
                        // remove this multitask
                        queue_iterator = queue_multitasks.erase(queue_iterator);
                    }
                    else
                    {
                        ++queue_iterator;
                    }
                }
                // all tasks in the current loop is processed, slots data is now ready
                LOG_VERBOSE("callback_run_slots", {});
                callback_run_slots();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    if (!running) {
                        LOG_VERBOSE("ending start_loop", {});
                        return;
                    }
                    condition_tasks.wait(lock, [&]{
                        return (!queue_tasks.empty() || !running);
                    });
                }
            }
        }
    }
    //
    // functions to manage multitasks
    //
    // add a multitask by specifying the id of all subtask (subtask is a task_server)
    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_multi multi;
        multi.id = multitask_id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
    }
    // updatethe remaining subtasks, while appending results to multitask
    void update_multitask(int multitask_id, int subtask_id, task_result& result)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        for (auto& multitask : queue_multitasks)
        {
            if (multitask.id == multitask_id)
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
            }
        }
    }
 };
 struct llama_server_response {
    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
    callback_multitask_t callback_update_multitask;
    // for keeping track of all tasks waiting for the result
    std::set<int> waiting_task_ids;
    // the main result queue
    std::vector<task_result> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    // add the task_id to the list of tasks waiting for response
    void add_waiting_task_id(int task_id) {
        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    // when the request is finished, we can remove task associated with it
    void remove_waiting_task_id(int task_id) {
        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
    // This function blocks the thread until there is a response for this task_id
    task_result recv(int task_id) {
        while (true)
        {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
                if (queue_results[i].id == task_id)
                {
                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
                }
            }
        }
        // should never reach here
    }
    // Register the function to update multitask
    void on_multitask_update(callback_multitask_t callback) {
        callback_update_multitask = callback;
    }
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
        LOG_VERBOSE("send new result", {{"task_id", result.id}});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
                queue_results.push_back(result);
                condition_results.notify_all();
                return;
            }
        }
    }
 };
 //
 // base64 utils (TODO: move to common in the future)
 //
 static const std::string base64_chars =
             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";
 static inline bool is_base64(uint8_t c)
 {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
 static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
    int in_ = 0;
    int in_len = encoded_string.size();
    uint8_t char_array_4[4];
    uint8_t char_array_3[3];
    std::vector<uint8_t> ret;
    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
    {
        char_array_4[i++] = encoded_string[in_]; in_++;
        if (i == 4)
        {
            for (i = 0; i <4; i++)
            {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }
            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
            for (i = 0; (i < 3); i++)
            {
                ret.push_back(char_array_3[i]);
            }
            i = 0;
        }
    }
    if (i)
    {
        for (j = i; j <4; j++)
        {
            char_array_4[j] = 0;
        }
        for (j = 0; j <4; j++)
        {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }
        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
        for (j = 0; (j < i - 1); j++)
        {
            ret.push_back(char_array_3[j]);
        }
    }
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
 //
 // other common utils
 //
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
 {
    size_t i;
    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
    {
    }
    return i;
 }
 static bool ends_with(const std::string &str, const std::string &suffix)
 {
    return str.size() >= suffix.size() &&
           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
 static size_t find_partial_stop_string(const std::string &stop,
                                       const std::string &text)
 {
    if (!text.empty() && !stop.empty())
    {
        const char text_last_char = text.back();
        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
        {
            if (stop[char_index] == text_last_char)
            {
                const std::string current_partial = stop.substr(0, char_index + 1);
                if (ends_with(text, current_partial))
                {
                    return text.size() - char_index - 1;
                }
            }
        }
    }
    return std::string::npos;
 }
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 {
    std::string ret;
    for (; begin != end; ++begin)
    {
        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
        std::string res(ss.str());
        out = "byte: \\x" + res;
    }
    return out;
 }
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
 {
    json out = json::array();
    for (const auto &prob : probs)
    {
        json probs_for_token = json::array();
        for (const auto &p : prob.probs)
        {
            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
            probs_for_token.push_back(json
            {
                {"tok_str", tok_str},
                {"prob",    p.prob},
            });
        }
        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
        out.push_back(json{
            {"content", tok_str},
            {"probs",   probs_for_token},
        });
    }
    return out;
 }
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -14,7 +14,7 @@ init_vars() {
    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS=""
-    CMAKE_TARGETS="--target ollama_llama_server"
+    CMAKE_TARGETS="--target ext_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
@@ -39,7 +39,7 @@ init_vars() {
    *)
        ;;
    esac
-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
+    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then 
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
 }
@@ -61,8 +61,8 @@ git_module_setup() {
 apply_patches() {
    # Wire up our CMakefile
-    if ! grep ollama ${LLAMACPP_DIR}/CMakeLists.txt; then
+    if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
-        echo 'add_subdirectory(../ext_server ext_server) # ollama' >>${LLAMACPP_DIR}/CMakeLists.txt
+        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
    fi
    if [ -n "$(ls -A ../patches/*.diff)" ]; then
@@ -76,29 +76,35 @@ apply_patches() {
            (cd ${LLAMACPP_DIR} && git apply ${patch})
        done
    fi
    # Avoid duplicate main symbols when we link into the cgo binary
    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
        mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
 }
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
    mkdir -p ${BUILD_DIR}/lib/
    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
        ${GCC_ARCH} \
        ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
        ${BUILD_DIR}/common/libcommon.a \
        ${BUILD_DIR}/libllama.a \
        -Wl,-rpath,\$ORIGIN \
        -lpthread -ldl -lm \
        ${EXTRA_LIBS}
 }
-compress() {
+compress_libs() {
    echo "Compressing payloads to reduce overall binary size..."
    pids=""
-    rm -rf ${BUILD_DIR}/bin/*.gz
+    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
-    for f in ${BUILD_DIR}/bin/* ; do
+    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
-        gzip -n --best -f ${f} &
+        gzip -n --best -f ${lib} &
        pids+=" $!"
    done
-    # check for lib directory
+    echo 
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            gzip -n --best -f ${f} &
            pids+=" $!"
        done
    fi
    echo
    for pid in ${pids}; do
        wait $pid
    done
@@ -107,7 +113,7 @@ compress() {
 # Keep the local tree clean after we're done with the build
 cleanup() {
-    (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
+    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
    if [ -n "$(ls -A ../patches/*.diff)" ]; then
        for patch in ../patches/*.diff; do
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,31 +18,34 @@ sign() {
    fi
 }
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+# bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
 bundle_metal() {
    grep -v '#include "ggml-common.h"' "${LLAMACPP_DIR}/ggml-metal.metal" | grep -v '#pragma once' > "${LLAMACPP_DIR}/ggml-metal.metal.temp"
    echo '#define GGML_COMMON_IMPL_METAL' > "${LLAMACPP_DIR}/ggml-metal.metal"
    cat "${LLAMACPP_DIR}/ggml-common.h" | grep -v '#pragma once' >> "${LLAMACPP_DIR}/ggml-metal.metal"
    cat  "${LLAMACPP_DIR}/ggml-metal.metal.temp" >> "${LLAMACPP_DIR}/ggml-metal.metal"
    rm "${LLAMACPP_DIR}/ggml-metal.metal.temp"
 }
 cleanup_metal() {
    (cd ${LLAMACPP_DIR} && git checkout ggml-metal.metal)
 }
 COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
 case "${GOARCH}" in
 "amd64")
    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
    #
    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}/cpu"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
    echo "Building LCD CPU"
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
-    compress
+    compress_libs
    #
    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
@@ -50,11 +53,11 @@ case "${GOARCH}" in
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
    echo "Building AVX CPU"
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
-    compress
+    compress_libs
    #
    # ~2013 CPU Dynamic library
@@ -62,30 +65,22 @@ case "${GOARCH}" in
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
    echo "Building AVX2 CPU"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
-    compress
+    compress_libs
    ;;
 "arm64")
-
+    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
-    # Static build for linking into the Go binary
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
    init_vars
    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
    bundle_metal
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
-    compress
+    compress_libs
    cleanup_metal
    ;;
 *)
    echo "GOARCH must be set"
@@ -95,4 +90,3 @@ case "${GOARCH}" in
 esac
 cleanup
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -26,9 +26,6 @@ amdGPUs() {
        "gfx908:xnack-"
        "gfx90a:xnack+"
        "gfx90a:xnack-"
        "gfx940"
        "gfx941"
        "gfx942"
        "gfx1010"
        "gfx1012"
        "gfx1030"
@@ -57,31 +54,16 @@ init_vars
 git_module_setup
 apply_patches
 init_vars
 if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
    # Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
    # Enables optimized Dockerfile builds using a blanket skip and targeted overrides
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}_static"
    echo "Building static library"
    build
 fi
 init_vars
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}/cpu"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
-        compress
+        compress_libs
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
@@ -98,43 +80,37 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="../build/linux/${ARCH}/cpu"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
            echo "Building LCD CPU"
            build
-            compress
+            compress_libs
        fi
-        if [ "${ARCH}" == "x86_64" ]; then
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
            #
-            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
+            # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
            # Approximately 400% faster than LCD on same CPU
            #
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
+            init_vars
-                #
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
-                # Approximately 400% faster than LCD on same CPU
+            echo "Building AVX CPU"
-                #
+            build
-                init_vars
+            compress_libs
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        fi
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                echo "Building AVX CPU"
                build
                compress
            fi
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
-                #
+            #
-                # ~2013 CPU Dynamic library
+            # ~2013 CPU Dynamic library
-                # Approximately 10% faster than AVX on same CPU
+            # Approximately 10% faster than AVX on same CPU
-                #
+            #
-                init_vars
+            init_vars
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
-                echo "Building AVX2 CPU"
+            echo "Building AVX2 CPU"
-                build
+            build
-                compress
+            compress_libs
            fi
        fi
    fi
 else
@@ -163,38 +139,29 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
-    if [ "${ARCH}" == "arm64" ]; then
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-        echo "ARM CPU detected - disabling unsupported AVX instructions"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
        #
        # CUDA compute < 6.0 lacks proper FP16 support on ARM. 
        # Disabling has minimal performance effect while maintaining compatibility. 
        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
    fi
    CMAKE_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
-    # Carry the CUDA libs as payloads to help reduce dependency burden on users
+    # Cary the CUDA libs as payloads to help reduce dependency burden on users
    #
    # TODO - in the future we may shift to packaging these separately and conditionally
    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
+    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
    for lib in libcudart.so libcublas.so libcublasLt.so ; do
        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
+            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
        fi
    done
-    compress
+    compress_libs
 fi
@@ -217,24 +184,21 @@ if [ -d "${ROCM_PATH}" ]; then
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/bin/deps.txt"
+    rm -f "${BUILD_DIR}/lib/deps.txt"
-    touch "${BUILD_DIR}/bin/deps.txt"
+    touch "${BUILD_DIR}/lib/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
+
-        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
+    # having the execstack bit set on the HIP runtime sometimes causes `ldd` to error
    execstack -c "${ROCM_PATH}/lib/libamdhip64.so*"
    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
    done
-    # bomb out if for some reason we didn't get a few deps
+    compress_libs
    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
        cat "${BUILD_DIR}/bin/deps.txt"
        echo "ERROR: deps file short"
        exit 1
    fi
    compress
 fi
 cleanup
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -13,9 +13,6 @@ function amdGPUs {
        "gfx908:xnack-"
        "gfx90a:xnack+"
        "gfx90a:xnack-"
        "gfx940"
        "gfx941"
        "gfx942"
        "gfx1010"
        "gfx1012"
        "gfx1030"
@@ -27,13 +24,19 @@ function amdGPUs {
 }
 function init_vars {
    # Verify the environment is a Developer Shell for MSVC 2019
    write-host $env:VSINSTALLDIR
    if (($env:VSINSTALLDIR -eq $null)) {
        Write-Error "`r`nBUILD ERROR - YOUR DEVELOPMENT ENVIRONMENT IS NOT SET UP CORRECTLY`r`nTo build Ollama you must run from an MSVC Developer Shell`r`nSee .\docs\development.md for instructions to set up your dev environment"
        exit 1
    }
    $script:SRC_DIR = $(resolve-path "..\..\")
    $script:llamacppDir = "../llama.cpp"
    $script:cmakeDefs = @(
        "-DBUILD_SHARED_LIBS=on",
        "-DLLAMA_NATIVE=off"
        )
-    $script:cmakeTargets = @("ollama_llama_server")
+    $script:cmakeTargets = @("ext_server")
    $script:ARCH = "amd64" # arm not yet supported.
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -62,12 +65,8 @@ function init_vars {
    } else {
        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
    }
-    # Note: Windows Kits 10 signtool crashes with GCP's plugin
+    # Note: 10 Windows Kit signtool crashes with GCP's plugin
-    if ($null -eq $env:SIGN_TOOL) {
+    ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
        ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
    } else {
        ${script:SignTool}=${env:SIGN_TOOL}
    }
    if ("${env:KEY_CONTAINER}") {
        ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
    }
@@ -83,8 +82,8 @@ function git_module_setup {
 function apply_patches {
    # Wire up our CMakefile
-    if (!(Select-String -Path "${script:llamacppDir}/CMakeLists.txt" -Pattern 'ollama')) {
+    if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
-        Add-Content -Path "${script:llamacppDir}/CMakeLists.txt" -Value 'add_subdirectory(../ext_server ext_server) # ollama'
+        Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
    }
    # Apply temporary patches until fix is upstream
@@ -97,15 +96,22 @@ function apply_patches {
        }
        # Checkout each file
        Set-Location -Path ${script:llamacppDir}
        foreach ($file in $filePaths) {
-            git -C "${script:llamacppDir}" checkout $file
+            git checkout $file
        }
    }
    # Apply each patch
    foreach ($patch in $patches) {
-        git -C "${script:llamacppDir}" apply $patch.FullName
+        Set-Location -Path ${script:llamacppDir}
        git apply $patch.FullName
    }
    # Avoid duplicate main symbols when we link into the cgo binary
    $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
    $content = $content -replace 'int main\(', 'int __main('
    Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
 }
 function build {
@@ -113,20 +119,26 @@ function build {
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
+    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    # Rearrange output to be consistent between different generators
+}
-    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
+
-        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
+function install {
-        remove-item "${script:buildDir}/bin/${script:config}"
+    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
    md "${script:buildDir}/lib" -ea 0 > $null
    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
    # Display the dll dependencies in the build log
    if ($script:DUMPBIN -ne $null) {
        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
    }
 }
 function sign {
    if ("${env:KEY_CONTAINER}") {
-        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
+        write-host "Signing ${script:buildDir}/lib/*.dll"
-        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
+        foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
@@ -134,20 +146,14 @@ function sign {
    }
 }
-function compress {
+function compress_libs {
    if ($script:GZIP -eq $null) {
        write-host "gzip not installed, not compressing files"
        return
    }
    write-host "Compressing binaries..."
    $binaries = dir "${script:buildDir}/bin/*.exe"
    foreach ($file in $binaries) {
        & "$script:GZIP" --best -f $file
    }
    write-host "Compressing dlls..."
-    $dlls = dir "${script:buildDir}/bin/*.dll"
+    $libs = dir "${script:buildDir}/lib/*.dll"
-    foreach ($file in $dlls) {
+    foreach ($file in $libs) {
        & "$script:GZIP" --best -f $file
    }
 }
@@ -162,11 +168,14 @@ function cleanup {
        }
        # Checkout each file
        Set-Location -Path ${script:llamacppDir}
        foreach ($file in $filePaths) {            
-            git -C "${script:llamacppDir}" checkout $file
+            git checkout $file
        }
        git -C "${script:llamacppDir}" checkout CMakeLists.txt
    }
    Set-Location "${script:llamacppDir}/examples/server"
    git checkout CMakeLists.txt server.cpp
 }
 init_vars
@@ -174,64 +183,38 @@ git_module_setup
 apply_patches
 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
 # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
 $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
 if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
 # GCC build for direct linking into the Go binary
 init_vars
-# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-# as we need this to be compiled by gcc for golang to be able to link with itx
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
-write-host "Checking for MinGW..."
+write-host "Building LCD CPU"
 # error action ensures we exit on failure
 get-command gcc
 get-command mingw32-make
 $script:cmakeTargets = @("llama", "ggml")
 $script:cmakeDefs = @(
    "-G", "MinGW Makefiles"
    "-DCMAKE_C_COMPILER=gcc.exe",
    "-DCMAKE_CXX_COMPILER=g++.exe",
    "-DBUILD_SHARED_LIBS=off",
    "-DLLAMA_NATIVE=off",
    "-DLLAMA_AVX=off",
    "-DLLAMA_AVX2=off",
    "-DLLAMA_AVX512=off",
    "-DLLAMA_F16C=off",
    "-DLLAMA_FMA=off")
 $script:buildDir="../build/windows/${script:ARCH}_static"
 write-host "Building static library"
 build
 install
 sign
 compress_libs
-# remaining llama.cpp builds use MSVC 
+init_vars
-    init_vars
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
-    $script:buildDir="../build/windows/${script:ARCH}/cpu"
+write-host "Building AVX CPU"
-    write-host "Building LCD CPU"
+build
-    build
+install
-    sign
+sign
-    compress
+compress_libs
-    init_vars
+init_vars
-    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
-    write-host "Building AVX CPU"
+write-host "Building AVX2 CPU"
-    build
+build
-    sign
+install
-    compress
+sign
-
+compress_libs
    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
    write-host "Building AVX2 CPU"
    build
    sign
    compress
 } else {
    write-host "Skipping CPU generation step as requested"
 }
 if ($null -ne $script:CUDA_LIB_DIR) {
    # Then build cuda as a dynamically loaded library
@@ -241,11 +224,13 @@ if ($null -ne $script:CUDA_LIB_DIR) {
        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
    }
    init_vars
-    $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
    write-host "Building CUDA"
    build
    install
    sign
-    compress
+    compress_libs
 }
 if ($null -ne $env:HIP_PATH) {
@@ -255,13 +240,12 @@ if ($null -ne $env:HIP_PATH) {
    }
    init_vars
-    $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
    $script:cmakeDefs += @(
        "-G", "Ninja", 
        "-DCMAKE_C_COMPILER=clang.exe",
        "-DCMAKE_CXX_COMPILER=clang++.exe",
        "-DLLAMA_HIPBLAS=on",
        "-DHIP_PLATFORM=amd",
        "-DLLAMA_AVX=on",
        "-DLLAMA_AVX2=off",
        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
@@ -270,7 +254,7 @@ if ($null -ne $env:HIP_PATH) {
        )
    # Make sure the ROCm binary dir is first in the path
-    $env:PATH="$env:HIP_PATH\bin;$env:PATH"
+    $env:PATH="$env:HIP_PATH\bin;$env:VSINSTALLDIR\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja;$env:PATH"
    # We have to clobber the LIB var from the developer shell for clang to work properly
    $env:LIB=""
@@ -279,13 +263,13 @@ if ($null -ne $env:HIP_PATH) {
    build
    # Ninja doesn't prefix with config name
    ${script:config}=""
    install
    if ($null -ne $script:DUMPBIN) {
-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
    }
    sign
-    compress
+    compress_libs
 }
 cleanup
-write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
+write-host "`ngo generate completed"
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@@ -1,3 +1,3 @@
 package generate
-//go:generate bash ./gen_darwin.sh
+//go:generate sh ./gen_darwin.sh
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -7,18 +7,16 @@ import (
 	"slices"
 )
-type containerGGLA struct {
+type ContainerGGLA struct {
 	version uint32
 }
-func (c *containerGGLA) Name() string {
+func (c *ContainerGGLA) Name() string {
 	return "ggla"
 }
-func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
+func (c *ContainerGGLA) Decode(rso *readSeekOffset) (model, error) {
-	if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
+	binary.Read(rso, binary.LittleEndian, &c.version)
 		return nil, err
 	}
 	switch c.version {
 	case 1:
@@ -26,66 +24,58 @@ func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
 		return nil, errors.New("invalid version")
 	}
-	model := newGGLA(c)
+	model := newModelGGLA(c)
-	err := model.decode(rs)
+	err := model.decode(rso)
 	return model, err
 }
-type ggla struct {
+type ModelGGLA struct {
-	*containerGGLA
+	*ContainerGGLA
 	kv      KV
-	tensors []*Tensor
+	tensors []Tensor
 }
-func newGGLA(container *containerGGLA) *ggla {
+func newModelGGLA(container *ContainerGGLA) *ModelGGLA {
-	return &ggla{
+	return &ModelGGLA{
-		containerGGLA: container,
+		ContainerGGLA: container,
 		kv:            make(KV),
 	}
 }
-func (llm *ggla) KV() KV {
+func (m *ModelGGLA) decode(rso *readSeekOffset) error {
 	return llm.kv
 }
 func (llm *ggla) Tensors() Tensors {
 	return llm.tensors
 }
 func (llm *ggla) decode(rs io.ReadSeeker) error {
 	var r uint32
-	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
+	if err := binary.Read(rso, binary.LittleEndian, &r); err != nil {
 		return err
 	}
-	llm.kv["r"] = r
+	m.kv["r"] = r
 	var alpha uint32
-	if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
+	if err := binary.Read(rso, binary.LittleEndian, &alpha); err != nil {
 		return err
 	}
-	llm.kv["alpha"] = alpha
+	m.kv["alpha"] = alpha
 	for {
 		var dims uint32
-		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
+		if err := binary.Read(rso, binary.LittleEndian, &dims); err != nil {
 			return err
 		}
 		var namesize uint32
-		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
+		if err := binary.Read(rso, binary.LittleEndian, &namesize); err != nil {
 			return err
 		}
 		var t Tensor
-		if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
+		if err := binary.Read(rso, binary.LittleEndian, &t.Kind); err != nil {
 			return err
 		}
 		t.Shape = make([]uint64, dims)
 		for i := 0; uint32(i) < dims; i++ {
 			var shape32 uint32
-			if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
+			if err := binary.Read(rso, binary.LittleEndian, &shape32); err != nil {
 				return err
 			}
@@ -97,32 +87,66 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 		slices.Reverse(t.Shape)
 		name := make([]byte, namesize)
-		if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
+		if err := binary.Read(rso, binary.LittleEndian, &name); err != nil {
 			return err
 		}
 		t.Name = string(name)
-		offset, err := rs.Seek(0, io.SeekCurrent)
+		if _, err := rso.Seek((rso.offset+31)&-32, io.SeekStart); err != nil {
 		if err != nil {
 			return err
 		}
-		if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
+		t.Offset = uint64(rso.offset)
 		if _, err := rso.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
 			return err
 		}
-		offset, err = rs.Seek(0, io.SeekCurrent)
+		m.tensors = append(m.tensors, t)
 		if err != nil {
 			return err
 		}
 		t.Offset = uint64(offset)
 		if _, err := rs.Seek(int64(t.size()), io.SeekCurrent); err != nil {
 			return err
 		}
 		llm.tensors = append(llm.tensors, &t)
 	}
 }
 func (m *ModelGGLA) KV() KV {
 	return m.kv
 }
 func (m *ModelGGLA) Tensor() []Tensor {
 	return m.tensors
 }
 func (*ModelGGLA) ModelFamily() string {
 	return "ggla"
 }
 func (*ModelGGLA) ModelType() string {
 	panic("not implemented")
 }
 func (*ModelGGLA) FileType() string {
 	panic("not implemented")
 }
 func (*ModelGGLA) NumLayers() uint32 {
 	panic("not implemented")
 }
 func (*ModelGGLA) NumGQA() uint32 {
 	panic("not implemented")
 }
 func (*ModelGGLA) NumEmbed() uint32 {
 	panic("not implemented")
 }
 func (*ModelGGLA) NumHead() uint32 {
 	panic("not implemented")
 }
 func (*ModelGGLA) NumHeadKv() uint32 {
 	panic("not implemented")
 }
 func (*ModelGGLA) NumCtx() uint32 {
 	panic("not implemented")
 }
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -3,14 +3,14 @@ package llm
 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"strings"
 )
 type GGML struct {
 	container
 	model
 	Size int64
 }
 const (
@@ -90,184 +90,20 @@ func fileType(fileType uint32) string {
 }
 type model interface {
-	KV() KV
+	ModelFamily() string
-	Tensors() Tensors
+	ModelType() string
-}
+	FileType() string
-
+	NumLayers() uint32
-type KV map[string]any
+	NumGQA() uint32
-
+	NumEmbed() uint32
-func (kv KV) u64(key string) uint64 {
+	NumHead() uint32
-	switch v := kv[key].(type) {
+	NumHeadKv() uint32
-	case uint64:
+	NumCtx() uint32
 		return v
 	case uint32:
 		return uint64(v)
 	case float64:
 		return uint64(v)
 	default:
 		return 0
 	}
 }
 func (kv KV) Architecture() string {
 	if s, ok := kv["general.architecture"].(string); ok {
 		return s
 	}
 	return "unknown"
 }
 func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }
 func (kv KV) FileType() string {
 	if u64 := kv.u64("general.file_type"); u64 > 0 {
 		return fileType(uint32(u64))
 	}
 	return "unknown"
 }
 func (kv KV) BlockCount() uint64 {
 	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
 }
 func (kv KV) HeadCount() uint64 {
 	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
 }
 func (kv KV) HeadCountKV() uint64 {
 	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
 		return headCountKV
 	}
 	return 1
 }
 func (kv KV) GQA() uint64 {
 	return kv.HeadCount() / kv.HeadCountKV()
 }
 func (kv KV) EmbeddingLength() uint64 {
 	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
 }
 func (kv KV) ContextLength() uint64 {
 	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
 }
 type Tensors []*Tensor
 func (ts Tensors) Layers() map[string]Layer {
 	layers := make(map[string]Layer)
 	for _, t := range ts {
 		parts := strings.Split(t.Name, ".")
 		if parts[0] == "blk" {
 			// join first and second part, e.g. blk.%d
 			parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
 		}
 		if _, ok := layers[parts[0]]; !ok {
 			layers[parts[0]] = make(Layer)
 		}
 		layers[parts[0]][strings.Join(parts[1:], ".")] = t
 	}
 	return layers
 }
 type Layer map[string]*Tensor
 func (l Layer) size() (size uint64) {
 	for _, t := range l {
 		size += t.size()
 	}
 	return size
 }
 type Tensor struct {
 	Name   string `json:"name"`
 	Kind   uint32 `json:"kind"`
 	Offset uint64 `json:"-"`
 	// Shape is the number of elements in each dimension
 	Shape []uint64 `json:"shape"`
 	io.WriterTo `json:"-"`
 }
 func (t Tensor) blockSize() uint64 {
 	switch {
 	case t.Kind < 2:
 		return 1
 	case t.Kind < 10:
 		return 32
 	default:
 		return 256
 	}
 }
 func (t Tensor) typeSize() uint64 {
 	blockSize := t.blockSize()
 	switch t.Kind {
 	case 0: // FP32
 		return 4
 	case 1: // FP16
 		return 2
 	case 2: // Q4_0
 		return 2 + blockSize/2
 	case 3: // Q4_1
 		return 2 + 2 + blockSize/2
 	case 6: // Q5_0
 		return 2 + 4 + blockSize/2
 	case 7: // Q5_1
 		return 2 + 2 + 4 + blockSize/2
 	case 8: // Q8_0
 		return 2 + blockSize
 	case 9: // Q8_1
 		return 4 + 4 + blockSize
 	case 10: // Q2_K
 		return blockSize/16 + blockSize/4 + 2 + 2
 	case 11: // Q3_K
 		return blockSize/8 + blockSize/4 + 12 + 2
 	case 12: // Q4_K
 		return 2 + 2 + 12 + blockSize/2
 	case 13: // Q5_K
 		return 2 + 2 + 12 + blockSize/8 + blockSize/2
 	case 14: // Q6_K
 		return blockSize/2 + blockSize/4 + blockSize/16 + 2
 	case 15: // Q8_K
 		return 2 + blockSize + 2*blockSize/16
 	case 16: // IQ2_XXS
 		return 2 + 2*blockSize/8
 	case 17: // IQ2_XS
 		return 2 + 2*blockSize/8 + blockSize/32
 	case 18: // IQ3_XXS
 		return 2 + 3*blockSize/8
 	default:
 		return 0
 	}
 }
 func (t Tensor) parameters() uint64 {
 	var count uint64 = 1
 	for _, n := range t.Shape {
 		count *= n
 	}
 	return count
 }
 func (t Tensor) size() uint64 {
 	return t.parameters() * t.typeSize() / t.blockSize()
 }
 type container interface {
 	Name() string
-	Decode(io.ReadSeeker) (model, error)
+	Decode(*readSeekOffset) (model, error)
 }
 const (
@@ -286,108 +122,60 @@ const (
 var ErrUnsupportedFormat = errors.New("unsupported model format")
-func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
+func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	ro := readSeekOffset{ReadSeeker: r}
 	var magic uint32
-	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
+	if err := binary.Read(&ro, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 	var c container
 	switch magic {
 	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
-		return nil, 0, ErrUnsupportedFormat
+		return nil, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
-		c = &containerGGLA{}
+		c = &ContainerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{ByteOrder: binary.LittleEndian}
+		c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{ByteOrder: binary.BigEndian}
+		c = &ContainerGGUF{ByteOrder: binary.BigEndian}
 	default:
-		return nil, 0, errors.New("invalid file magic")
+		return nil, errors.New("invalid file magic")
 	}
-	model, err := c.Decode(rs)
+	model, err := c.Decode(&ro)
 	if errors.Is(err, io.EOF) {
 		// noop
 	} else if err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
 		return nil, 0, err
 	}
 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-	}, offset, nil
+		Size:      ro.offset,
 	}, nil
 }
-func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
+type readSeekOffset struct {
-	embedding := llm.KV().EmbeddingLength()
+	io.ReadSeeker
-	heads := llm.KV().HeadCount()
+	offset int64
-	headsKV := llm.KV().HeadCountKV()
+}
 	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
-	layers := llm.Tensors().Layers()
+func (rso *readSeekOffset) Seek(offset int64, whence int) (int64, error) {
-
+	offset, err := rso.ReadSeeker.Seek(offset, whence)
-	switch llm.KV().Architecture() {
+	if err != nil {
-	case "llama":
+		return 0, err
 		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
 		if ffnGateWeight, ok := layers["0"]["ffn_gate.0.weight"]; ok {
 			ffnGateWeight1 := ffnGateWeight.Shape[1]
 			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
 			partialOffload = max(
 				4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
 	case "gemma":
 		fullOffload = 4 * batch * (embedding + vocab)
 		partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
 	case "command-r":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(2+4*embedding+context*(1+heads)),
 		)
 		partialOffload = max(
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
 		)
 	case "qwen2":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(1+2*embedding+context+context*heads),
 		)
 		partialOffload = max(
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
 		)
 	case "phi2":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(1+4*embedding+context+context*heads),
 		)
 		partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
 	case "stablelm":
 		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
 		partialOffload = max(
 			4*batch*(vocab+2*embedding),
 			fullOffload,
 		)
 	}
-	return
+	rso.offset = offset
 	return offset, nil
 }
 func (rso *readSeekOffset) Read(p []byte) (int, error) {
 	n, err := rso.ReadSeeker.Read(p)
 	rso.offset += int64(n)
 	return n, err
 }
--- a/llm/gguf.go
+++ b/llm/gguf.go
--- a/Show More
+++ b/Show More
`@@ -1,3 +1,3 @@`
	`package generate`	`package generate`

	`//go:generate bash ./gen_darwin.sh`	`//go:generate sh ./gen_darwin.sh`