Compare commits

..

6 Commits

Author SHA1 Message Date
Bruce MacDonald
c2b11611a8 Update new_runner_benchmark_test.go 2025-01-29 10:34:09 -08:00
Bruce MacDonald
90698c7d15 benchmark: new Go runner 2025-01-28 14:55:03 -08:00
Jesse Gross
4b4a5a28bf new runner 2025-01-27 13:47:13 -08:00
Jesse Gross
3c95c21ddf tensor loading iface 2025-01-23 16:04:13 -08:00
Michael Yang
8ab13e4d3e next 2025-01-22 22:10:31 -08:00
Michael Yang
144f63e2fb next build 2025-01-22 22:08:29 -08:00
64 changed files with 4211 additions and 1526 deletions

View File

@@ -3,9 +3,7 @@ ollama
app
macapp
dist
build
.env
.cache
test_data
.git
llama/build

View File

@@ -1,62 +1,31 @@
name: release
env:
ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
on:
push:
tags:
- 'v*'
jobs:
setup-environment:
runs-on: ubuntu-latest
environment: release
outputs:
GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
steps:
- uses: actions/checkout@v4
- name: Set environment
id: goflags
run: |
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
darwin-build:
# Full build of the Mac assets
build-darwin:
runs-on: macos-13
environment: release
needs: setup-environment
strategy:
matrix:
os: [darwin]
arch: [amd64, arm64]
env:
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
- run: |
go build -o dist/ .
- name: Set Version
shell: bash
run: |
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
- name: key
env:
GOOS: ${{ matrix.os }}
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: 1
CGO_CPPFLAGS: '-mmacosx-version-min=11.3'
- if: matrix.arch == 'amd64'
MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
run: |
cmake --preset CPU -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64
cmake --build --parallel --preset CPU
cmake --install build --component CPU --strip --parallel 8
- uses: actions/upload-artifact@v4
with:
name: build-${{ matrix.os }}-${{ matrix.arch }}
path: dist/*
darwin-sign:
runs-on: macos-13
environment: release
needs: darwin-build
steps:
- uses: actions/checkout@v4
- run: |
echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
security create-keychain -p password build.keychain
security default-keychain -s build.keychain
@@ -64,20 +33,11 @@ jobs:
security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
security set-keychain-settings -lut 3600 build.keychain
env:
MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
- uses: actions/download-artifact@v4
- uses: actions/setup-go@v5
with:
name: build-darwin-amd64
path: dist/darwin-amd64
- uses: actions/download-artifact@v4
with:
name: build-darwin-arm64
path: dist/darwin-arm64
- run: |
export VERSION=${GITHUB_REF_NAME#v}
./scripts/build_darwin.sh macapp sign
go-version-file: go.mod
cache: true
- name: Build Darwin
env:
APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
@@ -85,238 +45,481 @@ jobs:
APPLE_ID: ${{ vars.APPLE_ID }}
SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
run: |
./scripts/build_darwin.sh
- uses: actions/upload-artifact@v4
with:
name: dist-darwin
path: |
dist/Ollama-darwin.zip
dist/ollama-darwin.tgz
dist/ollama-darwin
windows-depends:
strategy:
matrix:
os: [windows]
arch: [amd64]
preset: ['CPU']
include:
- os: windows
arch: amd64
preset: 'CUDA 11'
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
cuda-version: '11.3'
- os: windows
arch: amd64
preset: 'CUDA 12'
install: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
cuda-version: '12.4'
- os: windows
arch: amd64
preset: 'ROCm 6'
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
rocm-version: '6.1'
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
# Windows builds take a long time to both install the dependencies and build, so parallelize
# CPU generation step
generate-windows-cpu:
environment: release
runs-on: windows
env:
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- name: Install system dependencies
run: |
choco install -y --no-progress ccache ninja
ccache -o cache_dir=${{ github.workspace }}\.ccache
- if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ')
id: cache-install
uses: actions/cache/restore@v4
with:
path: |
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
C:\Program Files\AMD\ROCm
key: ${{ matrix.install }}
- if: startsWith(matrix.preset, 'CUDA ')
name: Install CUDA ${{ matrix.cuda-version }}
run: |
$ErrorActionPreference = "Stop"
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
$subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
}
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- if: startsWith(matrix.preset, 'ROCm')
name: Install ROCm ${{ matrix.rocm-version }}
run: |
$ErrorActionPreference = "Stop"
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
Start-Process -FilePath .\install.exe -ArgumentList '-install' -NoNewWindow -Wait
}
$hipPath = (Resolve-Path "C:\Program Files\AMD\ROCm\*").path
echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
- if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
uses: actions/cache/save@v4
with:
path: |
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
C:\Program Files\AMD\ROCm
key: ${{ matrix.install }}
- uses: actions/checkout@v4
- uses: actions/cache@v4
with:
path: ${{ github.workspace }}\.ccache
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
- name: Build target "${{ matrix.preset }}"
- name: Set make jobs default
run: |
Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
cmake --preset "${{ matrix.preset }}"
cmake --build --parallel --preset "${{ matrix.preset }}"
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
env:
CMAKE_GENERATOR: Ninja
- uses: actions/upload-artifact@v4
with:
name: depends-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
path: dist\*
windows-build:
strategy:
matrix:
os: [windows]
arch: [amd64, arm64]
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
environment: release
needs: [setup-environment]
env:
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
steps:
- name: Install system dependencies
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: Add msys paths
run: |
$ErrorActionPreference = "Stop"
if ("${{ matrix.arch }}" -eq 'amd64') {
Start-Process "C:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
echo "C:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
} elseif ("${{ matrix.arch }}" -eq 'arm64') {
Set-ExecutionPolicy Bypass -Scope Process -Force
[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
choco install -y --no-progress git gzip
echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip"
Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip -DestinationPath "C:\Program Files\"
$installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt-aarch64").path
echo $installPath\bin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
}
- uses: actions/checkout@v4
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install msys2 tools
run: |
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true
- run: |
go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
- run: |
$env:VERSION='${{ github.ref_name }}' -Replace "v(.*)", '$1'
& .\scripts\build_windows.ps1 buildApp
env:
VCToolsRedistDir: stub
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
make dist
name: make
- uses: actions/upload-artifact@v4
with:
name: build-${{ matrix.os }}-${{ matrix.arch }}
name: generate-windows-cpu
path: |
dist\${{ matrix.os }}-${{ matrix.arch }}\*.exe
dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
dist/windows-amd64/**
windows-sign:
runs-on: windows
# ROCm generation step
generate-windows-rocm:
environment: release
needs: [windows-depends, windows-build]
runs-on: windows
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- uses: actions/checkout@v4
- uses: google-github-actions/auth@v2
- name: Set make jobs default
run: |
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: Add msys paths
run: |
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install msys2 tools
run: |
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
- uses: actions/setup-go@v5
with:
project_id: ollama
credentials_json: ${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}
- run: |
go-version-file: go.mod
cache: true
# ROCM installation steps
- name: 'Cache ROCm installer'
id: cache-rocm
uses: actions/cache@v4
with:
path: rocm-install.exe
key: ${{ env.ROCM_WINDOWS_URL }}
- name: 'Conditionally Download ROCm'
if: steps.cache-rocm.outputs.cache-hit != 'true'
run: |
$ErrorActionPreference = "Stop"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${{ runner.temp }}\sdksetup.exe"
Start-Process "${{ runner.temp }}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${{ runner.temp }}\plugin.zip"
Expand-Archive -Path "${{ runner.temp }}\plugin.zip" -DestinationPath "${{ runner.temp }}\plugin\"
& "${{ runner.temp }}\plugin\*\kmscng.msi" /quiet
echo "${{ vars.OLLAMA_CERT }}" >ollama_inc.crt
- uses: actions/download-artifact@v4
Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
- name: 'Install ROCm'
run: |
Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
- name: 'Verify ROCm'
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
- name: make rocm runner
run: |
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
make help-runners
make dist_rocm
- uses: actions/upload-artifact@v4
with:
name: build-windows-*
path: dist\
merge-multiple: true
- uses: actions/download-artifact@v4
name: generate-windows-rocm
path: |
dist/windows-amd64/**
# CUDA generation step
generate-windows-cuda:
environment: release
runs-on: windows
strategy:
matrix:
cuda:
- version: "11.3"
url: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
- version: "12.4"
url: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- uses: actions/checkout@v4
- name: Set make jobs default
run: |
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: Install msys2
run: |
$msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
write-host "Downloading msys2"
Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
write-host "Installing msys2"
Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install msys2 tools
run: |
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: verify tools
run: |
get-command gcc
gcc --version
get-command make
make --version
- uses: actions/setup-go@v5
with:
name: depends-windows-amd64-*
path: dist\windows-amd64\
merge-multiple: true
go-version-file: go.mod
cache: true
# CUDA installation steps
- name: 'Cache CUDA installer'
id: cache-cuda
uses: actions/cache@v4
with:
path: cuda-install.exe
key: ${{ matrix.cuda.url }}
- name: 'Conditionally Download CUDA'
if: steps.cache-cuda.outputs.cache-hit != 'true'
run: |
$ErrorActionPreference = "Stop"
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "cuda-install.exe"
- name: 'Install CUDA'
run: |
$subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ matrix.cuda.version }}"}
Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
- name: 'Verify CUDA'
run: |
& (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
- name: make cuda runner
run: |
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
- uses: actions/upload-artifact@v4
with:
name: generate-windows-cuda-${{ matrix.cuda.version }}
path: |
dist/windows-amd64/**
# windows arm64 generate, go build, and zip file (no installer)
# Output of this build is aggregated into the final x86 build
# for a unified windows installer
windows-arm64:
runs-on: windows-arm64
environment: release
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
# The current Windows arm64 beta image has effectively zero dev tools installed...
- name: Install git and gzip
run: |
Set-ExecutionPolicy Bypass -Scope Process -Force
[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
choco install -y --no-progress git gzip
echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
# pacman is buggy on win arm64, so we avoid using it, but rely on the binary artifacts
# we download the sfx (7zip bundle) which isn't fully set up, but the binaries we need to build work
- name: Install msys2 x64
run: |
$url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-base-x86_64-20240727.sfx.exe"
write-host "Downloading MSYS2"
Invoke-WebRequest -Uri "$url" -outfile "${env:RUNNER_TEMP}\msys2.exe"
write-host "Installing msys2"
Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @(
'-y', '-oC:\'
) -NoNewWindow -Wait
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
# since pacman isn't reliable, we just download the tar file and extract directly
- name: Downloading and extracting msys2 make tar file
run: |
$url="https://mirror.msys2.org/msys/x86_64/make-4.4.1-2-x86_64.pkg.tar.zst"
write-host "Downloading make"
Invoke-WebRequest -Uri "$url" -outfile c:\msys64\make.tar.zst
cd c:\msys64; tar -xf make.tar.zst
rm c:\msys64\make.tar.zst
- name: Verify Make works properly
run: |
echo $env:PATH
make --version
- name: Install Visual Studio 2022
run: |
$components = @(
"Microsoft.VisualStudio.Component.CoreEditor",
"Microsoft.VisualStudio.Workload.CoreEditor",
"Microsoft.VisualStudio.Component.Roslyn.Compiler",
"Microsoft.Component.MSBuild",
"Microsoft.VisualStudio.Component.TextTemplating",
"Microsoft.VisualStudio.Component.Debugger.JustInTime",
"Microsoft.VisualStudio.Component.VC.CoreIde",
"Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
"Microsoft.VisualStudio.Component.Windows11SDK.22621",
"Microsoft.VisualStudio.Component.VC.Tools.ARM64EC",
"Microsoft.VisualStudio.Component.VC.Tools.ARM64",
"Microsoft.VisualStudio.Component.VC.ATL",
"Microsoft.VisualStudio.Component.VC.ATL.ARM64",
"Microsoft.VisualStudio.Component.Graphics",
"Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
"Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
"Microsoft.VisualStudio.Component.Windows11Sdk.WindowsPerformanceToolkit",
"Microsoft.VisualStudio.Component.CppBuildInsights",
"Microsoft.VisualStudio.Component.VC.DiagnosticTools",
"Microsoft.VisualStudio.ComponentGroup.WebToolsExtensions.CMake",
"Microsoft.VisualStudio.Component.VC.CMake.Project",
"Microsoft.VisualStudio.Component.VC.ASAN",
"Microsoft.VisualStudio.Component.Vcpkg",
"Microsoft.VisualStudio.Workload.NativeDesktop"
)
$config = @{
"version" = "1.0"
"components" = $components
"extensions" = @()
}
$configPath = "${env:RUNNER_TEMP}\vsconfig"
$config | ConvertTo-Json | Out-File -FilePath $configPath
$bootstrapperFilePath = "${env:RUNNER_TEMP}\vs_community.exe"
write-host "Downloading Visual Studio 2022"
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_community.exe" -outfile $bootstrapperFilePath
$bootstrapperArgumentList = ('/c', $bootstrapperFilePath, '--config', $configPath, '--quiet', '--wait' )
write-host "Installing Visual Studio 2022"
$process = Start-Process -FilePath cmd.exe -ArgumentList $bootstrapperArgumentList -Wait -PassThru
$exitCode = $process.ExitCode
write-host $exitCode
# pacman in mingw/msys2 is ~broken on windows arm right now - hangs consistently during attempts to install
# so we'll use this alternative GCC binary
- name: Install llvm-mingw GCC
run: |
$gcc_url="https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip"
write-host "Downloading llvm-mingw"
Invoke-WebRequest -Uri "${gcc_url}" -OutFile "${env:RUNNER_TEMP}\gcc.zip"
write-host "Unpacking llvm-mingw"
expand-archive -path "${env:RUNNER_TEMP}\gcc.zip" -destinationpath "c:\"
mv c:\llvm-mingw-* c:\llvm-mingw
echo "c:\llvm-mingw\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Verify GCC
run: |
echo $env:PATH
gcc --version
- uses: actions/checkout@v4
- name: Set Version
run: |
$ver=${env:GITHUB_REF_NAME}.trim("v")
echo VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
- uses: 'google-github-actions/auth@v2'
with:
project_id: 'ollama'
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
- run: echo "${{ vars.OLLAMA_CERT }}" | Out-File -FilePath ollama_inc.crt -Encoding utf8
- name: install Windows SDK 8.1 to get signtool
run: |
$ErrorActionPreference = "Stop"
write-host "downloading SDK"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
write-host "Win SDK 8.1 installed"
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
- name: install signing plugin
run: |
$ErrorActionPreference = "Stop"
write-host "downloading plugin"
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
write-host "Installing plugin"
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true
- run: go get ./...
- run: |
& .\scripts\build_windows.ps1 gatherDependencies sign buildInstaller distZip
$gopath=(get-command go).source | split-path -parent
$gccpath=(get-command gcc).source | split-path -parent
import-module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -skipautomaticlocation
$env:PATH="$gopath;$gccpath;$env:PATH"
echo $env:PATH
$env:ARCH="arm64"
.\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies sign distZip
name: 'Windows Build'
- uses: actions/upload-artifact@v4
with:
name: windows-arm64
path: |
dist/windows-arm64/**
dist/windows-arm64-app.exe
dist/ollama-windows-arm64.zip
# Import the prior generation steps plus the full arm64 build, and build the final windows assets
build-windows:
environment: release
runs-on: windows
needs:
- generate-windows-cuda
- generate-windows-rocm
- generate-windows-cpu
- windows-arm64
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- uses: 'google-github-actions/auth@v2'
with:
project_id: 'ollama'
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
- name: install Windows SDK 8.1 to get signtool
run: |
$ErrorActionPreference = "Stop"
write-host "downloading SDK"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
write-host "Win SDK 8.1 installed"
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
- name: install signing plugin
run: |
$ErrorActionPreference = "Stop"
write-host "downloading plugin"
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
write-host "Installing plugin"
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
write-host "plugin installed"
- name: Install msys2
run: |
$msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
write-host "Downloading msys2"
Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
write-host "Installing msys2"
Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install msys2 tools
run: |
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: verify tools
run: |
get-command gcc
gcc --version
get-command make
make --version
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true
- run: go get
- uses: actions/download-artifact@v4
with:
name: generate-windows-cpu
path: dist/windows-amd64/
- uses: actions/download-artifact@v4
with:
name: generate-windows-cuda-11.3
path: dist/windows-amd64/
- uses: actions/download-artifact@v4
with:
name: generate-windows-cuda-12.4
path: dist/windows-amd64/
- uses: actions/download-artifact@v4
with:
name: generate-windows-rocm
path: dist/windows-amd64/
- uses: actions/download-artifact@v4
with:
name: windows-arm64
path: dist
- run: |
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
$env:OLLAMA_SKIP_GENERATE="1"
$env:ARCH="amd64"
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
& .\scripts\build_windows.ps1
- uses: actions/upload-artifact@v4
with:
name: dist-windows
path: |
dist\OllamaSetup.exe
dist\ollama-windows-*.zip
dist/OllamaSetup.exe
dist/ollama-windows-*.zip
linux-build:
build-linux:
environment: release
runs-on: linux
strategy:
matrix:
include:
- os: linux
arch: amd64
targets: 'archive rocm'
targets: [archive, rocm]
- os: linux
arch: arm64
targets: archive
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
environment: release
needs: setup-environment
env:
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
targets: [archive]
steps:
- uses: actions/checkout@v4
- uses: docker/setup-qemu-action@v3
- uses: docker/setup-buildx-action@v3
- run: |
apt-get update && apt-get install pigz
for TARGET in ${{ matrix.targets }}; do docker buildx build --platform $PLATFORM --target $TARGET --output type=local,dest=dist/$PLATFORM .; done
tar c -C dist/$PLATFORM . | pigz -9cv >dist/ollama-${PLATFORM//\//-}.tgz
tar c -C dist/$PLATFORM . | pigz -9cv >dist/ollama-${PLATFORM//\//-}.tar.gz
env:
PLATFORM: ${{ matrix.os }}/${{ matrix.arch }}
- uses: actions/upload-artifact@v4
with:
name: dist-${{ matrix.os }}-${{ matrix.arch }}
path: |
dist/ollama-${{ matrix.os }}-${{ matrix.arch }}.tgz
dist/ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.gz
docker-build:
build-docker:
environment: release
runs-on: linux
strategy:
matrix:
include:
- flavor: 'latest=false'
- flavor: |
latest=auto
platforms: linux/amd64,linux/arm64
build-args: |
GOFLAGS=${{ needs.setup-environment.outputs.GOFLAGS }}
- flavor: 'latest=false,suffix=rocm'
build-args: [GOFLAGS]
- flavor: |
suffix=-rocm,onlatest=false
platforms: linux/amd64
build-args: |
GOFLAGS=${{ needs.setup-environment.outputs.GOFLAGS }}
FLAVOR=rocm
runs-on: linux
environment: release
needs: setup-environment
build-args: [GOFLAGS, FLAVOR=rocm]
steps:
- uses: actions/checkout@v4
- uses: docker/setup-qemu-action@v2
@@ -344,10 +547,16 @@ jobs:
cache-from: type=registry,ref=ollama/ollama:latest
cache-to: type=inline
provenance: false
env:
GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ steps.metadata.outputs.version }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
# Aggregate all the assets and ship a release
release:
needs: [darwin-sign, windows-sign, linux-build]
needs:
- build-darwin
- build-windows
- build-linux-amd64
- build-linux-arm64
runs-on: linux
environment: release
permissions:
@@ -359,22 +568,14 @@ jobs:
- name: Set Version
shell: bash
run: |
- uses: actions/download-artifact@v4
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
- name: Retrieve built artifact
uses: actions/download-artifact@v4
with:
path: dist
pattern: dist-darwin
- uses: actions/download-artifact@v4
with:
path: dist
pattern: dist-windows
- uses: actions/download-artifact@v4
with:
path: dist
pattern: dist-linux-*
- uses: actions/download-artifact@v4
with:
path: dist
pattern: dist-windows
pattern: dist-*
merge-multiple: true
- run: |
ls -lh dist/
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
@@ -382,17 +583,15 @@ jobs:
cat dist/sha256sum.txt
- name: Create or update Release
run: |
RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
echo "Looking for existing release for ${RELEASE_VERSION}"
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
if [ -n "$OLD_TAG" ]; then
echo "Updating release ${RELEASE_VERSION} to point to new tag ${GITHUB_REF_NAME}"
echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
else
echo "Creating new release ${RELEASE_VERSION} pointing to tag ${GITHUB_REF_NAME}"
echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
gh release create ${GITHUB_REF_NAME} \
--title ${RELEASE_VERSION} \
--title ${{ env.RELEASE_VERSION }} \
--draft \
--generate-notes \
--prerelease

View File

@@ -40,105 +40,32 @@ jobs:
linux:
needs: [changes]
if: needs.changes.outputs.changed == 'True'
if: ${{ needs.changes.outputs.changed == 'True' }}
strategy:
matrix:
include:
- preset: CPU
- preset: CUDA
container: nvidia/cuda:11.8.0-devel-ubuntu22.04
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
- preset: ROCm
container: rocm/dev-ubuntu-22.04:6.1.2
- container: nvidia/cuda:11.8.0-devel-ubuntu22.04
preset: CUDA
- container: rocm/dev-ubuntu-22.04:6.1.2
preset: ROCm
extra-packages: rocm-libs
flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
runs-on: linux
runs-on: ubuntu-latest
container: ${{ matrix.container }}
steps:
- uses: actions/checkout@v4
- run: |
[ -n "${{ matrix.container }}" ] || sudo=sudo
$sudo apt-get update
$sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
apt-get update
apt-get install -y cmake pkg-config ccache ${{ matrix.extra-packages }}
ccache -o cache_dir=${{ github.workspace }}\.ccache
env:
DEBIAN_FRONTEND: noninteractive
- uses: actions/cache@v4
with:
path: /github/home/.cache/ccache
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
- run: |
cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
cmake --build --preset ${{ matrix.preset }} --parallel
windows:
needs: [changes]
if: needs.changes.outputs.changed == 'True'
strategy:
matrix:
include:
- preset: CPU
- preset: CUDA
install: https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
- preset: ROCm
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
flags: '-DAMDGPU_TARGETS=gfx1010'
runs-on: windows
steps:
- run: |
choco install -y --no-progress ccache ninja
ccache -o cache_dir=${{ github.workspace }}\.ccache
- if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm'
id: cache-install
uses: actions/cache/restore@v4
with:
path: |
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
C:\Program Files\AMD\ROCm
key: ${{ matrix.install }}
- if: matrix.preset == 'CUDA'
name: Install CUDA ${{ matrix.cuda-version }}
run: |
$ErrorActionPreference = "Stop"
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.8", "nvcc_11.8", "cublas_11.8", "cublas_dev_11.8")) -NoNewWindow -Wait
}
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- if: matrix.preset == 'ROCm'
name: Install ROCm ${{ matrix.rocm-version }}
run: |
$ErrorActionPreference = "Stop"
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
Start-Process -FilePath .\install.exe -ArgumentList '-install' -NoNewWindow -Wait
}
$hipPath = (Resolve-Path "C:\Program Files\AMD\ROCm\*").path
echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
- if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
uses: actions/cache/save@v4
with:
path: |
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
C:\Program Files\AMD\ROCm
key: ${{ matrix.install }}
- uses: actions/checkout@v4
- uses: actions/cache@v4
with:
path: ${{ github.workspace }}\.ccache
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
- run: |
Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
cmake --build --parallel --preset "${{ matrix.preset }}"
env:
CMAKE_GENERATOR: Ninja
cmake --preset ${{ matrix.preset }}
cmake --build --preset ${{ matrix.preset }} --parallel
test:
strategy:
@@ -163,5 +90,5 @@ jobs:
- uses: actions/checkout@v4
- name: Verify patches apply cleanly and do not change files
run: |
make -f Makefile.sync clean checkout sync
make -f Makefile2 clean checkout sync
git diff --compact-summary --exit-code

5
.gitignore vendored
View File

@@ -4,13 +4,12 @@
.venv
.swp
dist
build
ollama
.cache
*.exe
.idea
test_data
*.crt
__debug_bin*
llama/build
llama/vendor
__debug_bin*
llama/vendor

View File

@@ -21,23 +21,12 @@ set(GGML_BACKEND_SHARED ON)
set(GGML_SCHED_MAX_COPIES 4)
set(GGML_LLAMAFILE ON)
set(GGML_CPU_ALL_VARIANTS ON)
set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
set(GGML_CUDA_GRAPHS ON)
if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
set(GGML_CPU_ALL_VARIANTS ON)
endif()
set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
@@ -48,65 +37,18 @@ set(GGML_CPU ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
get_target_property(CPU_VARIANTS ggml-cpu MANUALLY_ADDED_DEPENDENCIES)
if(NOT CPU_VARIANTS)
set(CPU_VARIANTS "ggml-cpu")
endif()
install(TARGETS ggml-base ${CPU_VARIANTS}
RUNTIME_DEPENDENCIES
PRE_EXCLUDE_REGEXES ".*"
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "native")
endif()
find_package(CUDAToolkit)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
install(TARGETS ggml-cuda
RUNTIME_DEPENDENCIES
DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
PRE_INCLUDE_REGEXES cublas cublasLt cudart
PRE_EXCLUDE_REGEXES ".*"
RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
)
endif()
check_language(HIP)
if(CMAKE_HIP_COMPILER)
set(HIP_PLATFORM "amd")
find_package(hip REQUIRED)
if(NOT AMDGPU_TARGETS)
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
endif()
if(AMDGPU_TARGETS)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
install(TARGETS ggml-hip
RUNTIME_DEPENDENCIES
DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
PRE_INCLUDE_REGEXES amdhip64 hipblas rocblas amd_comgr hsa_runtime64 rocprofiler-register drm_amdgpu drm numa
PRE_EXCLUDE_REGEXES ".*"
POST_EXCLUDE_REGEXES "system32"
RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
)
foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
break()
endif()
endforeach()
endif()
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
endif()

View File

@@ -4,7 +4,6 @@
{
"name": "Default",
"binaryDir": "${sourceDir}/build",
"installDir": "${sourceDir}/dist",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release"
}

View File

@@ -33,33 +33,25 @@ FROM base AS cpu
RUN if [ "$(uname -m)" = "x86_64" ]; then yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++; fi
ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'CPU' \
&& cmake --build --parallel --preset 'CPU' \
&& cmake --install build --component CPU --strip --parallel 8
cmake --preset 'CPU' && cmake --build --parallel --preset 'CPU'
FROM base AS cuda-11
ARG CUDA11VERSION=11.3
RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
ENV PATH=/usr/local/cuda-11/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'CUDA 11' \
&& cmake --build --parallel --preset 'CUDA 11' \
&& cmake --install build --component CUDA --strip --parallel 8
cmake --preset 'CUDA 11' && cmake --build --parallel --preset 'CUDA 11'
FROM base AS cuda-12
ARG CUDA12VERSION=12.4
RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
ENV PATH=/usr/local/cuda-12/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'CUDA 12' \
&& cmake --build --parallel --preset 'CUDA 12' \
&& cmake --install build --component CUDA --strip --parallel 8
cmake --preset 'CUDA 12' && cmake --build --parallel --preset 'CUDA 12'
FROM base AS rocm-6
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'ROCm 6' \
&& cmake --build --parallel --preset 'ROCm 6' \
&& cmake --install build --component HIP --strip --parallel 8
cmake --preset 'ROCm 6' && cmake --build --parallel --preset 'ROCm 6'
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
ARG CMAKEVERSION
@@ -68,9 +60,7 @@ RUN apt-get update && apt-get install -y curl ccache \
COPY CMakeLists.txt CMakePresets.json .
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'JetPack 5' \
&& cmake --build --parallel --preset 'JetPack 5' \
&& cmake --install build --component CUDA --strip --parallel 8
cmake --preset 'JetPack 5' && cmake --build --parallel --preset 'JetPack 5'
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
ARG CMAKEVERSION
@@ -79,9 +69,7 @@ RUN apt-get update && apt-get install -y curl ccache \
COPY CMakeLists.txt CMakePresets.json .
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'JetPack 6' \
&& cmake --build --parallel --preset 'JetPack 6' \
&& cmake --install build --component CUDA --strip --parallel 8
cmake --preset 'JetPack 6' && cmake --build --parallel --preset 'JetPack 6'
FROM base AS build
ARG GOVERSION=1.23.4
@@ -95,20 +83,65 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
go build -trimpath -buildmode=pie -o /bin/ollama .
FROM --platform=linux/amd64 scratch AS amd64
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
COPY --from=cuda-11 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.11 \
/usr/local/cuda/lib64/libcublasLt.so.11 \
/usr/local/cuda/lib64/libcudart.so.11.0 \
/lib/ollama/cuda_v11/
COPY --from=cuda-12 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.12 \
/usr/local/cuda/lib64/libcublasLt.so.12 \
/usr/local/cuda/lib64/libcudart.so.12 \
/lib/ollama/cuda_v12/
FROM --platform=linux/arm64 scratch AS arm64
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 lib/ollama/cuda_jetpack5
COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 lib/ollama/cuda_jetpack6
COPY --from=cuda-11 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.11 \
/usr/local/cuda/lib64/libcublasLt.so.11 \
/usr/local/cuda/lib64/libcudart.so.11.0 \
/lib/ollama/cuda_v11/
COPY --from=cuda-12 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.12 \
/usr/local/cuda/lib64/libcublasLt.so.12 \
/usr/local/cuda/lib64/libcudart.so.12 \
/lib/ollama/cuda_v12/
COPY --from=jetpack-5 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.11 \
/usr/local/cuda/lib64/libcublasLt.so.11 \
/usr/local/cuda/lib64/libcudart.so.11.0 \
/lib/ollama/cuda_jetpack5/
COPY --from=jetpack-6 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.12 \
/usr/local/cuda/lib64/libcublasLt.so.12 \
/usr/local/cuda/lib64/libcudart.so.12 \
/lib/ollama/cuda_jetpack6/
FROM --platform=linux/arm64 scratch AS rocm
COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
COPY --from=rocm-6 --chmod=644 \
build/lib/libggml-hip.so \
/opt/rocm/lib/libamdhip64.so.6 \
/opt/rocm/lib/libhipblas.so.2 \
/opt/rocm/lib/librocblas.so.4 \
/opt/rocm/lib/libamd_comgr.so.2 \
/opt/rocm/lib/libhsa-runtime64.so.1 \
/opt/rocm/lib/librocprofiler-register.so.0 \
/opt/amdgpu/lib64/libdrm_amdgpu.so.1 \
/opt/amdgpu/lib64/libdrm.so.2 \
/usr/lib64/libnuma.so.1 \
/lib/ollama/rocm/
COPY --from=rocm-6 /opt/rocm/lib/rocblas/ /lib/ollama/rocm/rocblas/
FROM ${FLAVOR} AS archive
COPY --from=cpu dist/lib/ollama /lib/ollama
COPY --from=cpu --chmod=644 \
build/lib/libggml-base.so \
build/lib/libggml-cpu-*.so \
/lib/ollama/
COPY --from=build /bin/ollama /bin/ollama
FROM ubuntu:20.04
@@ -116,10 +149,10 @@ RUN apt-get update \
&& apt-get install -y ca-certificates \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
COPY --from=archive /bin /usr/bin
COPY --from=archive /bin/ /usr/bin/
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
COPY --from=archive /lib/ollama /usr/lib/ollama
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
COPY --from=archive /lib/ollama/ /usr/lib/ollama/
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/ollama
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENV NVIDIA_VISIBLE_DEVICES=all
ENV OLLAMA_HOST=0.0.0.0:11434

View File

@@ -2,34 +2,24 @@ UPSTREAM=https://github.com/ggerganov/llama.cpp.git
WORKDIR=llama/vendor
FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
.PHONY: help
help:
@echo "Available targets:"
@echo " sync Sync with upstream repositories"
@echo " checkout Checkout upstream repository"
@echo " apply-patches Apply patches to local repository"
@echo " format-patches Format patches from local repository"
@echo " clean Clean local repository"
@echo
@echo "Example:"
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
all: sync
.PHONY: sync
sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches
sync: llama/llama.cpp ml/backend/ggml/ggml
.PHONY: llama/llama.cpp
llama/llama.cpp: llama/vendor/ apply-patches
llama/llama.cpp: llama/vendor/ apply_patches
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
.PHONY: ml/backend/ggml/ggml apply-patches
ml/backend/ggml/ggml: llama/vendor/ggml/ apply-patches
.PHONY: ml/backend/ggml/ggml apply_patches
ml/backend/ggml/ggml: llama/vendor/ggml/ apply_patches
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
PATCHES=$(wildcard llama/patches/*.patch)
.PHONY: apply-patches
.PHONY: apply_patches
.NOTPARALLEL:
apply-patches: $(addsuffix ed, $(PATCHES))
apply_patches: $(addsuffix ed, $(PATCHES))
%.patched: %.patch
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
@@ -42,8 +32,8 @@ checkout: $(WORKDIR)
$(WORKDIR):
git clone $(UPSTREAM) $(WORKDIR)
.PHONE: format-patches
format-patches: llama/patches
.PHONE: format_patches
format_patches: llama/patches
git -C $(WORKDIR) format-patch \
--no-signature \
--no-numbered \

View File

@@ -369,7 +369,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
- [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
### Cloud
@@ -482,7 +481,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [GoLamify](https://github.com/prasad89/golamify)
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
### Mobile

25
benchmark/README.md Normal file
View File

@@ -0,0 +1,25 @@
# Benchmark
Performance benchmarking for Ollama.
## Prerequisites
- Ollama server running locally (`127.0.0.1:11434`)
- Desired models pre-downloaded (e.g., `llama3.2:1b`)
## Run Benchmark
```bash
# Run all tests
go test -bench=. -timeout 30m ./...
```
## New Runner Benchmark
```bash
go test -bench=Runner
```
or to test multiple models:
```bash
# run this from within the benchmark directory
# requires: llama3.2:1b, llama3.1:8b, llama3.3:70b
sh new_runner.sh
```

72
benchmark/new_runner.sh Normal file
View File

@@ -0,0 +1,72 @@
#!/bin/bash
kill_process_tree() {
local pid=$1
# Get all child processes using pgrep
local children=$(pgrep -P $pid)
# Kill children first
for child in $children; do
kill_process_tree $child
done
# Kill the parent process
kill -9 $pid 2>/dev/null || true
}
# Function to run the runner and benchmark for a given model
run_benchmark() {
local model=$1
echo "Starting runner with model: $model"
# Start the runner in background and save its PID
go run ../cmd/runner/main.go --new-runner -model "$model" &
runner_pid=$!
# Wait for the runner to initialize (adjust sleep time as needed)
sleep 5
echo "Running benchmark..."
# Run test and wait for it to complete
go test -bench=Runner
test_exit_code=$?
echo "Stopping runner process..."
# Kill the runner process and all its children
kill_process_tree $runner_pid
# Wait for the process to fully terminate
wait $runner_pid 2>/dev/null || true
# Make sure no processes are still listening on port 8080
lsof -t -i:8080 | xargs kill -9 2>/dev/null || true
# Additional sleep to ensure port is freed
sleep 2
# Check if test failed
if [ $test_exit_code -ne 0 ]; then
echo "Warning: Benchmark test failed with exit code $test_exit_code"
fi
echo "Benchmark complete for model: $model"
echo "----------------------------------------"
}
HOME_DIR="$HOME"
# llama3.2:1b: ~/.ollama/models/blobs/sha256-74701a8c35f6c8d9a4b91f3f3497643001d63e0c7a84e085bed452548fa88d45
# llama3.1:8b: ~/.ollama/models/blobs/sha256-667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29
# llama3.3:70b: ~/.ollama/models/blobs/sha256-4824460d29f2058aaf6e1118a63a7a197a09bed509f0e7d4e2efb1ee273b447d
models=(
"${HOME_DIR}/.ollama/models/blobs/sha256-74701a8c35f6c8d9a4b91f3f3497643001d63e0c7a84e085bed452548fa88d45"
"${HOME_DIR}/.ollama/models/blobs/sha256-667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"
# "${HOME_DIR}/.ollama/models/blobs/sha256-4824460d29f2058aaf6e1118a63a7a197a09bed509f0e7d4e2efb1ee273b447d"
)
# Run benchmarks for each model
for model in "${models[@]}"; do
run_benchmark "$model"
done
echo "All benchmarks completed!"

View File

@@ -0,0 +1,175 @@
package benchmark
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"testing"
"time"
)
const (
runnerURL = "http://localhost:8080"
warmupPrompts = 2 // Number of warm-up requests per test case
warmupTokens = 50 // Smaller token count for warm-up requests
)
var runnerMetrics []BenchmarkMetrics
// CompletionRequest represents the request body for the completion endpoint
type CompletionRequest struct {
Prompt string `json:"prompt"`
NumPredict int `json:"n_predict"`
Temperature float32 `json:"temperature"`
}
// CompletionResponse represents a single response chunk from the streaming API
type CompletionResponse struct {
Content string `json:"content"`
Stop bool `json:"stop"`
Timings struct {
PredictedN int `json:"predicted_n"`
PredictedMs int `json:"predicted_ms"`
PromptN int `json:"prompt_n"`
PromptMs int `json:"prompt_ms"`
} `json:"timings"`
}
// warmUp performs warm-up requests before the actual benchmark
func warmUp(b *testing.B, tt TestCase) {
b.Logf("Warming up for test case %s", tt.name)
warmupTest := TestCase{
name: tt.name + "_warmup",
prompt: tt.prompt,
maxTokens: warmupTokens,
}
for i := 0; i < warmupPrompts; i++ {
runCompletion(context.Background(), warmupTest, b)
time.Sleep(100 * time.Millisecond) // Brief pause between warm-up requests
}
b.Logf("Warm-up complete")
}
func BenchmarkRunnerInference(b *testing.B) {
b.Logf("Starting benchmark suite")
// Verify server availability
if _, err := http.Get(runnerURL + "/health"); err != nil {
b.Fatalf("Runner unavailable: %v", err)
}
b.Log("Runner available")
tests := []TestCase{
{
name: "short_prompt",
prompt: formatPrompt("Write a long story"),
maxTokens: 100,
},
{
name: "medium_prompt",
prompt: formatPrompt("Write a detailed economic analysis"),
maxTokens: 500,
},
{
name: "long_prompt",
prompt: formatPrompt("Write a comprehensive AI research paper"),
maxTokens: 1000,
},
}
// Register cleanup handler for results reporting
b.Cleanup(func() { reportMetrics(metrics) })
// Main benchmark loop
for _, tt := range tests {
b.Run(tt.name, func(b *testing.B) {
// Perform warm-up requests
warmUp(b, tt)
// Wait a bit after warm-up before starting the actual benchmark
time.Sleep(500 * time.Millisecond)
m := make([]BenchmarkMetrics, b.N)
for i := 0; i < b.N; i++ {
b.ResetTimer()
m[i] = runCompletion(context.Background(), tt, b)
}
metrics = append(metrics, m...)
})
}
}
func formatPrompt(text string) string {
return fmt.Sprintf("<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", text)
}
func runCompletion(ctx context.Context, tt TestCase, b *testing.B) BenchmarkMetrics {
start := time.Now()
var ttft time.Duration
var tokens int
lastToken := start
// Create request body
reqBody := CompletionRequest{
Prompt: tt.prompt,
NumPredict: tt.maxTokens,
Temperature: 0.1,
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
b.Fatalf("Failed to marshal request: %v", err)
}
// Create HTTP request
req, err := http.NewRequestWithContext(ctx, "POST", runnerURL+"/completion", bytes.NewBuffer(jsonData))
if err != nil {
b.Fatalf("Failed to create request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
// Execute request
resp, err := http.DefaultClient.Do(req)
if err != nil {
b.Fatalf("Request failed: %v", err)
}
defer resp.Body.Close()
// Process streaming response
decoder := json.NewDecoder(resp.Body)
for {
var chunk CompletionResponse
if err := decoder.Decode(&chunk); err != nil {
if err == io.EOF {
break
}
b.Fatalf("Failed to decode response: %v", err)
}
if ttft == 0 && chunk.Content != "" {
ttft = time.Since(start)
}
if chunk.Content != "" {
tokens++
lastToken = time.Now()
}
if chunk.Stop {
break
}
}
totalTime := lastToken.Sub(start)
return BenchmarkMetrics{
testName: tt.name,
ttft: ttft,
totalTime: totalTime,
totalTokens: tokens,
tokensPerSecond: float64(tokens) / totalTime.Seconds(),
}
}

View File

@@ -0,0 +1,293 @@
// Package benchmark provides tools for performance testing of Ollama inference server and supported models.
package benchmark
import (
"context"
"fmt"
"net/http"
"net/url"
"os"
"testing"
"text/tabwriter"
"time"
"github.com/ollama/ollama/api"
)
// ServerURL is the default Ollama server URL for benchmarking
const serverURL = "http://127.0.0.1:11434"
// metrics collects all benchmark results for final reporting
var metrics []BenchmarkMetrics
// models contains the list of model names to benchmark
var models = []string{
"llama3.2:1b",
// "qwen2.5:7b",
// "llama3.3:70b",
}
// TestCase defines a benchmark test scenario with prompt characteristics
type TestCase struct {
name string // Human-readable test name
prompt string // Input prompt text
maxTokens int // Maximum tokens to generate
}
// BenchmarkMetrics contains performance measurements for a single test run
type BenchmarkMetrics struct {
model string // Model being tested
scenario string // cold_start or warm_start
testName string // Name of the test case
ttft time.Duration // Time To First Token (TTFT)
totalTime time.Duration // Total time for complete response
totalTokens int // Total generated tokens
tokensPerSecond float64 // Calculated throughput
}
// ScenarioType defines the initialization state for benchmarking
type ScenarioType int
const (
ColdStart ScenarioType = iota // Model is loaded from cold state
WarmStart // Model is already loaded in memory
)
// String implements fmt.Stringer for ScenarioType
func (s ScenarioType) String() string {
return [...]string{"cold_start", "warm_start"}[s]
}
// BenchmarkServerInference is the main entry point for benchmarking Ollama inference performance.
// It tests all configured models with different prompt lengths and start scenarios.
func BenchmarkServerInference(b *testing.B) {
b.Logf("Starting benchmark suite with %d models", len(models))
// Verify server availability
if _, err := http.Get(serverURL + "/api/version"); err != nil {
b.Fatalf("Server unavailable: %v", err)
}
b.Log("Server available")
tests := []TestCase{
{"short_prompt", "Write a long story", 100},
{"medium_prompt", "Write a detailed economic analysis", 500},
{"long_prompt", "Write a comprehensive AI research paper", 1000},
}
// Register cleanup handler for results reporting
b.Cleanup(func() { reportMetrics(metrics) })
// Main benchmark loop
for _, model := range models {
client := api.NewClient(mustParse(serverURL), http.DefaultClient)
// Verify model availability
if _, err := client.Show(context.Background(), &api.ShowRequest{Model: model}); err != nil {
b.Fatalf("Model unavailable: %v", err)
}
for _, tt := range tests {
testName := fmt.Sprintf("%s/%s/%s", model, ColdStart, tt.name)
b.Run(testName, func(b *testing.B) {
m := runBenchmark(b, tt, model, ColdStart, client)
metrics = append(metrics, m...)
})
}
for _, tt := range tests {
testName := fmt.Sprintf("%s/%s/%s", model, WarmStart, tt.name)
b.Run(testName, func(b *testing.B) {
m := runBenchmark(b, tt, model, WarmStart, client)
metrics = append(metrics, m...)
})
}
}
}
// runBenchmark executes multiple iterations of a specific test case and scenario.
// Returns collected metrics for all iterations.
func runBenchmark(b *testing.B, tt TestCase, model string, scenario ScenarioType, client *api.Client) []BenchmarkMetrics {
results := make([]BenchmarkMetrics, b.N)
// Run benchmark iterations
for i := 0; i < b.N; i++ {
switch scenario {
case WarmStart:
// Pre-warm the model by generating some tokens
for i := 0; i < 2; i++ {
client.Generate(
context.Background(),
&api.GenerateRequest{
Model: model,
Prompt: tt.prompt,
Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
},
func(api.GenerateResponse) error { return nil },
)
}
case ColdStart:
unloadModel(client, model, b)
}
b.ResetTimer()
results[i] = runSingleIteration(context.Background(), client, tt, model, b)
results[i].scenario = scenario.String()
}
return results
}
// unloadModel forces model unloading using KeepAlive: -1 parameter.
// Includes short delay to ensure unloading completes before next test.
func unloadModel(client *api.Client, model string, b *testing.B) {
req := &api.GenerateRequest{
Model: model,
KeepAlive: &api.Duration{Duration: 0},
}
if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
b.Logf("Unload error: %v", err)
}
time.Sleep(100 * time.Millisecond)
}
// runSingleIteration measures performance metrics for a single inference request.
// Captures TTFT, total generation time, and calculates tokens/second.
func runSingleIteration(ctx context.Context, client *api.Client, tt TestCase, model string, b *testing.B) BenchmarkMetrics {
start := time.Now()
var ttft time.Duration
var tokens int
lastToken := start
req := &api.GenerateRequest{
Model: model,
Prompt: tt.prompt,
Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
}
if b != nil {
b.Logf("Prompt length: %d chars", len(tt.prompt))
}
// Execute generation request with metrics collection
client.Generate(ctx, req, func(resp api.GenerateResponse) error {
if ttft == 0 {
ttft = time.Since(start)
}
if resp.Response != "" {
tokens++
lastToken = time.Now()
}
return nil
})
totalTime := lastToken.Sub(start)
return BenchmarkMetrics{
model: model,
testName: tt.name,
ttft: ttft,
totalTime: totalTime,
totalTokens: tokens,
tokensPerSecond: float64(tokens) / totalTime.Seconds(),
}
}
// reportMetrics processes collected metrics and prints formatted results.
// Generates both human-readable tables and CSV output with averaged statistics.
func reportMetrics(results []BenchmarkMetrics) {
if len(results) == 0 {
return
}
// Aggregate results by test case
type statsKey struct {
model string
scenario string
testName string
}
stats := make(map[statsKey]*struct {
ttftSum time.Duration
totalTimeSum time.Duration
tokensSum int
iterations int
})
for _, m := range results {
key := statsKey{m.model, m.scenario, m.testName}
if _, exists := stats[key]; !exists {
stats[key] = &struct {
ttftSum time.Duration
totalTimeSum time.Duration
tokensSum int
iterations int
}{}
}
stats[key].ttftSum += m.ttft
stats[key].totalTimeSum += m.totalTime
stats[key].tokensSum += m.totalTokens
stats[key].iterations++
}
// Calculate averages
var averaged []BenchmarkMetrics
for key, data := range stats {
count := data.iterations
averaged = append(averaged, BenchmarkMetrics{
model: key.model,
scenario: key.scenario,
testName: key.testName,
ttft: data.ttftSum / time.Duration(count),
totalTime: data.totalTimeSum / time.Duration(count),
totalTokens: data.tokensSum / count,
tokensPerSecond: float64(data.tokensSum) / data.totalTimeSum.Seconds(),
})
}
// Print formatted results
printTableResults(averaged)
printCSVResults(averaged)
}
// printTableResults displays averaged metrics in a formatted table
func printTableResults(averaged []BenchmarkMetrics) {
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
fmt.Fprintln(w, "\nAVERAGED BENCHMARK RESULTS")
fmt.Fprintln(w, "Model\tScenario\tTest Name\tTTFT (ms)\tTotal Time (ms)\tTokens\tTokens/sec")
for _, m := range averaged {
fmt.Fprintf(w, "%s\t%s\t%s\t%.2f\t%.2f\t%d\t%.2f\n",
m.model,
m.scenario,
m.testName,
float64(m.ttft.Milliseconds()),
float64(m.totalTime.Milliseconds()),
m.totalTokens,
m.tokensPerSecond,
)
}
w.Flush()
}
// printCSVResults outputs averaged metrics in CSV format
func printCSVResults(averaged []BenchmarkMetrics) {
fmt.Println("\nCSV OUTPUT")
fmt.Println("model,scenario,test_name,ttft_ms,total_ms,tokens,tokens_per_sec")
for _, m := range averaged {
fmt.Printf("%s,%s,%s,%.2f,%.2f,%d,%.2f\n",
m.model,
m.scenario,
m.testName,
float64(m.ttft.Milliseconds()),
float64(m.totalTime.Milliseconds()),
m.totalTokens,
m.tokensPerSecond,
)
}
}
// mustParse is a helper function to parse URLs with panic on error
func mustParse(rawURL string) *url.URL {
u, err := url.Parse(rawURL)
if err != nil {
panic(err)
}
return u
}

417
cache/cache.go vendored
View File

@@ -1,63 +1,420 @@
package cache
import (
"errors"
"fmt"
"log/slog"
"math"
"slices"
"github.com/ollama/ollama/ml"
)
type Options struct {
Position int
}
var ErrNotSupported = errors.New("model does not support operation")
type Cache interface {
// ** used by model implementations **
// Returns an instance of the cache for layer 'i'
Sub(i int) Cache
Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
// Returns the history of key and value tensors plus a mask
//
// The tensors are of shape embed dim, kv heads, batch size
// The mask is of shape history size, batch size
Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor)
// Stores a batch of key and value in the cache
//
// The tensors must be of shape embed dim, kv heads, batch size
Put(ctx ml.Context, key, value ml.Tensor)
// ** cache management **
// Closes the cache and frees resources associated with it
Close()
// Called before the start of the model's forward pass. For each
// token in the coming batch, there must be a corresponding entry
// in positions and seqs.
StartForward(ctx ml.Context, positions []int32, seqs []int) error
// Copies tokens in the range [0, len) from srcSeq to dstSeq
CopyPrefix(srcSeq, dstSeq int, len int32)
// Removes tokens in the range [beginIndex, endIndex) from seq. Set
// endIndex to math.MaxInt32 to remove everything starting at beginIndex
Remove(seq int, beginIndex, endIndex int32) error
}
type Simple struct {
type Causal struct {
DType ml.DType
Capacity int
Capacity int32
// current forward pass
curLayer int
curLoc int
curBatchSize int
curMask ml.Tensor
curCellRange cellRange
// metadata
cells []cacheCell
cellRanges map[int]cellRange
// cache data storage
backend ml.Backend
cacheCtx ml.Context
keys, values []ml.Tensor
}
func (c *Simple) Sub(i int) Cache {
type seqCell struct {
seq int
pos int32
}
type cacheCell struct {
sequences []seqCell
}
type cellRange struct {
min int
max int
}
func (cell cacheCell) findSeq(seq int) *seqCell {
for i := range cell.sequences {
if cell.sequences[i].seq == seq {
return &cell.sequences[i]
}
}
return nil
}
func NewCausalCache(backend ml.Backend, dtype ml.DType, capacity int32) Cache {
return &Causal{
Capacity: capacity,
DType: dtype,
cells: make([]cacheCell, capacity),
cellRanges: make(map[int]cellRange),
backend: backend,
cacheCtx: backend.NewContext(),
}
}
func (c *Causal) Close() {
c.cacheCtx.Close()
}
var ErrKvCacheFull = errors.New("could not find a kv cache slot")
func (c *Causal) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
if len(positions) != len(seqs) {
return fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(positions), len(seqs))
}
c.curBatchSize = len(positions)
if c.curBatchSize < 1 {
return errors.New("batch size cannot be less than 1")
}
var err error
c.curLoc, err = c.findStartLoc()
if errors.Is(err, ErrKvCacheFull) {
c.defrag()
c.curLoc, err = c.findStartLoc()
}
if err != nil {
return err
}
c.curCellRange = newRange()
for i, pos := range positions {
seq := seqs[i]
c.cells[c.curLoc+i] = cacheCell{sequences: []seqCell{{seq: seq, pos: pos}}}
ranges, ok := c.cellRanges[seq]
if !ok {
ranges = newRange()
}
if c.curLoc+i > ranges.max {
ranges.max = c.curLoc + i
}
if ranges.max > c.curCellRange.max {
c.curCellRange.max = ranges.max
}
if c.curLoc+i < ranges.min {
ranges.min = c.curLoc + i
}
if ranges.min < c.curCellRange.min {
c.curCellRange.min = ranges.min
}
c.cellRanges[seq] = ranges
}
c.curMask, err = c.buildMask(ctx, positions, seqs)
return err
}
func newRange() cellRange {
return cellRange{
min: math.MaxInt,
max: 0,
}
}
func (c *Causal) findStartLoc() (int, error) {
var start, count int
for i := range c.cells {
if len(c.cells[i].sequences) == 0 {
count++
if count >= c.curBatchSize {
return start, nil
}
} else {
start = i + 1
count = 0
}
}
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
}
func (c *Causal) buildMask(ctx ml.Context, positions []int32, seqs []int) (ml.Tensor, error) {
// TODO(jessegross): This makes a number of simplifications such as no padding,
// which could be an issue for CUDA graphs and/or flash attention
len := c.curCellRange.max - c.curCellRange.min + 1
mask := make([]float32, c.curBatchSize*len)
for i := range c.curBatchSize {
for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
cellSeq := c.cells[j].findSeq(seqs[i])
if cellSeq == nil || cellSeq.pos > positions[i] {
mask[i*len+(j-c.curCellRange.min)] = float32(math.Inf(-1))
}
}
}
return ctx.FromFloatSlice(mask, len, c.curBatchSize)
}
func moveCell(ctx ml.Context, objs []ml.Tensor, src, dst, len int) {
for _, obj := range objs {
srcView := obj.View(ctx, int(obj.Stride(2))*src, int(obj.Dim(0)*obj.Dim(1))*len)
dstView := obj.View(ctx, int(obj.Stride(2))*dst, int(obj.Dim(0)*obj.Dim(1))*len)
ctx.Forward(srcView.Copy(ctx, dstView))
}
}
func (c *Causal) defrag() {
slog.Debug("defragmenting kv cache")
// Defrag strategy:
// - Search for empty holes at the beginning of the cache,
// filling them with active data starting at the end
// - If there are contiguous elements that need to be moved,
// combine them into a single operation by holding new moves
// until we see the next one is non-contiguous
// - Fill up the context with the maximum number of operations it
// can hold then compute that and continue with a new context
//
// We could try to optimize placement by grouping blocks from
// the same sequences together but most likely the next forward
// pass will disrupt this anyways, so the real world benefit
// seems limited as this time.
ctx := c.backend.NewContext()
// For every move, 6 tensors are required per layer (2 views and a
// copy for each of k and v). For efficiency, we try to group
// multiple contiguous blocks into a single move. However, if we
// exceed the maximum number of tensors then we need to compute
// what we have and start a new batch.
maxMoves := ctx.MaxTensors() / (6 * len(c.keys))
moves := 0
var pendingSrc, pendingDst, pendingLen int
for dst := range c.cells {
if len(c.cells[dst].sequences) == 0 {
for src := len(c.cells) - 1; src > dst; src-- {
if len(c.cells[src].sequences) != 0 {
c.cells[dst] = c.cells[src]
c.cells[src] = cacheCell{}
if pendingLen > 0 {
if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
pendingSrc = src
pendingLen++
break
} else {
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
moves++
}
}
pendingSrc = src
pendingDst = dst
pendingLen = 1
break
}
}
}
if moves >= maxMoves {
ctx.Compute(nil)
ctx.Close()
ctx = c.backend.NewContext()
moves = 0
}
}
if pendingLen > 0 {
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
moves++
}
if moves > 0 {
ctx.Compute(nil)
}
ctx.Close()
for seq := range c.cellRanges {
seqRange := newRange()
for i, cell := range c.cells {
if cell.findSeq(seq) != nil {
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
c.cellRanges[seq] = seqRange
}
}
func (c *Causal) Sub(i int) Cache {
if i >= len(c.keys) {
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
}
return &Simple{
keys: c.keys[i : i+1],
values: c.values[i : i+1],
Capacity: c.Capacity,
DType: c.DType,
}
c.curLayer = i
return c
}
func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
if c.keys[0] == nil || c.values[0] == nil {
c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
}
func (c *Causal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
key := c.keys[c.curLayer]
value := c.values[c.curLayer]
ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
n := min(c.Capacity, int(key.Dim(2))+opts.Position)
key = c.keys[0].View(ctx, 0,
key = key.View(ctx, int(key.Stride(2))*c.curCellRange.min,
int(key.Dim(0)), int(key.Stride(1)),
int(key.Dim(1)), int(key.Stride(2)),
n,
int(c.curMask.Dim(0)),
)
value = c.values[0].View(ctx, 0,
value = value.View(ctx, int(key.Stride(2))*c.curCellRange.min,
int(value.Dim(0)), int(value.Stride(1)),
int(value.Dim(1)), int(value.Stride(2)),
n,
int(c.curMask.Dim(0)),
)
// TODO shift context if necessary
return key, value
return key, value, c.curMask
}
func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
if c.curBatchSize != int(key.Dim(2)) {
panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, int(key.Dim(2))))
}
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
c.keys[c.curLayer] = c.cacheCtx.Zeros(c.DType, key.Dim(0), key.Dim(1), int64(c.Capacity))
c.values[c.curLayer] = c.cacheCtx.Zeros(c.DType, value.Dim(0), value.Dim(1), int64(c.Capacity))
}
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, int(key.Stride(2))*c.curLoc, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, int(value.Stride(2))*c.curLoc, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
}
func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
seqRange := newRange()
for i := range c.cells {
srcCellSeq := c.cells[i].findSeq(srcSeq)
dstCellSeq := c.cells[i].findSeq(dstSeq)
if dstCellSeq != nil {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == dstSeq })
}
if srcCellSeq != nil && srcCellSeq.pos < len {
c.cells[i].sequences = append(c.cells[i].sequences, seqCell{seq: dstSeq, pos: srcCellSeq.pos})
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
c.cellRanges[dstSeq] = seqRange
}
func (c *Causal) shift(seq int, beginIndex, offset int32) error {
panic("Shift not yet implemented")
}
func (c *Causal) Remove(seq int, beginIndex, endIndex int32) error {
var offset int32
if endIndex != math.MaxInt32 {
offset = beginIndex - endIndex
}
seqRange := newRange()
for i := range c.cells {
cellSeq := c.cells[i].findSeq(seq)
if cellSeq != nil {
if cellSeq.pos >= beginIndex && cellSeq.pos < endIndex {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == seq })
} else {
if cellSeq.pos >= endIndex {
cellSeq.pos += offset
}
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
}
if endIndex != math.MaxInt32 {
err := c.shift(seq, endIndex, offset)
if err != nil {
return err
}
}
c.cellRanges[seq] = seqRange
return nil
}

47
cache/tensor.go vendored Normal file
View File

@@ -0,0 +1,47 @@
package cache
import (
"github.com/ollama/ollama/ml"
)
type TensorCache struct {
curLayer int
cacheCtx ml.Context
keys, values []ml.Tensor
}
func NewTensorCache(backend ml.Backend) *TensorCache {
return &TensorCache{
cacheCtx: backend.NewContext(),
}
}
func (c *TensorCache) Close() {
c.cacheCtx.Close()
}
func (c *TensorCache) Sub(i int) *TensorCache {
if i >= len(c.keys) {
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
}
c.curLayer = i
return c
}
func (c *TensorCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
return c.keys[c.curLayer], c.values[c.curLayer], nil
}
func (c *TensorCache) Put(ctx ml.Context, key, value ml.Tensor) {
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
c.keys[c.curLayer] = c.cacheCtx.Zeros(key.DType(), key.Shape()...)
c.values[c.curLayer] = c.cacheCtx.Zeros(value.DType(), value.Shape()...)
}
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer]))
ctx.Forward(value.Copy(ctx, c.values[c.curLayer]))
}

View File

@@ -35,9 +35,9 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/llama/runner"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/runner"
"github.com/ollama/ollama/server"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
@@ -338,7 +338,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
return err
}
opts.MultiModal = len(info.ProjectorInfo) != 0
// TODO(jessegross): We should either find another way to know if this is
// a vision model or remove the logic. Also consider that other modalities will
// need different behavior anyways.
opts.MultiModal = true
opts.ParentModel = info.Details.ParentModel
if interactive {

View File

@@ -4,7 +4,7 @@ import (
"fmt"
"os"
"github.com/ollama/ollama/llama/runner"
"github.com/ollama/ollama/runner"
)
func main() {

View File

@@ -2,6 +2,7 @@ package convert
import "github.com/ollama/ollama/fs/ggml"
type qwen2Model struct {
ModelParameters
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`

View File

@@ -9,6 +9,8 @@ import (
"path/filepath"
"runtime"
"strings"
"github.com/ollama/ollama/envconfig"
)
// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -39,10 +41,13 @@ func commonAMDValidateLibDir() (string, error) {
// Favor our bundled version
// Installer payload location if we're running the installed binary
rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil
exe, err := os.Executable()
if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil
}
}
// Prefer explicit HIP env var

View File

@@ -77,7 +77,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
gfxOverride := envconfig.HsaOverrideGfxVersion()
var supported []string
var libDir string
depPaths := LibraryDirs()
libDir := ""
// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
@@ -352,8 +353,9 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
})
return nil, err
}
depPaths = append(depPaths, libDir)
}
gpuInfo.DependencyPath = []string{libDir}
gpuInfo.DependencyPath = depPaths
if gfxOverride == "" {
// Only load supported list once

View File

@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"slices"
"strconv"
@@ -49,13 +50,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
slog.Info(err.Error())
return nil, err
}
depPaths := LibraryDirs()
libDir, err := AMDValidateLibDir()
if err != nil {
err = fmt.Errorf("unable to verify rocm library: %w", err)
slog.Warn(err.Error())
return nil, err
}
depPaths = append(depPaths, libDir)
var supported []string
gfxOverride := envconfig.HsaOverrideGfxVersion()
@@ -111,7 +113,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
UnreliableFreeMemory: true,
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
DependencyPath: []string{libDir},
DependencyPath: depPaths,
MinimumMemory: rocmMinimumMemory,
Name: name,
Compute: gfx,
@@ -162,7 +164,9 @@ func AMDValidateLibDir() (string, error) {
}
// Installer payload (if we're running from some other location)
rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
localAppData := os.Getenv("LOCALAPPDATA")
appDir := filepath.Join(localAppData, "Programs", "Ollama")
rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
return rocmTargetDir, nil

View File

@@ -23,6 +23,7 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/runners"
)
type cudaHandles struct {
@@ -100,7 +101,15 @@ func initCudaHandles() *cudaHandles {
// Aligned with driver, we can't carry as payloads
nvcudaMgmtPatterns := NvcudaGlobs
cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(LibOllamaPath, "cuda_v*", CudartMgmtName))
if runtime.GOOS == "windows" {
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
}
libDirs := LibraryDirs()
for _, d := range libDirs {
cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(d, CudartMgmtName))
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
if len(NvmlGlobs) > 0 {
@@ -231,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
depPaths := LibraryDirs()
details, err := GetCPUDetails()
if err != nil {
slog.Warn("failed to lookup CPU details", "error", err)
@@ -239,9 +248,11 @@ func GetGPUInfo() GpuInfoList {
cpus = []CPUInfo{
{
GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu",
ID: "0",
memInfo: mem,
Library: "cpu",
Variant: runners.GetCPUCapability().String(),
ID: "0",
DependencyPath: depPaths,
},
CPUs: details,
},
@@ -283,13 +294,17 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
variant := cudaVariant(gpuInfo)
// Start with our bundled libraries
if variant != "" {
variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant)
if _, err := os.Stat(variantPath); err == nil {
// Put the variant directory first in the search path to avoid runtime linking to the wrong library
gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...)
if depPaths != nil {
gpuInfo.DependencyPath = depPaths
// Check for variant specific directory
if variant != "" {
for _, d := range depPaths {
if _, err := os.Stat(filepath.Join(d, "cuda_"+variant)); err == nil {
// Put the variant directory first in the search path to avoid runtime linking to the wrong library
gpuInfo.DependencyPath = append([]string{filepath.Join(d, "cuda_"+variant)}, gpuInfo.DependencyPath...)
break
}
}
}
}
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
@@ -361,7 +376,7 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DependencyPath = []string{LibOllamaPath}
gpuInfo.DependencyPath = depPaths
oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
}
@@ -497,30 +512,33 @@ func GetGPUInfo() GpuInfoList {
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
gpuLibPaths := []string{}
slog.Debug("Searching for GPU library", "name", baseLibName)
// search our bundled libraries first
patterns := []string{filepath.Join(LibOllamaPath, baseLibName)}
var ldPaths []string
switch runtime.GOOS {
case "windows":
ldPaths = strings.Split(os.Getenv("PATH"), string(os.PathListSeparator))
case "linux":
ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), string(os.PathListSeparator))
// Start with our bundled libraries
patterns := []string{}
for _, d := range LibraryDirs() {
patterns = append(patterns, filepath.Join(d, baseLibName))
}
// then search the system's LD_LIBRARY_PATH
for _, p := range ldPaths {
p, err := filepath.Abs(p)
switch runtime.GOOS {
case "windows":
ldPaths = strings.Split(os.Getenv("PATH"), ";")
case "linux":
ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
default:
return gpuLibPaths
}
// Then with whatever we find in the PATH/LD_LIBRARY_PATH
for _, ldPath := range ldPaths {
d, err := filepath.Abs(ldPath)
if err != nil {
continue
}
patterns = append(patterns, filepath.Join(p, baseLibName))
patterns = append(patterns, filepath.Join(d, baseLibName))
}
// finally, search the default patterns provided by the caller
patterns = append(patterns, defaultPatterns...)
slog.Debug("gpu library search", "globs", patterns)
for _, pattern := range patterns {
@@ -697,6 +715,23 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
}
}
func LibraryDirs() []string {
// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
// This can be simplified once we no longer carry runners as payloads
exe, err := os.Executable()
if err != nil {
slog.Warn("failed to lookup executable path", "error", err)
return nil
}
lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
if _, err := os.Stat(lib); err != nil {
return nil
}
return []string{lib}
}
func GetSystemInfo() SystemInfo {
gpus := GetGPUInfo()
gpuMutex.Lock()

View File

@@ -15,6 +15,7 @@ import (
"syscall"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/runners"
)
const (
@@ -27,6 +28,7 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{
{
Library: "cpu",
Variant: runners.GetCPUCapability().String(),
memInfo: mem,
},
}
@@ -49,6 +51,7 @@ func GetCPUInfo() GpuInfoList {
return []GpuInfo{
{
Library: "cpu",
Variant: runners.GetCPUCapability().String(),
memInfo: mem,
},
}

View File

@@ -1,53 +0,0 @@
package discover
import (
"os"
"path/filepath"
"runtime"
)
// LibPath is a path to lookup dynamic libraries
// in development it's usually 'build/lib/ollama'
// in distribution builds it's 'lib/ollama' on Windows
// '../lib/ollama' on Linux and the executable's directory on macOS
// note: distribution builds, additional GPU-specific libraries are
// found in subdirectories of the returned path, such as
// 'cuda_v11', 'cuda_v12', 'rocm', etc.
var LibOllamaPath string = func() string {
exe, err := os.Executable()
if err != nil {
return ""
}
exe, err = filepath.EvalSymlinks(exe)
if err != nil {
return ""
}
libPath := filepath.Dir(exe)
switch runtime.GOOS {
case "windows":
libPath = filepath.Join(filepath.Dir(exe), "lib", "ollama")
case "linux":
libPath = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
}
cwd, err := os.Getwd()
if err != nil {
return ""
}
// build paths for development
buildPaths := []string{
filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"),
filepath.Join(cwd, "build", "lib", "ollama"),
}
for _, p := range buildPaths {
if _, err := os.Stat(p); err == nil {
return p
}
}
return libPath
}()

View File

@@ -5,6 +5,7 @@ import (
"log/slog"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/runners"
)
type memInfo struct {
@@ -106,7 +107,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
for _, info := range l {
found := false
requested := info.Library
if info.Variant != "" {
if info.Variant != runners.CPUCapabilityNone.String() {
requested += "_" + info.Variant
}
for i, lib := range libs {

View File

@@ -1,120 +1,165 @@
# Development
Install prerequisites:
Install required tools:
- [Go](https://go.dev/doc/install)
- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://jmeubank.github.io/tdm-gcc/download/) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.
- go version 1.22 or higher
- OS specific C/C++ compiler (see below)
- GNU Make
Then build and run Ollama from the root directory of the repository:
```
go run . serve
## Overview
Ollama uses a mix of Go and C/C++ code to interface with GPUs. The C/C++ code is compiled with both CGO and GPU library specific compilers. A set of GNU Makefiles are used to compile the project. GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary. The default make target will build the runners and primary Go Ollama application that will run within the repo directory. Throughout the examples below `-j 5` is suggested for 5 parallel jobs to speed up the build. You can adjust the job count based on your CPU Core count to reduce build times. If you want to relocate the built binaries, use the `dist` target and recursively copy the files in `./dist/$OS-$ARCH/` to your desired location. To learn more about the other make targets use `make help`
Once you have built the GPU/CPU runners, you can compile the main application with `go build .`
### MacOS
[Download Go](https://go.dev/dl/)
```bash
make -j 5
```
## macOS (Apple Silicon)
Now you can run `ollama`:
macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
## macOS (Intel)
Install prerequisites:
- [CMake](https://cmake.org/download/) or `brew install cmake`
Then, configure and build the project:
```
cmake -B build
cmake --build build
```bash
./ollama
```
Lastly, run Ollama:
#### Xcode 15 warnings
If you are using Xcode newer than version 14, you may see a warning during `go build` about `ld: warning: ignoring duplicate libraries: '-lobjc'` due to Golang issue https://github.com/golang/go/issues/67799 which can be safely ignored. You can suppress the warning with `export CGO_LDFLAGS="-Wl,-no_warn_duplicate_libraries"`
### Linux
#### Linux CUDA (NVIDIA)
_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
development and runtime packages.
Typically the makefile will auto-detect CUDA, however, if your Linux distro
or installation approach uses alternative paths, you can specify the location by
overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)
```
go run . serve
make -j 5
```
## Windows
If both v11 and v12 tookkits are detected, runners for both major versions will be built by default. You can build just v12 with `make cuda_v12`
Install prerequisites:
#### Older Linux CUDA (NVIDIA)
- [CMake](https://cmake.org/download/)
- [Visual Studio 2022](https://visualstudio.microsoft.com/downloads/) including the Native Desktop Workload
- (Optional) AMD GPU support
- [ROCm](https://rocm.github.io/install.html)
- [Ninja](https://github.com/ninja-build/ninja/releases)
- (Optional) NVIDIA GPU support
- [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
To support older GPUs with Compute Capability 3.5 or 3.7, you will need to use an older version of the Driver from [Unix Driver Archive](https://www.nvidia.com/en-us/drivers/unix/) (tested with 470) and [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (tested with cuda V11). When you build Ollama, you will need to set two make variable to adjust the minimum compute capability Ollama supports via `make -j 5 CUDA_ARCHITECTURES="35;37;50;52" EXTRA_GOLDFLAGS="\"-X=github.com/ollama/ollama/discover.CudaComputeMajorMin=3\" \"-X=github.com/ollama/ollama/discover.CudaComputeMinorMin=5\""`. To find the Compute Capability of your older GPU, refer to [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
> [!IMPORTANT]
> Ensure prerequisites are in `PATH` before running CMake.
#### Linux ROCm (AMD)
> [!IMPORTANT]
> ROCm is not compatible with Visual Studio CMake generators. Use `-GNinja` when configuring the project.
_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
> [!IMPORTANT]
> CUDA is only compatible with Visual Studio CMake generators.
Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
Then, configure and build the project:
Typically the build scripts will auto-detect ROCm, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `HIP_PATH` to the location of the ROCm
install (typically `/opt/rocm`). You can also customize
the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)
```
cmake -B build
cmake --build build --config Release
make -j 5
```
Lastly, run Ollama:
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
#### Containerized Linux Build
If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist` and by default the script builds both arm64 and amd64 binaries. If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`
### Windows
The following tools are required as a minimal development environment to build CPU inference support.
- Go version 1.22 or higher
- https://go.dev/dl/
- Git
- https://git-scm.com/download/win
- clang with gcc compat and Make. There are multiple options on how to go about installing these tools on Windows. We have verified the following, but others may work as well:
- [MSYS2](https://www.msys2.org/)
- After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
- Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
> [!NOTE]
> Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
```
go run . serve
make -j 5
```
## Windows (ARM)
#### GPU Support
Windows ARM does not support additional acceleration libraries at this time.
The GPU tools require the Microsoft native build tools. To build either CUDA or ROCm, you must first install MSVC via Visual Studio:
## Linux
- Make sure to select `Desktop development with C++` as a Workload during the Visual Studio install
- You must complete the Visual Studio install and run it once **BEFORE** installing CUDA or ROCm for the tools to properly register
- Add the location of the **64 bit (x64)** compiler (`cl.exe`) to your `PATH`
- Note: the default Developer Shell may configure the 32 bit (x86) compiler which will lead to build failures. Ollama requires a 64 bit toolchain.
Install prerequisites:
#### Windows CUDA (NVIDIA)
- [CMake](https://cmake.org/download/) or `sudo apt install cmake` or `sudo dnf install cmake`
- (Optional) AMD GPU support
- [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
- (Optional) NVIDIA GPU support
- [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
In addition to the common Windows development tools and MSVC described above:
> [!IMPORTANT]
> Ensure prerequisites are in `PATH` before running CMake.
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
#### Windows ROCm (AMD Radeon)
Then, configure and build the project:
In addition to the common Windows development tools and MSVC described above:
```
cmake -B build
cmake --build build
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
#### Windows arm64
The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want. To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
```powershell
import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
```
Lastly, run Ollama:
You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment. Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
```
go run . serve
pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
```
## Docker
You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
## Advanced CPU Vector Settings
On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load. If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled. This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility. Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
To build without any vector flags:
```
docker build .
make CUSTOM_CPU_FLAGS=""
```
### ROCm
To build with both AVX and AVX2:
```
docker build --build-arg FLAVOR=rocm .
make CUSTOM_CPU_FLAGS=avx,avx2
```
## Running tests
To run tests, use `go test`:
To build with AVX512 features turned on:
```
go test ./...
make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
```
> [!NOTE]
> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags

View File

@@ -67,6 +67,8 @@ To use this:
3. `ollama run choose-a-model-name`
4. Start using the model!
More examples are available in the [examples directory](../examples).
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
```bash

View File

@@ -165,6 +165,8 @@ var (
IntelGPU = Bool("OLLAMA_INTEL_GPU")
// MultiUserCache optimizes prompt caching for multi-user scenarios
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
// Enable the new Ollama engine
NewRunners = Bool("OLLAMA_NEW_RUNNERS")
)
func String(s string) func() string {
@@ -250,6 +252,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_NEW_RUNNERS": {"OLLAMA_NEW_RUNNERS", NewRunners(), "Enable the new Ollama engine"},
// Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
@@ -288,3 +291,12 @@ func Values() map[string]string {
func Var(key string) string {
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
}
// On windows, we keep the binary at the top directory, but
// other platforms use a "bin" directory, so this returns ".."
func LibRelativeToExe() string {
if runtime.GOOS == "windows" {
return "."
}
return ".."
}

View File

@@ -1,53 +1,157 @@
# `llama`
This package provides Go bindings to [llama.cpp](https://github.com/ggerganov/llama.cpp).
This package integrates the [llama.cpp](https://github.com/ggerganov/llama.cpp) library as a Go package and makes it easy to build it with tags for different CPU and GPU processors.
Supported:
- [x] CPU
- [x] avx, avx2
- [x] macOS Metal
- [x] Windows CUDA
- [x] Windows ROCm
- [x] Linux CUDA
- [x] Linux ROCm
- [x] Llava
Extra build steps are required for CUDA and ROCm on Windows since `nvcc` and `hipcc` both require using msvc as the host compiler. For these shared libraries are created:
- `ggml_cuda.dll` on Windows or `ggml_cuda.so` on Linux
- `ggml_hipblas.dll` on Windows or `ggml_hipblas.so` on Linux
> Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
## Building
```
go build .
```
### AVX
```shell
go build -tags avx .
```
### AVX2
```shell
# go doesn't recognize `-mfma` as a valid compiler flag
# see https://github.com/golang/go/issues/17895
go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
go build -tags=avx,avx2 .
```
## Linux
### CUDA
Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
```shell
make ggml_cuda.so
go build -tags avx,cuda .
```
### ROCm
Install [ROCm](https://rocm.docs.amd.com/en/latest/).
```shell
make ggml_hipblas.so
go build -tags avx,rocm .
```
## Windows
Download [w64devkit](https://github.com/skeeto/w64devkit/releases/latest) for a simple MinGW development environment.
### CUDA
Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build the cuda code:
```shell
make ggml_cuda.dll
go build -tags avx,cuda .
```
### ROCm
Install [ROCm](https://rocm.docs.amd.com/en/latest/).
```shell
make ggml_hipblas.dll
go build -tags avx,rocm .
```
## Building runners
```shell
# build all runners for this platform
make -j
```
## Vendoring
Ollama vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/llama.cpp/tree/master/ggml/src). While we generally strive to contribute changes back upstream to avoid drift, we carry a small set of patches which are applied to the tracking commit.
Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
```
make -f Makefile.sync apply-patches
make apply-patches
```
### Updating Base Commit
**Pin to new base commit**
To change the base commit, update `FETCH_HEAD` in Makefile.sync.
To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring`
#### Applying patches
When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
```
make -f Makefile.sync apply-patches
make apply-patches
```
If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
Once all patches are applied, commit the changes to the tracking repository.
If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
```
make -f Makefile.sync format-patches sync
make create-patches sync
```
Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
### Generating Patches
When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
```
make -f Makefile.sync clean apply-patches
make apply-patches
```
Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
```
make sync
make -j 8
go build .
```
> [!IMPORTANT]
> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
```
make -f Makefile.sync format-patches
make create-patches
```
> [!IMPORTANT]
> Once you have completed this step, it is safe to run `apply-patches` since your change is preserved in the patches.
In your `./vendor/` directory, create a branch, and cherry-pick the new commit to that branch, then submit a PR upstream to llama.cpp.
Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.

View File

@@ -90,8 +90,6 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
// NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family.
func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error
systemInfo := discover.GetSystemInfo()
systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory := systemInfo.System.FreeMemory
@@ -103,8 +101,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
gpus = discover.GetCPUInfo()
}
estimate := EstimateGPULayers(gpus, f, projectors, opts)
if len(gpus) > 1 || gpus[0].Library != "cpu" {
var estimate MemoryEstimate
if len(gpus) == 1 && gpus[0].Library == "cpu" {
estimate = EstimateGPULayers(gpus, f, projectors, opts)
} else {
estimate = EstimateGPULayers(gpus, f, projectors, opts)
switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this
@@ -231,209 +233,152 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
params = append(params, "--multiuser-cache")
}
// get available libraries
exe, err := os.Executable()
if err != nil {
return nil, fmt.Errorf("could not get libollama dir: %w", err)
return nil, err
}
entries, err := os.ReadDir(discover.LibOllamaPath)
if err != nil {
return nil, fmt.Errorf("could not read libollama dir: %w", err)
}
libs := make(map[string]string)
for _, entry := range entries {
if entry.IsDir() {
libs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed ", "error", err)
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
finalParams := []string{"runner"}
if envconfig.NewRunners() {
finalParams = append(finalParams, "--new-runner")
}
finalParams = append(finalParams, params...)
finalParams = append(finalParams, "--port", strconv.Itoa(port))
lib := gpus[0].RunnerName()
requested := envconfig.LLMLibrary()
if libs[requested] != "" {
slog.Info("using requested gpu library", "requested", requested)
lib = requested
pathEnv := "LD_LIBRARY_PATH"
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// Start with the server directory for the LD_LIBRARY_PATH/PATH
libraryPaths := []string{filepath.Dir(exe)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// favor our bundled library dependencies over system libraries
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
var compatible []string
for k := range libs {
// exact match first
if k == lib {
compatible = append([]string{k}, compatible...)
continue
}
// then match the family (e.g. 'cuda')
if strings.Split(k, "_")[0] == strings.Split(lib, "_")[0] {
compatible = append(compatible, k)
}
// Note: we always put the dependency path first
// since this was the exact version we compiled/linked against
if gpus[0].DependencyPath != nil {
// assume gpus from the same library have the same dependency path
libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
}
slog.Debug("compatible gpu libraries", "compatible", compatible)
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags
for {
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed ", "error", err)
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
finalParams := []string{"runner"}
finalParams = append(finalParams, params...)
finalParams = append(finalParams, "--port", strconv.Itoa(port))
// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
s := &llmServer{
port: port,
cmd: exec.Command(exe, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
modelPath: model,
estimate: estimate,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
gpus: gpus,
done: make(chan error, 1),
}
pathEnv := "LD_LIBRARY_PATH"
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
s.cmd.SysProcAttr = LlamaServerSysProcAttr
var libraryPaths []string
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
envWorkarounds := [][2]string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
}
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
if len(compatible) > 0 {
c := compatible[0]
if libpath, ok := libs[c]; ok {
slog.Debug("adding gpu library", "path", libpath)
libraryPaths = append(libraryPaths, libpath)
}
}
// Note: we always put the dependency path first
// since this was the exact version we compiled/linked against
if gpus[0].DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
// assume gpus from the same library have the same dependency path
libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
}
// finally, add the root library path
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
exe, err := os.Executable()
if err != nil {
return nil, fmt.Errorf("unable to lookup executable path: %w", err)
}
exe, err = filepath.EvalSymlinks(exe)
if err != nil {
return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
}
// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
s := &llmServer{
port: port,
cmd: exec.Command(exe, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
modelPath: model,
estimate: estimate,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: f.KV().BlockCount() + 1,
gpus: gpus,
done: make(chan error, 1),
}
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
s.cmd.SysProcAttr = LlamaServerSysProcAttr
envWorkarounds := [][2]string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
}
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path and visible devices variable with our adjusted version
pathNeeded := true
devicesNeeded := visibleDevicesEnv != ""
for i := range s.cmd.Env {
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
devicesNeeded = false
} else if len(envWorkarounds) != 0 {
for _, kv := range envWorkarounds {
if strings.EqualFold(cmp[0], kv[0]) {
s.cmd.Env[i] = kv[0] + "=" + kv[1]
}
// Update or add the path and visible devices variable with our adjusted version
pathNeeded := true
devicesNeeded := visibleDevicesEnv != ""
for i := range s.cmd.Env {
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
devicesNeeded = false
} else if len(envWorkarounds) != 0 {
for _, kv := range envWorkarounds {
if strings.EqualFold(cmp[0], kv[0]) {
s.cmd.Env[i] = kv[0] + "=" + kv[1]
}
}
}
if pathNeeded {
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
}
if devicesNeeded {
s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
}
slog.Info("starting llama server", "cmd", s.cmd.String())
if envconfig.Debug() {
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCR_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
strings.HasPrefix(ev, "GPU_") ||
strings.HasPrefix(ev, "HSA_") ||
strings.HasPrefix(ev, "GGML_") ||
strings.HasPrefix(ev, "PATH=") ||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
filteredEnv = append(filteredEnv, ev)
}
}
// Log at debug as the environment is inherited and might contain sensitive information
slog.Debug("subprocess", "environment", filteredEnv)
}
if err = s.cmd.Start(); err != nil {
var msg string
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
err := fmt.Errorf("error starting runner: %v %s", err, msg)
if len(compatible) == 0 {
return nil, err
}
slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
compatible = compatible[1:]
continue
}
// reap subprocess when it exits
go func() {
err := s.cmd.Wait()
// Favor a more detailed message over the process exit status
if err != nil && s.status != nil && s.status.LastErrMsg != "" {
slog.Error("llama runner terminated", "error", err)
if strings.Contains(s.status.LastErrMsg, "unknown model") {
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
}
s.done <- errors.New(s.status.LastErrMsg)
} else {
s.done <- err
}
}()
return s, nil
}
if pathNeeded {
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
}
if devicesNeeded {
s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
}
slog.Info("starting llama server", "cmd", s.cmd.String())
if envconfig.Debug() {
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCR_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
strings.HasPrefix(ev, "GPU_") ||
strings.HasPrefix(ev, "HSA_") ||
strings.HasPrefix(ev, "GGML_") ||
strings.HasPrefix(ev, "PATH=") ||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
filteredEnv = append(filteredEnv, ev)
}
}
// Log at debug as the environment is inherited and might contain sensitive information
slog.Debug("subprocess", "environment", filteredEnv)
}
if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment the message about noexec
if errors.Is(err, os.ErrPermission) {
return nil, fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, exe)
}
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return nil, fmt.Errorf("error starting the external llama server: %v %s", err, msg)
}
// reap subprocess when it exits
go func() {
err := s.cmd.Wait()
// Favor a more detailed message over the process exit status
if err != nil && s.status != nil && s.status.LastErrMsg != "" {
slog.Debug("llama runner terminated", "error", err)
if strings.Contains(s.status.LastErrMsg, "unknown model") {
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
}
s.done <- errors.New(s.status.LastErrMsg)
} else {
s.done <- err
}
}()
return s, nil
}
type ServerStatus int

View File

@@ -18,8 +18,8 @@ const config: ForgeConfig = {
asar: true,
icon: './assets/icon.icns',
extraResource: [
path.join(__dirname, '../dist/darwin/ollama'),
...fs.readdirSync(path.join(__dirname, '../dist/darwin/amd64')).map(f => path.join(__dirname, '../dist/darwin/amd64', f)),
'../dist/ollama',
'../dist/darwin-amd64/lib',
path.join(__dirname, './assets/iconTemplate.png'),
path.join(__dirname, './assets/iconTemplate@2x.png'),
path.join(__dirname, './assets/iconUpdateTemplate.png'),
@@ -43,7 +43,7 @@ const config: ForgeConfig = {
}
: {}),
osxUniversal: {
x64ArchFiles: '*',
x64ArchFiles: '**/ollama*',
},
},
rebuildConfig: {},

View File

@@ -43,12 +43,13 @@ func NewBackend(f *os.File) (Backend, error) {
}
type Context interface {
Zeros(dtype DType, shape ...int) Tensor
Zeros(dtype DType, shape ...int64) Tensor
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
FromIntSlice(s []int32, shape ...int) (Tensor, error)
Forward(Tensor)
Compute(Tensor) Tensor
MaxTensors() int
Close() error
}

View File

@@ -23,7 +23,7 @@ import (
"github.com/ollama/ollama/ml"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
)
type device struct {
@@ -198,10 +198,9 @@ func (b *Backend) Get(name string) ml.Tensor {
func (b *Backend) NewContext() ml.Context {
nodes := max(8192, len(b.meta.Tensors().Items())*5)
bts := make([]byte, C.size_t(nodes)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(nodes), false))
c := C.ggml_init(C.struct_ggml_init_params{
mem_buffer: unsafe.Pointer(&bts[0]),
mem_size: C.size_t(len(bts)),
mem_buffer: nil,
mem_size: C.size_t(nodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(nodes), false),
no_alloc: true,
})
@@ -244,17 +243,23 @@ func (c *Context) Forward(t ml.Tensor) {
}
func (c *Context) Compute(t ml.Tensor) ml.Tensor {
c.Forward(t)
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
if t != nil && C.ggml_nbytes(t.(*Tensor).t) != 0 {
backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
return t
}
func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
func (c *Context) MaxTensors() int {
return c.nodes
}
func (c Context) Zeros(dtype ml.DType, shape ...int64) ml.Tensor {
if len(shape) < 1 || len(shape) > 4 {
panic("unsupported number of dimensions")
}
@@ -283,6 +288,13 @@ func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
n := len(s)
if n == 0 {
shape := 0
t := C.ggml_new_tensor(ctx.ctx, dtype, 1, (*C.int64_t)(unsafe.Pointer(&shape)))
return &Tensor{t: t}, nil
}
for _, v := range shape {
n /= v
}

View File

@@ -1,9 +1,7 @@
protect *.go
protect *-embed.*
protect **/*.go
protect **/*-embed.*
include include/
include src/
include src/CMakeLists.txt
include src/**/CMakeLists.txt
include src/ggml-blas/
include src/ggml-cpu/
include src/ggml-cpu/amx/
@@ -12,11 +10,12 @@ include src/ggml-cuda/
include src/ggml-cuda/template-instances/
include src/ggml-hip/
include src/ggml-metal/
include *.c
include *.h
include *.cpp
include *.cu
include *.cuh
include *.m
include *.metal
include **/CMakeLists.txt
include **/*.c
include **/*.h
include **/*.cpp
include **/*.cu
include **/*.cuh
include **/*.m
include **/*.metal
exclude *

View File

@@ -0,0 +1,262 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("ggml" C CXX)
include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(GGML_STANDALONE ON)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
# configure project version
# TODO
else()
set(GGML_STANDALONE OFF)
endif()
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
else()
set(BUILD_SHARED_LIBS_DEFAULT ON)
endif()
endif()
# remove the lib prefix on win32 mingw
if (WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX "")
set(CMAKE_SHARED_LIBRARY_PREFIX "")
set(CMAKE_SHARED_MODULE_PREFIX "")
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
#
# option list
#
# TODO: mark all options as advanced when not GGML_STANDALONE
if (APPLE)
set(GGML_METAL_DEFAULT ON)
set(GGML_BLAS_DEFAULT ON)
set(GGML_BLAS_VENDOR_DEFAULT "Apple")
else()
set(GGML_METAL_DEFAULT OFF)
set(GGML_BLAS_DEFAULT OFF)
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif()
if (CMAKE_CROSSCOMPILING)
set(GGML_NATIVE_DEFAULT OFF)
else()
set(GGML_NATIVE_DEFAULT ON)
endif()
# defaults
if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
endif()
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()
# general
option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON)
# debug
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
option(GGML_GPROF "ggml: enable gprof" OFF)
# build
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
# sanitizers
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
# instruction set specific
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
set(INS_ENB OFF)
else()
set(INS_ENB ON)
endif()
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
if (NOT MSVC)
# in MSVC F16C and FMA is implied with AVX2/AVX512
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
option(GGML_F16C "ggml: enable F16C" ${INS_ENB})
# MSVC does not seem to support AMX
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
endif()
# ggml core
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
option(GGML_CPU "ggml: enable CPU backend" ON)
# 3rd party libs / backends
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"ggml: metal minimum macOS version")
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
#
# dependencies
#
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
#
# build the library
#
add_subdirectory(src)
#
# tests and examples
#
if (GGML_BUILD_TESTS)
enable_testing()
add_subdirectory(tests)
endif ()
if (GGML_BUILD_EXAMPLES)
add_subdirectory(examples)
endif ()
#
# install
#
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
# all public headers
set(GGML_PUBLIC_HEADERS
include/ggml.h
include/ggml-cpu.h
include/ggml-alloc.h
include/ggml-backend.h
include/ggml-blas.h
include/ggml-cann.h
include/ggml-cuda.h
include/ggml-kompute.h
include/ggml-opt.h
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
include/ggml-vulkan.h)
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL)
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
#endif()
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
install(TARGETS ggml-base LIBRARY)
if (GGML_STANDALONE)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
@ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
DESTINATION share/pkgconfig)
endif()

View File

@@ -1,6 +1,5 @@
package cpu
// #cgo CFLAGS: -Wno-implicit-function-declaration
// #cgo CXXFLAGS: -std=c++17
// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE

View File

@@ -58,24 +58,11 @@ var OnceLoad = sync.OnceFunc(func() {
paths = lib.defaultValue
}
if runtime.GOOS == "darwin" {
if _, ok := os.LookupEnv("DYLD_LIBRARY_PATH"); !ok {
os.Setenv("DYLD_LIBRARY_PATH", paths)
}
}
split := filepath.SplitList(paths)
visited := make(map[string]struct{}, len(split))
for _, path := range split {
abspath, _ := filepath.Abs(path)
if _, ok := visited[abspath]; !ok {
func() {
cpath := C.CString(path)
defer C.free(unsafe.Pointer(cpath))
C.ggml_backend_load_all_from_path(cpath)
}()
visited[abspath] = struct{}{}
}
for _, path := range filepath.SplitList(paths) {
func() {
cpath := C.CString(path)
defer C.free(unsafe.Pointer(cpath))
C.ggml_backend_load_all_from_path(cpath)
}()
}
})

View File

@@ -1,160 +0,0 @@
package main
import (
"errors"
"flag"
"fmt"
"image"
"io"
"log/slog"
"os"
"path/filepath"
"strings"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
_ "github.com/ollama/ollama/model/llama"
_ "github.com/ollama/ollama/model/mllama"
"github.com/ollama/ollama/sample"
)
var args struct {
n int
debug bool
image string
cache bool
}
func temp() error {
flag.IntVar(&args.n, "n", 10, "number of samples")
flag.BoolVar(&args.debug, "debug", false, "enable debug logging")
flag.StringVar(&args.image, "image", "", "path to image file")
flag.BoolVar(&args.cache, "cache", false, "enable KV cache")
flag.Parse()
var prompt string
if n := len(flag.Args()); n == 1 {
bts, err := io.ReadAll(os.Stdin)
if err != nil {
return err
}
prompt = string(bts)
} else if n > 1 {
prompt = strings.Join(flag.Args()[1:], " ")
} else {
return fmt.Errorf("usage: %s path/to/file <prompt\n", filepath.Base(os.Args[0]))
}
level := slog.LevelInfo
if args.debug {
level = slog.LevelDebug
}
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
if attr.Key == slog.SourceKey {
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
})))
m, err := model.New(flag.Arg(0))
if err != nil {
return err
}
inputIDs, err := m.(model.TextProcessor).Encode(prompt)
if err != nil {
return err
}
var opts []model.OptionsFunc
if args.cache {
opts = append(opts, model.WithCache(&cache.Simple{
Capacity: 2048,
DType: ml.DTypeF32,
}))
}
if args.image != "" {
if err := func() error {
f, err := os.Open(args.image)
if err != nil {
return err
}
defer f.Close()
img, _, err := image.Decode(f)
if err != nil {
return err
}
opts = append(opts, model.WithImage(img))
return nil
}(); err != nil {
return err
}
}
var offset int
for range args.n {
logit, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
if err != nil {
return err
}
f32s := logit.Floats()
f64s := make([]float64, len(f32s))
for i, f32 := range f32s {
f64s[i] = float64(f32)
}
// do sampling
f64s, err = sample.Sample(f64s, sample.Greedy())
if err != nil {
return err
}
var outputIDs []int32
for _, f64 := range f64s {
if !m.(model.TextProcessor).Is(uint32(f64), model.SpecialEOS) {
outputIDs = append(outputIDs, int32(f64))
}
}
if len(outputIDs) == 0 {
break
}
s, err := m.(model.TextProcessor).Decode(outputIDs)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return err
}
fmt.Print(s)
inputIDs = append(inputIDs, outputIDs...)
if args.cache {
offset = len(inputIDs) - 1
}
}
return nil
}
func main() {
if err := temp(); err != nil {
fmt.Println("err", err)
os.Exit(1)
}
}

View File

@@ -3,6 +3,7 @@ package llama
import (
"math"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
@@ -59,7 +60,7 @@ type SelfAttention struct {
Output *nn.Linear `gguf:"attn_output"`
}
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache cache.Cache, opts *Options) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
@@ -74,7 +75,8 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
v := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
k, v = cache.Put(ctx, k, v, cache.Options)
cache.Put(ctx, k, v)
k, v, mask := cache.Get(ctx)
q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
@@ -82,6 +84,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
kq := k.Mulmat(ctx, q)
kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
kq = kq.Add(ctx, mask)
kq = kq.Softmax(ctx)
kqv := v.Mulmat(ctx, kq)
@@ -109,7 +112,7 @@ type Layer struct {
MLP *MLP
}
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache cache.Cache, opts *Options) ml.Tensor {
residual := hiddenState
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -142,7 +145,7 @@ func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
hiddenState = m.Output.Forward(ctx, hiddenState)
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
outputs, err := ctx.FromIntSlice(opts.Outputs(), len(opts.Outputs()))
if err != nil {
return nil, err
}

View File

@@ -1,6 +1,9 @@
package mllama
import (
"sync"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
@@ -16,6 +19,9 @@ type Model struct {
ImageProcessor
TextProcessor
start sync.Once
tCache *cache.TensorCache
}
func New(c ml.Config) (model.Model, error) {
@@ -28,6 +34,10 @@ func New(c ml.Config) (model.Model, error) {
}
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
m.start.Do(func() {
m.tCache = cache.NewTensorCache(m.Backend())
})
var crossAttentionStates ml.Tensor
if opts.Images != nil {
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(opts.Images[0])
@@ -75,9 +85,9 @@ func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
}
// TODO: attention mask, cross attention mask
hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache)
hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache, m.tCache)
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
outputs, err := ctx.FromIntSlice(opts.Outputs(), len(opts.Outputs()))
if err != nil {
return nil, err
}

View File

@@ -4,9 +4,9 @@ import (
"math"
"slices"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
)
type TextSelfAttention struct {
@@ -16,7 +16,7 @@ type TextSelfAttention struct {
Output *nn.Linear `gguf:"attn_output"`
}
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache cache.Cache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
@@ -31,7 +31,8 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mas
value := sa.Value.Forward(ctx, hiddenState)
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
key, value = cache.Put(ctx, key, value, cache.Options)
cache.Put(ctx, key, value)
key, value, mask := cache.Get(ctx)
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
@@ -39,11 +40,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mas
scores := key.Mulmat(ctx, query)
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
if mask != nil {
scores = scores.Add(ctx, mask)
}
scores = scores.Add(ctx, mask)
scores = scores.Softmax(ctx)
attention := value.Mulmat(ctx, scores)
@@ -72,7 +69,7 @@ type TextSelfAttentionDecoderLayer struct {
MLP *TextMLP
}
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache cache.Cache, _ *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -85,6 +82,10 @@ func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, pos
return hiddenState.Add(ctx, residual)
}
func (d *TextSelfAttentionDecoderLayer) Run() bool {
return true
}
type TextCrossAttention struct {
QueryNorm *nn.RMSNorm `gguf:"cross_attn_q_norm"`
Query *nn.Linear `gguf:"cross_attn_q_proj"`
@@ -94,23 +95,29 @@ type TextCrossAttention struct {
Output *nn.Linear `gguf:"cross_attn_o_proj"`
}
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, _ cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
query := ca.Query.Forward(ctx, hiddenState)
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
query = ca.QueryNorm.Forward(ctx, query, opts.eps)
key := ca.Key.Forward(ctx, crossAttentionStates)
key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
key = ca.KeyNorm.Forward(ctx, key, opts.eps)
var key, value ml.Tensor
if crossAttentionStates != nil {
numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
value := ca.Value.Forward(ctx, crossAttentionStates)
value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
key = ca.Key.Forward(ctx, crossAttentionStates)
key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
key = ca.KeyNorm.Forward(ctx, key, opts.eps)
// TODO cache key, value
value = ca.Value.Forward(ctx, crossAttentionStates)
value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
tCache.Put(ctx, key, value)
} else {
key, value, _ = tCache.Get(ctx)
}
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
@@ -135,13 +142,17 @@ type TextCrossAttentionDecoderLayer struct {
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
MLP *TextMLP
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
run bool
}
func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
d.run = true
residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, tCache, opts)
hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
hiddenState = hiddenState.Add(ctx, residual)
residual = hiddenState
@@ -152,18 +163,23 @@ func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
return hiddenState.Add(ctx, residual)
}
func (d *TextCrossAttentionDecoderLayer) Run() bool {
return d.run
}
type TextDecoderLayer interface {
Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor
Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor
Run() bool
}
type TextDecoder struct {
Layers []TextDecoderLayer
}
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
for i, layer := range d.Layers {
if !slices.Contains(opts.crossAttentionLayers, uint32(i)) || crossAttentionStates != nil {
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), opts)
if layer.Run() || crossAttentionStates != nil {
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), tCache.Sub(i), opts)
}
}
@@ -189,9 +205,9 @@ type TextModel struct {
*TextModelOptions
}
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache) ml.Tensor {
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache) ml.Tensor {
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, tCache, m.TextModelOptions)
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
return m.Output.Forward(ctx, hiddenState)
}

View File

@@ -20,51 +20,28 @@ import (
_ "github.com/ollama/ollama/ml/backend"
)
type Cache struct {
cache.Cache
cache.Options
}
func (c Cache) Sub(i int) Cache {
if c.Cache != nil {
return Cache{
Cache: c.Cache.Sub(i),
Options: c.Options,
}
}
return c
}
func (c Cache) Put(ctx ml.Context, key, value ml.Tensor, opts cache.Options) (ml.Tensor, ml.Tensor) {
if c.Cache != nil {
return c.Cache.Put(ctx, key, value, opts)
}
return key, value
}
type Options struct {
inputs []int32
inputs []int32
positions []int32
outputs []int32
Offset int
sequences []int
Images []image.Image
Cache
cache.Cache
}
func (opts Options) Inputs() []int32 {
return opts.inputs[opts.Offset:]
return opts.inputs
}
func (opts Options) Positions() []int32 {
positions := make([]int32, len(opts.inputs)-opts.Offset)
for i := range positions {
positions[i] = int32(opts.Offset + i)
}
return opts.positions
}
return positions
func (opts Options) Outputs() []int32 {
return opts.outputs
}
type OptionsFunc func(Model, *Options)
@@ -75,10 +52,21 @@ func WithInputIDs(ids []int32) OptionsFunc {
}
}
func WithOffset(offset int) OptionsFunc {
func WithPositions(pos []int32) OptionsFunc {
return func(m Model, opts *Options) {
opts.Offset = offset
opts.Cache.Position = offset
opts.positions = pos
}
}
func WithOutputs(outputs []int32) OptionsFunc {
return func(m Model, opts *Options) {
opts.outputs = outputs
}
}
func WithSequences(seqs []int) OptionsFunc {
return func(m Model, opts *Options) {
opts.sequences = seqs
}
}
@@ -90,12 +78,7 @@ func WithImage(img image.Image) OptionsFunc {
func WithCache(c cache.Cache) OptionsFunc {
return func(m Model, opts *Options) {
opts.Cache = Cache{
Cache: c,
Options: cache.Options{
Position: opts.Offset,
},
}
opts.Cache = c
}
}
@@ -152,6 +135,12 @@ func New(s string) (Model, error) {
}
func populateFields(b ml.Backend, v reflect.Value, tags ...Tag) reflect.Value {
var iface bool
if v.Kind() == reflect.Interface {
iface = true
v = v.Elem()
}
t := v.Type()
if t.Kind() == reflect.Pointer {
t, v = t.Elem(), v.Elem()
@@ -230,6 +219,10 @@ func populateFields(b ml.Backend, v reflect.Value, tags ...Tag) reflect.Value {
}
}
if iface {
return v.Addr()
}
return v
}
@@ -262,18 +255,22 @@ func canNil(t reflect.Type) bool {
t.Kind() == reflect.Slice
}
func Forward(m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
func Forward(ctx ml.Context, m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
var opts Options
for _, optsFunc := range optsFuncs {
optsFunc(m, &opts)
}
ctx := m.Backend().NewContext()
err := opts.Cache.StartForward(ctx, opts.positions, opts.sequences)
if err != nil {
return nil, err
}
t, err := m.Forward(ctx, opts)
if err != nil {
return nil, err
}
defer ctx.Close()
ctx.Forward(t)
return ctx.Compute(t), nil
}

View File

@@ -1,228 +0,0 @@
package model
import (
"reflect"
"testing"
)
func TestBytePairEncoding(t *testing.T) {
// Create a simple test vocabulary
vocab := &Vocabulary{
Values: []string{
"Hello",
"World",
"!",
"How",
"are",
"you",
"t",
"o",
"d",
"a",
"y",
"to",
"tod",
"toda",
"today",
" ",
},
Types: []uint32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3}, // 3 for special token (space)
Merges: []string{
"to",
"tod",
"toda",
"today",
},
BOS: 0,
EOS: 1,
}
bpe := BytePairEncoding{
Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
Vocabulary: vocab,
}
tests := []struct {
name string
input string
want []int32
wantErr bool
}{
{
name: "simple hello world",
input: "Hello World!",
want: []int32{0, 15, 1, 2}, // indexes in the vocabulary
wantErr: false,
},
{
name: "empty string",
input: "",
wantErr: false,
},
{
name: "just spaces",
input: " ",
want: []int32{15, 15, 15}, // space token repeated
wantErr: false,
},
{
name: "today with merges",
input: "today",
want: []int32{14}, // should merge
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := bpe.Encode(tt.input)
if (err != nil) != tt.wantErr {
t.Errorf("BytePairEncoding.Encode() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("BytePairEncoding.Encode() = %v, want %v", got, tt.want)
}
// Test round trip if encoding succeeded
if err == nil {
decoded, err := bpe.Decode(got)
if err != nil {
t.Errorf("BytePairEncoding.Decode() error = %v", err)
return
}
// Note: The decoded string might not exactly match the input due to
// tokenization/normalization, so we re-encode it to compare
reEncoded, err := bpe.Encode(decoded)
if err != nil {
t.Errorf("BytePairEncoding.Encode() error on round trip = %v", err)
return
}
if !reflect.DeepEqual(reEncoded, got) {
t.Errorf("Round trip failed: original tokens = %v, after round trip = %v", got, reEncoded)
}
}
})
}
}
func TestBytePairEncodingSpecialTokens(t *testing.T) {
vocab := &Vocabulary{
Values: []string{
"<s>",
"</s>",
"<pad>",
"Hello",
"World",
},
Types: []uint32{3, 3, 3, 1, 1}, // 3 for special tokens
BOS: 0,
EOS: 1,
}
bpe := BytePairEncoding{
Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
Vocabulary: vocab,
}
tests := []struct {
name string
input string
want []int32
wantErr bool
}{
{
name: "text with special token at start",
input: "<s>Hello",
want: []int32{0, 3},
wantErr: false,
},
{
name: "text with special token at end",
input: "World</s>",
want: []int32{4, 1},
wantErr: false,
},
{
name: "special token in middle",
input: "Hello<pad>World",
want: []int32{3, 2, 4},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := bpe.Encode(tt.input)
if (err != nil) != tt.wantErr {
t.Errorf("BytePairEncoding.Encode() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("BytePairEncoding.Encode() = %v, want %v", got, tt.want)
}
})
}
}
func TestBytePairEncodingSplit(t *testing.T) {
bpe := BytePairEncoding{
Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
}
tests := []struct {
name string
input string
want []string
wantErr bool
}{
{
name: "basic splitting",
input: "Hello World!",
want: []string{"Hello", " World", "!"},
},
{
name: "contractions",
input: "I'm don't won't",
want: []string{"I", "'m", " don", "'t", " won", "'t"},
},
{
name: "numbers",
input: "In 2024 there are 365 days",
want: []string{"In", " ", "202", "4", " there", " are", " ", "365", " days"},
},
{
name: "special characters",
input: "Hello!! ...world",
want: []string{"Hello", "!!", " ...", "world"},
},
{
name: "multiple spaces",
input: "Hello World",
want: []string{"Hello", " ", " World"},
},
{
name: "newlines",
input: "Hello\nWorld",
want: []string{"Hello", "\n", "World"},
},
{
name: "mixed case and punctuation",
input: "Hello, WORLD!! How's it going?",
want: []string{"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := bpe.split(tt.input)
if (err != nil) != tt.wantErr {
t.Errorf("BytePairEncoding.split() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("BytePairEncoding.split() = %v, want %v", got, tt.want)
}
})
}
}

View File

@@ -1,10 +1,10 @@
package runner
package common
import (
"strings"
)
func findStop(sequence string, stops []string) (bool, string) {
func FindStop(sequence string, stops []string) (bool, string) {
for _, stop := range stops {
if strings.Contains(sequence, stop) {
return true, stop
@@ -14,7 +14,7 @@ func findStop(sequence string, stops []string) (bool, string) {
return false, ""
}
func containsStopSuffix(sequence string, stops []string) bool {
func ContainsStopSuffix(sequence string, stops []string) bool {
for _, stop := range stops {
for i := 1; i <= len(stop); i++ {
if strings.HasSuffix(sequence, stop[:i]) {
@@ -29,7 +29,7 @@ func containsStopSuffix(sequence string, stops []string) bool {
// truncateStop removes the provided stop string from pieces,
// returning the partial pieces with stop removed, including truncating
// the last piece if required (and signalling if this was the case)
func truncateStop(pieces []string, stop string) ([]string, bool) {
func TruncateStop(pieces []string, stop string) ([]string, bool) {
joined := strings.Join(pieces, "")
index := strings.Index(joined, stop)
@@ -65,7 +65,7 @@ func truncateStop(pieces []string, stop string) ([]string, bool) {
return result, tokenTruncated
}
func incompleteUnicode(token string) bool {
func IncompleteUnicode(token string) bool {
incomplete := false
// check if there is incomplete UTF-8 character at the end

View File

@@ -1,4 +1,4 @@
package runner
package common
import (
"reflect"
@@ -52,7 +52,7 @@ func TestTruncateStop(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, resultTrunc := truncateStop(tt.pieces, tt.stop)
result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
}
@@ -120,7 +120,7 @@ func TestIncompleteUnicode(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := incompleteUnicode(tt.input)
result := IncompleteUnicode(tt.input)
if result != tt.expected {
t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
}

261
runner/newrunner/cache.go Normal file
View File

@@ -0,0 +1,261 @@
package newrunner
import (
"errors"
"fmt"
"log/slog"
"math"
"reflect"
"time"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
)
type InputCache struct {
// context window size (per slot)
numCtx int32
// individual KV caches
slots []InputCacheSlot
// optimize cache eviction for multiple users
multiUserCache bool
cache cache.Cache
}
func NewInputCache(backend ml.Backend, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
if kvSize/int32(numSlots) < 1 {
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
}
slots := make([]InputCacheSlot, numSlots)
for i := range slots {
slots[i] = InputCacheSlot{
Id: i,
Inputs: make([]input, 0),
}
}
return &InputCache{
numCtx: kvSize / int32(numSlots),
slots: slots,
multiUserCache: multiUserCache,
cache: cache.NewCausalCache(backend, kvCacheTypeFromStr(kvCacheType), kvSize),
}, nil
}
func kvCacheTypeFromStr(s string) ml.DType {
switch s {
case "q8_0":
panic("kv cache quantization not yet implemented")
case "q4_0":
panic("kv cache quantization not yet implemented")
default:
return ml.DTypeF32
}
}
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be be held that serializes
// these operations with each other and processBatch
type InputCacheSlot struct {
// Index in the KV cache
Id int
// Inputs that are stored in the KV cache
Inputs []input
// is this cache actively being processed as part of a sequence?
InUse bool
// last time this cache was used (as of start of processing)
lastUsed time.Time
}
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
var slot *InputCacheSlot
var numPast int32
var err error
// In single-user scenarios, the longest cache slot works fine for getting good input
// cache hit rates and it keeps the footprint of the cache small, which improves throughput.
// For multiple users, the "best" cache slot produces better input cache hit rates
// at the cost of worse performance when we miss the input cache.
if !c.multiUserCache {
slot, numPast, err = c.findLongestCacheSlot(prompt)
} else {
slot, numPast, err = c.findBestCacheSlot(prompt)
}
if err != nil {
return nil, nil, err
}
if !cachePrompt {
numPast = 0
}
slot.InUse = true
slot.lastUsed = time.Now()
if numPast == int32(len(prompt)) {
// Leave one input to sample so we can get a response
numPast--
}
err = c.cache.Remove(slot.Id, numPast, math.MaxInt32)
if err != nil {
// Some models don't support partial erasure
err = c.cache.Remove(slot.Id, 0, math.MaxInt32)
if err != nil {
return nil, nil, err
}
numPast = 0
}
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
"used", numPast, "remaining", int32(len(prompt))-numPast)
prompt = prompt[numPast:]
slot.Inputs = slot.Inputs[:numPast]
return slot, prompt, nil
}
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
longest := int32(-1)
var longestSlot *InputCacheSlot
for i, s := range c.slots {
if s.InUse {
continue
}
count := countCommonPrefix(s.Inputs, prompt)
if count > longest {
longest = count
longestSlot = &c.slots[i]
}
}
if longestSlot == nil {
return nil, 0, errors.New("no available cache slots")
}
return longestSlot, longest, nil
}
func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
oldest := time.Now()
var oldestSlot *InputCacheSlot
longest := int32(-1)
var longestSlot *InputCacheSlot
for i, s := range c.slots {
count := countCommonPrefix(s.Inputs, prompt)
if count > longest {
longest = count
longestSlot = &c.slots[i]
}
if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
oldest = s.lastUsed
oldestSlot = &c.slots[i]
}
}
if longest == int32(len(longestSlot.Inputs)) && !longestSlot.InUse {
return longestSlot, longest, nil
}
if oldestSlot.InUse {
return nil, 0, errors.New("no available cache slots")
}
if len(oldestSlot.Inputs) != 0 {
slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
"used", oldestSlot.lastUsed)
}
if longest > 0 && longestSlot != oldestSlot {
slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
len(longestSlot.Inputs))
oldestSlot.Inputs = make([]input, longest)
copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
// This is only nil for unit tests
if c.cache != nil {
c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
}
}
return oldestSlot, longest, nil
}
func countCommonPrefix(a []input, b []input) int32 {
var count int32
for i := range a {
if i >= len(b) {
break
}
if !reflect.DeepEqual(a[i], b[i]) {
break
}
count++
}
return count
}
func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
targetFree := (c.numCtx - numKeep) / 2
targetFree = max(targetFree, 1)
currentFree := c.numCtx - inputLen
discard := targetFree - currentFree
if discard < 0 {
discard = 0
}
return discard
}
// Frees up space in the KV cache by deleting the oldest half of history and shifting
// the newest half into that space (saving numKeep inputs at the beginning).
//
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
if numKeep >= c.numCtx {
return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
}
inputLen := int32(len(slot.Inputs))
discard := c.ShiftDiscard(inputLen, numKeep)
if discard <= 0 {
return nil
}
slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
"keep", numKeep, "discard", discard)
// TODO (jessegross): KV cache removal can fail for certain types of models
err := c.cache.Remove(slot.Id, numKeep, numKeep+discard)
if err != nil {
return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v): %w", slot.Id, numKeep, discard, err)
}
for i := numKeep + discard; i < inputLen; i++ {
slot.Inputs[i-discard] = slot.Inputs[i]
}
slot.Inputs = slot.Inputs[:inputLen-discard]
return nil
}

View File

@@ -0,0 +1,291 @@
package newrunner
import (
"image"
"testing"
"time"
)
func TestCountCommon(t *testing.T) {
imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
tests := []struct {
name string
t1 []input
t2 []input
expected int32
}{
{
name: "Equal",
t1: []input{{token: 1}, {token: 2}, {token: 3}},
t2: []input{{token: 1}, {token: 2}, {token: 3}},
expected: 3,
},
{
name: "Prefix",
t1: []input{{token: 1}},
t2: []input{{token: 1}, {token: 2}, {token: 3}},
expected: 1,
},
{
name: "Image Prefix",
t1: []input{{image: imgA}},
t2: []input{{image: imgA}, {image: imgB}, {image: imgC}},
expected: 1,
},
{
name: "Mixed",
t1: []input{{token: 1}, {image: imgA}},
t2: []input{{token: 1}, {image: imgA}, {token: 5}},
expected: 2,
},
{
name: "Empty",
t1: []input{},
t2: []input{{token: 1}, {token: 2}, {token: 3}},
expected: 0,
},
{
name: "Both Empty",
t1: []input{},
t2: []input{},
expected: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := countCommonPrefix(tt.t1, tt.t2)
if result != tt.expected {
t.Errorf("countCommonPrefix(%v, %v): have %v; want %v", tt.t1, tt.t2, result, tt.expected)
}
})
}
}
func TestFindCacheSlot(t *testing.T) {
type expected struct {
result int
len int32
}
tests := []struct {
name string
cache InputCache
prompt []input
longest expected
best expected
}{
{
name: "Empty",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
{
Id: 1,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
}},
prompt: []input{{token: 1}},
longest: expected{result: 0, len: 0},
best: expected{result: 0, len: 0},
},
{
name: "Extend",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-2 * time.Second),
},
}},
prompt: []input{{token: 1}, {token: 2}},
longest: expected{result: 1, len: 2},
best: expected{result: 1, len: 2},
},
{
name: "New",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
}},
prompt: []input{{token: 2}},
longest: expected{result: 0, len: 0},
best: expected{result: 1, len: 0},
},
{
name: "Fork",
cache: InputCache{
slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
},
},
prompt: []input{{token: 1}},
longest: expected{result: 0, len: 1},
best: expected{result: 1, len: 1},
},
{
name: "Evict",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-2 * time.Second),
},
}},
prompt: []input{{token: 2}, {token: 3}},
longest: expected{result: 0, len: 0},
best: expected{result: 1, len: 0},
},
{
name: "In use",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}, {token: 2}},
InUse: true,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{{token: 1}},
InUse: false,
lastUsed: time.Now().Add(-2 * time.Second),
},
}},
prompt: []input{{token: 1}, {token: 2}},
longest: expected{result: 1, len: 1},
best: expected{result: 1, len: 2},
},
}
for _, tt := range tests {
t.Run("Longest-"+tt.name, func(t *testing.T) {
result, resultLen, err := tt.cache.findLongestCacheSlot(tt.prompt)
if err != nil {
t.Errorf("findLongestCacheSlot: err %v", err)
} else if result.Id != tt.longest.result || resultLen != tt.longest.len {
t.Errorf("findLongestCacheSlot: slot have %v, want %v len have %v, want %v",
result.Id, tt.longest.result, resultLen, tt.longest.len)
}
})
}
for _, tt := range tests {
t.Run("Best-"+tt.name, func(t *testing.T) {
result, resultLen, err := tt.cache.findBestCacheSlot(tt.prompt)
if err != nil {
t.Errorf("findBestCacheSlot: err %v", err)
} else if result.Id != tt.best.result || resultLen != tt.best.len {
t.Errorf("findBestCacheSlot: slot have %v, want %v len have %v, want %v",
result.Id, tt.best.result, resultLen, tt.best.len)
}
})
}
}
func TestShiftDiscard(t *testing.T) {
tests := []struct {
name string
numCtx int32
numKeep int32
inputLen int32
expected int32
}{
{
name: "Shift",
numCtx: 2048,
numKeep: 5,
inputLen: 2048,
expected: 1021,
},
{
name: "Max Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 2048,
expected: 1,
},
{
name: "No Keep",
numCtx: 2048,
numKeep: 0,
inputLen: 2048,
expected: 1024,
},
{
name: "Truncate",
numCtx: 2048,
numKeep: 5,
inputLen: 5000,
expected: 3973,
},
{
name: "Truncate Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 5000,
expected: 2953,
},
{
name: "No Op",
numCtx: 2048,
numKeep: 5,
inputLen: 512,
expected: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := InputCache{numCtx: tt.numCtx}
result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
if result != tt.expected {
t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
}
})
}
}

941
runner/newrunner/runner.go Normal file
View File

@@ -0,0 +1,941 @@
package newrunner
import (
"bytes"
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"image"
"io"
"log"
"log/slog"
"net"
"net/http"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/runner/common"
"github.com/ollama/ollama/sample"
_ "github.com/ollama/ollama/model/llama"
_ "github.com/ollama/ollama/model/mllama"
)
// input is an element of the prompt to process, either a token or an image
type input struct {
token int32
image image.Image
}
type Sequence struct {
// batch index
iBatch int
// prompt inputs left to evaluate
inputs []input
// inputs that have been added to a batch but not yet submitted to Forward
pendingInputs []input
// tokens that have been generated but not returned yet (e.g. for stop sequences)
pendingResponses []string
// input cache being used by this sequence
cache *InputCacheSlot
// channel to send responses over
responses chan string
// channel to stop decoding (such as if the remote connection is closed)
quit chan bool
// number of tokens to predict
numPredict int
// set of samplers to run on generated logits
samplers []sample.Sampler
// channel to send back the embedding if embedding only
embedding chan []float32
// stop sequences
stop []string
// number of inputs to keep at the beginning when shifting context window
numKeep int32
// true if an embedding are to be returned instead of text generation
embeddingOnly bool
doneReason string
// Metrics
startProcessingTime time.Time
startGenerationTime time.Time
numPredicted int
numPromptInputs int
}
type NewSequenceParams struct {
numPredict int
stop []string
numKeep int32
samplers []sample.Sampler
embedding bool
}
func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
s.ready.Wait()
startTime := time.Now()
inputs, err := s.inputs(prompt, images)
if err != nil {
return nil, fmt.Errorf("failed to process inputs: %w", err)
} else if len(inputs) == 0 {
return nil, errors.New("no input provided")
}
if params.numKeep < 0 {
params.numKeep = int32(len(inputs))
}
// Ensure that at least 1 input can be discarded during shift
params.numKeep = min(params.numKeep, s.cache.numCtx-1)
if int32(len(inputs)) > s.cache.numCtx {
discard := int32(len(inputs)) - s.cache.numCtx
newInputs := inputs[:params.numKeep]
newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
inputs = newInputs
}
// TODO(jessegross): Ingest cached history for grammar
return &Sequence{
inputs: inputs,
numPromptInputs: len(inputs),
startProcessingTime: startTime,
numPredict: params.numPredict,
pendingResponses: make([]string, 0),
responses: make(chan string, 100),
quit: make(chan bool, 1),
embedding: make(chan []float32, 1),
samplers: params.samplers,
embeddingOnly: params.embedding,
stop: params.stop,
numKeep: params.numKeep,
}, nil
}
// inputs processes the prompt and images into a list of inputs
// by splitting the prompt on [img-<n>] tags, tokenizing text and
// decoding images
func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
var inputs []input
var parts []string
var matches [][]string
// TODO(jessegross): This can sometimes trigger for matching text in the
// user's prompt. We previously tried to avoid it by only looking for images
// on image models. We don't have a clear indication now but it would be better
// to properly escape it in any case.
re := regexp.MustCompile(`\[img-(\d+)\]`)
parts = re.Split(prompt, -1)
matches = re.FindAllStringSubmatch(prompt, -1)
for i, part := range parts {
// text - tokenize
tokens, err := s.model.(model.TextProcessor).Encode(part)
if err != nil {
return nil, err
}
for _, t := range tokens {
inputs = append(inputs, input{token: t})
}
// image - decode and store
if i < len(matches) {
n, _ := strconv.Atoi(matches[i][1])
imageIndex := -1
for j := range images {
if images[j].ID == n {
imageIndex = j
break
}
}
if imageIndex < 0 {
return nil, fmt.Errorf("invalid image index: %d", n)
}
image, _, err := image.Decode(bytes.NewReader(images[imageIndex].Data))
if err != nil {
return nil, err
}
inputs = append(inputs, input{image: image})
}
}
return inputs, nil
}
type Server struct {
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
// loaded model
model model.Model
// status for external health reporting - loading, ready to serve, etc.
status ServerStatus
// current progress on loading the model
progress float32
// number of simultaneous requests to handle
parallel int
// maximum number of elements in a batch (per sequence)
// TODO (jmorganca): make this n_batch
batchSize int
// protects access to everything below this line
// this is context state needed for decoding
mu sync.Mutex
// indicates that data is ready for processing
cond *sync.Cond
// the list of simultaneous sequences being evaluated
seqs []*Sequence
// seqs can have a maximum of parallel entries, which
// is enfoced by seqSem
seqsSem *semaphore.Weighted
// KV cache
cache *InputCache
}
func (s *Server) allNil() bool {
for _, item := range s.seqs {
if item != nil {
return false
}
}
return true
}
func flushPending(seq *Sequence) bool {
joined := strings.Join(seq.pendingResponses, "")
seq.pendingResponses = []string{}
// Check if there are any partial UTF-8 characters remaining.
// We already check and queue as we are generating but some may
// still make it here:
// - Sequence is ending, e.g. generation limit has been hit
// - Invalid characters in the middle of a string
// This is a stricter check to ensure we never output invalid Unicode.
for !utf8.ValidString(joined) {
joined = joined[:len(joined)-1]
}
if len(joined) == 0 {
return true
}
select {
case seq.responses <- joined:
return true
case <-seq.quit:
return false
}
}
func (s *Server) removeSequence(seqIndex int, reason string) {
seq := s.seqs[seqIndex]
flushPending(seq)
seq.doneReason = reason
close(seq.responses)
close(seq.embedding)
seq.cache.InUse = false
s.seqs[seqIndex] = nil
s.seqsSem.Release(1)
}
func (s *Server) run(ctx context.Context) {
s.ready.Wait()
for {
select {
case <-ctx.Done():
return
default:
err := s.processBatch()
if err != nil {
panic(err)
}
}
}
}
func (s *Server) processBatch() error {
s.mu.Lock()
for s.allNil() {
s.cond.Wait() // Wait until an item is added
}
defer s.mu.Unlock()
var inputIDs []int32
var pos []int32
var outputs []int32
var seqs []int
var image image.Image
for i, seq := range s.seqs {
if seq == nil {
continue
}
// if past the num predict limit
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
s.removeSequence(i, "limit")
continue
}
for j, input := range seq.inputs {
if int32(len(seq.cache.Inputs)+len(seq.pendingInputs)+1) > s.cache.numCtx {
if len(seq.pendingInputs) == 0 {
err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
if err != nil {
return err
}
} else {
break
}
}
if j >= s.batchSize {
break
}
if input.image != nil {
if image != nil {
break
}
image = input.image
seq.pendingInputs = append(seq.pendingInputs, input)
continue
}
inputIDs = append(inputIDs, input.token)
pos = append(pos, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
seqs = append(seqs, seq.cache.Id)
seq.iBatch = len(outputs)
if j+1 == len(seq.inputs) {
outputs = append(outputs, int32(len(inputIDs)-1))
}
seq.pendingInputs = append(seq.pendingInputs, input)
}
seq.inputs = seq.inputs[len(seq.pendingInputs):]
}
if len(inputIDs) == 0 {
return nil
}
var options []model.OptionsFunc
if image != nil {
options = append(options, model.WithImage(image))
}
ctx := s.model.Backend().NewContext()
defer ctx.Close()
logit, err := model.Forward(ctx, s.model, append(options, model.WithCache(s.cache.cache), model.WithInputIDs(inputIDs), model.WithPositions(pos), model.WithOutputs(outputs), model.WithSequences(seqs))...)
if err != nil {
return err
}
f32s := logit.Floats()
for i, seq := range s.seqs {
if seq == nil {
continue
}
// After calling Forward, pending inputs are now in the cache
if len(seq.pendingInputs) > 0 {
seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
seq.pendingInputs = []input{}
}
// don't sample prompt processing
if len(seq.inputs) != 0 {
continue
}
seq.numPredicted++
if seq.numPredicted == 1 {
seq.startGenerationTime = time.Now()
}
// if done processing the prompt, generate an embedding and return
if seq.embeddingOnly {
/*embed := s.lc.GetEmbeddingsSeq(seq.cache.Id)
if embed == nil {
embed = s.lc.GetEmbeddingsIth(seq.iBatch)
}
seq.embedding <- embed*/
s.removeSequence(i, "")
continue
}
vocabSize := len(f32s) / len(outputs)
seqLogits := f32s[seq.iBatch*vocabSize : (seq.iBatch+1)*vocabSize]
// TODO(jessegross): The data type and number of outputs for the samplers seem inconsistent
f64s := make([]float64, vocabSize)
for j, f32 := range seqLogits {
f64s[j] = float64(f32)
}
// do sampling
f64s, err = sample.Sample(f64s, seq.samplers...)
if err != nil {
return err
}
var outputIDs []int32
for _, f64 := range f64s {
if !s.model.(model.TextProcessor).Is(uint32(f64), model.SpecialEOS) {
outputIDs = append(outputIDs, int32(f64))
} else {
s.removeSequence(i, "stop")
continue
}
}
if len(outputIDs) == 0 {
continue
}
piece, err := s.model.(model.TextProcessor).Decode(outputIDs)
if errors.Is(err, io.EOF) {
continue
} else if err != nil {
return err
}
for _, id := range outputIDs {
seq.inputs = append(seq.inputs, input{token: id})
}
seq.pendingResponses = append(seq.pendingResponses, piece)
sequence := strings.Join(seq.pendingResponses, "")
if ok, stop := common.FindStop(sequence, seq.stop); ok {
slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
var tokenTruncated bool
origLen := len(seq.pendingResponses)
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
newLen := len(seq.pendingResponses)
// Update the cache based on the tokens that will be returned:
// - We have more tokens than are currently in the cache because
// the last ones generated weren't submitted to Forward
// - Remove any stop sequences that we stripped out
// - If truncateStop removed a portion of a token, drop that
// - As defense-in-depth, if truncatedToken didn't find a stop token
// remove the extra ones that we added to the cache len
tokenLen := len(seq.cache.Inputs) + len(outputIDs)
tokenLen -= origLen - newLen
if tokenTruncated {
tokenLen--
}
if origLen == newLen {
tokenLen = len(seq.cache.Inputs)
}
seq.cache.Inputs = seq.cache.Inputs[:tokenLen]
s.removeSequence(i, "stop")
continue
}
if common.ContainsStopSuffix(sequence, seq.stop) {
continue
}
if common.IncompleteUnicode(sequence) {
continue
}
if !flushPending(seq) {
s.removeSequence(i, "connection")
}
}
return nil
}
// TODO (jmorganca): use structs from the api package to avoid duplication
// this way the api acts as a proxy instead of using a different api for the
// runner
type Options struct {
api.Runner
NumKeep int `json:"n_keep"`
Seed int `json:"seed"`
NumPredict int `json:"n_predict"`
TopK int `json:"top_k"`
TopP float32 `json:"top_p"`
MinP float32 `json:"min_p"`
TypicalP float32 `json:"typical_p"`
RepeatLastN int `json:"repeat_last_n"`
Temperature float32 `json:"temperature"`
RepeatPenalty float32 `json:"repeat_penalty"`
PresencePenalty float32 `json:"presence_penalty"`
FrequencyPenalty float32 `json:"frequency_penalty"`
Mirostat int `json:"mirostat"`
MirostatTau float32 `json:"mirostat_tau"`
MirostatEta float32 `json:"mirostat_eta"`
Stop []string `json:"stop"`
}
type ImageData struct {
Data []byte `json:"data"`
ID int `json:"id"`
AspectRatioID int `json:"aspect_ratio_id"`
}
type CompletionRequest struct {
Prompt string `json:"prompt"`
Images []ImageData `json:"image_data"`
Grammar string `json:"grammar"`
CachePrompt bool `json:"cache_prompt"`
Options
}
type Timings struct {
PredictedN int `json:"predicted_n"`
PredictedMS float64 `json:"predicted_ms"`
PromptN int `json:"prompt_n"`
PromptMS float64 `json:"prompt_ms"`
}
type CompletionResponse struct {
Content string `json:"content"`
Stop bool `json:"stop"`
Model string `json:"model,omitempty"`
Prompt string `json:"prompt,omitempty"`
StoppedLimit bool `json:"stopped_limit,omitempty"`
PredictedN int `json:"predicted_n,omitempty"`
PredictedMS float64 `json:"predicted_ms,omitempty"`
PromptN int `json:"prompt_n,omitempty"`
PromptMS float64 `json:"prompt_ms,omitempty"`
Timings Timings `json:"timings"`
}
func getSamplers(_ CompletionRequest) []sample.Sampler {
// TODO(jessegross): Waiting for sampling code
/*var samplingParams llama.SamplingParams
samplingParams.TopK = req.TopK
samplingParams.TopP = req.TopP
samplingParams.MinP = req.MinP
samplingParams.TypicalP = req.TypicalP
samplingParams.Temp = req.Temperature
samplingParams.RepeatLastN = req.RepeatLastN
samplingParams.PenaltyRepeat = req.RepeatPenalty
samplingParams.PenaltyFreq = req.FrequencyPenalty
samplingParams.PenaltyPresent = req.PresencePenalty
samplingParams.Mirostat = req.Mirostat
samplingParams.MirostatTau = req.MirostatTau
samplingParams.MirostatEta = req.MirostatEta
samplingParams.Seed = uint32(req.Seed)
samplingParams.Grammar = req.Grammar*/
return []sample.Sampler{sample.Greedy()}
}
func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
var req CompletionRequest
req.Options = Options(api.DefaultOptions())
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Bad request", http.StatusBadRequest)
return
}
// Set the headers to indicate streaming
w.Header().Set("Content-Type", "application/json")
w.Header().Set("Transfer-Encoding", "chunked")
flusher, ok := w.(http.Flusher)
if !ok {
http.Error(w, "Streaming not supported", http.StatusInternalServerError)
return
}
seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
numPredict: req.NumPredict,
stop: req.Stop,
numKeep: int32(req.NumKeep),
samplers: getSamplers(req),
embedding: false,
})
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
return
}
// Ensure there is a place to put the sequence, released when removed from s.seqs
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
if errors.Is(err, context.Canceled) {
slog.Info("aborting completion request due to client closing the connection")
} else {
slog.Error("Failed to acquire semaphore", "error", err)
}
return
}
s.mu.Lock()
found := false
for i, sq := range s.seqs {
if sq == nil {
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
if err != nil {
s.mu.Unlock()
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
return
}
s.seqs[i] = seq
s.cond.Signal()
found = true
break
}
}
s.mu.Unlock()
if !found {
http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
return
}
for {
select {
case <-r.Context().Done():
close(seq.quit)
return
case content, ok := <-seq.responses:
if ok {
if err := json.NewEncoder(w).Encode(&CompletionResponse{
Content: content,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
close(seq.quit)
return
}
flusher.Flush()
} else {
// Send the final response
if err := json.NewEncoder(w).Encode(&CompletionResponse{
Stop: true,
StoppedLimit: seq.doneReason == "limit",
Timings: Timings{
PromptN: seq.numPromptInputs,
PromptMS: float64(seq.startGenerationTime.Sub(seq.startProcessingTime).Milliseconds()),
PredictedN: seq.numPredicted,
PredictedMS: float64(time.Since(seq.startGenerationTime).Milliseconds()),
},
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
}
return
}
}
}
}
type EmbeddingRequest struct {
Content string `json:"content"`
CachePrompt bool `json:"cache_prompt"`
}
type EmbeddingResponse struct {
Embedding []float32 `json:"embedding"`
}
func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
var req EmbeddingRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
return
}
w.Header().Set("Content-Type", "application/json")
slog.Debug("embedding request", "content", req.Content)
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
return
}
// Ensure there is a place to put the sequence, released when removed from s.seqs
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
if errors.Is(err, context.Canceled) {
slog.Info("aborting embeddings request due to client closing the connection")
} else {
slog.Error("Failed to acquire semaphore", "error", err)
}
return
}
s.mu.Lock()
found := false
for i, sq := range s.seqs {
if sq == nil {
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
if err != nil {
s.mu.Unlock()
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
return
}
s.seqs[i] = seq
s.cond.Signal()
found = true
break
}
}
s.mu.Unlock()
if !found {
http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
return
}
embedding := <-seq.embedding
if err := json.NewEncoder(w).Encode(&EmbeddingResponse{
Embedding: embedding,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
}
type HealthResponse struct {
Status string `json:"status"`
Progress float32 `json:"progress"`
}
type ServerStatus int
const (
ServerStatusReady ServerStatus = iota
ServerStatusLoadingModel
ServerStatusError
)
func (s ServerStatus) ToString() string {
switch s {
case ServerStatusReady:
return "ok"
case ServerStatusLoadingModel:
return "loading model"
default:
return "server error"
}
}
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(w).Encode(&HealthResponse{
Status: s.status.ToString(),
Progress: s.progress,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
}
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) loadModel(
mpath string,
lpath multiLPath,
kvCacheType string,
kvSize int,
multiUserCache bool,
) {
var err error
s.model, err = model.New(mpath)
if err != nil {
panic(err)
}
// TODO(jessegross): LoRA loading
if lpath.String() != "" {
panic("loras are not yet implemented")
}
s.cache, err = NewInputCache(s.model.Backend(), kvCacheType, int32(kvSize), s.parallel, multiUserCache)
if err != nil {
panic(err)
}
s.status = ServerStatusReady
s.ready.Done()
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
_ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
_ = fs.Int("main-gpu", 0, "Main GPU")
_ = fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
_ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
return err
}
level := slog.LevelInfo
if *verbose {
level = slog.LevelDebug
}
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
if attr.Key == slog.SourceKey {
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
})
slog.SetDefault(slog.New(handler))
slog.Info("starting ollama engine")
// TODO(jessegross): Some system info would be useful
server := &Server{
batchSize: *batchSize,
parallel: *parallel,
seqs: make([]*Sequence, *parallel),
seqsSem: semaphore.NewWeighted(int64(*parallel)),
status: ServerStatusLoadingModel,
}
// TODO(jessegross): Parameters that need to be implemented:
// n-gpu-layers
// main-gpu
// flash-attn
// threads
// no-mmap
// mlock
// tensor-split
/*var tensorSplitFloats []float32
if *tensorSplit != "" {
stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
tensorSplitFloats = make([]float32, 0, len(stringFloats))
for _, s := range stringFloats {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats = append(tensorSplitFloats, float32(f))
}
}*/
server.ready.Add(1)
go server.loadModel(*mpath, lpaths, *kvCacheType, *kvSize, *multiUserCache)
server.cond = sync.NewCond(&server.mu)
ctx, cancel := context.WithCancel(context.Background())
go server.run(ctx)
addr := "127.0.0.1:" + strconv.Itoa(*port)
listener, err := net.Listen("tcp", addr)
if err != nil {
fmt.Println("Listen error:", err)
cancel()
return err
}
defer listener.Close()
mux := http.NewServeMux()
mux.HandleFunc("/embedding", server.embeddings)
mux.HandleFunc("/completion", server.completion)
mux.HandleFunc("/health", server.health)
httpServer := http.Server{
Handler: mux,
}
log.Println("Server listening on", addr)
if err := httpServer.Serve(listener); err != nil {
log.Fatal("server error:", err)
return err
}
cancel()
return nil
}

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"errors"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"testing"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"errors"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"reflect"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"context"
@@ -24,6 +24,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runner/common"
)
// input is an element of the prompt to process, either
@@ -498,12 +499,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
seq.pendingResponses = append(seq.pendingResponses, piece)
sequence := strings.Join(seq.pendingResponses, "")
if ok, stop := findStop(sequence, seq.stop); ok {
if ok, stop := common.FindStop(sequence, seq.stop); ok {
slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
var tokenTruncated bool
origLen := len(seq.pendingResponses)
seq.pendingResponses, tokenTruncated = truncateStop(seq.pendingResponses, stop)
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
newLen := len(seq.pendingResponses)
// Update the cache based on the tokens that will be returned:
@@ -524,11 +525,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
continue
}
if containsStopSuffix(sequence, seq.stop) {
if common.ContainsStopSuffix(sequence, seq.stop) {
continue
}
if incompleteUnicode(sequence) {
if common.IncompleteUnicode(sequence) {
continue
}
@@ -885,9 +886,6 @@ func (s *Server) loadModel(
}
func Execute(args []string) error {
if args[0] == "runner" {
args = args[1:]
}
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
ppath := fs.String("mmproj", "", "Path to projector binary file")

24
runner/runner.go Normal file
View File

@@ -0,0 +1,24 @@
package runner
import (
"github.com/ollama/ollama/runner/newrunner"
"github.com/ollama/ollama/runner/oldrunner"
)
func Execute(args []string) error {
if args[0] == "runner" {
args = args[1:]
}
var newRunner bool
if args[0] == "--new-runner" {
args = args[1:]
newRunner = true
}
if newRunner {
return newrunner.Execute(args)
} else {
return oldrunner.Execute(args)
}
}

39
runners/common.go Normal file
View File

@@ -0,0 +1,39 @@
package runners
import (
"golang.org/x/sys/cpu"
)
type CPUCapability uint32
// Override at build time when building base GPU runners
// var GPURunnerCPUCapability = CPUCapabilityAVX
const (
CPUCapabilityNone CPUCapability = iota
CPUCapabilityAVX
CPUCapabilityAVX2
// TODO AVX512
)
func (c CPUCapability) String() string {
switch c {
case CPUCapabilityAVX:
return "avx"
case CPUCapabilityAVX2:
return "avx2"
default:
return "no vector extensions"
}
}
func GetCPUCapability() CPUCapability {
if cpu.X86.HasAVX2 {
return CPUCapabilityAVX2
}
if cpu.X86.HasAVX {
return CPUCapabilityAVX
}
// else LCD
return CPUCapabilityNone
}

21
scripts/build.sh Normal file
View File

@@ -0,0 +1,21 @@
#!/bin/sh
set -eu
usage() {
echo "usage: $(basename $0) VERSION"
exit 1
}
[ "$#" -eq 1 ] || usage
export VERSION="$1"
# build universal MacOS binary
sh $(dirname $0)/build_darwin.sh
# # build arm64 and amd64 Linux binaries
sh $(dirname $0)/build_linux.sh
# # build arm64 and amd64 Docker images
sh $(dirname $0)/build_docker.sh

View File

@@ -2,92 +2,53 @@
set -e
status() { echo >&2 ">>> $@"; }
usage() {
echo "usage: $(basename $0) [build [sign]]"
exit 1
}
. $(dirname $0)/env.sh
export VERSION=${VERSION:-$(git describe --tags --dirty)}
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${VERSION#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
export CGO_CPPFLAGS='-mmacosx-version-min=11.3'
mkdir -p dist
ARCHS="arm64 amd64"
while getopts "a:h" OPTION; do
case $OPTION in
a) ARCHS=$OPTARG ;;
h) usage ;;
esac
done
# These require Xcode v13 or older to target MacOS v11
# If installed to an alternate location use the following to enable
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
export CGO_CPPFLAGS=-mmacosx-version-min=11.3
shift $(( $OPTIND - 1 ))
rm -rf llama/build dist/darwin-*
_build_darwin() {
for ARCH in $ARCHS; do
status "Building darwin $ARCH"
INSTALL_PREFIX=dist/darwin-$ARCH/
GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 go build -o $INSTALL_PREFIX .
# Generate the universal ollama binary for stand-alone usage: metal + avx
echo "Building binary"
echo "Building darwin arm64"
GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
echo "Building darwin amd64 with AVX enabled"
GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
if [ "$ARCH" = "amd64" ]; then
status "Building darwin $ARCH dynamic backends"
cmake -B build/darwin-$ARCH \
-DCMAKE_OSX_ARCHITECTURES=x86_64 \
-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3
cmake --build build/darwin-$ARCH --target ggml-cpu -j
install build/darwin-$ARCH/lib/ollama/*.{dylib,so} $INSTALL_PREFIX
fi
done
}
_sign_darwin() {
status "Creating universal binary..."
lipo -create -output dist/darwin/ollama dist/darwin/*/ollama
if [ -z "$APPLE_IDENTITY" ]; then
status "No APPLE_IDENTITY set, skipping code signing"
return
fi
for F in dist/darwin/ollama dist/darwin/amd64/lib*; do
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime $F
done
# create a temporary zip for notarization
TEMP=$(mktemp -u).zip
ditto -c -k --keepParent dist/darwin/ollama "$TEMP"
xcrun notarytool submit dist/darwin/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
rm -f "$TEMP"
# create a universal tarball
tar -cf dist/ollama-darwin.tar --strip-components 2 dist/darwin/ollama
tar -rf dist/ollama-darwin.tar --strip-components 3 dist/darwin/amd64/lib*
gzip -9vc <dist/ollama-darwin.tar >dist/ollama-darwin.tgz
}
_build_macapp() {
# build and optionally sign the mac app
npm install --prefix macapp
if [ -n "$APPLE_IDENTITY" ]; then
npm run --prefix macapp make:sign
else
npm run --prefix macapp make
fi
mv ./macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
}
if [ "$#" -eq 0 ]; then
_build_darwin
_sign_darwin
_build_macapp
exit 0
# sign the binary and rename it
if [ -n "$APPLE_IDENTITY" ]; then
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama-darwin
else
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
fi
ditto -c -k --keepParent dist/ollama-darwin dist/temp.zip
if [ -n "$APPLE_IDENTITY" ]; then
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
fi
rm -f dist/temp.zip
# Build the app bundle
echo "Building app"
echo "Building darwin amd64 with runners"
rm dist/darwin-amd64/bin/ollama
GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
# Generate the universal ollama binary for the app bundle: metal + no-avx
lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
# build and optionally sign the mac app
npm install --prefix macapp
if [ -n "$APPLE_IDENTITY" ]; then
npm run --prefix macapp make:sign
else
npm run --prefix macapp make
fi
cp macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
for CMD in "$@"; do
case $CMD in
build) _build_darwin ;;
sign) _sign_darwin ;;
macapp) _build_macapp ;;
*) usage ;;
esac
done

View File

@@ -80,61 +80,18 @@ function checkEnv() {
function buildOllama() {
if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
write-host "Building ollama runners"
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
# Default first, then conditionall ROCm and cuda v11
write-host "Building Default native backend libraries"
$env:CMAKE_GENERATOR="ninja"
& cmake --preset Default
& make -j 12 dist
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset Default -j 12
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build -j 12
# TODO - add steps for v11 and ROCm
#
# if ("$script:CUDA_DIRS".Contains("v11") -and "$script:CUDA_DIRS".Contains("v12")) {
# # We assume the default is v12, so override for v11
# $origCUDA_PATH=$env:CUDA_PATH
# $hashEnv = @{}
# Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
# $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
# write-host "$v11"
# # $env:CUDA_PATH=$hashEnv[$v11]
# # $env:CUDACXX=$hashEnv[$v11]+"\bin\nvcc.exe"
# $env:CUDAToolkit_ROOT=$hashEnv[$v11]
# # ls env:
# write-host "Building CUDA v11 backend libraries"
# & cmake --preset "CUDA 11"
# $env:CUDA_PATH=$origCUDA_PATH
# exit(1)
# if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# # & cmake --build --preset "CUDA 11" -j 12
# # if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# }
# if ($env:HIP_PATH) {
# write-host "Building ROCm backend libraries"
# $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
# $env:HIP_PLATFORM="amd"
# $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
# & cmake --preset "ROCm"
# $env:HIPCXX=""
# $env:HIP_PLATFORM=""
# $env:CMAKE_PREFIX_PATH=""
# if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# & cmake --build --preset "ROCm" -j 12
# if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# }
} else {
write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
}
write-host "Building ollama CLI"
& go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
cp .\ollama.exe "${script:DIST_DIR}\"
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
}
function buildApp() {

20
scripts/fast.sh Executable file
View File

@@ -0,0 +1,20 @@
#/bin/sh
# Wrapper script to speed up builds by disabling some permutations and reduce compatibility matrix
# Don't use for release builds, but suitable for local developer iteration
# Only build cuda v12
export OLLAMA_SKIP_CUDA_11_GENERATE=1
# Major versions only
export CUDA_V12_ARCHITECTURES="60;70;80;90"
# Skip ROCm
export OLLAMA_SKIP_ROCM_GENERATE=1
# Disable various less common quants and fattn
export OLLAMA_FAST_BUILD=1
if [ $# -ne 1 ] ; then
echo "Usage: ./scripts/fast.sh <build_script>"
exit 1
fi
exec $1

25
scripts/publish.sh Executable file
View File

@@ -0,0 +1,25 @@
# Set your variables here.
REPO="jmorganca/ollama"
# Check if VERSION is set
if [[ -z "${VERSION}" ]]; then
echo "VERSION is not set. Please set the VERSION environment variable."
exit 1
fi
OS=$(go env GOOS)
./script/build_${OS}.sh
# Create a new tag if it doesn't exist.
if ! git rev-parse v$VERSION >/dev/null 2>&1; then
git tag v$VERSION
fi
git push origin v$VERSION
# Create a new release.
gh release create -p v$VERSION -t v$VERSION
# Upload the zip file.
gh release upload v$VERSION ./dist/* --clobber

78
scripts/rh_linux_deps.sh Normal file
View File

@@ -0,0 +1,78 @@
#!/bin/sh
# Script for common Dockerfile dependency installation in redhat linux based images
set -ex
set -o pipefail
MACHINE=$(uname -m)
if grep -i "centos" /etc/system-release >/dev/null; then
# As of 7/1/2024 mirrorlist.centos.org has been taken offline, so adjust accordingly
sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
# Centos 7 derivatives have too old of a git version to run our generate script
# uninstall and ignore failures
yum remove -y git
yum -y install epel-release centos-release-scl
# The release packages reinstate the mirrors, undo that again
sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
yum -y install dnf
if [ "${MACHINE}" = "x86_64" ]; then
yum -y install https://repo.ius.io/ius-release-el7.rpm
dnf install -y git236
else
dnf install -y rh-git227-git
ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
fi
dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz findutils
elif grep -i "rocky" /etc/system-release >/dev/null; then
# Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
[vault]
name=Rocky Vault
baseurl=https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
EOF
dnf install -y git \
gcc-toolset-10-gcc-10.2.1-8.2.el8 \
gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
findutils \
yum-utils \
pigz
else
echo "ERROR Unexpected distro"
exit 1
fi
if [ "${MACHINE}" = "x86_64" ] ; then
curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
mv /tmp/ccache /usr/local/bin/
else
yum -y install epel-release
yum install -y ccache
fi
if [ -n "${CMAKE_VERSION}" ]; then
curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
fi
if [ -n "${GOLANG_VERSION}" ]; then
if [ "${MACHINE}" = "x86_64" ]; then
GO_ARCH="amd64"
else
GO_ARCH="arm64"
fi
mkdir -p /usr/local
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local
ln -s /usr/local/go/bin/go /usr/local/bin/go
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt
fi

View File

@@ -10,6 +10,7 @@ import (
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/model/mllama"
"github.com/ollama/ollama/template"
@@ -92,26 +93,33 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
var imgData llm.ImageData
if isMllama {
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
if envconfig.NewRunners() {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
} else {
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
return "", nil, err
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
return "", nil, err
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: ar,
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: ar,
}
}
imgPrompt = "<|image|>"
} else {

View File

@@ -203,7 +203,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
images := make([]llm.ImageData, len(req.Images))
for i := range req.Images {
if isMllama {
if isMllama && !envconfig.NewRunners() {
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})