Compare commits
28 Commits
brucemacd/
...
v0.9.3-rc4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ad118d8b13 | ||
|
|
f08534137b | ||
|
|
4b4a90f233 | ||
|
|
03274a6b2f | ||
|
|
cc6463ebca | ||
|
|
405d2f628f | ||
|
|
a3f7dd3e98 | ||
|
|
c85c0ebf89 | ||
|
|
10a8e04a8d | ||
|
|
1c6669e64c | ||
|
|
b2b270ad5d | ||
|
|
2bb69b40c7 | ||
|
|
65bff664cb | ||
|
|
c088ac0e79 | ||
|
|
0a066cfd91 | ||
|
|
87b7af6cee | ||
|
|
f2527b08fb | ||
|
|
8bcb3125c1 | ||
|
|
6baf1e31e2 | ||
|
|
ed567ef43b | ||
|
|
a6e64fbdf2 | ||
|
|
60cfa2a203 | ||
|
|
55bbf3b4a1 | ||
|
|
6bda1d2479 | ||
|
|
9e125d884c | ||
|
|
20c5fd39c8 | ||
|
|
d2ee599dcf | ||
|
|
6ed8898590 |
51
.github/workflows/release.yaml
vendored
51
.github/workflows/release.yaml
vendored
@@ -103,21 +103,18 @@ jobs:
|
||||
arch: [amd64]
|
||||
preset: ['CPU']
|
||||
include:
|
||||
- os: windows
|
||||
arch: amd64
|
||||
preset: 'CUDA 11'
|
||||
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
||||
cuda-version: '11.3'
|
||||
- os: windows
|
||||
arch: amd64
|
||||
preset: 'CUDA 12'
|
||||
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
||||
cuda-version: '12.8'
|
||||
flags: ''
|
||||
- os: windows
|
||||
arch: amd64
|
||||
preset: 'ROCm 6'
|
||||
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
||||
rocm-version: '6.2'
|
||||
flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
|
||||
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
||||
environment: release
|
||||
env:
|
||||
@@ -160,6 +157,9 @@ jobs:
|
||||
echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
- if: matrix.preset == 'CPU'
|
||||
run: |
|
||||
echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
@@ -178,9 +178,9 @@ jobs:
|
||||
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
|
||||
- name: Build target "${{ matrix.preset }}"
|
||||
run: |
|
||||
Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||
cmake --preset "${{ matrix.preset }}"
|
||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
|
||||
cmake --build --parallel --preset "${{ matrix.preset }}"
|
||||
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
|
||||
env:
|
||||
@@ -246,7 +246,7 @@ jobs:
|
||||
dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
|
||||
|
||||
windows-sign:
|
||||
runs-on: windows-2022
|
||||
runs-on: windows
|
||||
environment: release
|
||||
needs: [windows-depends, windows-build]
|
||||
steps:
|
||||
@@ -322,16 +322,21 @@ jobs:
|
||||
- run: |
|
||||
for COMPONENT in bin/* lib/ollama/*; do
|
||||
case "$COMPONENT" in
|
||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v11) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_sbsa) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
||||
esac
|
||||
done
|
||||
working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
|
||||
- run: |
|
||||
echo "Manifests"
|
||||
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
|
||||
echo $ARCHIVE
|
||||
cat $ARCHIVE
|
||||
done
|
||||
- run: |
|
||||
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
|
||||
tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
|
||||
@@ -470,8 +475,18 @@ jobs:
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: dist-linux-*
|
||||
path: dist
|
||||
merge-multiple: true
|
||||
path: stage
|
||||
merge-multiple: false
|
||||
- name: Merge linux amd64 payload
|
||||
working-directory: stage/dist-linux-amd64-archive
|
||||
run: |
|
||||
tar zxf ollama-linux-amd64.tgz
|
||||
tar zxf ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
|
||||
rm -f ollama-linux-amd64.tgz ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
|
||||
tar -c -f- --owner 0 --group 0 . | pigz -9vc > ../ollama-linux-amd64.tgz
|
||||
- name: Cleanup linux payloads
|
||||
run: |
|
||||
find stage -name ollama-linux\*.tgz -exec mv {} dist/ \;
|
||||
- run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
|
||||
working-directory: dist
|
||||
- name: Create or update Release
|
||||
|
||||
17
.github/workflows/test.yaml
vendored
17
.github/workflows/test.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
| xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
|
||||
}
|
||||
|
||||
echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT
|
||||
echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
|
||||
|
||||
linux:
|
||||
needs: [changes]
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
include:
|
||||
- preset: CPU
|
||||
- preset: CUDA
|
||||
container: nvidia/cuda:11.8.0-devel-ubuntu22.04
|
||||
container: nvidia/cuda:12.8.1-devel-ubuntu22.04
|
||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
||||
- preset: ROCm
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
@@ -78,11 +78,11 @@ jobs:
|
||||
include:
|
||||
- preset: CPU
|
||||
- preset: CUDA
|
||||
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
||||
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
||||
- preset: ROCm
|
||||
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
||||
flags: '-DAMDGPU_TARGETS=gfx1010'
|
||||
flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
|
||||
runs-on: windows
|
||||
steps:
|
||||
- run: |
|
||||
@@ -102,7 +102,7 @@ jobs:
|
||||
$ErrorActionPreference = "Stop"
|
||||
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
||||
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
|
||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
|
||||
}
|
||||
|
||||
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
||||
@@ -120,6 +120,9 @@ jobs:
|
||||
echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
- if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
|
||||
uses: actions/cache/save@v4
|
||||
with:
|
||||
@@ -133,8 +136,8 @@ jobs:
|
||||
path: ${{ github.workspace }}\.ccache
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
||||
- run: |
|
||||
Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
|
||||
cmake --build --parallel --preset "${{ matrix.preset }}"
|
||||
env:
|
||||
|
||||
@@ -78,14 +78,13 @@ if(CMAKE_CUDA_COMPILER)
|
||||
|
||||
find_package(CUDAToolkit)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
|
||||
set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
|
||||
install(TARGETS ggml-cuda
|
||||
RUNTIME_DEPENDENCIES
|
||||
DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
|
||||
PRE_INCLUDE_REGEXES cublas cublasLt cudart
|
||||
PRE_EXCLUDE_REGEXES ".*"
|
||||
RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
|
||||
LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
|
||||
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
|
||||
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -116,7 +115,11 @@ if(CMAKE_HIP_COMPILER)
|
||||
|
||||
set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
|
||||
install(TARGETS ggml-hip
|
||||
RUNTIME_DEPENDENCIES
|
||||
RUNTIME_DEPENDENCY_SET rocm
|
||||
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
|
||||
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
|
||||
)
|
||||
install(RUNTIME_DEPENDENCY_SET rocm
|
||||
DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
|
||||
PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
|
||||
PRE_EXCLUDE_REGEXES ".*"
|
||||
|
||||
@@ -17,20 +17,12 @@
|
||||
"name": "CUDA",
|
||||
"inherits": [ "Default" ]
|
||||
},
|
||||
{
|
||||
"name": "CUDA 11",
|
||||
"inherits": [ "CUDA" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
|
||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "CUDA 12",
|
||||
"inherits": [ "CUDA" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
|
||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
|
||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -58,6 +50,7 @@
|
||||
"name": "ROCm 6",
|
||||
"inherits": [ "ROCm" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
|
||||
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
|
||||
}
|
||||
}
|
||||
@@ -78,11 +71,6 @@
|
||||
"configurePreset": "CUDA",
|
||||
"targets": [ "ggml-cuda" ]
|
||||
},
|
||||
{
|
||||
"name": "CUDA 11",
|
||||
"inherits": [ "CUDA" ],
|
||||
"configurePreset": "CUDA 11"
|
||||
},
|
||||
{
|
||||
"name": "CUDA 12",
|
||||
"inherits": [ "CUDA" ],
|
||||
|
||||
24
Dockerfile
24
Dockerfile
@@ -7,12 +7,13 @@ ARG JETPACK5VERSION=r35.4.1
|
||||
ARG JETPACK6VERSION=r36.4.0
|
||||
ARG CMAKEVERSION=3.31.2
|
||||
|
||||
# CUDA v11 requires gcc v10. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
|
||||
# We require gcc v10 minimum. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
|
||||
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
||||
RUN yum install -y yum-utils \
|
||||
&& yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
|
||||
&& rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
|
||||
&& dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
|
||||
&& dnf install -y ccache \
|
||||
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
||||
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||
|
||||
@@ -38,15 +39,6 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
&& cmake --build --parallel --preset 'CPU' \
|
||||
&& cmake --install build --component CPU --strip --parallel 8
|
||||
|
||||
FROM base AS cuda-11
|
||||
ARG CUDA11VERSION=11.3
|
||||
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 11' \
|
||||
&& cmake --build --parallel --preset 'CUDA 11' \
|
||||
&& cmake --install build --component CUDA --strip --parallel 8
|
||||
|
||||
FROM base AS cuda-12
|
||||
ARG CUDA12VERSION=12.8
|
||||
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||
@@ -98,17 +90,15 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
|
||||
go build -trimpath -buildmode=pie -o /bin/ollama .
|
||||
|
||||
FROM --platform=linux/amd64 scratch AS amd64
|
||||
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||
COPY --from=cuda-12 dist/lib/ollama /lib/ollama
|
||||
|
||||
FROM --platform=linux/arm64 scratch AS arm64
|
||||
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||
COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
|
||||
COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
|
||||
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
|
||||
COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
|
||||
COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
|
||||
|
||||
FROM scratch AS rocm
|
||||
COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
|
||||
COPY --from=rocm-6 dist/lib/ollama /lib/ollama
|
||||
|
||||
FROM ${FLAVOR} AS archive
|
||||
COPY --from=cpu dist/lib/ollama /lib/ollama
|
||||
|
||||
@@ -409,6 +409,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.)
|
||||
- [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
|
||||
- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
|
||||
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
|
||||
|
||||
### Cloud
|
||||
|
||||
|
||||
@@ -1,178 +0,0 @@
|
||||
package benchmark
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
// Command line flags
|
||||
var modelFlag string
|
||||
|
||||
func init() {
|
||||
flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
|
||||
flag.Lookup("m").DefValue = "model"
|
||||
}
|
||||
|
||||
// modelName returns the model name from flags, failing the test if not set
|
||||
func modelName(b *testing.B) string {
|
||||
if modelFlag == "" {
|
||||
b.Fatal("Error: -m flag is required for benchmark tests")
|
||||
}
|
||||
return modelFlag
|
||||
}
|
||||
|
||||
type TestCase struct {
|
||||
name string
|
||||
prompt string
|
||||
maxTokens int
|
||||
}
|
||||
|
||||
// runGenerateBenchmark contains the common generate and metrics logic
|
||||
func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
|
||||
start := time.Now()
|
||||
var ttft time.Duration
|
||||
var metrics api.Metrics
|
||||
|
||||
err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
|
||||
if ttft == 0 && resp.Response != "" {
|
||||
ttft = time.Since(start)
|
||||
}
|
||||
if resp.Done {
|
||||
metrics = resp.Metrics
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Report custom metrics as part of the benchmark results
|
||||
b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
|
||||
b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
|
||||
|
||||
// Token throughput metrics
|
||||
promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
|
||||
genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
|
||||
b.ReportMetric(promptThroughput, "prompt_tok/s")
|
||||
b.ReportMetric(genThroughput, "gen_tok/s")
|
||||
|
||||
// Token counts
|
||||
b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
|
||||
b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkColdStart runs benchmarks with model loading from cold state
|
||||
func BenchmarkColdStart(b *testing.B) {
|
||||
client := setup(b)
|
||||
tests := []TestCase{
|
||||
{"short_prompt", "Write a long story", 100},
|
||||
{"medium_prompt", "Write a detailed economic analysis", 500},
|
||||
{"long_prompt", "Write a comprehensive AI research paper", 1000},
|
||||
}
|
||||
m := modelName(b)
|
||||
|
||||
for _, tt := range tests {
|
||||
b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
|
||||
ctx := b.Context()
|
||||
|
||||
// Set number of tokens as our throughput metric
|
||||
b.SetBytes(int64(tt.maxTokens))
|
||||
|
||||
for b.Loop() {
|
||||
b.StopTimer()
|
||||
// Ensure model is unloaded before each iteration
|
||||
unload(client, m, b)
|
||||
b.StartTimer()
|
||||
|
||||
req := &api.GenerateRequest{
|
||||
Model: m,
|
||||
Prompt: tt.prompt,
|
||||
Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
|
||||
}
|
||||
|
||||
runGenerateBenchmark(b, ctx, client, req)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkWarmStart runs benchmarks with pre-loaded model
|
||||
func BenchmarkWarmStart(b *testing.B) {
|
||||
client := setup(b)
|
||||
tests := []TestCase{
|
||||
{"short_prompt", "Write a long story", 100},
|
||||
{"medium_prompt", "Write a detailed economic analysis", 500},
|
||||
{"long_prompt", "Write a comprehensive AI research paper", 1000},
|
||||
}
|
||||
m := modelName(b)
|
||||
|
||||
for _, tt := range tests {
|
||||
b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
|
||||
ctx := b.Context()
|
||||
|
||||
// Pre-warm the model
|
||||
warmup(client, m, tt.prompt, b)
|
||||
|
||||
// Set number of tokens as our throughput metric
|
||||
b.SetBytes(int64(tt.maxTokens))
|
||||
|
||||
for b.Loop() {
|
||||
req := &api.GenerateRequest{
|
||||
Model: m,
|
||||
Prompt: tt.prompt,
|
||||
Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
|
||||
}
|
||||
|
||||
runGenerateBenchmark(b, ctx, client, req)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// setup verifies server and model availability
|
||||
func setup(b *testing.B) *api.Client {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
|
||||
b.Fatalf("Model unavailable: %v", err)
|
||||
}
|
||||
|
||||
return client
|
||||
}
|
||||
|
||||
// warmup ensures the model is loaded and warmed up
|
||||
func warmup(client *api.Client, model string, prompt string, b *testing.B) {
|
||||
for range 3 {
|
||||
err := client.Generate(
|
||||
context.Background(),
|
||||
&api.GenerateRequest{
|
||||
Model: model,
|
||||
Prompt: prompt,
|
||||
Options: map[string]any{"num_predict": 50, "temperature": 0.1},
|
||||
},
|
||||
func(api.GenerateResponse) error { return nil },
|
||||
)
|
||||
if err != nil {
|
||||
b.Logf("Error during model warm-up: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// unload forces model unloading using KeepAlive: 0 parameter
|
||||
func unload(client *api.Client, model string, b *testing.B) {
|
||||
req := &api.GenerateRequest{
|
||||
Model: model,
|
||||
KeepAlive: &api.Duration{Duration: 0},
|
||||
}
|
||||
if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
|
||||
b.Logf("Unload error: %v", err)
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
@@ -2,9 +2,6 @@ package convert
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
@@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
||||
}
|
||||
|
||||
func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
oldnew := []string{
|
||||
"model.layers", "blk",
|
||||
"w1", "ffn_gate_exps",
|
||||
"w2", "ffn_down_exps",
|
||||
"w3", "ffn_up_exps",
|
||||
}
|
||||
|
||||
for i := range p.NumLocalExperts {
|
||||
oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
|
||||
}
|
||||
|
||||
// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
|
||||
namer := strings.NewReplacer(oldnew...)
|
||||
experts := make(map[string]experts)
|
||||
|
||||
// merge experts into a single tensor while removing them from ts
|
||||
ts = slices.DeleteFunc(ts, func(t Tensor) bool {
|
||||
if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
|
||||
return false
|
||||
}
|
||||
|
||||
name := namer.Replace(t.Name())
|
||||
experts[name] = append(experts[name], t)
|
||||
return true
|
||||
})
|
||||
|
||||
var out []*ggml.Tensor
|
||||
for n, e := range experts {
|
||||
// TODO(mxyng): sanity check experts
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: n,
|
||||
Kind: e[0].Kind(),
|
||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
||||
WriterTo: e,
|
||||
merges := make([]merge, 0, p.NumHiddenLayers*6)
|
||||
for i := range p.NumHiddenLayers {
|
||||
merges = append(merges, merge{
|
||||
fmt.Sprintf("blk.%d.*.w1.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
|
||||
}, merge{
|
||||
fmt.Sprintf("blk.%d.*.w1.bias", i),
|
||||
fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
|
||||
}, merge{
|
||||
fmt.Sprintf("blk.%d.*.w2.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
|
||||
}, merge{
|
||||
fmt.Sprintf("blk.%d.*.w2.bias", i),
|
||||
fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
|
||||
}, merge{
|
||||
fmt.Sprintf("blk.%d.*.w3.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
|
||||
}, merge{
|
||||
fmt.Sprintf("blk.%d.*.w3.bias", i),
|
||||
fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
|
||||
})
|
||||
}
|
||||
|
||||
out, ts := mergeTensors(ts, merges...)
|
||||
return append(out, p.llamaModel.Tensors(ts)...)
|
||||
}
|
||||
|
||||
func (p *mixtralModel) Replacements() []string {
|
||||
return append(
|
||||
p.llamaModel.Replacements(),
|
||||
"model.layers", "blk",
|
||||
"block_sparse_moe.gate", "ffn_gate_inp",
|
||||
"block_sparse_moe.experts.", ".",
|
||||
)
|
||||
}
|
||||
|
||||
type experts []Tensor
|
||||
|
||||
func (e experts) WriteTo(w io.Writer) (int64, error) {
|
||||
// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
|
||||
for _, t := range e {
|
||||
// the canonical merged experts tensor stacks all experts along a new, 0 axis,
|
||||
// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
|
||||
// this accomplishes the same thing by writing each expert tensor in sequence
|
||||
if _, err := t.WriteTo(w); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
@@ -2,7 +2,9 @@ package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"io"
|
||||
"iter"
|
||||
"path"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
@@ -74,3 +76,54 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type merge struct {
|
||||
pattern, name string
|
||||
}
|
||||
|
||||
// mergeTensors merges tensors that match a given pattern into a single tensor.
|
||||
func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
|
||||
var matched []Tensor
|
||||
for i := range merges {
|
||||
matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
|
||||
matched, _ := path.Match(merges[i].pattern, t.Name())
|
||||
return matched
|
||||
})
|
||||
|
||||
if len(matched) > 0 {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: merges[i].name,
|
||||
Kind: matched[0].Kind(),
|
||||
Shape: append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
|
||||
WriterTo: mergeGroup(matched),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out, unmatched
|
||||
}
|
||||
|
||||
// slicesSplitFunc splits a slice into two slices based on a predicate function.
|
||||
func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
|
||||
for _, e := range s {
|
||||
if fn(e) {
|
||||
matched = append(matched, e)
|
||||
} else {
|
||||
unmatched = append(unmatched, e)
|
||||
}
|
||||
}
|
||||
|
||||
return matched, unmatched
|
||||
}
|
||||
|
||||
type mergeGroup []Tensor
|
||||
|
||||
func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
|
||||
for _, t := range g {
|
||||
if _, err := t.WriteTo(w); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
@@ -9,6 +9,8 @@ import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/pdevine/tensor"
|
||||
)
|
||||
|
||||
@@ -302,3 +304,99 @@ func TestSplitDim(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestMerge(t *testing.T) {
|
||||
unmatched := []Tensor{
|
||||
&fakeTensor{
|
||||
name: "a.0.b",
|
||||
shape: []uint64{5, 2},
|
||||
data: []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
|
||||
},
|
||||
&fakeTensor{
|
||||
name: "a.1.b",
|
||||
shape: []uint64{5, 2},
|
||||
data: []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
|
||||
},
|
||||
&fakeTensor{
|
||||
name: "c.0.d",
|
||||
shape: []uint64{5, 2},
|
||||
data: []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
|
||||
},
|
||||
&fakeTensor{
|
||||
name: "c.1.d",
|
||||
shape: []uint64{5, 2},
|
||||
data: []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
|
||||
},
|
||||
&fakeTensor{
|
||||
name: "e.0.f",
|
||||
shape: []uint64{5, 2},
|
||||
data: []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
|
||||
},
|
||||
}
|
||||
|
||||
checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
|
||||
for i := range n {
|
||||
got := matched[i]
|
||||
if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
|
||||
t.Errorf("unexpected (-want +got):\n%s", diff)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := got.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, 20)
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
offset := 10 + (i * 20)
|
||||
want := make([]float32, 20)
|
||||
for j := range 20 {
|
||||
want[j] = float32(offset + j)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(want, f32s); diff != "" {
|
||||
t.Errorf("unexpected data (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("single merge", func(t *testing.T) {
|
||||
matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
|
||||
if len(unmatched) != 3 {
|
||||
t.Error("expected 3 remaining tensors, got", len(unmatched))
|
||||
}
|
||||
|
||||
if len(matched) != 1 {
|
||||
t.Error("expected 1 merged tensor, got", len(matched))
|
||||
}
|
||||
|
||||
checkMatched(t, 1, matched)
|
||||
})
|
||||
|
||||
t.Run("multiple merges", func(t *testing.T) {
|
||||
matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
|
||||
if len(unmatched) != 1 {
|
||||
t.Error("expected 1 remaining tensors, got", len(unmatched))
|
||||
}
|
||||
|
||||
if len(matched) != 2 {
|
||||
t.Error("expected 2 merged tensor, got", len(matched))
|
||||
}
|
||||
|
||||
checkMatched(t, 2, matched)
|
||||
})
|
||||
|
||||
t.Run("no match", func(t *testing.T) {
|
||||
matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
|
||||
if len(unmatched) != 5 {
|
||||
t.Error("expected 5 remaining tensors, got", len(unmatched))
|
||||
}
|
||||
|
||||
if len(matched) != 0 {
|
||||
t.Error("expected no merged tensors, got", len(matched))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
package discover
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"regexp"
|
||||
@@ -55,10 +56,13 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
|
||||
}
|
||||
}
|
||||
}
|
||||
return "sbsa"
|
||||
}
|
||||
|
||||
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
|
||||
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
||||
// The detected driver is older than Feb 2023
|
||||
slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
|
||||
return "v11"
|
||||
}
|
||||
return "v12"
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
// '../lib/ollama' on Linux and the executable's directory on macOS
|
||||
// note: distribution builds, additional GPU-specific libraries are
|
||||
// found in subdirectories of the returned path, such as
|
||||
// 'cuda_v11', 'cuda_v12', 'rocm', etc.
|
||||
// 'cuda_v12', 'rocm', etc.
|
||||
var LibOllamaPath string = func() string {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
|
||||
@@ -1,59 +0,0 @@
|
||||
# Benchmark
|
||||
|
||||
Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
|
||||
|
||||
## When to use
|
||||
|
||||
Run these benchmarks when:
|
||||
- Making changes to the model inference engine
|
||||
- Modifying model loading/unloading logic
|
||||
- Changing prompt processing or token generation code
|
||||
- Implementing a new model architecture
|
||||
- Testing performance across different hardware setups
|
||||
|
||||
## Prerequisites
|
||||
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
|
||||
## Usage and Examples
|
||||
|
||||
>[!NOTE]
|
||||
>All commands must be run from the root directory of the Ollama project.
|
||||
|
||||
Basic syntax:
|
||||
```bash
|
||||
go test -bench=. ./benchmark/... -m $MODEL_NAME
|
||||
```
|
||||
|
||||
Required flags:
|
||||
- `-bench=.`: Run all benchmarks
|
||||
- `-m`: Model name to benchmark
|
||||
|
||||
Optional flags:
|
||||
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
|
||||
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
|
||||
|
||||
Common usage patterns:
|
||||
|
||||
Single benchmark run with a model specified:
|
||||
```bash
|
||||
go test -bench=. ./benchmark/... -m llama3.3
|
||||
```
|
||||
|
||||
## Output metrics
|
||||
|
||||
The benchmark reports several key metrics:
|
||||
|
||||
- `gen_tok/s`: Generated tokens per second
|
||||
- `prompt_tok/s`: Prompt processing tokens per second
|
||||
- `ttft_ms`: Time to first token in milliseconds
|
||||
- `load_ms`: Model load time in milliseconds
|
||||
- `gen_tokens`: Total tokens generated
|
||||
- `prompt_tokens`: Total prompt tokens processed
|
||||
|
||||
Each benchmark runs two scenarios:
|
||||
- Cold start: Model is loaded from disk for each test
|
||||
- Warm start: Model is pre-loaded in memory
|
||||
|
||||
Three prompt lengths are tested for each scenario:
|
||||
- Short prompt (100 tokens)
|
||||
- Medium prompt (500 tokens)
|
||||
- Long prompt (1000 tokens)
|
||||
@@ -1,6 +1,6 @@
|
||||
# GPU
|
||||
## Nvidia
|
||||
Ollama supports Nvidia GPUs with compute capability 5.0+.
|
||||
Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
|
||||
|
||||
Check your compute compatibility to see if your card is supported:
|
||||
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
||||
|
||||
@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
|
||||
In the server log, you will see a message that looks something like this (varies from release to release):
|
||||
|
||||
```
|
||||
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
|
||||
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
|
||||
```
|
||||
|
||||
**Experimental LLM Library Override**
|
||||
|
||||
103
fs/ggml/ggml.go
103
fs/ggml/ggml.go
@@ -34,7 +34,8 @@ func (kv KV) Kind() string {
|
||||
}
|
||||
|
||||
func (kv KV) ParameterCount() uint64 {
|
||||
return keyValue(kv, "general.parameter_count", uint64(0))
|
||||
val, _ := keyValue(kv, "general.parameter_count", uint64(0))
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) FileType() FileType {
|
||||
@@ -53,16 +54,27 @@ func (kv KV) EmbeddingLength() uint64 {
|
||||
return uint64(kv.Uint("embedding_length"))
|
||||
}
|
||||
|
||||
func (kv KV) HeadCount() uint64 {
|
||||
return uint64(kv.Uint("attention.head_count"))
|
||||
func (kv KV) HeadCountMax() uint64 {
|
||||
// TODO(drifkin): using the max value can cause an overestimation. In the
|
||||
// future if array values become more popular, we can adapt the more invasive
|
||||
// <https://github.com/ollama/ollama/pull/10225>
|
||||
return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
|
||||
}
|
||||
|
||||
func (kv KV) HeadCountKV() uint64 {
|
||||
return uint64(kv.Uint("attention.head_count_kv", 1))
|
||||
func (kv KV) HeadCountMin() uint64 {
|
||||
return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCount() uint64 {
|
||||
if heads := kv.HeadCount(); heads > 0 {
|
||||
func (kv KV) HeadCountKVMax() uint64 {
|
||||
return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
|
||||
}
|
||||
|
||||
func (kv KV) HeadCountKVMin() uint64 {
|
||||
return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountMax() uint64 {
|
||||
if heads := kv.HeadCountMin(); heads > 0 {
|
||||
return kv.EmbeddingLength() / heads
|
||||
}
|
||||
|
||||
@@ -70,15 +82,11 @@ func (kv KV) EmbeddingHeadCount() uint64 {
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountK() uint64 {
|
||||
return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
|
||||
return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountV() uint64 {
|
||||
return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
|
||||
}
|
||||
|
||||
func (kv KV) GQA() uint64 {
|
||||
return kv.HeadCount() / kv.HeadCountKV()
|
||||
return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
|
||||
}
|
||||
|
||||
func (kv KV) ContextLength() uint64 {
|
||||
@@ -90,35 +98,72 @@ func (kv KV) ChatTemplate() string {
|
||||
}
|
||||
|
||||
func (kv KV) String(key string, defaultValue ...string) string {
|
||||
return keyValue(kv, key, append(defaultValue, "")...)
|
||||
val, _ := keyValue(kv, key, append(defaultValue, "")...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
|
||||
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||
val, _ := keyValue(kv, key, append(defaultValue, 0)...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Float(key string, defaultValue ...float32) float32 {
|
||||
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||
val, _ := keyValue(kv, key, append(defaultValue, 0)...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Bool(key string, defaultValue ...bool) bool {
|
||||
return keyValue(kv, key, append(defaultValue, false)...)
|
||||
val, _ := keyValue(kv, key, append(defaultValue, false)...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
|
||||
_, max := kv.UintOrArrayValue(key, defaultValue)
|
||||
return max
|
||||
}
|
||||
|
||||
func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
|
||||
min, _ := kv.UintOrArrayValue(key, defaultValue)
|
||||
return min
|
||||
}
|
||||
|
||||
func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
|
||||
if u32, ok := keyValue(kv, key, uint32(0)); ok {
|
||||
return u32, u32
|
||||
} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
|
||||
min := slices.Min(u32s.values)
|
||||
max := slices.Max(u32s.values)
|
||||
return min, max
|
||||
} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
|
||||
min := slices.Min(i32s.values)
|
||||
max := slices.Max(i32s.values)
|
||||
if min < 0 || max < 0 {
|
||||
slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
|
||||
}
|
||||
return uint32(min), uint32(max)
|
||||
}
|
||||
|
||||
return defaultValue, defaultValue
|
||||
}
|
||||
|
||||
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
||||
return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
|
||||
val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
|
||||
return val.values
|
||||
}
|
||||
|
||||
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
|
||||
return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
|
||||
val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
|
||||
return val.values
|
||||
}
|
||||
|
||||
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
||||
return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
|
||||
val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
|
||||
return val.values
|
||||
}
|
||||
|
||||
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
|
||||
return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
|
||||
val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
|
||||
return val.values
|
||||
}
|
||||
|
||||
func (kv KV) OllamaEngineRequired() bool {
|
||||
@@ -143,17 +188,17 @@ type arrayValueTypes interface {
|
||||
*array[string] | *array[float32] | *array[float64] | *array[bool]
|
||||
}
|
||||
|
||||
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
|
||||
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
|
||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||
key = kv.Architecture() + "." + key
|
||||
}
|
||||
|
||||
if val, ok := kv[key]; ok {
|
||||
return val.(T)
|
||||
if val, ok := kv[key].(T); ok {
|
||||
return val, true
|
||||
}
|
||||
|
||||
slog.Debug("key not found", "key", key, "default", defaultValue[0])
|
||||
return defaultValue[0]
|
||||
slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
|
||||
return defaultValue[0], false
|
||||
}
|
||||
|
||||
type Tensors struct {
|
||||
@@ -425,11 +470,11 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
||||
|
||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||
embedding := f.KV().EmbeddingLength()
|
||||
heads := f.KV().HeadCount()
|
||||
headsKV := f.KV().HeadCountKV()
|
||||
heads := f.KV().HeadCountMax()
|
||||
headsKV := f.KV().HeadCountKVMax()
|
||||
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
|
||||
|
||||
embeddingHeads := f.KV().EmbeddingHeadCount()
|
||||
embeddingHeads := f.KV().EmbeddingHeadCountMax()
|
||||
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
||||
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
||||
|
||||
|
||||
@@ -269,3 +269,33 @@ func TestKeyValue(t *testing.T) {
|
||||
t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeadCount(t *testing.T) {
|
||||
valuesArray := []int32{1, 5, 3, 4}
|
||||
cases := []struct {
|
||||
kv KV
|
||||
want uint64
|
||||
}{
|
||||
{
|
||||
kv: KV{
|
||||
"general.architecture": "abc",
|
||||
"abc.attention.head_count": &array[int32]{values: valuesArray, size: len(valuesArray)},
|
||||
},
|
||||
want: uint64(5),
|
||||
},
|
||||
{
|
||||
kv: KV{
|
||||
"general.architecture": "abc",
|
||||
"abc.attention.head_count": uint32(3),
|
||||
},
|
||||
want: uint64(3),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
got := tt.kv.HeadCountMax()
|
||||
if got != tt.want {
|
||||
t.Errorf("unexpected max value: got=%d want=%d", got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,7 @@ func Open(path string) (f *File, err error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if f.Version != 3 {
|
||||
if f.Version < 2 {
|
||||
return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
|
||||
}
|
||||
|
||||
|
||||
@@ -45,6 +45,8 @@ var (
|
||||
"qwen2.5-coder:latest",
|
||||
"qwen:latest",
|
||||
"solar-pro:latest",
|
||||
"codellama:latest",
|
||||
"nous-hermes:latest",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Hiltgen <daniel@ollama.com>
|
||||
Date: Sun, 22 Jun 2025 09:22:05 -0700
|
||||
Subject: [PATCH] temporary prevent rocm+cuda mixed loading
|
||||
|
||||
---
|
||||
ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
|
||||
1 file changed, 10 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 4e67d243..8f49f084 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
|
||||
ggml_backend_load_best("blas", silent, dir_path);
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
- ggml_backend_load_best("cuda", silent, dir_path);
|
||||
- ggml_backend_load_best("hip", silent, dir_path);
|
||||
+
|
||||
+ // Avoid mixed hip+cuda configurations
|
||||
+ const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
|
||||
+ const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
|
||||
+ if (!hip_devices && !rocr_devices) {
|
||||
+ ggml_backend_load_best("cuda", silent, dir_path);
|
||||
+ } else {
|
||||
+ ggml_backend_load_best("hip", silent, dir_path);
|
||||
+ }
|
||||
+
|
||||
ggml_backend_load_best("kompute", silent, dir_path);
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
@@ -151,7 +151,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
}
|
||||
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = f.KV().GQA() * kvTotal / 6
|
||||
headsKV := f.KV().HeadCountKVMin()
|
||||
if headsKV == 0 {
|
||||
headsKV = 1
|
||||
}
|
||||
gqa := f.KV().HeadCountMax() / headsKV
|
||||
graphPartialOffload = gqa * kvTotal / 6
|
||||
}
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
|
||||
@@ -139,6 +139,13 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
gpus = discover.GetCPUInfo()
|
||||
}
|
||||
|
||||
// Verify the requested context size is <= the model training size
|
||||
trainCtx := f.KV().ContextLength()
|
||||
if opts.NumCtx/numParallel > int(trainCtx) && trainCtx > 0 {
|
||||
slog.Warn("requested context size too large for model", "num_ctx", opts.NumCtx, "num_parallel", numParallel, "n_ctx_train", trainCtx)
|
||||
opts.NumCtx = int(trainCtx) * numParallel
|
||||
}
|
||||
|
||||
estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
|
||||
if len(gpus) > 1 || gpus[0].Library != "cpu" {
|
||||
switch {
|
||||
@@ -311,7 +318,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
params = append(params, "--mmproj", projectors[0])
|
||||
}
|
||||
|
||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
|
||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
|
||||
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
||||
// without any LD_LIBRARY_PATH flags
|
||||
for {
|
||||
|
||||
@@ -602,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
|
||||
}
|
||||
|
||||
func (c *Context) Compute(tensors ...ml.Tensor) {
|
||||
C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
|
||||
if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
|
||||
panic(fmt.Errorf("error computing ggml graph: %v", status))
|
||||
}
|
||||
C.ggml_backend_sched_reset(c.b.sched)
|
||||
|
||||
needSync := true
|
||||
|
||||
12
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
12
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
|
||||
ggml_backend_load_best("blas", silent, dir_path);
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
|
||||
// Avoid mixed hip+cuda configurations
|
||||
const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
|
||||
const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
|
||||
if (!hip_devices && !rocr_devices) {
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
} else {
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
}
|
||||
|
||||
ggml_backend_load_best("kompute", silent, dir_path);
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
|
||||
@@ -87,7 +87,7 @@ func (v *Vocabulary) Decode(id int32) string {
|
||||
func (v *Vocabulary) SpecialVocabulary() []string {
|
||||
v.specialOnce.Do(func() {
|
||||
for i := range v.Values {
|
||||
if v.Types[i] == TOKEN_TYPE_CONTROL {
|
||||
if v.Types[i] == TOKEN_TYPE_CONTROL || v.Types[i] == TOKEN_TYPE_USER_DEFINED {
|
||||
v.special = append(v.special, v.Values[i])
|
||||
}
|
||||
}
|
||||
|
||||
16
model/vocabulary_test.go
Normal file
16
model/vocabulary_test.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package model
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestVocabulary_SpecialVocabulary(t *testing.T) {
|
||||
vocab := &Vocabulary{
|
||||
Values: []string{"<|startoftext|>", "<|endoftext|>", "<|tool_call_start|>", "<|tool_call_end|>", "hi"},
|
||||
Types: []int32{TOKEN_TYPE_CONTROL, TOKEN_TYPE_CONTROL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_NORMAL},
|
||||
}
|
||||
|
||||
specialVocab := vocab.SpecialVocabulary()
|
||||
|
||||
if len(specialVocab) != 4 {
|
||||
t.Errorf("expected 4 special tokens, got %d", len(specialVocab))
|
||||
}
|
||||
}
|
||||
@@ -27,7 +27,6 @@ function checkEnv() {
|
||||
$env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
|
||||
}
|
||||
# Locate CUDA versions
|
||||
# Note: this assumes every version found will be built
|
||||
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
|
||||
if ($cudaList.length -eq 0) {
|
||||
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
||||
@@ -94,19 +93,6 @@ function buildOllama() {
|
||||
|
||||
$hashEnv = @{}
|
||||
Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
|
||||
if ("$script:CUDA_DIRS".Contains("v11")) {
|
||||
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
|
||||
$env:CUDAToolkit_ROOT=$hashEnv[$v11]
|
||||
write-host "Building CUDA v11 backend libraries"
|
||||
# Note: cuda v11 requires msvc 2019 so force the older generator
|
||||
# to avoid 2022 (or newer) from being used as the default
|
||||
& cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
& cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
& cmake --install build --component "CUDA" --strip
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
if ("$script:CUDA_DIRS".Contains("v12")) {
|
||||
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
|
||||
$env:CUDAToolkit_ROOT=$hashEnv[$v12]
|
||||
@@ -127,12 +113,17 @@ function buildOllama() {
|
||||
$env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
|
||||
$env:HIP_PLATFORM="amd"
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
& cmake --fresh --preset "ROCm 6" -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ --install-prefix $script:DIST_DIR
|
||||
& cmake --fresh --preset "ROCm 6" -G Ninja `
|
||||
-DCMAKE_C_COMPILER=clang `
|
||||
-DCMAKE_CXX_COMPILER=clang++ `
|
||||
-DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
|
||||
-DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
|
||||
--install-prefix $script:DIST_DIR
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
$env:HIPCXX=""
|
||||
$env:HIP_PLATFORM=""
|
||||
$env:CMAKE_PREFIX_PATH=""
|
||||
& cmake --build --preset "ROCm" --config Release --parallel $script:JOBS
|
||||
& cmake --build --preset "ROCm 6" --config Release --parallel $script:JOBS
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
& cmake --install build --component "HIP" --strip
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
|
||||
@@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
|
||||
--build-arg=GOFLAGS \
|
||||
--build-arg=OLLAMA_CUSTOM_CPU_DEFS \
|
||||
--build-arg=OLLAMA_SKIP_CUDA_GENERATE \
|
||||
--build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \
|
||||
--build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
|
||||
--build-arg=CUDA_V11_ARCHITECTURES \
|
||||
--build-arg=CUDA_V12_ARCHITECTURES \
|
||||
--build-arg=OLLAMA_SKIP_ROCM_GENERATE \
|
||||
--build-arg=OLLAMA_FAST_BUILD \
|
||||
|
||||
115
server/cache/capabilities.go
vendored
115
server/cache/capabilities.go
vendored
@@ -1,115 +0,0 @@
|
||||
package cache
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/thinking"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
// cacheEntry stores capabilities and the modification time of the model file
|
||||
type cacheEntry struct {
|
||||
capabilities []model.Capability
|
||||
modTime time.Time
|
||||
}
|
||||
|
||||
// ggufCapabilities is a cache for gguf model capabilities
|
||||
var ggufCapabilities = &sync.Map{}
|
||||
|
||||
// ModelInfo contains the minimal information needed to determine capabilities
|
||||
type ModelInfo struct {
|
||||
ModelPath string
|
||||
ProjectorPaths []string
|
||||
Template *template.Template
|
||||
}
|
||||
|
||||
// Capabilities returns the capabilities that the model supports
|
||||
func Capabilities(info ModelInfo) []model.Capability {
|
||||
capabilities, err := ggufCapabilties(info.ModelPath)
|
||||
if err != nil {
|
||||
slog.Error("could not determine gguf capabilities", "error", err)
|
||||
}
|
||||
|
||||
if info.Template == nil {
|
||||
return capabilities
|
||||
}
|
||||
|
||||
// Check for tools capability
|
||||
if slices.Contains(info.Template.Vars(), "tools") {
|
||||
capabilities = append(capabilities, model.CapabilityTools)
|
||||
}
|
||||
|
||||
// Check for insert capability
|
||||
if slices.Contains(info.Template.Vars(), "suffix") {
|
||||
capabilities = append(capabilities, model.CapabilityInsert)
|
||||
}
|
||||
|
||||
// Check for vision capability in projector-based models
|
||||
if len(info.ProjectorPaths) > 0 {
|
||||
capabilities = append(capabilities, model.CapabilityVision)
|
||||
}
|
||||
|
||||
// Check for thinking capability
|
||||
openingTag, closingTag := thinking.InferTags(info.Template.Template)
|
||||
if openingTag != "" && closingTag != "" {
|
||||
capabilities = append(capabilities, model.CapabilityThinking)
|
||||
}
|
||||
|
||||
return capabilities
|
||||
}
|
||||
|
||||
func ggufCapabilties(modelPath string) ([]model.Capability, error) {
|
||||
// Get file info to check modification time
|
||||
fileInfo, err := os.Stat(modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
currentModTime := fileInfo.ModTime()
|
||||
|
||||
// Check if we have a cached entry
|
||||
if cached, ok := ggufCapabilities.Load(modelPath); ok {
|
||||
entry := cached.(cacheEntry)
|
||||
// If the file hasn't been modified since we cached it, return the cached capabilities
|
||||
if entry.modTime.Equal(currentModTime) {
|
||||
return entry.capabilities, nil
|
||||
}
|
||||
}
|
||||
|
||||
// If not cached or file was modified, read the model file to determine capabilities
|
||||
capabilities := []model.Capability{}
|
||||
|
||||
r, err := os.Open(modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
f, err := ggml.Decode(r, 1024)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
|
||||
capabilities = append(capabilities, model.CapabilityEmbedding)
|
||||
} else {
|
||||
capabilities = append(capabilities, model.CapabilityCompletion)
|
||||
}
|
||||
if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
|
||||
capabilities = append(capabilities, model.CapabilityVision)
|
||||
}
|
||||
|
||||
// Cache the capabilities with the modification time
|
||||
ggufCapabilities.Store(modelPath, cacheEntry{
|
||||
capabilities: capabilities,
|
||||
modTime: currentModTime,
|
||||
})
|
||||
|
||||
return capabilities, nil
|
||||
}
|
||||
211
server/cache/capabilities_test.go
vendored
211
server/cache/capabilities_test.go
vendored
@@ -1,211 +0,0 @@
|
||||
package cache
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"maps"
|
||||
"os"
|
||||
"slices"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
// testGGUF creates a temporary GGUF model file for testing with custom key-value pairs
|
||||
func testGGUF(tb testing.TB, customKV ggml.KV) string {
|
||||
tb.Helper()
|
||||
f, err := os.CreateTemp(tb.TempDir(), "test*.gguf")
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
kv := ggml.KV{}
|
||||
maps.Copy(kv, customKV)
|
||||
|
||||
tensors := []*ggml.Tensor{
|
||||
{
|
||||
Name: "token_embd.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{1, 1},
|
||||
WriterTo: bytes.NewBuffer(make([]byte, 4)),
|
||||
},
|
||||
}
|
||||
|
||||
if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
|
||||
return f.Name()
|
||||
}
|
||||
|
||||
func TestCapabilities(t *testing.T) {
|
||||
ggufCapabilities.Range(func(key, value any) bool {
|
||||
ggufCapabilities.Delete(key)
|
||||
return true
|
||||
})
|
||||
|
||||
// Create test model paths
|
||||
completionModelPath := testGGUF(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
})
|
||||
|
||||
visionModelPath := testGGUF(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.vision.block_count": uint32(1),
|
||||
})
|
||||
|
||||
embeddingModelPath := testGGUF(t, ggml.KV{
|
||||
"general.architecture": "bert",
|
||||
"bert.pooling_type": uint32(1),
|
||||
})
|
||||
|
||||
// Create templates
|
||||
toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse template: %v", err)
|
||||
}
|
||||
|
||||
chatTemplate, err := template.Parse("{{ .prompt }}")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse template: %v", err)
|
||||
}
|
||||
|
||||
toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse template: %v", err)
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
model ModelInfo
|
||||
expectedCaps []model.Capability
|
||||
}{
|
||||
{
|
||||
name: "model with completion capability",
|
||||
model: ModelInfo{
|
||||
ModelPath: completionModelPath,
|
||||
Template: chatTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion},
|
||||
},
|
||||
{
|
||||
name: "model with completion, tools, and insert capability",
|
||||
model: ModelInfo{
|
||||
ModelPath: completionModelPath,
|
||||
Template: toolsInsertTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
|
||||
},
|
||||
{
|
||||
name: "model with tools capability",
|
||||
model: ModelInfo{
|
||||
ModelPath: completionModelPath,
|
||||
Template: toolsTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
|
||||
},
|
||||
{
|
||||
name: "model with vision capability from gguf",
|
||||
model: ModelInfo{
|
||||
ModelPath: visionModelPath,
|
||||
Template: chatTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityVision},
|
||||
},
|
||||
{
|
||||
name: "model with vision capability from projector",
|
||||
model: ModelInfo{
|
||||
ModelPath: completionModelPath,
|
||||
ProjectorPaths: []string{"/path/to/projector"},
|
||||
Template: chatTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityVision},
|
||||
},
|
||||
{
|
||||
name: "model with vision, tools, and insert capability",
|
||||
model: ModelInfo{
|
||||
ModelPath: visionModelPath,
|
||||
Template: toolsInsertTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityVision, model.CapabilityTools, model.CapabilityInsert},
|
||||
},
|
||||
{
|
||||
name: "model with embedding capability",
|
||||
model: ModelInfo{
|
||||
ModelPath: embeddingModelPath,
|
||||
Template: chatTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityEmbedding},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// First call - should read from file
|
||||
caps := Capabilities(tc.model)
|
||||
slices.Sort(caps)
|
||||
slices.Sort(tc.expectedCaps)
|
||||
if !slices.Equal(caps, tc.expectedCaps) {
|
||||
t.Errorf("Expected capabilities %v, got %v", tc.expectedCaps, caps)
|
||||
}
|
||||
|
||||
// Verify caching for models that read from GGUF
|
||||
if tc.model.ModelPath != "" {
|
||||
// Check that entry is cached
|
||||
_, ok := ggufCapabilities.Load(tc.model.ModelPath)
|
||||
if !ok {
|
||||
t.Error("Expected capabilities to be cached")
|
||||
}
|
||||
|
||||
// Second call - should use cache
|
||||
caps2 := Capabilities(tc.model)
|
||||
slices.Sort(caps2)
|
||||
if !slices.Equal(caps, caps2) {
|
||||
t.Errorf("Cached capabilities don't match original: expected %v, got %v", caps, caps2)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Test cache invalidation on file modification
|
||||
t.Run("cache invalidation", func(t *testing.T) {
|
||||
// Use completion model for this test
|
||||
info := ModelInfo{
|
||||
ModelPath: completionModelPath,
|
||||
Template: chatTemplate,
|
||||
}
|
||||
|
||||
// Get initial cached entry
|
||||
cached, ok := ggufCapabilities.Load(completionModelPath)
|
||||
if !ok {
|
||||
t.Fatal("Expected model to be cached from previous tests")
|
||||
}
|
||||
entry := cached.(cacheEntry)
|
||||
|
||||
// Modify the file's timestamp to the future
|
||||
future := time.Now().Add(time.Hour)
|
||||
err := os.Chtimes(completionModelPath, future, future)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to update file timestamp: %v", err)
|
||||
}
|
||||
|
||||
// Call should re-read from file due to changed modtime
|
||||
caps := Capabilities(info)
|
||||
if len(caps) != 1 || caps[0] != model.CapabilityCompletion {
|
||||
t.Errorf("Expected [CapabilityCompletion], got %v", caps)
|
||||
}
|
||||
|
||||
// Check that cache was updated with new modtime
|
||||
cached2, ok := ggufCapabilities.Load(completionModelPath)
|
||||
if !ok {
|
||||
t.Error("Expected capabilities to be cached after re-read")
|
||||
}
|
||||
entry2 := cached2.(cacheEntry)
|
||||
if entry2.modTime.Equal(entry.modTime) {
|
||||
t.Error("Expected cache entry to have updated modTime")
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -23,9 +23,10 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/fs/gguf"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/server/cache"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/thinking"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
@@ -67,14 +68,60 @@ type Model struct {
|
||||
Template *template.Template
|
||||
}
|
||||
|
||||
// Capabilities returns the capabilities that the model supports
|
||||
func (m *Model) Capabilities() []model.Capability {
|
||||
capabilities := []model.Capability{}
|
||||
|
||||
// Check for completion capability
|
||||
f, err := gguf.Open(m.ModelPath)
|
||||
if err == nil {
|
||||
defer f.Close()
|
||||
|
||||
if f.KeyValue("pooling_type").Valid() {
|
||||
capabilities = append(capabilities, model.CapabilityEmbedding)
|
||||
} else {
|
||||
// If no embedding is specified, we assume the model supports completion
|
||||
capabilities = append(capabilities, model.CapabilityCompletion)
|
||||
}
|
||||
if f.KeyValue("vision.block_count").Valid() {
|
||||
capabilities = append(capabilities, model.CapabilityVision)
|
||||
}
|
||||
} else {
|
||||
slog.Error("couldn't open model file", "error", err)
|
||||
}
|
||||
|
||||
if m.Template == nil {
|
||||
return capabilities
|
||||
}
|
||||
|
||||
// Check for tools capability
|
||||
if slices.Contains(m.Template.Vars(), "tools") {
|
||||
capabilities = append(capabilities, model.CapabilityTools)
|
||||
}
|
||||
|
||||
// Check for insert capability
|
||||
if slices.Contains(m.Template.Vars(), "suffix") {
|
||||
capabilities = append(capabilities, model.CapabilityInsert)
|
||||
}
|
||||
|
||||
// Check for vision capability in projector-based models
|
||||
if len(m.ProjectorPaths) > 0 {
|
||||
capabilities = append(capabilities, model.CapabilityVision)
|
||||
}
|
||||
|
||||
// Check for thinking capability
|
||||
openingTag, closingTag := thinking.InferTags(m.Template.Template)
|
||||
if openingTag != "" && closingTag != "" {
|
||||
capabilities = append(capabilities, model.CapabilityThinking)
|
||||
}
|
||||
|
||||
return capabilities
|
||||
}
|
||||
|
||||
// CheckCapabilities checks if the model has the specified capabilities returning an error describing
|
||||
// any missing or unknown capabilities
|
||||
func (m *Model) CheckCapabilities(want ...model.Capability) error {
|
||||
available := cache.Capabilities(cache.ModelInfo{
|
||||
ModelPath: m.ModelPath,
|
||||
ProjectorPaths: m.ProjectorPaths,
|
||||
Template: m.Template,
|
||||
})
|
||||
available := m.Capabilities()
|
||||
var errs []error
|
||||
|
||||
// Map capabilities to their corresponding error
|
||||
|
||||
@@ -9,6 +9,131 @@ import (
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
func TestModelCapabilities(t *testing.T) {
|
||||
// Create completion model (llama architecture without vision)
|
||||
completionModelPath, _ := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
}, []*ggml.Tensor{})
|
||||
|
||||
// Create vision model (llama architecture with vision block count)
|
||||
visionModelPath, _ := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.vision.block_count": uint32(1),
|
||||
}, []*ggml.Tensor{})
|
||||
|
||||
// Create embedding model (bert architecture with pooling type)
|
||||
embeddingModelPath, _ := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "bert",
|
||||
"bert.pooling_type": uint32(1),
|
||||
}, []*ggml.Tensor{})
|
||||
|
||||
toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse template: %v", err)
|
||||
}
|
||||
|
||||
chatTemplate, err := template.Parse("{{ .prompt }}")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse template: %v", err)
|
||||
}
|
||||
|
||||
toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse template: %v", err)
|
||||
}
|
||||
|
||||
testModels := []struct {
|
||||
name string
|
||||
model Model
|
||||
expectedCaps []model.Capability
|
||||
}{
|
||||
{
|
||||
name: "model with completion capability",
|
||||
model: Model{
|
||||
ModelPath: completionModelPath,
|
||||
Template: chatTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion},
|
||||
},
|
||||
|
||||
{
|
||||
name: "model with completion, tools, and insert capability",
|
||||
model: Model{
|
||||
ModelPath: completionModelPath,
|
||||
Template: toolsInsertTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
|
||||
},
|
||||
{
|
||||
name: "model with tools capability",
|
||||
model: Model{
|
||||
ModelPath: completionModelPath,
|
||||
Template: toolsTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
|
||||
},
|
||||
{
|
||||
name: "model with vision capability",
|
||||
model: Model{
|
||||
ModelPath: visionModelPath,
|
||||
Template: chatTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityVision},
|
||||
},
|
||||
{
|
||||
name: "model with vision, tools, and insert capability",
|
||||
model: Model{
|
||||
ModelPath: visionModelPath,
|
||||
Template: toolsInsertTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityVision, model.CapabilityTools, model.CapabilityInsert},
|
||||
},
|
||||
{
|
||||
name: "model with embedding capability",
|
||||
model: Model{
|
||||
ModelPath: embeddingModelPath,
|
||||
Template: chatTemplate,
|
||||
},
|
||||
expectedCaps: []model.Capability{model.CapabilityEmbedding},
|
||||
},
|
||||
}
|
||||
|
||||
// compare two slices of model.Capability regardless of order
|
||||
compareCapabilities := func(a, b []model.Capability) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
|
||||
aCount := make(map[model.Capability]int)
|
||||
for _, cap := range a {
|
||||
aCount[cap]++
|
||||
}
|
||||
|
||||
bCount := make(map[model.Capability]int)
|
||||
for _, cap := range b {
|
||||
bCount[cap]++
|
||||
}
|
||||
|
||||
for cap, count := range aCount {
|
||||
if bCount[cap] != count {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
for _, tt := range testModels {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Test Capabilities method
|
||||
caps := tt.model.Capabilities()
|
||||
if !compareCapabilities(caps, tt.expectedCaps) {
|
||||
t.Errorf("Expected capabilities %v, got %v", tt.expectedCaps, caps)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestModelCheckCapabilities(t *testing.T) {
|
||||
// Create simple model file for tests that don't depend on GGUF content
|
||||
completionModelPath, _ := createBinFile(t, ggml.KV{
|
||||
|
||||
2
server/internal/cache/blob/cache.go
vendored
2
server/internal/cache/blob/cache.go
vendored
@@ -59,7 +59,7 @@ type DiskCache struct {
|
||||
testHookBeforeFinalWrite func(f *os.File)
|
||||
}
|
||||
|
||||
// PutString is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
|
||||
// PutBytes is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
|
||||
func PutBytes[S string | []byte](c *DiskCache, d Digest, data S) error {
|
||||
return c.Put(d, bytes.NewReader([]byte(data)), int64(len(data)))
|
||||
}
|
||||
|
||||
@@ -34,7 +34,6 @@ import (
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/server/cache"
|
||||
"github.com/ollama/ollama/server/internal/client/ollama"
|
||||
"github.com/ollama/ollama/server/internal/registry"
|
||||
"github.com/ollama/ollama/template"
|
||||
@@ -820,17 +819,13 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
}
|
||||
|
||||
resp := &api.ShowResponse{
|
||||
License: strings.Join(m.License, "\n"),
|
||||
System: m.System,
|
||||
Template: m.Template.String(),
|
||||
Details: modelDetails,
|
||||
Messages: msgs,
|
||||
Capabilities: cache.Capabilities(cache.ModelInfo{
|
||||
ModelPath: m.ModelPath,
|
||||
Template: m.Template,
|
||||
ProjectorPaths: m.ProjectorPaths,
|
||||
}),
|
||||
ModifiedAt: manifest.fi.ModTime(),
|
||||
License: strings.Join(m.License, "\n"),
|
||||
System: m.System,
|
||||
Template: m.Template.String(),
|
||||
Details: modelDetails,
|
||||
Messages: msgs,
|
||||
Capabilities: m.Capabilities(),
|
||||
ModifiedAt: manifest.fi.ModTime(),
|
||||
}
|
||||
|
||||
var params []string
|
||||
|
||||
@@ -191,7 +191,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
}
|
||||
|
||||
// Load model for fitting
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
|
||||
if err != nil {
|
||||
pending.errCh <- err
|
||||
break
|
||||
|
||||
@@ -18,9 +18,8 @@ const (
|
||||
)
|
||||
|
||||
type Parser struct {
|
||||
tag string
|
||||
names []string
|
||||
properties []string
|
||||
tag string
|
||||
tools []api.Tool
|
||||
|
||||
state toolsState
|
||||
buffer []byte
|
||||
@@ -34,15 +33,10 @@ func NewParser(tmpl *template.Template, tools []api.Tool) *Parser {
|
||||
}
|
||||
|
||||
func NewParserWithTag(tools []api.Tool, tag string) *Parser {
|
||||
var p Parser
|
||||
for _, t := range tools {
|
||||
p.names = append(p.names, t.Function.Name)
|
||||
for r := range t.Function.Parameters.Properties {
|
||||
p.properties = append(p.properties, r)
|
||||
}
|
||||
return &Parser{
|
||||
tag: tag,
|
||||
tools: tools,
|
||||
}
|
||||
p.tag = tag
|
||||
return &p
|
||||
}
|
||||
|
||||
// Add processes a string input to parse tool calls and content that
|
||||
@@ -121,36 +115,40 @@ func (p *Parser) findTag() (int, bool) {
|
||||
// parseToolCall finds the next complete tool call in the buffer
|
||||
// incrementing n and advancing the buffer.
|
||||
func (p *Parser) parseToolCall() *api.ToolCall {
|
||||
var name string
|
||||
var args map[string]any
|
||||
var tool *api.Tool
|
||||
var end int = len(p.buffer)
|
||||
var i int
|
||||
|
||||
// find tool name
|
||||
var i int
|
||||
for _, n := range p.names {
|
||||
for _, t := range p.tools {
|
||||
n := t.Function.Name
|
||||
if i = bytes.Index(p.buffer, []byte(n)); i != -1 {
|
||||
if i+len(n) < end {
|
||||
name = n
|
||||
tool = &t
|
||||
end = i + len(n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if name == "" {
|
||||
if tool == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if args, i = p.findArguments(); args == nil {
|
||||
return nil
|
||||
}
|
||||
// only look for arguments if the tool has parameters
|
||||
args := map[string]any{}
|
||||
if len(tool.Function.Parameters.Properties) > 0 {
|
||||
if args, i = p.findArguments(*tool); args == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if i > end {
|
||||
end = i
|
||||
if i > end {
|
||||
end = i
|
||||
}
|
||||
}
|
||||
|
||||
tc := &api.ToolCall{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: name,
|
||||
Name: tool.Function.Name,
|
||||
Arguments: args,
|
||||
Index: p.n,
|
||||
},
|
||||
@@ -162,13 +160,17 @@ func (p *Parser) parseToolCall() *api.ToolCall {
|
||||
}
|
||||
|
||||
// findArguments returns the first object that appears to be
|
||||
// arguments and the position where the arguments end, returning nil and 0 if
|
||||
// an invalid JSON object or non-arguments object is found first
|
||||
func (p *Parser) findArguments() (map[string]any, int) {
|
||||
// arguments for the provided tool, returning nil
|
||||
func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
|
||||
if len(p.buffer) == 0 {
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
// no arguments to parse
|
||||
if len(tool.Function.Parameters.Properties) == 0 {
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
var braces int
|
||||
var start int = -1
|
||||
var end int
|
||||
@@ -184,11 +186,13 @@ func (p *Parser) findArguments() (map[string]any, int) {
|
||||
}
|
||||
|
||||
if c == '}' {
|
||||
braces--
|
||||
if braces == 0 && start != -1 {
|
||||
end = i + 1
|
||||
object = p.buffer[start:end]
|
||||
break
|
||||
if start != -1 {
|
||||
braces--
|
||||
if braces == 0 {
|
||||
end = i + 1
|
||||
object = p.buffer[start:end]
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -206,24 +210,27 @@ func (p *Parser) findArguments() (map[string]any, int) {
|
||||
|
||||
var find func(obj any) map[string]any
|
||||
find = func(obj any) map[string]any {
|
||||
switch v := obj.(type) {
|
||||
switch obj := obj.(type) {
|
||||
case map[string]any:
|
||||
// check if the object keys are valid tool properties
|
||||
// TODO (jmorganca): check only sets of properties that
|
||||
// go together instead of the entire set
|
||||
for _, prop := range p.properties {
|
||||
if _, exists := v[prop]; exists {
|
||||
return v
|
||||
found := true
|
||||
for key := range obj {
|
||||
if _, exists := tool.Function.Parameters.Properties[key]; !exists {
|
||||
found = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
for _, value := range v {
|
||||
if found {
|
||||
return obj
|
||||
}
|
||||
|
||||
for _, value := range obj {
|
||||
if result := find(value); result != nil {
|
||||
return result
|
||||
}
|
||||
}
|
||||
case []any:
|
||||
for _, item := range v {
|
||||
for _, item := range obj {
|
||||
if result := find(item); result != nil {
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -104,6 +104,13 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "say_hello",
|
||||
Description: "Say hello",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
@@ -144,6 +151,20 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "invalid arguments",
|
||||
inputs: []string{`<tool_call>{"name": "get_conditions", "arguments": {"city": "San Francisco"}}</tool_call>`},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: nil,
|
||||
},
|
||||
{
|
||||
name: "missing args",
|
||||
inputs: []string{`<tool_call>{"name": "get_conditions"}</tool_call>`},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: nil,
|
||||
},
|
||||
{
|
||||
name: "text before tool call",
|
||||
inputs: []string{`Let me check the weather. <tool_call>{"name": "get_temperature", "arguments": {"city": "New York"}}</tool_call>`},
|
||||
@@ -161,6 +182,28 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "qwen no args tool call",
|
||||
inputs: []string{`Let me say hello to the user. I'll use the say_hello tool <tool_call>{"name": "say_hello"}</tool_call>`},
|
||||
content: "Let me say hello to the user. I'll use the say_hello tool ",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "qwen no args with text",
|
||||
inputs: []string{"Let me say hello to the user. I'll use the say_hello tool. "},
|
||||
content: "Let me say hello to the user. I'll use the say_hello tool. ",
|
||||
tmpl: qwen,
|
||||
calls: nil,
|
||||
},
|
||||
{
|
||||
name: "two tool calls in a list",
|
||||
inputs: []string{`[TOOL_CALLS] [{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}, {"name": "get_conditions", "arguments": {"location": "Tokyo"}}][/TOOL_CALLS]`},
|
||||
@@ -189,7 +232,7 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "two tool calls",
|
||||
name: "qwen two tool calls",
|
||||
inputs: []string{`Okay, let's call both tools! <tool_call>{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}</tool_call>`},
|
||||
content: "Okay, let's call both tools! ",
|
||||
tmpl: qwen,
|
||||
@@ -215,6 +258,30 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "qwen two tool calls one with no args",
|
||||
inputs: []string{`Let me check the weather. <tool_call>{"name": "say_hello"}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}`},
|
||||
content: "Let me check the weather. ",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 1,
|
||||
Name: "get_conditions",
|
||||
Arguments: api.ToolCallFunctionArguments{
|
||||
"location": "Tokyo",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "deepseek",
|
||||
inputs: []string{"<think>Wait, I need to call a tool</think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_temperature\n```json\n{\"city\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"},
|
||||
@@ -338,6 +405,52 @@ func TestParser(t *testing.T) {
|
||||
content: "for { fmt.Println(\"hello\") }",
|
||||
tmpl: json,
|
||||
},
|
||||
{
|
||||
name: "json no args tool call",
|
||||
inputs: []string{
|
||||
"{\"name\": \"say_hello\"}",
|
||||
},
|
||||
content: "",
|
||||
tmpl: json,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "json no args no tool call",
|
||||
inputs: []string{
|
||||
"I'll use the say_hello tool to say hello to the user.",
|
||||
},
|
||||
content: "I'll use the say_hello tool to say hello to the user.",
|
||||
tmpl: json,
|
||||
calls: nil,
|
||||
},
|
||||
|
||||
// TODO (jmorganca): this is a false positive, we should
|
||||
// not be parsing this as a tool call
|
||||
{
|
||||
name: "json no args false positive",
|
||||
inputs: []string{
|
||||
`{say_hello!!!}`,
|
||||
},
|
||||
content: "",
|
||||
tmpl: json,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "list multiple",
|
||||
inputs: []string{
|
||||
@@ -380,6 +493,30 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "list partial",
|
||||
inputs: []string{
|
||||
"[{",
|
||||
"\"name\": \"get_conditions\", ",
|
||||
"\"arguments\": {",
|
||||
"\"location\": \"Tokyo\"",
|
||||
"}",
|
||||
"}",
|
||||
},
|
||||
content: "",
|
||||
tmpl: list,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "get_conditions",
|
||||
Arguments: api.ToolCallFunctionArguments{
|
||||
"location": "Tokyo",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "list invalid",
|
||||
inputs: []string{
|
||||
"[",
|
||||
"{",
|
||||
@@ -393,6 +530,33 @@ func TestParser(t *testing.T) {
|
||||
tmpl: list,
|
||||
calls: nil,
|
||||
},
|
||||
{
|
||||
name: "list trailing ]",
|
||||
inputs: []string{
|
||||
"[",
|
||||
"{",
|
||||
"\"name\": \"get_conditions\", ",
|
||||
"\"arguments\": {",
|
||||
"\"location\": \"Tokyo\"",
|
||||
"}",
|
||||
"}",
|
||||
"]",
|
||||
"]",
|
||||
},
|
||||
content: "",
|
||||
tmpl: list,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "get_conditions",
|
||||
Arguments: api.ToolCallFunctionArguments{
|
||||
"location": "Tokyo",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "list not a tool call",
|
||||
inputs: []string{
|
||||
@@ -404,6 +568,26 @@ func TestParser(t *testing.T) {
|
||||
tmpl: list,
|
||||
calls: nil,
|
||||
},
|
||||
{
|
||||
name: "list with no arguments",
|
||||
inputs: []string{
|
||||
"[",
|
||||
"{",
|
||||
"\"name\": \"say_hello\"",
|
||||
"}",
|
||||
},
|
||||
content: "",
|
||||
tmpl: list,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -700,25 +884,75 @@ func TestFindTag(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestFindArguments(t *testing.T) {
|
||||
tool := api.Tool{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "get_temperature",
|
||||
Description: "Retrieve the temperature for a given location",
|
||||
Parameters: struct {
|
||||
Type string `json:"type"`
|
||||
Defs any `json:"$defs,omitempty"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Required []string `json:"required"`
|
||||
Properties map[string]struct {
|
||||
Type api.PropertyType `json:"type"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Description string `json:"description"`
|
||||
Enum []any `json:"enum,omitempty"`
|
||||
} `json:"properties"`
|
||||
}{
|
||||
Type: "object",
|
||||
Properties: map[string]struct {
|
||||
Type api.PropertyType `json:"type"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Description string `json:"description"`
|
||||
Enum []any `json:"enum,omitempty"`
|
||||
}{
|
||||
"format": {
|
||||
Type: api.PropertyType{"string"},
|
||||
Description: "The format to return the temperature in",
|
||||
Enum: []any{"fahrenheit", "celsius"},
|
||||
},
|
||||
"location": {
|
||||
Type: api.PropertyType{"string"},
|
||||
Description: "The location to get the temperature for",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
tool2 := api.Tool{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "say_hello",
|
||||
Description: "Say hello to the user",
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
buffer []byte
|
||||
want map[string]any
|
||||
tool api.Tool
|
||||
}{
|
||||
{
|
||||
name: "empty string",
|
||||
buffer: []byte{},
|
||||
want: nil,
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "whitespace only",
|
||||
buffer: []byte(" \n\t "),
|
||||
want: nil,
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "unbalanced braces - missing closing",
|
||||
buffer: []byte(`{"format": "fahrenheit", "location": "San Francisco"`),
|
||||
want: nil,
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "unbalanced braces - extra closing",
|
||||
@@ -726,11 +960,13 @@ func TestFindArguments(t *testing.T) {
|
||||
want: map[string]any{
|
||||
"format": "fahrenheit",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "invalid JSON",
|
||||
buffer: []byte(`{format: fahrenheit, location: "San Francisco"}`),
|
||||
want: nil,
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "valid json",
|
||||
@@ -739,6 +975,7 @@ func TestFindArguments(t *testing.T) {
|
||||
"format": "fahrenheit",
|
||||
"location": "San Francisco, CA",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "valid arguments with special tokens",
|
||||
@@ -747,6 +984,7 @@ func TestFindArguments(t *testing.T) {
|
||||
"format": "fahrenheit",
|
||||
"location": "San Francisco, CA",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "valid arguments in array",
|
||||
@@ -755,6 +993,7 @@ func TestFindArguments(t *testing.T) {
|
||||
"format": "fahrenheit",
|
||||
"location": "San Francisco, CA",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "nested deep",
|
||||
@@ -763,39 +1002,49 @@ func TestFindArguments(t *testing.T) {
|
||||
"format": "fahrenheit",
|
||||
"location": "San Francisco, CA",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "one arg",
|
||||
buffer: []byte(`get_weather({"location": "San Francisco, CA"})`),
|
||||
buffer: []byte(`get_temperature({"location": "San Francisco, CA"})`),
|
||||
want: map[string]any{
|
||||
"location": "San Francisco, CA",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "two args",
|
||||
buffer: []byte(`[{"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}, {"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}]`),
|
||||
buffer: []byte(`[{"name": "get_temperature", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}, {"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}]`),
|
||||
want: map[string]any{
|
||||
"location": "San Francisco, CA",
|
||||
"format": "fahrenheit",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "no args",
|
||||
buffer: []byte(`{"name": "say_hello"}`),
|
||||
want: nil,
|
||||
tool: tool2,
|
||||
},
|
||||
{
|
||||
name: "deepseek",
|
||||
buffer: []byte("<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"),
|
||||
buffer: []byte("<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_temperature\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"),
|
||||
want: map[string]any{
|
||||
"location": "Tokyo",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
parser := &Parser{
|
||||
buffer: tt.buffer,
|
||||
properties: []string{"format", "location"},
|
||||
buffer: tt.buffer,
|
||||
tools: []api.Tool{tool, tool2},
|
||||
}
|
||||
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, _ := parser.findArguments()
|
||||
got, _ := parser.findArguments(tool)
|
||||
|
||||
if diff := cmp.Diff(got, tt.want); diff != "" {
|
||||
t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff)
|
||||
|
||||
Reference in New Issue
Block a user