Compare commits
40 Commits
v0.9.3-rc5
...
v0.10.0-rc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3bac5cba60 | ||
|
|
4151ef8cf7 | ||
|
|
82da19c634 | ||
|
|
bdd9d22dfd | ||
|
|
5fc38d042f | ||
|
|
191d94289d | ||
|
|
802ad16ce4 | ||
|
|
5e67f4f90e | ||
|
|
e840ccb523 | ||
|
|
b4fe3adc0a | ||
|
|
d73f8aa8c3 | ||
|
|
92c2e8a56c | ||
|
|
2e3fd86d48 | ||
|
|
4261a3b0b2 | ||
|
|
acef9b4c1b | ||
|
|
9a43994c45 | ||
|
|
f8a6e88819 | ||
|
|
35fda7b4af | ||
|
|
66fb8575ce | ||
|
|
20c3266e94 | ||
|
|
34088dbcfb | ||
|
|
43107b15b9 | ||
|
|
1f91cb0c8c | ||
|
|
12d8ad0d38 | ||
|
|
592d21e7db | ||
|
|
5a08b01f5b | ||
|
|
4f473e224c | ||
|
|
9d60bb44cf | ||
|
|
f371260e75 | ||
|
|
c9e6d7719e | ||
|
|
2c4ce40334 | ||
|
|
5d8c173529 | ||
|
|
44b17d2bfa | ||
|
|
3b8b692218 | ||
|
|
4129af9205 | ||
|
|
45f216a9c7 | ||
|
|
d0b32def60 | ||
|
|
11ffc36157 | ||
|
|
ba04902670 | ||
|
|
3944602f51 |
163
.github/workflows/release.yaml
vendored
163
.github/workflows/release.yaml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
|
||||
|
||||
darwin-build:
|
||||
runs-on: macos-13
|
||||
runs-on: macos-13-xlarge
|
||||
environment: release
|
||||
needs: setup-environment
|
||||
strategy:
|
||||
@@ -54,48 +54,6 @@ jobs:
|
||||
name: build-${{ matrix.os }}-${{ matrix.arch }}
|
||||
path: dist/*
|
||||
|
||||
darwin-sign:
|
||||
runs-on: macos-13
|
||||
environment: release
|
||||
needs: darwin-build
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- run: |
|
||||
echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
|
||||
security create-keychain -p password build.keychain
|
||||
security default-keychain -s build.keychain
|
||||
security unlock-keychain -p password build.keychain
|
||||
security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
|
||||
security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
|
||||
security set-keychain-settings -lut 3600 build.keychain
|
||||
env:
|
||||
MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
|
||||
MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: build-darwin-amd64
|
||||
path: dist/darwin-amd64
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: build-darwin-arm64
|
||||
path: dist/darwin-arm64
|
||||
- run: |
|
||||
export VERSION=${GITHUB_REF_NAME#v}
|
||||
./scripts/build_darwin.sh sign macapp
|
||||
env:
|
||||
APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
|
||||
APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
|
||||
APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
|
||||
APPLE_ID: ${{ vars.APPLE_ID }}
|
||||
SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||
DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-darwin
|
||||
path: |
|
||||
dist/Ollama-darwin.zip
|
||||
dist/ollama-darwin.tgz
|
||||
|
||||
windows-depends:
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -230,61 +188,11 @@ jobs:
|
||||
go-version-file: go.mod
|
||||
- run: |
|
||||
go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
|
||||
- if: matrix.arch == 'arm64'
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vc_redist.arm64.exe" -OutFile "dist\windows-arm64\vc_redist.arm64.exe"
|
||||
- run: |
|
||||
$env:VERSION='${{ github.ref_name }}' -Replace "v(.*)", '$1'
|
||||
& .\scripts\build_windows.ps1 buildApp
|
||||
env:
|
||||
VCToolsRedistDir: stub
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: build-${{ matrix.os }}-${{ matrix.arch }}
|
||||
path: |
|
||||
dist\${{ matrix.os }}-${{ matrix.arch }}\*.exe
|
||||
dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
|
||||
|
||||
windows-sign:
|
||||
runs-on: windows
|
||||
environment: release
|
||||
needs: [windows-depends, windows-build]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: google-github-actions/auth@v2
|
||||
with:
|
||||
project_id: ollama
|
||||
credentials_json: ${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}
|
||||
- run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${{ runner.temp }}\sdksetup.exe"
|
||||
Start-Process "${{ runner.temp }}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
|
||||
|
||||
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${{ runner.temp }}\plugin.zip"
|
||||
Expand-Archive -Path "${{ runner.temp }}\plugin.zip" -DestinationPath "${{ runner.temp }}\plugin\"
|
||||
& "${{ runner.temp }}\plugin\*\kmscng.msi" /quiet
|
||||
|
||||
echo "${{ vars.OLLAMA_CERT }}" >ollama_inc.crt
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: build-windows-*
|
||||
path: dist\
|
||||
merge-multiple: true
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: depends-windows-amd64-*
|
||||
path: dist\windows-amd64\
|
||||
merge-multiple: true
|
||||
- run: |
|
||||
& .\scripts\build_windows.ps1 gatherDependencies sign buildInstaller distZip
|
||||
env:
|
||||
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-windows
|
||||
path: |
|
||||
dist\OllamaSetup.exe
|
||||
dist\ollama-windows-*.zip
|
||||
|
||||
linux-build:
|
||||
strategy:
|
||||
@@ -317,7 +225,7 @@ jobs:
|
||||
CGO_CFLAGS=${{ env.CGO_CFLAGS }}
|
||||
CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
|
||||
outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
|
||||
cache-from: type=registry,ref=ollama/ollama:latest
|
||||
cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
|
||||
cache-to: type=inline
|
||||
- run: |
|
||||
for COMPONENT in bin/* lib/ollama/*; do
|
||||
@@ -390,8 +298,8 @@ jobs:
|
||||
context: .
|
||||
platforms: ${{ matrix.os }}/${{ matrix.arch }}
|
||||
build-args: ${{ matrix.build-args }}
|
||||
outputs: type=image,name=ollama/ollama,push-by-digest=true,name-canonical=true,push=true
|
||||
cache-from: type=registry,ref=ollama/ollama:latest
|
||||
outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
|
||||
cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
|
||||
cache-to: type=inline
|
||||
- run: |
|
||||
mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
|
||||
@@ -423,7 +331,7 @@ jobs:
|
||||
latest=false
|
||||
suffix=${{ matrix.suffix }}
|
||||
images: |
|
||||
ollama/ollama
|
||||
${{ vars.DOCKER_REPO }}
|
||||
tags: |
|
||||
type=ref,enable=true,priority=600,prefix=pr-,event=pr
|
||||
type=semver,pattern={{version}}
|
||||
@@ -433,66 +341,24 @@ jobs:
|
||||
path: ${{ runner.temp }}
|
||||
merge-multiple: true
|
||||
- run: |
|
||||
docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf 'ollama/ollama@%s ')
|
||||
docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
|
||||
docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf '${{ vars.DOCKER_REPO }}@%s ')
|
||||
docker buildx imagetools inspect ${{ vars.DOCKER_REPO }}:${{ steps.metadata.outputs.version }}
|
||||
working-directory: ${{ runner.temp }}
|
||||
|
||||
# Trigger downstream release process
|
||||
trigger:
|
||||
runs-on: ubuntu-latest
|
||||
environment: release
|
||||
needs: [darwin-build, windows-build, windows-depends]
|
||||
steps:
|
||||
- name: Trigger downstream release process
|
||||
run: |
|
||||
curl -L \
|
||||
-X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
|
||||
-d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}"
|
||||
|
||||
# Aggregate all the assets and ship a release
|
||||
release:
|
||||
needs: [darwin-sign, windows-sign, linux-build]
|
||||
runs-on: linux
|
||||
environment: release
|
||||
needs: [darwin-build, windows-build, windows-depends, linux-build]
|
||||
permissions:
|
||||
contents: write
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: dist-darwin
|
||||
path: dist
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: dist-windows
|
||||
path: dist
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: dist-linux-*
|
||||
path: stage
|
||||
merge-multiple: false
|
||||
- name: Merge linux amd64 payload
|
||||
working-directory: stage/dist-linux-amd64-archive
|
||||
run: |
|
||||
tar zxf ollama-linux-amd64.tgz
|
||||
tar zxf ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
|
||||
rm -f ollama-linux-amd64.tgz ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
|
||||
tar -c -f- --owner 0 --group 0 . | pigz -9vc > ../ollama-linux-amd64.tgz
|
||||
- name: Cleanup linux payloads
|
||||
run: |
|
||||
find stage -name ollama-linux\*.tgz -exec mv {} dist/ \;
|
||||
- run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
|
||||
working-directory: dist
|
||||
- name: Create or update Release
|
||||
- name: Create or update Release for tag
|
||||
run: |
|
||||
RELEASE_VERSION="$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
|
||||
|
||||
echo "Looking for existing release for ${RELEASE_VERSION}"
|
||||
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
|
||||
if [ -n "$OLD_TAG" ]; then
|
||||
@@ -506,5 +372,12 @@ jobs:
|
||||
--generate-notes \
|
||||
--prerelease
|
||||
fi
|
||||
echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
|
||||
gh release upload ${GITHUB_REF_NAME} dist/* --clobber
|
||||
- name: Trigger downstream release process
|
||||
run: |
|
||||
curl -L \
|
||||
-X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
|
||||
-d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"origin\": \"${GITHUB_REPOSITORY}\", \"publish\": \"1\"}}"
|
||||
|
||||
@@ -104,7 +104,7 @@ FROM ${FLAVOR} AS archive
|
||||
COPY --from=cpu dist/lib/ollama /lib/ollama
|
||||
COPY --from=build /bin/ollama /bin/ollama
|
||||
|
||||
FROM ubuntu:20.04
|
||||
FROM ubuntu:24.04
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y ca-certificates \
|
||||
&& apt-get clean \
|
||||
|
||||
11
README.md
11
README.md
@@ -1,6 +1,6 @@
|
||||
<div align="center">
|
||||
<a href="https://ollama.com">
|
||||
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
|
||||
<img alt="ollama" width="240" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
@@ -10,7 +10,7 @@ Get up and running with large language models.
|
||||
|
||||
### macOS
|
||||
|
||||
[Download](https://ollama.com/download/Ollama-darwin.zip)
|
||||
[Download](https://ollama.com/download/Ollama.dmg)
|
||||
|
||||
### Windows
|
||||
|
||||
@@ -360,7 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
|
||||
- [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
|
||||
- [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
|
||||
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
||||
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG and deep research on Mac/Windows/Linux)
|
||||
- [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
|
||||
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
||||
- [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
|
||||
@@ -455,6 +455,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
|
||||
- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
|
||||
- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
|
||||
- [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.
|
||||
|
||||
### Apple Vision Pro
|
||||
|
||||
@@ -593,10 +594,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
||||
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
|
||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
||||
- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
|
||||
- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
|
||||
|
||||
### Supported backends
|
||||
|
||||
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
|
||||
- [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.
|
||||
|
||||
### Observability
|
||||
- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
|
||||
|
||||
@@ -222,10 +222,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||
return fmt.Errorf("unmarshal: %w", err)
|
||||
}
|
||||
|
||||
if errorResponse.Error != "" {
|
||||
return errors.New(errorResponse.Error)
|
||||
}
|
||||
|
||||
if response.StatusCode >= http.StatusBadRequest {
|
||||
return StatusError{
|
||||
StatusCode: response.StatusCode,
|
||||
@@ -234,6 +230,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||
}
|
||||
}
|
||||
|
||||
if errorResponse.Error != "" {
|
||||
return errors.New(errorResponse.Error)
|
||||
}
|
||||
|
||||
if err := fn(bts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -89,6 +89,16 @@ func TestClientStream(t *testing.T) {
|
||||
},
|
||||
wantErr: "mid-stream error",
|
||||
},
|
||||
{
|
||||
name: "http status error takes precedence over general error",
|
||||
responses: []any{
|
||||
testError{
|
||||
message: "custom error message",
|
||||
statusCode: http.StatusInternalServerError,
|
||||
},
|
||||
},
|
||||
wantErr: "500",
|
||||
},
|
||||
{
|
||||
name: "successful stream completion",
|
||||
responses: []any{
|
||||
|
||||
16
api/types.go
16
api/types.go
@@ -143,6 +143,7 @@ type Message struct {
|
||||
Thinking string `json:"thinking,omitempty"`
|
||||
Images []ImageData `json:"images,omitempty"`
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||
ToolName string `json:"tool_name,omitempty"`
|
||||
}
|
||||
|
||||
func (m *Message) UnmarshalJSON(b []byte) error {
|
||||
@@ -467,13 +468,14 @@ type ListModelResponse struct {
|
||||
|
||||
// ProcessModelResponse is a single model description in [ProcessResponse].
|
||||
type ProcessModelResponse struct {
|
||||
Name string `json:"name"`
|
||||
Model string `json:"model"`
|
||||
Size int64 `json:"size"`
|
||||
Digest string `json:"digest"`
|
||||
Details ModelDetails `json:"details,omitempty"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
SizeVRAM int64 `json:"size_vram"`
|
||||
Name string `json:"name"`
|
||||
Model string `json:"model"`
|
||||
Size int64 `json:"size"`
|
||||
Digest string `json:"digest"`
|
||||
Details ModelDetails `json:"details,omitempty"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
SizeVRAM int64 `json:"size_vram"`
|
||||
ContextLength int `json:"context_length"`
|
||||
}
|
||||
|
||||
type TokenResponse struct {
|
||||
|
||||
12
cmd/cmd.go
12
cmd/cmd.go
@@ -583,12 +583,13 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
|
||||
} else {
|
||||
until = format.HumanTime(m.ExpiresAt, "Never")
|
||||
}
|
||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
|
||||
ctxStr := strconv.Itoa(m.ContextLength)
|
||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, ctxStr, until})
|
||||
}
|
||||
}
|
||||
|
||||
table := tablewriter.NewWriter(os.Stdout)
|
||||
table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
|
||||
table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "CONTEXT", "UNTIL"})
|
||||
table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
|
||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||
table.SetHeaderLine(false)
|
||||
@@ -1079,10 +1080,11 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||
var state *displayResponseState = &displayResponseState{}
|
||||
var latest api.ChatResponse
|
||||
var fullResponse strings.Builder
|
||||
var role string
|
||||
var thinkTagOpened bool = false
|
||||
var thinkTagClosed bool = false
|
||||
|
||||
role := "assistant"
|
||||
|
||||
fn := func(response api.ChatResponse) error {
|
||||
if response.Message.Content != "" || !opts.HideThinking {
|
||||
p.StopAndClear()
|
||||
@@ -1416,13 +1418,13 @@ func NewCLI() *cobra.Command {
|
||||
|
||||
createCmd := &cobra.Command{
|
||||
Use: "create MODEL",
|
||||
Short: "Create a model from a Modelfile",
|
||||
Short: "Create a model",
|
||||
Args: cobra.ExactArgs(1),
|
||||
PreRunE: checkServerHeartbeat,
|
||||
RunE: CreateHandler,
|
||||
}
|
||||
|
||||
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
|
||||
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\")")
|
||||
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
|
||||
|
||||
showCmd := &cobra.Command{
|
||||
|
||||
@@ -385,18 +385,21 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
case "modelfile":
|
||||
fmt.Println(resp.Modelfile)
|
||||
case "parameters":
|
||||
fmt.Println("Model defined parameters:")
|
||||
if resp.Parameters == "" {
|
||||
fmt.Println("No parameters were specified for this model.")
|
||||
fmt.Println(" No additional parameters were specified for this model.")
|
||||
} else {
|
||||
if len(opts.Options) > 0 {
|
||||
fmt.Println("User defined parameters:")
|
||||
for k, v := range opts.Options {
|
||||
fmt.Printf("%-*s %v\n", 30, k, v)
|
||||
}
|
||||
fmt.Println()
|
||||
for _, l := range strings.Split(resp.Parameters, "\n") {
|
||||
fmt.Printf(" %s\n", l)
|
||||
}
|
||||
fmt.Println("Model defined parameters:")
|
||||
fmt.Println(resp.Parameters)
|
||||
}
|
||||
fmt.Println()
|
||||
if len(opts.Options) > 0 {
|
||||
fmt.Println("User defined parameters:")
|
||||
for k, v := range opts.Options {
|
||||
fmt.Printf(" %-*s %v\n", 30, k, v)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
case "system":
|
||||
switch {
|
||||
|
||||
@@ -24,7 +24,6 @@ type gemma3nModel struct {
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
HiddenSizePerLayerInput uint32 `json:"hidden_size_per_layer_input"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
LaurelRank uint32 `json:"laurel_rank"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
@@ -72,8 +71,6 @@ func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv["gemma3n.embedding_length"] = m.TextModel.HiddenSize
|
||||
kv["gemma3n.feed_forward_length"] = m.TextModel.IntermediateSize
|
||||
kv["gemma3n.head_dim"] = m.TextModel.HeadDim
|
||||
kv["gemma3n.laurel_rank"] = m.TextModel.LaurelRank
|
||||
kv["gemma3n.num_kv_shared_layers"] = m.TextModel.NumKVSharedLayers
|
||||
kv["gemma3n.rope.freq_base_local"] = m.TextModel.RopeLocalBaseFreq
|
||||
kv["gemma3n.rope.freq_base"] = m.TextModel.RopeTheta
|
||||
return kv
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
* [Quickstart](../README.md#quickstart)
|
||||
* [Examples](./examples.md)
|
||||
* [Importing models](./import.md)
|
||||
* [MacOS Documentation](./macos.md)
|
||||
* [Linux Documentation](./linux.md)
|
||||
* [Windows Documentation](./windows.md)
|
||||
* [Docker Documentation](./docker.md)
|
||||
|
||||
242
docs/api.md
242
docs/api.md
@@ -500,6 +500,7 @@ The `message` object has the following fields:
|
||||
- `thinking`: (for thinking models) the model's thinking process
|
||||
- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
|
||||
- `tool_calls` (optional): a list of tools in JSON that the model wants to use
|
||||
- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result
|
||||
|
||||
Advanced parameters (optional):
|
||||
|
||||
@@ -508,13 +509,21 @@ Advanced parameters (optional):
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
### Tool calling
|
||||
|
||||
Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below.
|
||||
|
||||
Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below.
|
||||
|
||||
[See models with tool calling capabilities](https://ollama.com/search?c=tool).
|
||||
|
||||
### Structured outputs
|
||||
|
||||
Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
|
||||
|
||||
### Examples
|
||||
|
||||
#### Chat Request (Streaming)
|
||||
#### Chat request (Streaming)
|
||||
|
||||
##### Request
|
||||
|
||||
@@ -569,6 +578,88 @@ Final response:
|
||||
}
|
||||
```
|
||||
|
||||
#### Chat request (Streaming with tools)
|
||||
|
||||
##### Request
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3.2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what is the weather in tokyo?"
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the weather in a given city",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to get the weather for"
|
||||
}
|
||||
},
|
||||
"required": ["city"]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"stream": true
|
||||
}'
|
||||
```
|
||||
|
||||
##### Response
|
||||
|
||||
A stream of JSON objects is returned:
|
||||
```json
|
||||
{
|
||||
"model": "llama3.2",
|
||||
"created_at": "2025-07-07T20:22:19.184789Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"arguments": {
|
||||
"city": "Tokyo"
|
||||
}
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
"done": false
|
||||
}
|
||||
```
|
||||
|
||||
Final response:
|
||||
|
||||
```json
|
||||
{
|
||||
"model":"llama3.2",
|
||||
"created_at":"2025-07-07T20:22:19.19314Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": ""
|
||||
},
|
||||
"done_reason": "stop",
|
||||
"done": true,
|
||||
"total_duration": 182242375,
|
||||
"load_duration": 41295167,
|
||||
"prompt_eval_count": 169,
|
||||
"prompt_eval_duration": 24573166,
|
||||
"eval_count": 15,
|
||||
"eval_duration": 115959084
|
||||
}
|
||||
```
|
||||
|
||||
#### Chat request (No streaming)
|
||||
|
||||
##### Request
|
||||
@@ -606,6 +697,74 @@ curl http://localhost:11434/api/chat -d '{
|
||||
}
|
||||
```
|
||||
|
||||
#### Chat request (No streaming, with tools)
|
||||
|
||||
##### Request
|
||||
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3.2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what is the weather in tokyo?"
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the weather in a given city",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to get the weather for"
|
||||
}
|
||||
},
|
||||
"required": ["city"]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"stream": false
|
||||
}'
|
||||
```
|
||||
|
||||
##### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3.2",
|
||||
"created_at": "2025-07-07T20:32:53.844124Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"arguments": {
|
||||
"city": "Tokyo"
|
||||
}
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
"done_reason": "stop",
|
||||
"done": true,
|
||||
"total_duration": 3244883583,
|
||||
"load_duration": 2969184542,
|
||||
"prompt_eval_count": 169,
|
||||
"prompt_eval_duration": 141656333,
|
||||
"eval_count": 18,
|
||||
"eval_duration": 133293625
|
||||
}
|
||||
```
|
||||
|
||||
#### Chat request (Structured outputs)
|
||||
|
||||
##### Request
|
||||
@@ -712,6 +871,87 @@ Final response:
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
#### Chat request (With history, with tools)
|
||||
|
||||
##### Request
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3.2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what is the weather in Toronto?"
|
||||
},
|
||||
// the message from the model appended to history
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"function": {
|
||||
"name": "get_temperature",
|
||||
"arguments": {
|
||||
"city": "Toronto"
|
||||
}
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
// the tool call result appended to history
|
||||
{
|
||||
"role": "tool",
|
||||
"content": "11 degrees celsius",
|
||||
"tool_name": "get_temperature",
|
||||
}
|
||||
],
|
||||
"stream": false,
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the weather in a given city",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to get the weather for"
|
||||
}
|
||||
},
|
||||
"required": ["city"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
##### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3.2",
|
||||
"created_at": "2025-07-07T20:43:37.688511Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "The current temperature in Toronto is 11°C."
|
||||
},
|
||||
"done_reason": "stop",
|
||||
"done": true,
|
||||
"total_duration": 890771750,
|
||||
"load_duration": 707634750,
|
||||
"prompt_eval_count": 94,
|
||||
"prompt_eval_duration": 91703208,
|
||||
"eval_count": 11,
|
||||
"eval_duration": 90282125
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
#### Chat request (with images)
|
||||
|
||||
##### Request
|
||||
|
||||
17
docs/faq.md
17
docs/faq.md
@@ -292,7 +292,7 @@ If too many requests are sent to the server, it will respond with a 503 error in
|
||||
|
||||
## How does Ollama handle concurrent requests?
|
||||
|
||||
Ollama supports two levels of concurrent processing. If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time. For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
|
||||
Ollama supports two levels of concurrent processing. If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time. For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.
|
||||
|
||||
If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded. As prior models become idle, one or more will be unloaded to make room for the new model. Queued requests will be processed in order. When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
|
||||
|
||||
@@ -301,7 +301,7 @@ Parallel request processing for a given model results in increasing the context
|
||||
The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
|
||||
|
||||
- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 * the number of GPUs or 3 for CPU inference.
|
||||
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory.
|
||||
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default is 1, and will handle 1 request per model at a time.
|
||||
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
|
||||
|
||||
Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
|
||||
@@ -333,3 +333,16 @@ The currently available K/V cache quantization types are:
|
||||
How much the cache quantization impacts the model's response quality will depend on the model and the task. Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
|
||||
|
||||
You may need to experiment with different quantization types to find the best balance between memory usage and quality.
|
||||
|
||||
## How can I stop Ollama from starting when I login to my computer
|
||||
|
||||
Ollama for Windows and macOS register as a login item during installation. You can disable this if you prefer not to have Ollama automatically start. Ollama will respect this setting across upgrades, unless you uninstall the application.
|
||||
|
||||
**Windows**
|
||||
- Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
|
||||
|
||||
**MacOS Monterey (v12)**
|
||||
- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
|
||||
|
||||
**MacOS Ventura (v13) and later**
|
||||
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
|
||||
@@ -7,6 +7,8 @@ Check your compute compatibility to see if your card is supported:
|
||||
|
||||
| Compute Capability | Family | Cards |
|
||||
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| 12.0 | GeForce RTX 50xx | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090` |
|
||||
| | NVIDIA Professioal | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell` |
|
||||
| 9.0 | NVIDIA | `H200` `H100` |
|
||||
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
||||
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
||||
|
||||
@@ -53,6 +53,8 @@ FROM /path/to/safetensors/directory
|
||||
|
||||
If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
|
||||
|
||||
If you do not create the Modelfile, ollama will act as if there was a Modelfile with the command `FROM .`.
|
||||
|
||||
Now run the `ollama create` command from the directory where you created the `Modelfile`:
|
||||
|
||||
```shell
|
||||
|
||||
@@ -16,7 +16,7 @@ curl -fsSL https://ollama.com/install.sh | sh
|
||||
Download and extract the package:
|
||||
|
||||
```shell
|
||||
curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
|
||||
curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
|
||||
sudo tar -C /usr -xzf ollama-linux-amd64.tgz
|
||||
```
|
||||
|
||||
|
||||
42
docs/macos.md
Normal file
42
docs/macos.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Ollama for macOS
|
||||
|
||||
## System Requirements
|
||||
|
||||
* MacOS Monterey (v12) or newer
|
||||
* Apple M series (CPU and GPU support) or x86 (CPU only)
|
||||
|
||||
|
||||
## Filesystem Requirements
|
||||
|
||||
The preferred method of installation is to mount the `ollama.dmg` and drag-and-drop the Ollama application to the system-wide `Applications` folder. Upon startup, the Ollama app will verify the `ollama` CLI is present in your PATH, and if not detected, will prompt for permission to create a link in `/usr/local/bin`
|
||||
|
||||
Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size. If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
|
||||
|
||||
### Changing Install Location
|
||||
|
||||
To install the Ollama application somewhere other than `Applications`, place the Ollama application in the desired location, and ensure the CLI `Ollama.app/Contents/Resources/ollama` or a sym-link to the CLI can be found in your path. Upon first start decline the "Move to Applications?" request.
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
Ollama on MacOS stores files in a few different locations.
|
||||
- `~/.ollama` contains models and configuration
|
||||
- `~/.ollama/logs` contains logs
|
||||
- *app.log* contains most recent logs from the GUI application
|
||||
- *server.log* contains the most recent server logs
|
||||
- `<install location>/Ollama.app/Contents/Resources/ollama` the CLI binary
|
||||
|
||||
## Uninstall
|
||||
|
||||
To fully remove Ollama from your system, remove the following files and folders:
|
||||
|
||||
```
|
||||
sudo rm -rf /Applications/Ollama.app
|
||||
sudo rm /usr/local/bin/ollama
|
||||
rm -rf "~/Library/Application Support/Ollama"
|
||||
rm -rf "~/Library/Saved Application State/com.electron.ollama.savedState"
|
||||
rm -rf ~/Library/Caches/com.electron.ollama/
|
||||
rm -rf ~/Library/Caches/ollama
|
||||
rm -rf ~/Library/WebKit/com.electron.ollama
|
||||
rm -rf ~/.ollama
|
||||
```
|
||||
@@ -150,7 +150,7 @@ PARAMETER <parameter> <parametervalue>
|
||||
|
||||
| Parameter | Description | Value Type | Example Usage |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
|
||||
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
|
||||
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 4096) | int | num_ctx 4096 |
|
||||
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
||||
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
||||
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
|
||||
|
||||
@@ -30,20 +30,6 @@ To install the Ollama application in a location different than your home directo
|
||||
OllamaSetup.exe /DIR="d:\some\location"
|
||||
```
|
||||
|
||||
### Changing Model Location
|
||||
|
||||
To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
|
||||
|
||||
1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
|
||||
|
||||
2. Click on _Edit environment variables for your account_.
|
||||
|
||||
3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
|
||||
|
||||
4. Click OK/Apply to save.
|
||||
|
||||
If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
|
||||
|
||||
## API Access
|
||||
|
||||
Here's a quick example showing API access from `powershell`
|
||||
|
||||
@@ -219,7 +219,7 @@ func Uint(key string, defaultValue uint) func() uint {
|
||||
|
||||
var (
|
||||
// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
|
||||
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
|
||||
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 1)
|
||||
// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
|
||||
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
||||
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
||||
|
||||
@@ -174,6 +174,7 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
|
||||
func (kv KV) OllamaEngineRequired() bool {
|
||||
return slices.Contains([]string{
|
||||
"gemma3",
|
||||
"gemma3n",
|
||||
"mistral3",
|
||||
"llama4",
|
||||
"mllama",
|
||||
@@ -554,7 +555,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
||||
// vocab graph
|
||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||
)
|
||||
case "gemma", "gemma2", "gemma3":
|
||||
case "gemma", "gemma2", "gemma3", "gemma3n":
|
||||
fullOffload = max(
|
||||
4*batch*(embedding+vocab),
|
||||
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
|
||||
@@ -567,6 +568,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
||||
embedding*embeddingHeadsK*heads*9/16,
|
||||
)
|
||||
|
||||
if f.KV().Architecture() == "gemma3n" {
|
||||
fullOffload *= 4
|
||||
partialOffload *= 4
|
||||
}
|
||||
|
||||
// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
|
||||
// engine. Gemma3 always uses the Ollama engine.
|
||||
if f.KV().Architecture() == "gemma3" {
|
||||
|
||||
57
integration/library_models_test.go
Normal file
57
integration/library_models_test.go
Normal file
@@ -0,0 +1,57 @@
|
||||
//go:build integration && library
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
// First run of this scenario on a target system will take a long time to download
|
||||
// ~1.5TB of models. Set a sufficiently large -timeout for your network speed
|
||||
func TestLibraryModelsGenerate(t *testing.T) {
|
||||
softTimeout, hardTimeout := getTimeouts(t)
|
||||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
chatModels := libraryChatModels
|
||||
for _, model := range chatModels {
|
||||
t.Run(model, func(t *testing.T) {
|
||||
if time.Now().Sub(started) > softTimeout {
|
||||
t.Skip("skipping remaining tests to avoid excessive runtime")
|
||||
}
|
||||
if err := PullIfMissing(ctx, client, model); err != nil {
|
||||
t.Fatalf("pull failed %s", err)
|
||||
}
|
||||
req := api.GenerateRequest{
|
||||
Model: model,
|
||||
Prompt: "why is the sky blue?",
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]interface{}{
|
||||
"temperature": 0.1,
|
||||
"seed": 123,
|
||||
},
|
||||
}
|
||||
anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"}
|
||||
// Special cases
|
||||
if model == "duckdb-nsql" {
|
||||
anyResp = []string{"select", "from"}
|
||||
} else if model == "granite3-guardian" || model == "shieldgemma" || model == "llama-guard3" || model == "bespoke-minicheck" {
|
||||
anyResp = []string{"yes", "no", "safe", "unsafe"}
|
||||
} else if model == "openthinker" || model == "nexusraven" {
|
||||
anyResp = []string{"plugin", "im_sep", "components", "function call"}
|
||||
} else if model == "starcoder" || model == "starcoder2" || model == "magicoder" || model == "deepseek-coder" {
|
||||
req.Prompt = "def fibonacci():"
|
||||
anyResp = []string{"f(n)", "sequence", "n-1", "main()", "__main__", "while"}
|
||||
}
|
||||
DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -19,37 +19,6 @@ import (
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
var (
|
||||
started = time.Now()
|
||||
chatModels = []string{
|
||||
"granite3-moe:latest",
|
||||
"granite-code:latest",
|
||||
"nemotron-mini:latest",
|
||||
"command-r:latest",
|
||||
"gemma2:latest",
|
||||
"gemma:latest",
|
||||
"internlm2:latest",
|
||||
"phi3.5:latest",
|
||||
"phi3:latest",
|
||||
// "phi:latest", // flaky, sometimes generates no response on first query
|
||||
"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
|
||||
"falcon:latest",
|
||||
"falcon2:latest",
|
||||
"minicpm-v:latest",
|
||||
"mistral:latest",
|
||||
"orca-mini:latest",
|
||||
"llama2:latest",
|
||||
"llama3.1:latest",
|
||||
"llama3.2:latest",
|
||||
"llama3.2-vision:latest",
|
||||
"qwen2.5-coder:latest",
|
||||
"qwen:latest",
|
||||
"solar-pro:latest",
|
||||
"codellama:latest",
|
||||
"nous-hermes:latest",
|
||||
}
|
||||
)
|
||||
|
||||
func TestModelsGenerate(t *testing.T) {
|
||||
softTimeout, hardTimeout := getTimeouts(t)
|
||||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||
@@ -70,6 +39,13 @@ func TestModelsGenerate(t *testing.T) {
|
||||
slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
|
||||
}
|
||||
|
||||
var chatModels []string
|
||||
if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
|
||||
chatModels = ollamaEngineChatModels
|
||||
} else {
|
||||
chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
|
||||
}
|
||||
|
||||
for _, model := range chatModels {
|
||||
t.Run(model, func(t *testing.T) {
|
||||
if time.Now().Sub(started) > softTimeout {
|
||||
|
||||
266
integration/model_perf_test.go
Normal file
266
integration/model_perf_test.go
Normal file
@@ -0,0 +1,266 @@
|
||||
//go:build integration && perf
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log/slog"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
var (
|
||||
// Models that don't work reliably with the large context prompt in this test case
|
||||
longContextFlakes = []string{
|
||||
"granite-code:latest",
|
||||
"nemotron-mini:latest",
|
||||
"falcon:latest", // 2k model
|
||||
"falcon2:latest", // 2k model
|
||||
"minicpm-v:latest",
|
||||
"qwen:latest",
|
||||
"solar-pro:latest",
|
||||
}
|
||||
)
|
||||
|
||||
// Note: this test case can take a long time to run, particularly on models with
|
||||
// large contexts. Run with -timeout set to a large value to get reasonable coverage
|
||||
// Example usage:
|
||||
//
|
||||
// go test --tags=integration,perf -count 1 ./integration -v -timeout 90m -run TestModelsPerf 2>&1 | tee int.log
|
||||
// cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
|
||||
// cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
|
||||
func TestModelsPerf(t *testing.T) {
|
||||
softTimeout, hardTimeout := getTimeouts(t)
|
||||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
// TODO use info API eventually
|
||||
var maxVram uint64
|
||||
var err error
|
||||
if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
|
||||
maxVram, err = strconv.ParseUint(s, 10, 64)
|
||||
if err != nil {
|
||||
t.Fatalf("invalid OLLAMA_MAX_VRAM %v", err)
|
||||
}
|
||||
} else {
|
||||
slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
|
||||
}
|
||||
|
||||
data, err := ioutil.ReadFile(filepath.Join("testdata", "shakespeare.txt"))
|
||||
if err != nil {
|
||||
t.Fatalf("failed to open test data file: %s", err)
|
||||
}
|
||||
longPrompt := "summarize the following: " + string(data)
|
||||
|
||||
var chatModels []string
|
||||
if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
|
||||
chatModels = ollamaEngineChatModels
|
||||
} else {
|
||||
chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
|
||||
}
|
||||
|
||||
for _, model := range chatModels {
|
||||
t.Run(model, func(t *testing.T) {
|
||||
if time.Now().Sub(started) > softTimeout {
|
||||
t.Skip("skipping remaining tests to avoid excessive runtime")
|
||||
}
|
||||
if err := PullIfMissing(ctx, client, model); err != nil {
|
||||
t.Fatalf("pull failed %s", err)
|
||||
}
|
||||
var maxContext int
|
||||
|
||||
resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
|
||||
if err != nil {
|
||||
t.Fatalf("show failed: %s", err)
|
||||
}
|
||||
arch := resp.ModelInfo["general.architecture"].(string)
|
||||
maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
|
||||
|
||||
if maxVram > 0 {
|
||||
resp, err := client.List(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("list models failed %v", err)
|
||||
}
|
||||
for _, m := range resp.Models {
|
||||
// For these tests we want to exercise a some amount of overflow on the CPU
|
||||
if m.Name == model && float32(m.Size)*0.75 > float32(maxVram) {
|
||||
t.Skipf("model %s is too large %s for available VRAM %s", model, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
|
||||
}
|
||||
}
|
||||
}
|
||||
slog.Info("scneario", "model", model, "max_context", maxContext)
|
||||
loaded := false
|
||||
defer func() {
|
||||
// best effort unload once we're done with the model
|
||||
if loaded {
|
||||
client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
|
||||
}
|
||||
}()
|
||||
|
||||
// Some models don't handle the long context data well so skip them to avoid flaky test results
|
||||
longContextFlake := false
|
||||
for _, flake := range longContextFlakes {
|
||||
if model == flake {
|
||||
longContextFlake = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// iterate through a few context sizes for coverage without excessive runtime
|
||||
var contexts []int
|
||||
keepGoing := true
|
||||
if maxContext > 16384 {
|
||||
contexts = []int{4096, 8192, 16384, maxContext}
|
||||
} else if maxContext > 8192 {
|
||||
contexts = []int{4096, 8192, maxContext}
|
||||
} else if maxContext > 4096 {
|
||||
contexts = []int{4096, maxContext}
|
||||
} else if maxContext > 0 {
|
||||
contexts = []int{maxContext}
|
||||
} else {
|
||||
t.Fatal("unknown max context size")
|
||||
}
|
||||
for _, numCtx := range contexts {
|
||||
if !keepGoing && numCtx > 8192 { // Always try up to 8k before bailing out
|
||||
break
|
||||
}
|
||||
skipLongPrompt := false
|
||||
|
||||
// Workaround bug 11172 temporarily...
|
||||
maxPrompt := longPrompt
|
||||
// If we fill the context too full with the prompt, many models
|
||||
// quickly hit context shifting and go bad.
|
||||
if len(maxPrompt) > numCtx*2 { // typically yields ~1/2 full context
|
||||
maxPrompt = maxPrompt[:numCtx*2]
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
prompt string
|
||||
anyResp []string
|
||||
}{
|
||||
{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
|
||||
{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
|
||||
}
|
||||
var gpuPercent int
|
||||
for _, tc := range testCases {
|
||||
if len(tc.prompt) > 100 && (longContextFlake || skipLongPrompt) {
|
||||
slog.Info("skipping long prompt", "model", model, "num_ctx", numCtx, "gpu_percent", gpuPercent)
|
||||
continue
|
||||
}
|
||||
req := api.GenerateRequest{
|
||||
Model: model,
|
||||
Prompt: tc.prompt,
|
||||
KeepAlive: &api.Duration{Duration: 20 * time.Second}, // long enough to ensure a ps returns
|
||||
Options: map[string]interface{}{
|
||||
"temperature": 0,
|
||||
"seed": 123,
|
||||
"num_ctx": numCtx,
|
||||
},
|
||||
}
|
||||
atLeastOne := false
|
||||
var resp api.GenerateResponse
|
||||
|
||||
stream := false
|
||||
req.Stream = &stream
|
||||
|
||||
// Avoid potentially getting stuck indefinitely
|
||||
limit := 5 * time.Minute
|
||||
genCtx, cancel := context.WithDeadlineCause(
|
||||
ctx,
|
||||
time.Now().Add(limit),
|
||||
fmt.Errorf("generate on model %s with ctx %d took longer than %v", model, numCtx, limit),
|
||||
)
|
||||
defer cancel()
|
||||
|
||||
err = client.Generate(genCtx, &req, func(rsp api.GenerateResponse) error {
|
||||
resp = rsp
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
// Avoid excessive test runs, but don't consider a failure with massive context
|
||||
if numCtx > 16384 && strings.Contains(err.Error(), "took longer") {
|
||||
slog.Warn("max context was taking too long, skipping", "error", err)
|
||||
keepGoing = false
|
||||
skipLongPrompt = true
|
||||
continue
|
||||
}
|
||||
t.Fatalf("generate error: ctx:%d err:%s", numCtx, err)
|
||||
}
|
||||
loaded = true
|
||||
for _, expResp := range tc.anyResp {
|
||||
if strings.Contains(strings.ToLower(resp.Response), expResp) {
|
||||
atLeastOne = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !atLeastOne {
|
||||
t.Fatalf("response didn't contain expected values: ctx:%d expected:%v response:%s ", numCtx, tc.anyResp, resp.Response)
|
||||
}
|
||||
models, err := client.ListRunning(ctx)
|
||||
if err != nil {
|
||||
slog.Warn("failed to list running models", "error", err)
|
||||
continue
|
||||
}
|
||||
if len(models.Models) > 1 {
|
||||
slog.Warn("multiple models loaded, may impact performance results", "loaded", models.Models)
|
||||
}
|
||||
for _, m := range models.Models {
|
||||
if m.Name == model {
|
||||
if m.SizeVRAM == 0 {
|
||||
slog.Info("Model fully loaded into CPU")
|
||||
gpuPercent = 0
|
||||
keepGoing = false
|
||||
skipLongPrompt = true
|
||||
} else if m.SizeVRAM == m.Size {
|
||||
slog.Info("Model fully loaded into GPU")
|
||||
gpuPercent = 100
|
||||
} else {
|
||||
sizeCPU := m.Size - m.SizeVRAM
|
||||
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
|
||||
gpuPercent = int(100 - cpuPercent)
|
||||
slog.Info("Model split between CPU/GPU", "CPU", cpuPercent, "GPU", gpuPercent)
|
||||
keepGoing = false
|
||||
|
||||
// Heuristic to avoid excessive test run time
|
||||
if gpuPercent < 90 {
|
||||
skipLongPrompt = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
|
||||
"MODEL",
|
||||
"CONTEXT",
|
||||
"GPU PERCENT",
|
||||
"PROMPT COUNT",
|
||||
"LOAD TIME",
|
||||
"PROMPT EVAL TPS",
|
||||
"EVAL TPS",
|
||||
)
|
||||
fmt.Fprintf(os.Stderr, "MODEL_PERF_DATA:%s,%d,%d,%d,%0.2f,%0.2f,%0.2f\n",
|
||||
model,
|
||||
numCtx,
|
||||
gpuPercent,
|
||||
resp.PromptEvalCount,
|
||||
float64(resp.LoadDuration)/1000000000.0,
|
||||
float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
|
||||
float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
|
||||
)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
124456
integration/testdata/shakespeare.txt
vendored
Normal file
124456
integration/testdata/shakespeare.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@@ -32,6 +32,229 @@ const (
|
||||
smol = "llama3.2:1b"
|
||||
)
|
||||
|
||||
var (
|
||||
started = time.Now()
|
||||
|
||||
// Note: add newer models at the top of the list to test them first
|
||||
ollamaEngineChatModels = []string{
|
||||
"gemma3n:e2b",
|
||||
"mistral-small3.2:latest",
|
||||
"deepseek-r1:1.5b",
|
||||
"llama3.2-vision:latest",
|
||||
"qwen2.5-coder:latest",
|
||||
"qwen2.5vl:3b",
|
||||
"qwen3:0.6b", // dense
|
||||
"qwen3:30b", // MOE
|
||||
"gemma3:1b",
|
||||
"llama3.1:latest",
|
||||
"llama3.2:latest",
|
||||
"gemma2:latest",
|
||||
"minicpm-v:latest", // arch=qwen2
|
||||
"granite-code:latest", // arch=llama
|
||||
}
|
||||
llamaRunnerChatModels = []string{
|
||||
"mistral:latest",
|
||||
"falcon3:latest",
|
||||
"granite3-moe:latest",
|
||||
"command-r:latest",
|
||||
"nemotron-mini:latest",
|
||||
"phi3.5:latest",
|
||||
"solar-pro:latest",
|
||||
"internlm2:latest",
|
||||
"codellama:latest", // arch=llama
|
||||
"phi3:latest",
|
||||
"falcon2:latest",
|
||||
"gemma:latest",
|
||||
"llama2:latest",
|
||||
"nous-hermes:latest",
|
||||
"orca-mini:latest",
|
||||
"qwen:latest",
|
||||
"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
|
||||
"falcon:latest",
|
||||
}
|
||||
|
||||
// Some library models are quite large - ensure large VRAM and sufficient disk space
|
||||
// before running scenarios based on this set
|
||||
libraryChatModels = []string{
|
||||
"alfred",
|
||||
"athene-v2",
|
||||
"aya-expanse",
|
||||
"aya",
|
||||
"bakllava",
|
||||
"bespoke-minicheck",
|
||||
"codebooga",
|
||||
"codegeex4",
|
||||
"codegemma",
|
||||
"codellama",
|
||||
"codeqwen",
|
||||
"codestral",
|
||||
"codeup",
|
||||
"cogito",
|
||||
"command-a",
|
||||
"command-r-plus",
|
||||
"command-r",
|
||||
"command-r7b-arabic",
|
||||
"command-r7b",
|
||||
"dbrx",
|
||||
"deepcoder",
|
||||
"deepscaler",
|
||||
"deepseek-coder-v2",
|
||||
"deepseek-coder",
|
||||
"deepseek-llm",
|
||||
"deepseek-r1",
|
||||
// "deepseek-v2.5", // requires 155 GB VRAM
|
||||
"deepseek-v2",
|
||||
// "deepseek-v3", // requires 482 GB VRAM
|
||||
"devstral",
|
||||
"dolphin-llama3",
|
||||
"dolphin-mistral",
|
||||
"dolphin-mixtral",
|
||||
"dolphin-phi",
|
||||
"dolphin3",
|
||||
"dolphincoder",
|
||||
"duckdb-nsql",
|
||||
"everythinglm",
|
||||
"exaone-deep",
|
||||
"exaone3.5",
|
||||
"falcon",
|
||||
"falcon2",
|
||||
"falcon3",
|
||||
"firefunction-v2",
|
||||
"gemma",
|
||||
"gemma2",
|
||||
"gemma3",
|
||||
"gemma3n",
|
||||
"glm4",
|
||||
"goliath",
|
||||
"granite-code",
|
||||
"granite3-dense",
|
||||
"granite3-guardian",
|
||||
"granite3-moe",
|
||||
"granite3.1-dense",
|
||||
"granite3.1-moe",
|
||||
"granite3.2-vision",
|
||||
"granite3.2",
|
||||
"granite3.3",
|
||||
"hermes3",
|
||||
"internlm2",
|
||||
"llama-guard3",
|
||||
"llama-pro",
|
||||
"llama2-chinese",
|
||||
"llama2-uncensored",
|
||||
"llama2",
|
||||
"llama3-chatqa",
|
||||
"llama3-gradient",
|
||||
"llama3-groq-tool-use",
|
||||
"llama3.1",
|
||||
"llama3.2-vision",
|
||||
"llama3.2",
|
||||
"llama3.3",
|
||||
"llama3",
|
||||
"llama4",
|
||||
"llava-llama3",
|
||||
"llava-phi3",
|
||||
"llava",
|
||||
"magicoder",
|
||||
"magistral",
|
||||
"marco-o1",
|
||||
"mathstral",
|
||||
"meditron",
|
||||
"medllama2",
|
||||
"megadolphin",
|
||||
"minicpm-v",
|
||||
"mistral-large",
|
||||
"mistral-nemo",
|
||||
"mistral-openorca",
|
||||
"mistral-small",
|
||||
"mistral-small3.1",
|
||||
"mistral-small3.2",
|
||||
"mistral",
|
||||
"mistrallite",
|
||||
"mixtral",
|
||||
"moondream",
|
||||
"nemotron-mini",
|
||||
"nemotron",
|
||||
"neural-chat",
|
||||
"nexusraven",
|
||||
"notus",
|
||||
"nous-hermes",
|
||||
"nous-hermes2-mixtral",
|
||||
"nous-hermes2",
|
||||
"nuextract",
|
||||
"olmo2",
|
||||
"open-orca-platypus2",
|
||||
"openchat",
|
||||
"opencoder",
|
||||
"openhermes",
|
||||
"openthinker",
|
||||
"orca-mini",
|
||||
"orca2",
|
||||
// "phi", // unreliable
|
||||
"phi3.5",
|
||||
"phi3",
|
||||
"phi4-mini-reasoning",
|
||||
"phi4-mini",
|
||||
"phi4-reasoning",
|
||||
"phi4",
|
||||
"phind-codellama",
|
||||
"qwen",
|
||||
"qwen2-math",
|
||||
"qwen2.5-coder",
|
||||
"qwen2.5",
|
||||
"qwen2.5vl",
|
||||
"qwen2",
|
||||
"qwen3:0.6b", // dense
|
||||
"qwen3:30b", // MOE
|
||||
"qwq",
|
||||
"r1-1776",
|
||||
"reader-lm",
|
||||
"reflection",
|
||||
"sailor2",
|
||||
"samantha-mistral",
|
||||
"shieldgemma",
|
||||
"smallthinker",
|
||||
"smollm",
|
||||
"smollm2",
|
||||
"solar-pro",
|
||||
"solar",
|
||||
"sqlcoder",
|
||||
"stable-beluga",
|
||||
"stable-code",
|
||||
"stablelm-zephyr",
|
||||
"stablelm2",
|
||||
"starcoder",
|
||||
"starcoder2",
|
||||
"starling-lm",
|
||||
"tinydolphin",
|
||||
"tinyllama",
|
||||
"tulu3",
|
||||
"vicuna",
|
||||
"wizard-math",
|
||||
"wizard-vicuna-uncensored",
|
||||
"wizard-vicuna",
|
||||
"wizardcoder",
|
||||
"wizardlm-uncensored",
|
||||
"wizardlm2",
|
||||
"xwinlm",
|
||||
"yarn-llama2",
|
||||
"yarn-mistral",
|
||||
"yi-coder",
|
||||
"yi",
|
||||
"zephyr",
|
||||
}
|
||||
libraryEmbedModels = []string{
|
||||
"all-minilm",
|
||||
"bge-large",
|
||||
"bge-m3",
|
||||
"granite-embedding",
|
||||
"mxbai-embed-large",
|
||||
"nomic-embed-text",
|
||||
"paraphrase-multilingual",
|
||||
"snowflake-arctic-embed",
|
||||
"snowflake-arctic-embed2",
|
||||
}
|
||||
)
|
||||
|
||||
func Init() {
|
||||
lifecycle.InitLogging()
|
||||
}
|
||||
@@ -271,6 +494,10 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
|
||||
t.Errorf("generate stalled. Response so far:%s", buf.String())
|
||||
}
|
||||
case <-done:
|
||||
if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
|
||||
slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
|
||||
return
|
||||
}
|
||||
require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
|
||||
// Verify the response contains the expected data
|
||||
response := buf.String()
|
||||
|
||||
@@ -7,31 +7,31 @@ This enables matching up devices and information reported by the backend
|
||||
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
||||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 33 ++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 39 ++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.m | 1 +
|
||||
3 files changed, 35 insertions(+)
|
||||
3 files changed, 41 insertions(+)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 74e46716..a880df33 100644
|
||||
index 74e46716..48839339 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -152,6 +152,7 @@ extern "C" {
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
+ const char * uuid;
|
||||
+ const char * id;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cb0d8528..4c829153 100644
|
||||
index cb0d8528..d6960174 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
+ std::string uuid;
|
||||
+ std::string id;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -39,9 +39,9 @@ index cb0d8528..4c829153 100644
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
|
||||
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
+ return ctx->uuid.c_str();
|
||||
+ return ctx->id.c_str();
|
||||
+}
|
||||
+
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
@@ -51,17 +51,17 @@ index cb0d8528..4c829153 100644
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
+ props->uuid = ggml_backend_cuda_device_get_uuid(dev);
|
||||
+ props->id = ggml_backend_cuda_device_get_id(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
+ #if !defined(GGML_USE_HIP)
|
||||
+ char uuid[64];
|
||||
+ snprintf(uuid, sizeof(uuid),
|
||||
+ char id[64];
|
||||
+ snprintf(id, sizeof(id),
|
||||
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
+ (unsigned char)prop.uuid.bytes[0],
|
||||
+ (unsigned char)prop.uuid.bytes[1],
|
||||
@@ -80,23 +80,29 @@ index cb0d8528..4c829153 100644
|
||||
+ (unsigned char)prop.uuid.bytes[14],
|
||||
+ (unsigned char)prop.uuid.bytes[15]
|
||||
+ );
|
||||
+ dev_ctx->uuid = uuid;
|
||||
+ dev_ctx->id = id;
|
||||
+ #else
|
||||
+ dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
+ #ifdef _WIN32
|
||||
+ char id[16];
|
||||
+ snprintf(id, sizeof(id), "%d", i);
|
||||
+ dev_ctx->id = id;
|
||||
+ #else
|
||||
+ dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
+ #endif
|
||||
+ #endif
|
||||
+
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 1b56f858..ee4f2dcb 100644
|
||||
index 1b56f858..a9eeebc6 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
+ props->uuid = "0";
|
||||
+ props->id = "0";
|
||||
props->type = ggml_backend_metal_device_get_type(dev);
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = (struct ggml_backend_dev_caps) {
|
||||
|
||||
@@ -124,9 +124,9 @@ type DeviceMemory struct {
|
||||
// may not be persistent across instances of the runner.
|
||||
Name string
|
||||
|
||||
// UUID is a unique persistent identifier for the device for matching
|
||||
// with system management libraries
|
||||
UUID string
|
||||
// ID is an identifier for the device for matching with system
|
||||
// management libraries.
|
||||
ID string
|
||||
|
||||
// Weights is the per-layer memory needed for the model weights.
|
||||
Weights []Memory
|
||||
@@ -156,8 +156,8 @@ func (m DeviceMemory) LogValue() slog.Value {
|
||||
attrs = append(attrs, slog.Any("Graph", m.Graph))
|
||||
}
|
||||
|
||||
if len(attrs) > 0 && m.UUID != "" {
|
||||
attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
|
||||
if len(attrs) > 0 && m.ID != "" {
|
||||
attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
|
||||
}
|
||||
|
||||
return slog.GroupValue(attrs...)
|
||||
|
||||
@@ -138,7 +138,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
|
||||
requiredMemory.CPU.UUID = C.GoString(props.uuid)
|
||||
requiredMemory.CPU.ID = C.GoString(props.id)
|
||||
requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
|
||||
|
||||
@@ -155,7 +155,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
C.ggml_backend_dev_get_props(d, &props)
|
||||
requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
|
||||
requiredMemory.GPUs[i].ID = C.GoString(props.id)
|
||||
requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
|
||||
}
|
||||
@@ -355,6 +355,26 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
bbs[c] = b
|
||||
}
|
||||
|
||||
// Mimic llama runner logs summarizing layers and memory
|
||||
gpuLayers := 0
|
||||
for _, layer := range layers {
|
||||
if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
|
||||
gpuLayers++
|
||||
}
|
||||
}
|
||||
slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
|
||||
|
||||
switch C.ggml_backend_dev_type(output.d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
slog.Info("offloading output layer to CPU")
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
slog.Info("offloading output layer to GPU")
|
||||
gpuLayers++
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
slog.Info("offloading output layer to ACCEL")
|
||||
}
|
||||
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
|
||||
|
||||
for bs := range maps.Values(bbs) {
|
||||
slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
|
||||
}
|
||||
@@ -400,7 +420,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
|
||||
C.int(len(schedBackends)),
|
||||
C.size_t(maxGraphNodes),
|
||||
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
|
||||
C._Bool(false),
|
||||
C._Bool(false),
|
||||
),
|
||||
schedBackends: schedBackends,
|
||||
|
||||
2
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
2
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
@@ -152,7 +152,7 @@ extern "C" {
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
const char * uuid;
|
||||
const char * id;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
|
||||
22
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
22
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
@@ -2888,7 +2888,7 @@ struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string uuid;
|
||||
std::string id;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -2901,9 +2901,9 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
|
||||
static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ctx->uuid.c_str();
|
||||
return ctx->id.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
@@ -2920,7 +2920,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
props->uuid = ggml_backend_cuda_device_get_uuid(dev);
|
||||
props->id = ggml_backend_cuda_device_get_id(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
@@ -3471,8 +3471,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
#if !defined(GGML_USE_HIP)
|
||||
char uuid[64];
|
||||
snprintf(uuid, sizeof(uuid),
|
||||
char id[64];
|
||||
snprintf(id, sizeof(id),
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
(unsigned char)prop.uuid.bytes[0],
|
||||
(unsigned char)prop.uuid.bytes[1],
|
||||
@@ -3491,9 +3491,15 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
(unsigned char)prop.uuid.bytes[14],
|
||||
(unsigned char)prop.uuid.bytes[15]
|
||||
);
|
||||
dev_ctx->uuid = uuid;
|
||||
dev_ctx->id = id;
|
||||
#else
|
||||
dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
#ifdef _WIN32
|
||||
char id[16];
|
||||
snprintf(id, sizeof(id), "%d", i);
|
||||
dev_ctx->id = id;
|
||||
#else
|
||||
dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
|
||||
@@ -5726,7 +5726,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
props->uuid = "0";
|
||||
props->id = "0";
|
||||
props->type = ggml_backend_metal_device_get_type(dev);
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = (struct ggml_backend_dev_caps) {
|
||||
|
||||
@@ -4,6 +4,6 @@ package metal
|
||||
|
||||
//go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
|
||||
|
||||
// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
|
||||
// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -DGGML_METAL_USE_BF16 -I.. -I../../include
|
||||
// #cgo LDFLAGS: -framework Metal -framework MetalKit
|
||||
import "C"
|
||||
|
||||
@@ -39,7 +39,6 @@ func New(c fs.Config) (model.Model, error) {
|
||||
),
|
||||
}
|
||||
|
||||
// TODO: setup hybrid (local sliding window + global) cache
|
||||
m.Cache = kvcache.NewWrapperCache(
|
||||
kvcache.NewCausalCache(m.Shift),
|
||||
kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),
|
||||
|
||||
@@ -2,6 +2,7 @@ package llama
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
@@ -33,6 +34,14 @@ type Model struct {
|
||||
}
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
// This model currently only supports the gpt2 tokenizer
|
||||
if c.String("tokenizer.ggml.model") == "llama" {
|
||||
return nil, fmt.Errorf("unsupported tokenizer: llama")
|
||||
}
|
||||
// Best effort detection of library/deepseek-coder model(s) which are incompatible
|
||||
if c.String("general.name") == "deepseek-ai" {
|
||||
return nil, fmt.Errorf("unsupported model: %s", c.String("general.name"))
|
||||
}
|
||||
m := Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
|
||||
@@ -2,7 +2,9 @@ package qwen2
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
@@ -126,6 +128,14 @@ func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor
|
||||
}
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
// This model currently only supports the gpt2 tokenizer
|
||||
if c.String("tokenizer.ggml.model") == "llama" {
|
||||
return nil, fmt.Errorf("unsupported tokenizer: llama")
|
||||
}
|
||||
// detect library/qwen model(s) which are incompatible
|
||||
if strings.HasPrefix(c.String("general.name"), "Qwen2-beta") {
|
||||
return nil, fmt.Errorf("unsupported model: %s", c.String("general.name"))
|
||||
}
|
||||
m := Model{
|
||||
Layers: make([]DecoderLayer, c.Uint("block_count")),
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
|
||||
@@ -423,7 +423,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
}
|
||||
}
|
||||
|
||||
types := []string{"jpeg", "jpg", "png"}
|
||||
types := []string{"jpeg", "jpg", "png", "webp"}
|
||||
valid := false
|
||||
for _, t := range types {
|
||||
prefix := "data:image/" + t + ";base64,"
|
||||
|
||||
@@ -231,6 +231,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
|
||||
// do not quantize relative position bias (T5)
|
||||
quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")
|
||||
|
||||
quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight")
|
||||
|
||||
newType := fsggml.TensorType(t.Kind)
|
||||
if quantize {
|
||||
// get more optimal quantization type based on the tensor shape, layer, etc.
|
||||
|
||||
@@ -842,8 +842,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
}
|
||||
resp.Parameters = strings.Join(params, "\n")
|
||||
|
||||
for k, v := range req.Options {
|
||||
if _, ok := req.Options[k]; ok {
|
||||
if len(req.Options) > 0 {
|
||||
if m.Options == nil {
|
||||
m.Options = make(map[string]any)
|
||||
}
|
||||
for k, v := range req.Options {
|
||||
m.Options[k] = v
|
||||
}
|
||||
}
|
||||
@@ -1404,6 +1407,9 @@ func (s *Server) PsHandler(c *gin.Context) {
|
||||
Details: modelDetails,
|
||||
ExpiresAt: v.expiresAt,
|
||||
}
|
||||
if v.Options != nil {
|
||||
mr.ContextLength = v.Options.NumCtx / v.numParallel
|
||||
}
|
||||
// The scheduler waits to set expiresAt, so if a model is loading it's
|
||||
// possible that it will be set to the unix epoch. For those cases, just
|
||||
// calculate the time w/ the sessionDuration instead.
|
||||
|
||||
@@ -57,9 +57,7 @@ type Scheduler struct {
|
||||
var defaultModelsPerGPU = 3
|
||||
|
||||
// Default automatic value for parallel setting
|
||||
// Model will still need to fit in VRAM. If this setting won't fit
|
||||
// we'll back off down to 1 to try to get it to fit
|
||||
var defaultParallel = 2
|
||||
var defaultParallel = 1
|
||||
|
||||
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
||||
|
||||
|
||||
@@ -310,21 +310,23 @@ func (t *Template) Execute(w io.Writer, v Values) error {
|
||||
}
|
||||
|
||||
// collate messages based on role. consecutive messages of the same role are merged
|
||||
// into a single message. collate also collects and returns all system messages.
|
||||
// into a single message (except for tool messages which preserve individual metadata).
|
||||
// collate also collects and returns all system messages.
|
||||
// collate mutates message content adding image tags ([img-%d]) as needed
|
||||
// todo(parthsareen): revisit for contextual image support
|
||||
func collate(msgs []api.Message) (string, []*api.Message) {
|
||||
var system []string
|
||||
var collated []*api.Message
|
||||
for i := range msgs {
|
||||
msg := msgs[i]
|
||||
if msg.Role == "system" {
|
||||
system = append(system, msg.Content)
|
||||
if msgs[i].Role == "system" {
|
||||
system = append(system, msgs[i].Content)
|
||||
}
|
||||
|
||||
if len(collated) > 0 && collated[len(collated)-1].Role == msg.Role {
|
||||
collated[len(collated)-1].Content += "\n\n" + msg.Content
|
||||
// merges consecutive messages of the same role into a single message (except for tool messages)
|
||||
if len(collated) > 0 && collated[len(collated)-1].Role == msgs[i].Role && msgs[i].Role != "tool" {
|
||||
collated[len(collated)-1].Content += "\n\n" + msgs[i].Content
|
||||
} else {
|
||||
collated = append(collated, &msg)
|
||||
collated = append(collated, &msgs[i])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -163,10 +163,12 @@ func TestParse(t *testing.T) {
|
||||
{"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}},
|
||||
{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
|
||||
{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
|
||||
{"{{ range .Messages }}{{ if eq .Role \"tool\" }}Tool Result: {{ .ToolName }} {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role", "toolname"}},
|
||||
{`{{- range .Messages }}
|
||||
{{- if eq .Role "system" }}SYSTEM:
|
||||
{{- else if eq .Role "user" }}USER:
|
||||
{{- else if eq .Role "assistant" }}ASSISTANT:
|
||||
{{- else if eq .Role "tool" }}TOOL:
|
||||
{{- end }} {{ .Content }}
|
||||
{{- end }}`, []string{"content", "messages", "role"}},
|
||||
{`{{- if .Messages }}
|
||||
@@ -376,3 +378,99 @@ func TestExecuteWithSuffix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollate(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
msgs []api.Message
|
||||
expected []*api.Message
|
||||
system string
|
||||
}{
|
||||
{
|
||||
name: "consecutive user messages are merged",
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "Hello"},
|
||||
{Role: "user", Content: "How are you?"},
|
||||
},
|
||||
expected: []*api.Message{
|
||||
{Role: "user", Content: "Hello\n\nHow are you?"},
|
||||
},
|
||||
system: "",
|
||||
},
|
||||
{
|
||||
name: "consecutive tool messages are NOT merged",
|
||||
msgs: []api.Message{
|
||||
{Role: "tool", Content: "sunny", ToolName: "get_weather"},
|
||||
{Role: "tool", Content: "72F", ToolName: "get_temperature"},
|
||||
},
|
||||
expected: []*api.Message{
|
||||
{Role: "tool", Content: "sunny", ToolName: "get_weather"},
|
||||
{Role: "tool", Content: "72F", ToolName: "get_temperature"},
|
||||
},
|
||||
system: "",
|
||||
},
|
||||
{
|
||||
name: "tool messages preserve all fields",
|
||||
msgs: []api.Message{
|
||||
{Role: "user", Content: "What's the weather?"},
|
||||
{Role: "tool", Content: "sunny", ToolName: "get_conditions"},
|
||||
{Role: "tool", Content: "72F", ToolName: "get_temperature"},
|
||||
},
|
||||
expected: []*api.Message{
|
||||
{Role: "user", Content: "What's the weather?"},
|
||||
{Role: "tool", Content: "sunny", ToolName: "get_conditions"},
|
||||
{Role: "tool", Content: "72F", ToolName: "get_temperature"},
|
||||
},
|
||||
system: "",
|
||||
},
|
||||
{
|
||||
name: "mixed messages with system",
|
||||
msgs: []api.Message{
|
||||
{Role: "system", Content: "You are helpful"},
|
||||
{Role: "user", Content: "Hello"},
|
||||
{Role: "assistant", Content: "Hi there!"},
|
||||
{Role: "user", Content: "What's the weather?"},
|
||||
{Role: "tool", Content: "sunny", ToolName: "get_weather"},
|
||||
{Role: "tool", Content: "72F", ToolName: "get_temperature"},
|
||||
{Role: "user", Content: "Thanks"},
|
||||
},
|
||||
expected: []*api.Message{
|
||||
{Role: "system", Content: "You are helpful"},
|
||||
{Role: "user", Content: "Hello"},
|
||||
{Role: "assistant", Content: "Hi there!"},
|
||||
{Role: "user", Content: "What's the weather?"},
|
||||
{Role: "tool", Content: "sunny", ToolName: "get_weather"},
|
||||
{Role: "tool", Content: "72F", ToolName: "get_temperature"},
|
||||
{Role: "user", Content: "Thanks"},
|
||||
},
|
||||
system: "You are helpful",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
system, collated := collate(tt.msgs)
|
||||
if diff := cmp.Diff(system, tt.system); diff != "" {
|
||||
t.Errorf("system mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
|
||||
// Compare the messages
|
||||
if len(collated) != len(tt.expected) {
|
||||
t.Errorf("expected %d messages, got %d", len(tt.expected), len(collated))
|
||||
return
|
||||
}
|
||||
|
||||
for i := range collated {
|
||||
if collated[i].Role != tt.expected[i].Role {
|
||||
t.Errorf("message %d role mismatch: got %q, want %q", i, collated[i].Role, tt.expected[i].Role)
|
||||
}
|
||||
if collated[i].Content != tt.expected[i].Content {
|
||||
t.Errorf("message %d content mismatch: got %q, want %q", i, collated[i].Content, tt.expected[i].Content)
|
||||
}
|
||||
if collated[i].ToolName != tt.expected[i].ToolName {
|
||||
t.Errorf("message %d tool name mismatch: got %q, want %q", i, collated[i].ToolName, tt.expected[i].ToolName)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
129
tools/tools.go
129
tools/tools.go
@@ -115,35 +115,21 @@ func (p *Parser) findTag() (int, bool) {
|
||||
// parseToolCall finds the next complete tool call in the buffer
|
||||
// incrementing n and advancing the buffer.
|
||||
func (p *Parser) parseToolCall() *api.ToolCall {
|
||||
var tool *api.Tool
|
||||
var end int = len(p.buffer)
|
||||
var i int
|
||||
|
||||
// find tool name
|
||||
for _, t := range p.tools {
|
||||
n := t.Function.Name
|
||||
if i = bytes.Index(p.buffer, []byte(n)); i != -1 {
|
||||
if i+len(n) < end {
|
||||
tool = &t
|
||||
end = i + len(n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tool, end := findTool(p.tools, p.buffer)
|
||||
if tool == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// only look for arguments if the tool has parameters
|
||||
// only look for arguments after the tool name if the tool has parameters
|
||||
// TODO (jmorganca): while probably uncommon, this doesn't support
|
||||
// parsing arguments before the tool name, which may be needed in the future
|
||||
args := map[string]any{}
|
||||
if len(tool.Function.Parameters.Properties) > 0 {
|
||||
if args, i = p.findArguments(*tool); args == nil {
|
||||
var i int
|
||||
if args, i = findArguments(*tool, p.buffer[end:]); args == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if i > end {
|
||||
end = i
|
||||
}
|
||||
end += i
|
||||
}
|
||||
|
||||
tc := &api.ToolCall{
|
||||
@@ -159,15 +145,80 @@ func (p *Parser) parseToolCall() *api.ToolCall {
|
||||
return tc
|
||||
}
|
||||
|
||||
// findArguments returns the first object that appears to be
|
||||
// arguments for the provided tool, returning nil
|
||||
func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
|
||||
if len(p.buffer) == 0 {
|
||||
// findTool finds the first tool name in the list that matches the
|
||||
// beginning of the buffer, returning nil if no tool is found
|
||||
// or if the buffer ends with a partial tool name since we need
|
||||
// to wait for more data to disambiguate.
|
||||
// The second return value is the end position of the tool name
|
||||
// if one is found, otherwise 0.
|
||||
func findTool(tools []api.Tool, buf []byte) (*api.Tool, int) {
|
||||
if len(buf) == 0 {
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
// no arguments to parse
|
||||
if len(tool.Function.Parameters.Properties) == 0 {
|
||||
// check if buffer ends with a partial tool name
|
||||
// this prevents matching "get" when seeing "get_weather"
|
||||
var longest string
|
||||
for _, t := range tools {
|
||||
if len(t.Function.Name) > len(longest) {
|
||||
longest = t.Function.Name
|
||||
}
|
||||
}
|
||||
|
||||
// Only check up to longest characters from the end
|
||||
for i := 1; i <= min(len(buf), len(longest)); i++ {
|
||||
tail := buf[len(buf)-i:]
|
||||
for _, t := range tools {
|
||||
name := []byte(t.Function.Name)
|
||||
if len(tail) < len(name) && bytes.HasPrefix(name, tail) {
|
||||
return nil, 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find first occurrence of the longest tool name
|
||||
var found *api.Tool
|
||||
start := -1
|
||||
end := -1
|
||||
|
||||
for i := range tools {
|
||||
name := []byte(tools[i].Function.Name)
|
||||
pos := bytes.Index(buf, name)
|
||||
if pos == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if we have a better match already
|
||||
if start != -1 {
|
||||
if pos > start {
|
||||
continue
|
||||
}
|
||||
if pos == start && len(name) <= len(found.Function.Name) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
found = &tools[i]
|
||||
start = pos
|
||||
end = pos + len(name)
|
||||
}
|
||||
|
||||
if found != nil {
|
||||
return found, end
|
||||
}
|
||||
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
// findArguments returns the first object that appears to be
|
||||
// arguments for the provided tool in the provided buffer,
|
||||
// returning nil if no arguments are found and the end position
|
||||
// TODO (jmorganca): this does not support parsing omitted arguments
|
||||
// objects for functions that have all-optional parameters
|
||||
// e.g. `{"name": "get_conditions", "arguments": {}}` will work but
|
||||
// `{"name": "get_conditions"}` will not currently work
|
||||
func findArguments(tool api.Tool, buffer []byte) (map[string]any, int) {
|
||||
if len(buffer) == 0 {
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
@@ -177,7 +228,7 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
|
||||
var object []byte
|
||||
|
||||
// find any outer json object
|
||||
for i, c := range p.buffer {
|
||||
for i, c := range buffer {
|
||||
if c == '{' {
|
||||
braces++
|
||||
if start == -1 {
|
||||
@@ -190,7 +241,7 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
|
||||
braces--
|
||||
if braces == 0 {
|
||||
end = i + 1
|
||||
object = p.buffer[start:end]
|
||||
object = buffer[start:end]
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -202,8 +253,6 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
|
||||
}
|
||||
|
||||
var data map[string]any
|
||||
|
||||
// not valid json
|
||||
if err := json.Unmarshal(object, &data); err != nil {
|
||||
return nil, 0
|
||||
}
|
||||
@@ -212,15 +261,27 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
|
||||
find = func(obj any) map[string]any {
|
||||
switch obj := obj.(type) {
|
||||
case map[string]any:
|
||||
found := true
|
||||
valid := true
|
||||
// check if all keys in the object exist in the tool's parameters
|
||||
for key := range obj {
|
||||
if _, exists := tool.Function.Parameters.Properties[key]; !exists {
|
||||
found = false
|
||||
valid = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
// check for required parameters
|
||||
// TODO (jmorganca): this should error instead of silently failing
|
||||
if valid {
|
||||
for _, required := range tool.Function.Parameters.Required {
|
||||
if _, exists := obj[required]; !exists {
|
||||
valid = false
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if valid {
|
||||
return obj
|
||||
}
|
||||
|
||||
|
||||
@@ -52,7 +52,8 @@ func TestParser(t *testing.T) {
|
||||
Enum []any `json:"enum,omitempty"`
|
||||
} `json:"properties"`
|
||||
}{
|
||||
Type: "object",
|
||||
Type: "object",
|
||||
Required: []string{"city"},
|
||||
Properties: map[string]struct {
|
||||
Type api.PropertyType `json:"type"`
|
||||
Items any `json:"items,omitempty"`
|
||||
@@ -111,6 +112,81 @@ func TestParser(t *testing.T) {
|
||||
Description: "Say hello",
|
||||
},
|
||||
},
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "say_hello_world",
|
||||
Description: "Say hello world",
|
||||
},
|
||||
},
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "get_address",
|
||||
Description: "Get the address of a given location",
|
||||
Parameters: struct {
|
||||
Type string `json:"type"`
|
||||
Defs any `json:"$defs,omitempty"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Required []string `json:"required"`
|
||||
Properties map[string]struct {
|
||||
Type api.PropertyType `json:"type"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Description string `json:"description"`
|
||||
Enum []any `json:"enum,omitempty"`
|
||||
} `json:"properties"`
|
||||
}{
|
||||
Type: "object",
|
||||
Properties: map[string]struct {
|
||||
Type api.PropertyType `json:"type"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Description string `json:"description"`
|
||||
Enum []any `json:"enum,omitempty"`
|
||||
}{
|
||||
"location": {
|
||||
Type: api.PropertyType{"string"},
|
||||
Description: "The location to get the address for",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: "add",
|
||||
Description: "Add two numbers",
|
||||
Parameters: struct {
|
||||
Type string `json:"type"`
|
||||
Defs any `json:"$defs,omitempty"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Required []string `json:"required"`
|
||||
Properties map[string]struct {
|
||||
Type api.PropertyType `json:"type"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Description string `json:"description"`
|
||||
Enum []any `json:"enum,omitempty"`
|
||||
} `json:"properties"`
|
||||
}{
|
||||
Type: "object",
|
||||
Properties: map[string]struct {
|
||||
Type api.PropertyType `json:"type"`
|
||||
Items any `json:"items,omitempty"`
|
||||
Description string `json:"description"`
|
||||
Enum []any `json:"enum,omitempty"`
|
||||
}{
|
||||
"a": {
|
||||
Type: api.PropertyType{"string"},
|
||||
Description: "The first number to add",
|
||||
},
|
||||
"b": {
|
||||
Type: api.PropertyType{"string"},
|
||||
Description: "The second number to add",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
@@ -159,8 +235,23 @@ func TestParser(t *testing.T) {
|
||||
calls: nil,
|
||||
},
|
||||
{
|
||||
name: "missing args",
|
||||
inputs: []string{`<tool_call>{"name": "get_conditions"}</tool_call>`},
|
||||
name: "empty args",
|
||||
inputs: []string{`<tool_call>{"name": "get_conditions", "arguments": {}}</tool_call>`},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "get_conditions",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "missing required args",
|
||||
inputs: []string{`<tool_call>{"name": "get_temperature", "arguments": {}}</tool_call>`},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: nil,
|
||||
@@ -259,9 +350,9 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "qwen two tool calls one with no args",
|
||||
inputs: []string{`Let me check the weather. <tool_call>{"name": "say_hello"}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}`},
|
||||
content: "Let me check the weather. ",
|
||||
name: "empty args followed by args",
|
||||
inputs: []string{`Let me say hello and check the weather. <tool_call>{"name": "say_hello", "arguments": {}}</tool_call><tool_call>{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}</tool_call>`},
|
||||
content: "Let me say hello and check the weather. ",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
@@ -271,6 +362,31 @@ func TestParser(t *testing.T) {
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 1,
|
||||
Name: "get_temperature",
|
||||
Arguments: api.ToolCallFunctionArguments{
|
||||
"city": "London",
|
||||
"format": "fahrenheit",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "qwen empty followed by args",
|
||||
inputs: []string{`Let me check the weather. <tool_call>{"name": "get_conditions", "arguments": {}}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}`},
|
||||
content: "Let me check the weather. ",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "get_conditions",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 1,
|
||||
@@ -588,6 +704,173 @@ func TestParser(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool name with collision",
|
||||
inputs: []string{
|
||||
"<tool_call>",
|
||||
"{",
|
||||
"\"name\": \"say_hello",
|
||||
"_world\",",
|
||||
"}",
|
||||
"}",
|
||||
},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello_world",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool name with collision multiple",
|
||||
inputs: []string{
|
||||
"<tool_call>",
|
||||
"{",
|
||||
"\"name\": \"say_hello",
|
||||
"_world\",",
|
||||
"}",
|
||||
"</tool_call>",
|
||||
"<tool_call>",
|
||||
"{",
|
||||
"\"name\": \"say_hello",
|
||||
"\",",
|
||||
"}",
|
||||
"</tool_call>",
|
||||
},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello_world",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 1,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool name with collision non streaming",
|
||||
inputs: []string{
|
||||
`<tool_call>{"name": "say_hello`,
|
||||
},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: nil,
|
||||
},
|
||||
{
|
||||
name: "tool name with collision non streaming multiple",
|
||||
inputs: []string{
|
||||
`<tool_call>{"name": "say_hello"}</tool_call><tool_call>{"name": "say_hello_world"}`,
|
||||
},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 1,
|
||||
Name: "say_hello_world",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool name with collision non streaming shorter",
|
||||
inputs: []string{
|
||||
`<tool_call>{"name": "say_hello"}</tool_call>`,
|
||||
},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool name with collision non streaming longer",
|
||||
inputs: []string{
|
||||
`<tool_call>{"name": "say_hello_world"}</tool_call>`,
|
||||
},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "say_hello_world",
|
||||
Arguments: api.ToolCallFunctionArguments{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool name with substring of another",
|
||||
inputs: []string{
|
||||
"{",
|
||||
"\"name\": \"get_address\",",
|
||||
"\"arguments\": {",
|
||||
"\"location\": \"London\"",
|
||||
"}",
|
||||
"}",
|
||||
},
|
||||
content: "",
|
||||
tmpl: json,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "get_address",
|
||||
Arguments: api.ToolCallFunctionArguments{
|
||||
"location": "London",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool name with substring of another",
|
||||
inputs: []string{
|
||||
`<tool_call>{"name": "get_address", "arguments": {"location": "London"}}</tool_call>`,
|
||||
},
|
||||
content: "",
|
||||
tmpl: qwen,
|
||||
calls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Index: 0,
|
||||
Name: "get_address",
|
||||
Arguments: api.ToolCallFunctionArguments{
|
||||
"location": "London",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -1035,16 +1318,19 @@ func TestFindArguments(t *testing.T) {
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
{
|
||||
name: "deepseek",
|
||||
buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
|
||||
want: map[string]any{
|
||||
"location": "Tokyo",
|
||||
},
|
||||
tool: tool,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
parser := &Parser{
|
||||
buffer: tt.buffer,
|
||||
tools: []api.Tool{tool, tool2},
|
||||
}
|
||||
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, _ := parser.findArguments(tool)
|
||||
got, _ := findArguments(tt.tool, tt.buffer)
|
||||
|
||||
if diff := cmp.Diff(got, tt.want); diff != "" {
|
||||
t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff)
|
||||
|
||||
Reference in New Issue
Block a user