Compare commits
167 Commits
jmorganca/
...
bruce/iq-q
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d40497b9a2 | ||
|
|
828e4bf101 | ||
|
|
05105903d8 | ||
|
|
abf3b1fb34 | ||
|
|
82fcc0601d | ||
|
|
185a927210 | ||
|
|
096ea2c8c3 | ||
|
|
06b31e2e24 | ||
|
|
b7a87a22b6 | ||
|
|
e8aaea030e | ||
|
|
267e25a750 | ||
|
|
9a32c514cb | ||
|
|
e9ae607ece | ||
|
|
93707fa3f2 | ||
|
|
94c369095f | ||
|
|
9164b0161b | ||
|
|
bf4fc25f7b | ||
|
|
5b806d8d24 | ||
|
|
cb1e072643 | ||
|
|
45b6a12e45 | ||
|
|
68755f1f5e | ||
|
|
997a455039 | ||
|
|
88775e1ff9 | ||
|
|
8867e744ff | ||
|
|
4fd064bea6 | ||
|
|
59fbceedcc | ||
|
|
321d57e1a0 | ||
|
|
ba26c7aa00 | ||
|
|
63c763685f | ||
|
|
34a4a94f13 | ||
|
|
f4a73d57a4 | ||
|
|
948114e3e3 | ||
|
|
a3e60d9058 | ||
|
|
8acb233668 | ||
|
|
119589fcb3 | ||
|
|
5ea844964e | ||
|
|
bd8eed57fc | ||
|
|
9cf0f2e973 | ||
|
|
176ad3aa6e | ||
|
|
4d08363580 | ||
|
|
8907bf51d2 | ||
|
|
abe614c705 | ||
|
|
238715037d | ||
|
|
c0a00f68ae | ||
|
|
f0c454ab57 | ||
|
|
b9f74ff3d6 | ||
|
|
fcf4d60eee | ||
|
|
e33d5c2dbc | ||
|
|
18d9a7e1f1 | ||
|
|
8488388cbd | ||
|
|
588901f449 | ||
|
|
0a7fdbe533 | ||
|
|
5950c176ca | ||
|
|
23d23409a0 | ||
|
|
9009bedf13 | ||
|
|
d4ac57e240 | ||
|
|
7b59d1770f | ||
|
|
95ead8ffba | ||
|
|
7aa08a77ca | ||
|
|
7e432cdfac | ||
|
|
586672f490 | ||
|
|
b03408de74 | ||
|
|
1e6a28bf5b | ||
|
|
d6e3b64582 | ||
|
|
114c932a8e | ||
|
|
7f7103de06 | ||
|
|
c631a9c726 | ||
|
|
8fd9e56804 | ||
|
|
8a65717f55 | ||
|
|
6d3152a98a | ||
|
|
b438d485f1 | ||
|
|
204349b17b | ||
|
|
86e67fc4a9 | ||
|
|
2bed62926e | ||
|
|
aad8d128a0 | ||
|
|
ec1acbb867 | ||
|
|
e4859c4563 | ||
|
|
8e30eb26bd | ||
|
|
0b5c589ca2 | ||
|
|
65fadddc85 | ||
|
|
ed5fb088c4 | ||
|
|
f81f308118 | ||
|
|
b1390a7b37 | ||
|
|
11d83386a5 | ||
|
|
bb31def011 | ||
|
|
41e03ede95 | ||
|
|
7fea1ecdf6 | ||
|
|
054894271d | ||
|
|
6fef042f0b | ||
|
|
5c0c2d1d09 | ||
|
|
37f9c8ad99 | ||
|
|
2a80f55e2a | ||
|
|
421c878a2d | ||
|
|
36666c2142 | ||
|
|
85801317d1 | ||
|
|
2ed0d65948 | ||
|
|
d459dc4ad1 | ||
|
|
40bc4622ef | ||
|
|
c0f818a07a | ||
|
|
8671fdeda6 | ||
|
|
2619850fb4 | ||
|
|
8feb97dc0d | ||
|
|
4e1ff6dcbb | ||
|
|
8589d752ac | ||
|
|
de4ded68b0 | ||
|
|
9b5a3c5991 | ||
|
|
00b0699c75 | ||
|
|
993cf8bf55 | ||
|
|
7bb7cb8a60 | ||
|
|
b123be5b71 | ||
|
|
ddf5c09a9b | ||
|
|
5f73c08729 | ||
|
|
f503a848c2 | ||
|
|
36a6daccab | ||
|
|
ceb0e26e5e | ||
|
|
284e02bed0 | ||
|
|
3450a57d4a | ||
|
|
592dae31c8 | ||
|
|
2010cbc5fa | ||
|
|
ac0801eced | ||
|
|
ad66e5b060 | ||
|
|
ade4b55520 | ||
|
|
a6d62e0617 | ||
|
|
6e76348df7 | ||
|
|
0d6687f84c | ||
|
|
74d2a9ef9a | ||
|
|
14476d48cc | ||
|
|
ce8ce82567 | ||
|
|
4dc4f1be34 | ||
|
|
16b52331a4 | ||
|
|
5445aaa94e | ||
|
|
2ac3dd6853 | ||
|
|
d8851cb7a0 | ||
|
|
058f6cd2cc | ||
|
|
790cf34d17 | ||
|
|
928d844896 | ||
|
|
939d6a8606 | ||
|
|
58888a74bc | ||
|
|
cc5a71e0e3 | ||
|
|
e83bcf7f9a | ||
|
|
5690e5ce99 | ||
|
|
f2ea8470e5 | ||
|
|
34b9db5afc | ||
|
|
8711d03df7 | ||
|
|
ee448deaba | ||
|
|
6e8db04716 | ||
|
|
658e60cf73 | ||
|
|
4c78f028f8 | ||
|
|
435cc866a3 | ||
|
|
c7d3a558f6 | ||
|
|
089cdb2877 | ||
|
|
ea1e9aa36b | ||
|
|
d0d28ef90d | ||
|
|
6654186a7c | ||
|
|
aa72281eae | ||
|
|
74bcbf828f | ||
|
|
fe39147e64 | ||
|
|
fad00a85e5 | ||
|
|
9c0db4cc83 | ||
|
|
6f18297b3a | ||
|
|
15016413de | ||
|
|
440b7190ed | ||
|
|
3934c15895 | ||
|
|
d524e5ef5e | ||
|
|
52f5370c48 | ||
|
|
1b42b4b59a | ||
|
|
7c000ec3ed |
20
.github/workflows/release.yaml
vendored
20
.github/workflows/release.yaml
vendored
@@ -103,6 +103,7 @@ jobs:
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
llm/build/**/*.a
|
||||
dist/windows-amd64/**
|
||||
|
||||
# ROCm generation step
|
||||
generate-windows-rocm:
|
||||
@@ -173,7 +174,9 @@ jobs:
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: generate-windows-rocm
|
||||
path: llm/build/**/bin/*
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
dist/windows-amd64/**
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: windows-rocm-deps
|
||||
@@ -253,7 +256,9 @@ jobs:
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cuda
|
||||
path: llm/build/**/bin/*
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
dist/windows-amd64/**
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: windows-cuda-deps
|
||||
@@ -306,23 +311,18 @@ jobs:
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cpu
|
||||
path: llm/build
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cuda
|
||||
path: llm/build
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: windows-cuda-deps
|
||||
path: dist/deps
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: windows-rocm-deps
|
||||
path: dist/deps
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-rocm
|
||||
path: llm/build
|
||||
- run: dir llm/build
|
||||
- run: |
|
||||
$gopath=(get-command go).source | split-path -parent
|
||||
@@ -331,13 +331,13 @@ jobs:
|
||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
||||
$env:PATH="$gopath;$env:PATH"
|
||||
$env:OLLAMA_SKIP_GENERATE="1"
|
||||
$env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
|
||||
$env:HIP_PATH=$(resolve-path ".\dist\deps")
|
||||
& .\scripts\build_windows.ps1
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-windows
|
||||
path: dist/*.exe
|
||||
path: |
|
||||
dist/OllamaSetup.exe
|
||||
dist/ollama-windows-*.zip
|
||||
|
||||
# Linux x86 assets built using the container based build
|
||||
build-linux-amd64:
|
||||
|
||||
34
.github/workflows/test.yaml
vendored
34
.github/workflows/test.yaml
vendored
@@ -1,5 +1,15 @@
|
||||
name: test
|
||||
|
||||
concurrency:
|
||||
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
|
||||
# cancels running CI jobs and starts all new ones.
|
||||
#
|
||||
# For non-PR pushes, concurrency.group needs to be unique for every distinct
|
||||
# CI run we want to have happen. Use run_id, which in practice means all
|
||||
# non-PR CI runs will be allowed to run without preempting each other.
|
||||
group: ${{ github.workflow }}-$${{ github.pull_request.number || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
@@ -21,7 +31,9 @@ jobs:
|
||||
- id: changes
|
||||
run: |
|
||||
changed() {
|
||||
git diff-tree -r --no-commit-id --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
|
||||
git diff-tree -r --no-commit-id --name-only \
|
||||
$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
|
||||
${{ github.event.pull_request.head.sha }} \
|
||||
| xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
|
||||
}
|
||||
|
||||
@@ -103,7 +115,9 @@ jobs:
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cuda-${{ matrix.cuda-version }}-libraries
|
||||
path: llm/build/**/bin/*
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
dist/windows-amd64/**
|
||||
generate-rocm:
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
||||
@@ -134,7 +148,9 @@ jobs:
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: rocm-${{ matrix.rocm-version }}-libraries
|
||||
path: llm/build/**/bin/*
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
dist/windows-amd64/**
|
||||
|
||||
# ROCm generation step
|
||||
generate-windows-rocm:
|
||||
@@ -253,14 +269,9 @@ jobs:
|
||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
||||
- run: |
|
||||
mkdir -p llm/build/windows/$ARCH/stub/bin
|
||||
touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||
shell: bash
|
||||
- uses: golangci/golangci-lint-action@v4
|
||||
with:
|
||||
args: --timeout 8m0s
|
||||
args: --timeout 8m0s -v
|
||||
test:
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -284,7 +295,6 @@ jobs:
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache: true
|
||||
- run: go get
|
||||
- run: |
|
||||
case ${{ matrix.arch }} in
|
||||
amd64) echo ARCH=x86_64 ;;
|
||||
@@ -299,10 +309,6 @@ jobs:
|
||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
||||
- run: |
|
||||
mkdir -p llm/build/windows/$ARCH/stub/bin
|
||||
touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||
shell: bash
|
||||
- run: go generate ./...
|
||||
- run: go build
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,4 +11,5 @@ ggml-metal.metal
|
||||
.idea
|
||||
test_data
|
||||
*.crt
|
||||
llm/build
|
||||
llm/build
|
||||
__debug_bin*
|
||||
24
README.md
24
README.md
@@ -1,5 +1,5 @@
|
||||
<div align="center">
|
||||
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
|
||||
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
|
||||
</div>
|
||||
|
||||
# Ollama
|
||||
@@ -51,16 +51,12 @@ Here are some example models that can be downloaded:
|
||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
||||
| Llama 3 | 8B | 4.7GB | `ollama run llama3` |
|
||||
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
||||
| Phi-3 | 3.8B | 2.3GB | `ollama run phi3` |
|
||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||
| Dolphin Phi | 2.7B | 1.6GB | `ollama run dolphin-phi` |
|
||||
| Phi-2 | 2.7B | 1.7GB | `ollama run phi` |
|
||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
|
||||
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
||||
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
||||
| Llama 2 13B | 13B | 7.3GB | `ollama run llama2:13b` |
|
||||
| Llama 2 70B | 70B | 39GB | `ollama run llama2:70b` |
|
||||
| Orca Mini | 3B | 1.9GB | `ollama run orca-mini` |
|
||||
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
||||
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
|
||||
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
|
||||
@@ -177,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
||||
The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||
```
|
||||
|
||||
### Pass in prompt as arguments
|
||||
### Pass the prompt as an argument
|
||||
|
||||
```
|
||||
$ ollama run llama3 "Summarize this file: $(cat README.md)"
|
||||
@@ -260,16 +256,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
|
||||
### Web & Desktop
|
||||
|
||||
- [Open WebUI](https://github.com/open-webui/open-webui)
|
||||
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
|
||||
- [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
|
||||
- [LibreChat](https://github.com/danny-avila/LibreChat)
|
||||
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
|
||||
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
|
||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [Saddle](https://github.com/jikkuatwork/saddle)
|
||||
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
||||
- [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
|
||||
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
||||
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
||||
- [Open WebUI](https://github.com/open-webui/open-webui)
|
||||
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
|
||||
- [big-AGI](https://github.com/enricoros/big-AGI/blob/main/docs/config-local-ollama.md)
|
||||
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
|
||||
@@ -291,9 +288,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
|
||||
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
|
||||
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
||||
- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
|
||||
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
|
||||
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
|
||||
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
|
||||
- [chat: chat web app for teams](https://github.com/swuecho/chat)
|
||||
- [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
|
||||
- [Ollama RAG Chatbot: Local Chat with multiple PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
|
||||
|
||||
### Terminal
|
||||
|
||||
@@ -309,11 +310,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
|
||||
- [cmdh](https://github.com/pgibler/cmdh)
|
||||
- [ooo](https://github.com/npahlfer/ooo)
|
||||
- [shell-pilot](https://github.com/reid41/shell-pilot)
|
||||
- [tenere](https://github.com/pythops/tenere)
|
||||
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
|
||||
- [typechat-cli](https://github.com/anaisbetts/typechat-cli)
|
||||
- [ShellOracle](https://github.com/djcopley/ShellOracle)
|
||||
- [tlm](https://github.com/yusufcanb/tlm)
|
||||
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
|
||||
|
||||
### Database
|
||||
|
||||
@@ -378,6 +381,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
||||
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
|
||||
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
|
||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||
|
||||
### Supported backends
|
||||
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
|
||||
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
"net/url"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
@@ -57,12 +58,36 @@ func checkError(resp *http.Response, body []byte) error {
|
||||
// If the variable is not specified, a default ollama host and port will be
|
||||
// used.
|
||||
func ClientFromEnvironment() (*Client, error) {
|
||||
ollamaHost, err := GetOllamaHost()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &Client{
|
||||
base: &url.URL{
|
||||
Scheme: ollamaHost.Scheme,
|
||||
Host: net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
|
||||
},
|
||||
http: http.DefaultClient,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type OllamaHost struct {
|
||||
Scheme string
|
||||
Host string
|
||||
Port string
|
||||
}
|
||||
|
||||
func GetOllamaHost() (OllamaHost, error) {
|
||||
defaultPort := "11434"
|
||||
|
||||
scheme, hostport, ok := strings.Cut(os.Getenv("OLLAMA_HOST"), "://")
|
||||
hostVar := os.Getenv("OLLAMA_HOST")
|
||||
hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
|
||||
|
||||
scheme, hostport, ok := strings.Cut(hostVar, "://")
|
||||
switch {
|
||||
case !ok:
|
||||
scheme, hostport = "http", os.Getenv("OLLAMA_HOST")
|
||||
scheme, hostport = "http", hostVar
|
||||
case scheme == "http":
|
||||
defaultPort = "80"
|
||||
case scheme == "https":
|
||||
@@ -82,15 +107,24 @@ func ClientFromEnvironment() (*Client, error) {
|
||||
}
|
||||
}
|
||||
|
||||
return &Client{
|
||||
base: &url.URL{
|
||||
Scheme: scheme,
|
||||
Host: net.JoinHostPort(host, port),
|
||||
},
|
||||
http: http.DefaultClient,
|
||||
if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
|
||||
return OllamaHost{}, ErrInvalidHostPort
|
||||
}
|
||||
|
||||
return OllamaHost{
|
||||
Scheme: scheme,
|
||||
Host: host,
|
||||
Port: port,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func NewClient(base *url.URL, http *http.Client) *Client {
|
||||
return &Client{
|
||||
base: base,
|
||||
http: http,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
|
||||
var reqBody io.Reader
|
||||
var data []byte
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
package api
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestClientFromEnvironment(t *testing.T) {
|
||||
type testCase struct {
|
||||
@@ -40,4 +46,40 @@ func TestClientFromEnvironment(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
hostTestCases := map[string]*testCase{
|
||||
"empty": {value: "", expect: "127.0.0.1:11434"},
|
||||
"only address": {value: "1.2.3.4", expect: "1.2.3.4:11434"},
|
||||
"only port": {value: ":1234", expect: ":1234"},
|
||||
"address and port": {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
|
||||
"hostname": {value: "example.com", expect: "example.com:11434"},
|
||||
"hostname and port": {value: "example.com:1234", expect: "example.com:1234"},
|
||||
"zero port": {value: ":0", expect: ":0"},
|
||||
"too large port": {value: ":66000", err: ErrInvalidHostPort},
|
||||
"too small port": {value: ":-1", err: ErrInvalidHostPort},
|
||||
"ipv6 localhost": {value: "[::1]", expect: "[::1]:11434"},
|
||||
"ipv6 world open": {value: "[::]", expect: "[::]:11434"},
|
||||
"ipv6 no brackets": {value: "::1", expect: "[::1]:11434"},
|
||||
"ipv6 + port": {value: "[::1]:1337", expect: "[::1]:1337"},
|
||||
"extra space": {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
|
||||
"extra quotes": {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
|
||||
"extra space+quotes": {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
|
||||
"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
|
||||
}
|
||||
|
||||
for k, v := range hostTestCases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_HOST", v.value)
|
||||
|
||||
oh, err := GetOllamaHost()
|
||||
if err != v.err {
|
||||
t.Fatalf("expected %s, got %s", v.err, err)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
host := net.JoinHostPort(oh.Host, oh.Port)
|
||||
assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
15
api/types.go
15
api/types.go
@@ -98,8 +98,7 @@ type ChatResponse struct {
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
Message Message `json:"message"`
|
||||
|
||||
Done bool `json:"done"`
|
||||
DoneReason string `json:"done_reason,omitempty"`
|
||||
Done bool `json:"done"`
|
||||
|
||||
Metrics
|
||||
}
|
||||
@@ -266,9 +265,8 @@ type GenerateResponse struct {
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
Response string `json:"response"`
|
||||
|
||||
Done bool `json:"done"`
|
||||
DoneReason string `json:"done_reason,omitempty"`
|
||||
Context []int `json:"context,omitempty"`
|
||||
Done bool `json:"done"`
|
||||
Context []int `json:"context,omitempty"`
|
||||
|
||||
Metrics
|
||||
}
|
||||
@@ -311,6 +309,7 @@ func (m *Metrics) Summary() {
|
||||
}
|
||||
|
||||
var ErrInvalidOpts = errors.New("invalid options")
|
||||
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
|
||||
|
||||
func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
|
||||
@@ -398,8 +397,10 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||
func DefaultOptions() Options {
|
||||
return Options{
|
||||
// options set on request to runner
|
||||
NumPredict: -1,
|
||||
NumKeep: 0,
|
||||
NumPredict: -1,
|
||||
|
||||
// set a minimal num_keep to avoid issues on context shifts
|
||||
NumKeep: 4,
|
||||
Temperature: 0.8,
|
||||
TopK: 40,
|
||||
TopP: 0.9,
|
||||
|
||||
@@ -43,37 +43,36 @@ func getCLIFullPath(command string) string {
|
||||
return command
|
||||
}
|
||||
|
||||
func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||
done := make(chan int)
|
||||
|
||||
logDir := filepath.Dir(ServerLogFile)
|
||||
_, err := os.Stat(logDir)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
if err := os.MkdirAll(logDir, 0o755); err != nil {
|
||||
return done, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
|
||||
}
|
||||
}
|
||||
|
||||
func start(ctx context.Context, command string) (*exec.Cmd, error) {
|
||||
cmd := getCmd(ctx, getCLIFullPath(command))
|
||||
// send stdout and stderr to a file
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return done, fmt.Errorf("failed to spawn server stdout pipe %s", err)
|
||||
return nil, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
|
||||
}
|
||||
stderr, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
return done, fmt.Errorf("failed to spawn server stderr pipe %s", err)
|
||||
}
|
||||
stdin, err := cmd.StdinPipe()
|
||||
if err != nil {
|
||||
return done, fmt.Errorf("failed to spawn server stdin pipe %s", err)
|
||||
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
|
||||
}
|
||||
|
||||
// TODO - rotation
|
||||
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
||||
if err != nil {
|
||||
return done, fmt.Errorf("failed to create server log %w", err)
|
||||
return nil, fmt.Errorf("failed to create server log: %w", err)
|
||||
}
|
||||
|
||||
logDir := filepath.Dir(ServerLogFile)
|
||||
_, err = os.Stat(logDir)
|
||||
if err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return nil, fmt.Errorf("stat ollama server log dir %s: %v", logDir, err)
|
||||
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(logDir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
|
||||
}
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer logFile.Close()
|
||||
io.Copy(logFile, stdout) //nolint:errcheck
|
||||
@@ -117,19 +116,33 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||
|
||||
// run the command and wait for it to finish
|
||||
if err := cmd.Start(); err != nil {
|
||||
return done, fmt.Errorf("failed to start server %w", err)
|
||||
return nil, fmt.Errorf("failed to start server %w", err)
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
|
||||
}
|
||||
slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))
|
||||
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||
done := make(chan int)
|
||||
|
||||
go func() {
|
||||
// Keep the server running unless we're shuttind down the app
|
||||
crashCount := 0
|
||||
for {
|
||||
slog.Info("starting server...")
|
||||
cmd, err := start(ctx, command)
|
||||
if err != nil {
|
||||
crashCount++
|
||||
slog.Error(fmt.Sprintf("failed to start server %s", err))
|
||||
time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
|
||||
continue
|
||||
}
|
||||
|
||||
cmd.Wait() //nolint:errcheck
|
||||
stdin.Close()
|
||||
var code int
|
||||
if cmd.ProcessState != nil {
|
||||
code = cmd.ProcessState.ExitCode()
|
||||
@@ -143,15 +156,12 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||
default:
|
||||
crashCount++
|
||||
slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Error(fmt.Sprintf("failed to restart server %s", err))
|
||||
// Keep trying, but back off if we keep failing
|
||||
time.Sleep(time.Duration(crashCount) * time.Second)
|
||||
}
|
||||
time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
|
||||
break
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return done, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -88,15 +88,12 @@ DialogFontSize=12
|
||||
[Files]
|
||||
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
||||
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
||||
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
||||
; Assumes v5.7, may need adjustments for v6
|
||||
#if GetEnv("HIP_PATH") != ""
|
||||
Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
|
||||
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
|
||||
; amdhip64.dll dependency comes from the driver and must be installed already
|
||||
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
|
||||
#if DirExists("..\dist\windows-amd64\rocm")
|
||||
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
||||
#endif
|
||||
|
||||
|
||||
@@ -132,7 +129,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
|
||||
|
||||
|
||||
;FinishedHeadingLabel=Run your first model
|
||||
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama2
|
||||
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3
|
||||
;ClickFinish=%n
|
||||
|
||||
[Registry]
|
||||
|
||||
36
auth/auth.go
36
auth/auth.go
@@ -10,12 +10,44 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
const defaultPrivateKey = "id_ed25519"
|
||||
|
||||
func keyPath() (string, error) {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return filepath.Join(home, ".ollama", defaultPrivateKey), nil
|
||||
}
|
||||
|
||||
func GetPublicKey() (string, error) {
|
||||
keyPath, err := keyPath()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
privateKeyFile, err := os.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
||||
return "", err
|
||||
}
|
||||
|
||||
privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
|
||||
|
||||
return strings.TrimSpace(string(publicKey)), nil
|
||||
}
|
||||
|
||||
func NewNonce(r io.Reader, length int) (string, error) {
|
||||
nonce := make([]byte, length)
|
||||
if _, err := io.ReadFull(r, nonce); err != nil {
|
||||
@@ -26,13 +58,11 @@ func NewNonce(r io.Reader, length int) (string, error) {
|
||||
}
|
||||
|
||||
func Sign(ctx context.Context, bts []byte) (string, error) {
|
||||
home, err := os.UserHomeDir()
|
||||
keyPath, err := keyPath()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
|
||||
|
||||
privateKeyFile, err := os.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
||||
|
||||
289
cmd/cmd.go
289
cmd/cmd.go
@@ -17,6 +17,7 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
@@ -31,10 +32,12 @@ import (
|
||||
"golang.org/x/term"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/auth"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/progress"
|
||||
"github.com/ollama/ollama/server"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
@@ -53,14 +56,13 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
p := progress.NewProgress(os.Stderr)
|
||||
defer p.Stop()
|
||||
|
||||
bars := make(map[string]*progress.Bar)
|
||||
|
||||
modelfile, err := os.ReadFile(filename)
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
commands, err := parser.Parse(bytes.NewReader(modelfile))
|
||||
modelfile, err := model.ParseFile(f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -74,10 +76,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
spinner := progress.NewSpinner(status)
|
||||
p.Add(status, spinner)
|
||||
|
||||
for _, c := range commands {
|
||||
switch c.Name {
|
||||
for i := range modelfile.Commands {
|
||||
switch modelfile.Commands[i].Name {
|
||||
case "model", "adapter":
|
||||
path := c.Args
|
||||
path := modelfile.Commands[i].Args
|
||||
if path == "~" {
|
||||
path = home
|
||||
} else if strings.HasPrefix(path, "~/") {
|
||||
@@ -89,101 +91,22 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
fi, err := os.Stat(path)
|
||||
if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
|
||||
if errors.Is(err, os.ErrNotExist) && modelfile.Commands[i].Name == "model" {
|
||||
continue
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO make this work w/ adapters
|
||||
if fi.IsDir() {
|
||||
tf, err := os.CreateTemp("", "ollama-tf")
|
||||
// this is likely a safetensors or pytorch directory
|
||||
// TODO make this work w/ adapters
|
||||
tempfile, err := tempZipFiles(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.RemoveAll(tf.Name())
|
||||
defer os.RemoveAll(tempfile)
|
||||
|
||||
zf := zip.NewWriter(tf)
|
||||
|
||||
files := []string{}
|
||||
|
||||
tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
|
||||
if err != nil {
|
||||
return err
|
||||
} else if len(tfiles) == 0 {
|
||||
tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
files = append(files, tfiles...)
|
||||
|
||||
if len(files) == 0 {
|
||||
return fmt.Errorf("no models were found in '%s'", path)
|
||||
}
|
||||
|
||||
// add the safetensor/torch config file + tokenizer
|
||||
files = append(files, filepath.Join(path, "config.json"))
|
||||
files = append(files, filepath.Join(path, "params.json"))
|
||||
files = append(files, filepath.Join(path, "added_tokens.json"))
|
||||
files = append(files, filepath.Join(path, "tokenizer.model"))
|
||||
|
||||
for _, fn := range files {
|
||||
f, err := os.Open(fn)
|
||||
|
||||
// just skip whatever files aren't there
|
||||
if os.IsNotExist(err) {
|
||||
if strings.HasSuffix(fn, "tokenizer.model") {
|
||||
// try the parent dir before giving up
|
||||
parentDir := filepath.Dir(path)
|
||||
newFn := filepath.Join(parentDir, "tokenizer.model")
|
||||
f, err = os.Open(newFn)
|
||||
if os.IsNotExist(err) {
|
||||
continue
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
h, err := zip.FileInfoHeader(fi)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
h.Name = filepath.Base(fn)
|
||||
h.Method = zip.Store
|
||||
|
||||
w, err := zf.CreateHeader(h)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = io.Copy(w, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if err := zf.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := tf.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
path = tf.Name()
|
||||
path = tempfile
|
||||
}
|
||||
|
||||
digest, err := createBlob(cmd, client, path)
|
||||
@@ -191,10 +114,11 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
modelfile = bytes.ReplaceAll(modelfile, []byte(c.Args), []byte("@"+digest))
|
||||
modelfile.Commands[i].Args = "@" + digest
|
||||
}
|
||||
}
|
||||
|
||||
bars := make(map[string]*progress.Bar)
|
||||
fn := func(resp api.ProgressResponse) error {
|
||||
if resp.Digest != "" {
|
||||
spinner.Stop()
|
||||
@@ -220,7 +144,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
|
||||
quantization, _ := cmd.Flags().GetString("quantization")
|
||||
|
||||
request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
|
||||
request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
|
||||
if err := client.Create(cmd.Context(), &request, fn); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -228,6 +152,114 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func tempZipFiles(path string) (string, error) {
|
||||
tempfile, err := os.CreateTemp("", "ollama-tf")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer tempfile.Close()
|
||||
|
||||
zipfile := zip.NewWriter(tempfile)
|
||||
defer zipfile.Close()
|
||||
|
||||
detectContentType := func(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var b bytes.Buffer
|
||||
b.Grow(512)
|
||||
|
||||
if _, err := io.CopyN(&b, f, 512); err != nil && !errors.Is(err, io.EOF) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
contentType, _, _ := strings.Cut(http.DetectContentType(b.Bytes()), ";")
|
||||
return contentType, nil
|
||||
}
|
||||
|
||||
glob := func(pattern, contentType string) ([]string, error) {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, safetensor := range matches {
|
||||
if ct, err := detectContentType(safetensor); err != nil {
|
||||
return nil, err
|
||||
} else if ct != contentType {
|
||||
return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, safetensor)
|
||||
}
|
||||
}
|
||||
|
||||
return matches, nil
|
||||
}
|
||||
|
||||
var files []string
|
||||
if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||
// safetensors files might be unresolved git lfs references; skip if they are
|
||||
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
||||
files = append(files, st...)
|
||||
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
||||
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
||||
files = append(files, pt...)
|
||||
} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/octet-stream"); len(pt) > 0 {
|
||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
||||
// covers consolidated.x.pth, consolidated.pth
|
||||
files = append(files, pt...)
|
||||
} else {
|
||||
return "", errors.New("no safetensors or torch files found")
|
||||
}
|
||||
|
||||
// add configuration files, json files are detected as text/plain
|
||||
js, err := glob(filepath.Join(path, "*.json"), "text/plain")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
files = append(files, js...)
|
||||
|
||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
||||
files = append(files, tks...)
|
||||
} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
|
||||
// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
|
||||
files = append(files, tks...)
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
zfi, err := zip.FileInfoHeader(fi)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
zf, err := zipfile.CreateHeader(zfi)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if _, err := io.Copy(zf, f); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
return tempfile.Name(), nil
|
||||
}
|
||||
|
||||
func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
|
||||
bin, err := os.Open(path)
|
||||
if err != nil {
|
||||
@@ -322,6 +354,47 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
return generateInteractive(cmd, opts)
|
||||
}
|
||||
|
||||
func errFromUnknownKey(unknownKeyErr error) error {
|
||||
// find SSH public key in the error message
|
||||
sshKeyPattern := `ssh-\w+ [^\s"]+`
|
||||
re := regexp.MustCompile(sshKeyPattern)
|
||||
matches := re.FindStringSubmatch(unknownKeyErr.Error())
|
||||
|
||||
if len(matches) > 0 {
|
||||
serverPubKey := matches[0]
|
||||
|
||||
localPubKey, err := auth.GetPublicKey()
|
||||
if err != nil {
|
||||
return unknownKeyErr
|
||||
}
|
||||
|
||||
if runtime.GOOS == "linux" && serverPubKey != localPubKey {
|
||||
// try the ollama service public key
|
||||
svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
|
||||
if err != nil {
|
||||
return unknownKeyErr
|
||||
}
|
||||
localPubKey = strings.TrimSpace(string(svcPubKey))
|
||||
}
|
||||
|
||||
// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
|
||||
if serverPubKey != localPubKey {
|
||||
return unknownKeyErr
|
||||
}
|
||||
|
||||
var msg strings.Builder
|
||||
msg.WriteString(unknownKeyErr.Error())
|
||||
msg.WriteString("\n\nYour ollama key is:\n")
|
||||
msg.WriteString(localPubKey)
|
||||
msg.WriteString("\nAdd your key at:\n")
|
||||
msg.WriteString("https://ollama.com/settings/keys")
|
||||
|
||||
return errors.New(msg.String())
|
||||
}
|
||||
|
||||
return unknownKeyErr
|
||||
}
|
||||
|
||||
func PushHandler(cmd *cobra.Command, args []string) error {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
@@ -369,6 +442,20 @@ func PushHandler(cmd *cobra.Command, args []string) error {
|
||||
|
||||
request := api.PushRequest{Name: args[0], Insecure: insecure}
|
||||
if err := client.Push(cmd.Context(), &request, fn); err != nil {
|
||||
if spinner != nil {
|
||||
spinner.Stop()
|
||||
}
|
||||
if strings.Contains(err.Error(), "access denied") {
|
||||
return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
|
||||
}
|
||||
host := model.ParseName(args[0]).Host
|
||||
isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
|
||||
if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
|
||||
// the user has not added their ollama key to ollama.com
|
||||
// re-throw an error with a more user-friendly message
|
||||
return errFromUnknownKey(err)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -796,19 +883,17 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
}
|
||||
|
||||
func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
host, port, err := net.SplitHostPort(strings.Trim(os.Getenv("OLLAMA_HOST"), "\"'"))
|
||||
// retrieve the OLLAMA_HOST environment variable
|
||||
ollamaHost, err := api.GetOllamaHost()
|
||||
if err != nil {
|
||||
host, port = "127.0.0.1", "11434"
|
||||
if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
|
||||
host = ip.String()
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
if err := initializeKeypair(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
|
||||
ln, err := net.Listen("tcp", net.JoinHostPort(ollamaHost.Host, ollamaHost.Port))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -1034,7 +1119,7 @@ Environment Variables:
|
||||
RunE: ListHandler,
|
||||
}
|
||||
copyCmd := &cobra.Command{
|
||||
Use: "cp SOURCE TARGET",
|
||||
Use: "cp SOURCE DESTINATION",
|
||||
Short: "Copy a model",
|
||||
Args: cobra.ExactArgs(2),
|
||||
PreRunE: checkServerHeartbeat,
|
||||
|
||||
@@ -94,6 +94,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
fmt.Fprintln(os.Stderr, " /show Show model information")
|
||||
fmt.Fprintln(os.Stderr, " /load <model> Load a session or model")
|
||||
fmt.Fprintln(os.Stderr, " /save <model> Save your current session")
|
||||
fmt.Fprintln(os.Stderr, " /clear Clear session context")
|
||||
fmt.Fprintln(os.Stderr, " /bye Exit")
|
||||
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
|
||||
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
|
||||
@@ -280,6 +281,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
}
|
||||
fmt.Printf("Created new model '%s'\n", args[1])
|
||||
continue
|
||||
case strings.HasPrefix(line, "/clear"):
|
||||
opts.Messages = []api.Message{}
|
||||
fmt.Println("Cleared session context")
|
||||
continue
|
||||
case strings.HasPrefix(line, "/set"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -18,19 +19,23 @@ import (
|
||||
)
|
||||
|
||||
type Params struct {
|
||||
Architectures []string `json:"architectures"`
|
||||
VocabSize int `json:"vocab_size"`
|
||||
HiddenSize int `json:"hidden_size"` // n_embd
|
||||
HiddenLayers int `json:"num_hidden_layers"` // n_layer
|
||||
ContextSize int `json:"max_position_embeddings"`
|
||||
IntermediateSize int `json:"intermediate_size"`
|
||||
AttentionHeads int `json:"num_attention_heads"` // n_head
|
||||
KeyValHeads int `json:"num_key_value_heads"`
|
||||
NormEPS float64 `json:"rms_norm_eps"`
|
||||
BoSTokenID int `json:"bos_token_id"`
|
||||
EoSTokenID int `json:"eos_token_id"`
|
||||
HeadDimension int `json:"head_dim"`
|
||||
PaddingTokenID int `json:"pad_token_id"`
|
||||
Architectures []string `json:"architectures"`
|
||||
VocabSize int `json:"vocab_size"`
|
||||
HiddenSize int `json:"hidden_size"` // n_embd
|
||||
HiddenLayers int `json:"num_hidden_layers"` // n_layer
|
||||
ContextSize int `json:"max_position_embeddings"`
|
||||
IntermediateSize int `json:"intermediate_size"`
|
||||
AttentionHeads int `json:"num_attention_heads"` // n_head
|
||||
KeyValHeads int `json:"num_key_value_heads"`
|
||||
NormEPS float64 `json:"rms_norm_eps"`
|
||||
BoSTokenID int `json:"bos_token_id"`
|
||||
EoSTokenID int `json:"eos_token_id"`
|
||||
HeadDimension int `json:"head_dim"`
|
||||
PaddingTokenID int `json:"pad_token_id"`
|
||||
RopeFrequencyBase float64 `json:"rope_theta"`
|
||||
|
||||
Experts int `json:"num_local_experts"`
|
||||
ExpertsUsed int `json:"num_experts_per_tok"`
|
||||
|
||||
ByteOrder
|
||||
}
|
||||
@@ -43,7 +48,7 @@ type ByteOrder interface {
|
||||
type ModelArch interface {
|
||||
GetTensors() error
|
||||
LoadVocab() error
|
||||
WriteGGUF() (string, error)
|
||||
WriteGGUF(io.WriteSeeker) error
|
||||
}
|
||||
|
||||
type ModelFormat interface {
|
||||
|
||||
@@ -94,7 +94,7 @@ func (m *GemmaModel) LoadVocab() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *GemmaModel) WriteGGUF() (string, error) {
|
||||
func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
|
||||
kv := llm.KV{
|
||||
"general.architecture": "gemma",
|
||||
"general.name": m.Name,
|
||||
@@ -122,16 +122,5 @@ func (m *GemmaModel) WriteGGUF() (string, error) {
|
||||
"tokenizer.ggml.add_eos_token": false,
|
||||
}
|
||||
|
||||
f, err := os.CreateTemp("", "ollama-gguf")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
mod := llm.NewGGUFV3(m.Params.ByteOrder)
|
||||
if err := mod.Encode(f, kv, m.Tensors); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return f.Name(), nil
|
||||
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
|
||||
}
|
||||
|
||||
@@ -132,7 +132,7 @@ func (m *LlamaModel) LoadVocab() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *LlamaModel) WriteGGUF() (string, error) {
|
||||
func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
|
||||
kv := llm.KV{
|
||||
"general.architecture": "llama",
|
||||
"general.name": m.Name,
|
||||
@@ -161,16 +161,9 @@ func (m *LlamaModel) WriteGGUF() (string, error) {
|
||||
|
||||
f, err := os.CreateTemp("", "ollama-gguf")
|
||||
if err != nil {
|
||||
return "", err
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
mod := llm.NewGGUFV3(m.Params.ByteOrder)
|
||||
if err := mod.Encode(f, kv, m.Tensors); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
|
||||
|
||||
return f.Name(), nil
|
||||
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(f, kv, m.Tensors)
|
||||
}
|
||||
|
||||
@@ -132,7 +132,7 @@ func (m *MistralModel) LoadVocab() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MistralModel) WriteGGUF() (string, error) {
|
||||
func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
|
||||
kv := llm.KV{
|
||||
"general.architecture": "llama",
|
||||
"general.name": m.Name,
|
||||
@@ -158,16 +158,5 @@ func (m *MistralModel) WriteGGUF() (string, error) {
|
||||
"tokenizer.ggml.unknown_token_id": uint32(0),
|
||||
}
|
||||
|
||||
f, err := os.CreateTemp("", "ollama-gguf")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
mod := llm.NewGGUFV3(m.Params.ByteOrder)
|
||||
if err := mod.Encode(f, kv, m.Tensors); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return f.Name(), nil
|
||||
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
|
||||
}
|
||||
|
||||
85
convert/mixtral.go
Normal file
85
convert/mixtral.go
Normal file
@@ -0,0 +1,85 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"io"
|
||||
"regexp"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type MixtralModel struct {
|
||||
ModelData
|
||||
}
|
||||
|
||||
func (m *MixtralModel) GetTensors() error {
|
||||
t, err := m.Format.GetTensors(m.Path, m.Params)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
m.Tensors = []llm.Tensor{}
|
||||
|
||||
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
|
||||
re, err := regexp.Compile(pattern)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, l := range t {
|
||||
matches := re.FindAllStringSubmatch(l.Name, -1)
|
||||
if len(matches) > 0 {
|
||||
wt := l.WriterTo.(safetensorWriterTo)
|
||||
wt.handler = mistralLayerHandler
|
||||
l.WriterTo = wt
|
||||
}
|
||||
m.Tensors = append(m.Tensors, l)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MixtralModel) LoadVocab() error {
|
||||
v, err := LoadSentencePieceTokens(m.Path, m.Params)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.Vocab = v
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
|
||||
kv := llm.KV{
|
||||
"general.architecture": "llama",
|
||||
"general.name": m.Name,
|
||||
"llama.block_count": uint32(m.Params.HiddenLayers),
|
||||
"llama.context_length": uint32(m.Params.ContextSize),
|
||||
"llama.embedding_length": uint32(m.Params.HiddenSize),
|
||||
"llama.feed_forward_length": uint32(m.Params.IntermediateSize),
|
||||
"llama.attention.head_count": uint32(m.Params.AttentionHeads),
|
||||
"llama.attention.head_count_kv": uint32(m.Params.KeyValHeads),
|
||||
|
||||
"llama.rope.freq_base": float32(m.Params.RopeFrequencyBase),
|
||||
"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
|
||||
|
||||
"llama.expert_count": uint32(m.Params.Experts),
|
||||
"llama.expert_used_count": uint32(m.Params.ExpertsUsed),
|
||||
|
||||
"llama.vocab_size": uint32(len(m.Vocab.Tokens)),
|
||||
"llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
|
||||
|
||||
"general.file_type": uint32(1),
|
||||
"tokenizer.ggml.model": "llama",
|
||||
|
||||
"tokenizer.ggml.tokens": m.Vocab.Tokens,
|
||||
"tokenizer.ggml.scores": m.Vocab.Scores,
|
||||
"tokenizer.ggml.token_type": m.Vocab.Types,
|
||||
|
||||
"tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID),
|
||||
"tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID),
|
||||
"tokenizer.ggml.unknown_token_id": uint32(0),
|
||||
"tokenizer.ggml.add_bos_token": true,
|
||||
"tokenizer.ggml.add_eos_token": false,
|
||||
}
|
||||
|
||||
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
|
||||
}
|
||||
@@ -93,7 +93,6 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
|
||||
}
|
||||
|
||||
slices.Sort(keys)
|
||||
|
||||
slog.Info("converting layers")
|
||||
|
||||
var tensors []llm.Tensor
|
||||
@@ -105,7 +104,6 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
slog.Debug(fmt.Sprintf("metadata = %#v", data))
|
||||
var size uint64
|
||||
var kind uint32
|
||||
switch len(data.Shape) {
|
||||
@@ -150,11 +148,13 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
|
||||
padding: 8 + jsonSize,
|
||||
}
|
||||
|
||||
tensors = append(tensors, t)
|
||||
offset += size
|
||||
tensors = append(tensors, t)
|
||||
}
|
||||
|
||||
slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors)))
|
||||
slog.Debug(fmt.Sprintf("offset = %d", offset))
|
||||
|
||||
return tensors, offset, nil
|
||||
}
|
||||
|
||||
@@ -185,15 +185,19 @@ func (m *SafetensorFormat) GetLayerName(n string) (string, error) {
|
||||
}
|
||||
|
||||
tMap := map[string]string{
|
||||
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
|
||||
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
|
||||
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
|
||||
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
|
||||
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
|
||||
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
|
||||
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
|
||||
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
|
||||
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
|
||||
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
|
||||
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
|
||||
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
|
||||
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
|
||||
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
|
||||
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
|
||||
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
|
||||
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
|
||||
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
|
||||
"model.layers.(\\d+).block_sparse_moe.gate.weight": "blk.$1.ffn_gate_inp.weight",
|
||||
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w1.weight": "blk.$1.ffn_gate.$2.weight",
|
||||
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w2.weight": "blk.$1.ffn_down.$2.weight",
|
||||
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w3.weight": "blk.$1.ffn_up.$2.weight",
|
||||
}
|
||||
|
||||
v, ok := directMap[n]
|
||||
@@ -286,6 +290,15 @@ func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (M
|
||||
Format: m,
|
||||
},
|
||||
}, nil
|
||||
case "MixtralForCausalLM":
|
||||
return &MixtralModel{
|
||||
ModelData{
|
||||
Name: name,
|
||||
Path: dirPath,
|
||||
Params: params,
|
||||
Format: m,
|
||||
},
|
||||
}, nil
|
||||
case "GemmaForCausalLM":
|
||||
return &GemmaModel{
|
||||
ModelData{
|
||||
|
||||
58
docs/api.md
58
docs/api.md
@@ -17,7 +17,7 @@
|
||||
|
||||
### Model names
|
||||
|
||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
|
||||
### Durations
|
||||
|
||||
@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"prompt": "Why is the sky blue?"
|
||||
}'
|
||||
```
|
||||
@@ -77,7 +77,7 @@ A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"response": "The",
|
||||
"done": false
|
||||
@@ -99,7 +99,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "",
|
||||
"done": true,
|
||||
@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"stream": false
|
||||
}'
|
||||
@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "The sky is blue because it is the color of the sky.",
|
||||
"done": true,
|
||||
@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
||||
"format": "json",
|
||||
"stream": false
|
||||
@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-11-09T21:07:55.186497Z",
|
||||
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
||||
"done": true,
|
||||
@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"stream": false,
|
||||
"options": {
|
||||
@@ -332,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "The sky is blue because it is the color of the sky.",
|
||||
"done": true,
|
||||
@@ -354,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2"
|
||||
"model": "llama3"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -364,7 +364,7 @@ A single JSON object is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-12-18T19:52:07.071755Z",
|
||||
"response": "",
|
||||
"done": true
|
||||
@@ -407,7 +407,7 @@ Send a chat message with a streaming response.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -423,7 +423,7 @@ A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -438,7 +438,7 @@ Final response:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"done": true,
|
||||
"total_duration": 4883583458,
|
||||
@@ -456,7 +456,7 @@ Final response:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -471,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "registry.ollama.ai/library/llama2:latest",
|
||||
"model": "registry.ollama.ai/library/llama3:latest",
|
||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -495,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -519,7 +519,7 @@ A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -533,7 +533,7 @@ Final response:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"done": true,
|
||||
"total_duration": 8113331500,
|
||||
@@ -591,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -609,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "registry.ollama.ai/library/llama2:latest",
|
||||
"model": "registry.ollama.ai/library/llama3:latest",
|
||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -651,7 +651,7 @@ Create a new model from a `Modelfile`.
|
||||
```shell
|
||||
curl http://localhost:11434/api/create -d '{
|
||||
"name": "mario",
|
||||
"modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros."
|
||||
"modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -758,7 +758,7 @@ A single JSON object will be returned.
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "llama2:latest",
|
||||
"name": "llama3:latest",
|
||||
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
|
||||
"size": 3825819519,
|
||||
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
|
||||
@@ -792,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/show -d '{
|
||||
"name": "llama2"
|
||||
"name": "llama3"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -827,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/copy -d '{
|
||||
"source": "llama2",
|
||||
"destination": "llama2-backup"
|
||||
"source": "llama3",
|
||||
"destination": "llama3-backup"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -854,7 +854,7 @@ Delete a model and its data.
|
||||
|
||||
```shell
|
||||
curl -X DELETE http://localhost:11434/api/delete -d '{
|
||||
"name": "llama2:13b"
|
||||
"name": "llama3:13b"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -882,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/pull -d '{
|
||||
"name": "llama2"
|
||||
"name": "llama3"
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro
|
||||
or installation approach uses unusual paths, you can specify the location by
|
||||
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
|
||||
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
|
||||
set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
|
||||
a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
|
||||
|
||||
Then generate dependencies:
|
||||
|
||||
@@ -142,4 +142,4 @@ In addition to the common Windows development tools described above, install AMD
|
||||
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
|
||||
- [Strawberry Perl](https://strawberryperl.com/)
|
||||
|
||||
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
|
||||
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
|
||||
|
||||
14
docs/faq.md
14
docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
|
||||
|
||||
```
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"options": {
|
||||
"num_ctx": 4096
|
||||
@@ -88,9 +88,9 @@ On windows, Ollama inherits your user and system environment variables.
|
||||
|
||||
3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
|
||||
|
||||
4. Click OK/Apply to save
|
||||
4. Click OK/Apply to save
|
||||
|
||||
5. Run `ollama` from a new terminal window
|
||||
5. Run `ollama` from a new terminal window
|
||||
|
||||
|
||||
## How can I expose Ollama on my network?
|
||||
@@ -221,10 +221,14 @@ The `keep_alive` parameter can be set to:
|
||||
|
||||
For example, to preload a model and leave it in memory use:
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}'
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
|
||||
```
|
||||
|
||||
To unload the model and free up memory use:
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
|
||||
```
|
||||
|
||||
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
||||
|
||||
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
|
||||
|
||||
@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
|
||||
- [Examples](#examples)
|
||||
- [Instructions](#instructions)
|
||||
- [FROM (Required)](#from-required)
|
||||
- [Build from llama2](#build-from-llama2)
|
||||
- [Build from llama3](#build-from-llama3)
|
||||
- [Build from a bin file](#build-from-a-bin-file)
|
||||
- [PARAMETER](#parameter)
|
||||
- [Valid Parameters and Values](#valid-parameters-and-values)
|
||||
@@ -48,7 +48,7 @@ INSTRUCTION arguments
|
||||
An example of a `Modelfile` creating a mario blueprint:
|
||||
|
||||
```modelfile
|
||||
FROM llama2
|
||||
FROM llama3
|
||||
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
||||
PARAMETER temperature 1
|
||||
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
|
||||
@@ -67,33 +67,25 @@ To use this:
|
||||
|
||||
More examples are available in the [examples directory](../examples).
|
||||
|
||||
### `Modelfile`s in [ollama.com/library][1]
|
||||
|
||||
There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
|
||||
|
||||
- Option 1: view a details page from a model's tags page:
|
||||
1. Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
|
||||
2. Click on a tag (e.g. https://ollama.com/library/llama2:13b)
|
||||
3. Scroll down to "Layers"
|
||||
- Note: if the [`FROM` instruction](#from-required) is not present,
|
||||
it means the model was created from a local file
|
||||
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
|
||||
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
||||
|
||||
```bash
|
||||
> ollama show --modelfile llama2:13b
|
||||
> ollama show --modelfile llama3
|
||||
# Modelfile generated by "ollama show"
|
||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||
# FROM llama2:13b
|
||||
# FROM llama3:latest
|
||||
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
||||
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
||||
|
||||
FROM /root/.ollama/models/blobs/sha256:123abc
|
||||
TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
|
||||
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
|
||||
|
||||
{{ end }}{{ .Prompt }} [/INST] """
|
||||
SYSTEM """"""
|
||||
PARAMETER stop [INST]
|
||||
PARAMETER stop [/INST]
|
||||
PARAMETER stop <<SYS>>
|
||||
PARAMETER stop <</SYS>>
|
||||
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
{{ .Response }}<|eot_id|>"""
|
||||
PARAMETER stop "<|start_header_id|>"
|
||||
PARAMETER stop "<|end_header_id|>"
|
||||
PARAMETER stop "<|eot_id|>"
|
||||
PARAMETER stop "<|reserved_special_token"
|
||||
```
|
||||
|
||||
## Instructions
|
||||
@@ -106,10 +98,10 @@ The `FROM` instruction defines the base model to use when creating a model.
|
||||
FROM <model name>:<tag>
|
||||
```
|
||||
|
||||
#### Build from llama2
|
||||
#### Build from llama3
|
||||
|
||||
```modelfile
|
||||
FROM llama2
|
||||
FROM llama3
|
||||
```
|
||||
|
||||
A list of available base models:
|
||||
|
||||
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
|
||||
'content': 'Say this is a test',
|
||||
}
|
||||
],
|
||||
model='llama2',
|
||||
model='llama3',
|
||||
)
|
||||
```
|
||||
|
||||
@@ -43,7 +43,7 @@ const openai = new OpenAI({
|
||||
|
||||
const chatCompletion = await openai.chat.completions.create({
|
||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||
model: 'llama2',
|
||||
model: 'llama3',
|
||||
})
|
||||
```
|
||||
|
||||
@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
|
||||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "llama2",
|
||||
"model": "llama3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
|
||||
Before using a model, pull it locally `ollama pull`:
|
||||
|
||||
```shell
|
||||
ollama pull llama2
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
### Default model names
|
||||
@@ -121,7 +121,7 @@ ollama pull llama2
|
||||
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
||||
|
||||
```
|
||||
ollama cp llama2 gpt-3.5-turbo
|
||||
ollama cp llama3 gpt-3.5-turbo
|
||||
```
|
||||
|
||||
Afterwards, this new model name can be specified the `model` field:
|
||||
|
||||
@@ -15,7 +15,7 @@ import { Ollama } from "langchain/llms/ollama";
|
||||
|
||||
const ollama = new Ollama({
|
||||
baseUrl: "http://localhost:11434",
|
||||
model: "llama2",
|
||||
model: "llama3",
|
||||
});
|
||||
|
||||
const answer = await ollama.invoke(`why is the sky blue?`);
|
||||
@@ -23,10 +23,10 @@ const answer = await ollama.invoke(`why is the sky blue?`);
|
||||
console.log(answer);
|
||||
```
|
||||
|
||||
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
|
||||
That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
|
||||
|
||||
```bash
|
||||
npm install cheerio
|
||||
npm install cheerio
|
||||
```
|
||||
|
||||
```javascript
|
||||
|
||||
@@ -17,10 +17,12 @@ Let's start by asking a simple question that we can get an answer to from the **
|
||||
Then we can create a model and ask the question:
|
||||
|
||||
```python
|
||||
from langchain.llms import Ollama
|
||||
ollama = Ollama(base_url='http://localhost:11434',
|
||||
model="llama2")
|
||||
print(ollama("why is the sky blue"))
|
||||
from langchain_community.llms import Ollama
|
||||
ollama = Ollama(
|
||||
base_url='http://localhost:11434',
|
||||
model="llama3"
|
||||
)
|
||||
print(ollama.invoke("why is the sky blue"))
|
||||
```
|
||||
|
||||
Notice that we are defining the model and the base URL for Ollama.
|
||||
|
||||
@@ -1,47 +1,48 @@
|
||||
# Ollama Windows Preview
|
||||
|
||||
Welcome to the Ollama Windows preview.
|
||||
|
||||
No more WSL required!
|
||||
|
||||
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
|
||||
After installing Ollama Windows Preview, Ollama will run in the background and
|
||||
the `ollama` command line is available in `cmd`, `powershell` or your favorite
|
||||
terminal application. As usual the Ollama [api](./api.md) will be served on
|
||||
`http://localhost:11434`.
|
||||
|
||||
As this is a preview release, you should expect a few bugs here and there. If
|
||||
you run into a problem you can reach out on
|
||||
[Discord](https://discord.gg/ollama), or file an
|
||||
[issue](https://github.com/ollama/ollama/issues).
|
||||
Logs will often be helpful in dianosing the problem (see
|
||||
[Troubleshooting](#troubleshooting) below)
|
||||
|
||||
## System Requirements
|
||||
|
||||
* Windows 10 or newer, Home or Pro
|
||||
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
|
||||
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
|
||||
|
||||
## API Access
|
||||
|
||||
Here's a quick example showing API access from `powershell`
|
||||
```powershell
|
||||
(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
|
||||
a "view logs" menu item to the app, and increses logging for the GUI app and
|
||||
server.
|
||||
|
||||
Ollama on Windows stores files in a few different locations. You can view them in
|
||||
the explorer window by hitting `<cmd>+R` and type in:
|
||||
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
||||
- *app.log* contains logs from the GUI application
|
||||
- *server.log* contains the server logs
|
||||
- *upgrade.log* contains log output for upgrades
|
||||
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
|
||||
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
||||
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
||||
<<<<<<< HEAD
|
||||
# Ollama Windows Preview
|
||||
|
||||
Welcome to the Ollama Windows preview.
|
||||
|
||||
No more WSL required!
|
||||
|
||||
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
|
||||
After installing Ollama Windows Preview, Ollama will run in the background and
|
||||
the `ollama` command line is available in `cmd`, `powershell` or your favorite
|
||||
terminal application. As usual the Ollama [api](./api.md) will be served on
|
||||
`http://localhost:11434`.
|
||||
|
||||
As this is a preview release, you should expect a few bugs here and there. If
|
||||
you run into a problem you can reach out on
|
||||
[Discord](https://discord.gg/ollama), or file an
|
||||
[issue](https://github.com/ollama/ollama/issues).
|
||||
Logs will often be helpful in diagnosing the problem (see
|
||||
[Troubleshooting](#troubleshooting) below)
|
||||
|
||||
## System Requirements
|
||||
|
||||
* Windows 10 or newer, Home or Pro
|
||||
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
|
||||
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
|
||||
|
||||
## API Access
|
||||
|
||||
Here's a quick example showing API access from `powershell`
|
||||
```powershell
|
||||
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
|
||||
a "view logs" menu item to the app, and increses logging for the GUI app and
|
||||
server.
|
||||
|
||||
Ollama on Windows stores files in a few different locations. You can view them in
|
||||
the explorer window by hitting `<cmd>+R` and type in:
|
||||
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
||||
- *app.log* contains logs from the GUI application
|
||||
- *server.log* contains the server logs
|
||||
- *upgrade.log* contains log output for upgrades
|
||||
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
|
||||
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
||||
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
|
||||
|
||||
`ollama run llama2 < sourcequestions.txt`
|
||||
`ollama run llama3 < sourcequestions.txt`
|
||||
|
||||
This concept is used in the following example.
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ func main() {
|
||||
|
||||
ctx := context.Background()
|
||||
req := &api.ChatRequest{
|
||||
Model: "llama2",
|
||||
Model: "llama3",
|
||||
Messages: messages,
|
||||
}
|
||||
|
||||
|
||||
@@ -40,9 +40,9 @@ while True:
|
||||
continue
|
||||
|
||||
# Prompt
|
||||
template = """Use the following pieces of context to answer the question at the end.
|
||||
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
||||
Use three sentences maximum and keep the answer as concise as possible.
|
||||
template = """Use the following pieces of context to answer the question at the end.
|
||||
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
||||
Use three sentences maximum and keep the answer as concise as possible.
|
||||
{context}
|
||||
Question: {question}
|
||||
Helpful Answer:"""
|
||||
@@ -51,11 +51,11 @@ while True:
|
||||
template=template,
|
||||
)
|
||||
|
||||
llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
qa_chain = RetrievalQA.from_chain_type(
|
||||
llm,
|
||||
retriever=vectorstore.as_retriever(),
|
||||
chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
|
||||
)
|
||||
|
||||
result = qa_chain({"query": query})
|
||||
result = qa_chain({"query": query})
|
||||
|
||||
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama2
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
@@ -21,4 +21,3 @@ This example is a basic "hello world" of using LangChain with Ollama.
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from langchain.llms import Ollama
|
||||
|
||||
input = input("What is your question?")
|
||||
llm = Ollama(model="llama2")
|
||||
llm = Ollama(model="llama3")
|
||||
res = llm.predict(input)
|
||||
print (res)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM llama2
|
||||
FROM llama3
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from super mario bros, acting as an assistant.
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
# Example character: Mario
|
||||
|
||||
This example shows how to create a basic character using Llama2 as the base model.
|
||||
This example shows how to create a basic character using Llama3 as the base model.
|
||||
|
||||
To run this example:
|
||||
|
||||
1. Download the Modelfile
|
||||
2. `ollama pull llama2` to get the base model used in the model file.
|
||||
2. `ollama pull llama3` to get the base model used in the model file.
|
||||
3. `ollama create NAME -f ./Modelfile`
|
||||
4. `ollama run NAME`
|
||||
|
||||
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
|
||||
What the model file looks like:
|
||||
|
||||
```
|
||||
FROM llama2
|
||||
FROM llama3
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from Super Mario Bros, acting as an assistant.
|
||||
|
||||
@@ -2,16 +2,16 @@ import requests
|
||||
import json
|
||||
import random
|
||||
|
||||
model = "llama2"
|
||||
model = "llama3"
|
||||
template = {
|
||||
"firstName": "",
|
||||
"lastName": "",
|
||||
"firstName": "",
|
||||
"lastName": "",
|
||||
"address": {
|
||||
"street": "",
|
||||
"city": "",
|
||||
"state": "",
|
||||
"street": "",
|
||||
"city": "",
|
||||
"state": "",
|
||||
"zipCode": ""
|
||||
},
|
||||
},
|
||||
"phoneNumber": ""
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ countries = [
|
||||
"France",
|
||||
]
|
||||
country = random.choice(countries)
|
||||
model = "llama2"
|
||||
model = "llama3"
|
||||
|
||||
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
|
||||
|
||||
|
||||
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama2
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import requests
|
||||
|
||||
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
|
||||
model = "llama2" # TODO: update this for whatever model you wish to use
|
||||
model = "llama3" # TODO: update this for whatever model you wish to use
|
||||
|
||||
|
||||
def chat(messages):
|
||||
|
||||
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama2
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a
|
||||
|
||||
## Usage
|
||||
|
||||
1. Add llama2 to have the mentors ask your questions:
|
||||
1. Add llama3 to have the mentors ask your questions:
|
||||
|
||||
```bash
|
||||
ollama pull llama2
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
2. Install prerequisites:
|
||||
|
||||
@@ -15,7 +15,7 @@ async function characterGenerator() {
|
||||
ollama.setModel("stablebeluga2:70b-q4_K_M");
|
||||
const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);
|
||||
|
||||
const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
|
||||
const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
|
||||
|
||||
fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
|
||||
if (err) throw err;
|
||||
@@ -23,4 +23,4 @@ async function characterGenerator() {
|
||||
});
|
||||
}
|
||||
|
||||
characterGenerator();
|
||||
characterGenerator();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import * as readline from "readline";
|
||||
|
||||
const model = "llama2";
|
||||
const model = "llama3";
|
||||
type Message = {
|
||||
role: "assistant" | "user" | "system";
|
||||
content: string;
|
||||
@@ -74,4 +74,4 @@ async function main() {
|
||||
|
||||
}
|
||||
|
||||
main();
|
||||
main();
|
||||
|
||||
@@ -15,6 +15,7 @@ const (
|
||||
|
||||
KibiByte = Byte * 1024
|
||||
MebiByte = KibiByte * 1024
|
||||
GibiByte = MebiByte * 1024
|
||||
)
|
||||
|
||||
func HumanBytes(b int64) string {
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"runtime"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -35,22 +35,64 @@ func GetSupportedGFX(libDir string) ([]string, error) {
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
|
||||
// Set the visible devices if not already set
|
||||
// TODO - does sort order matter?
|
||||
devices := []string{}
|
||||
for i := range ids {
|
||||
if _, skipped := skip[i]; skipped {
|
||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
if info.Library != "rocm" {
|
||||
// TODO shouldn't happen if things are wired correctly...
|
||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
||||
continue
|
||||
}
|
||||
devices = append(devices, strconv.Itoa(i))
|
||||
ids = append(ids, info.ID)
|
||||
}
|
||||
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||
}
|
||||
|
||||
func commonAMDValidateLibDir() (string, error) {
|
||||
// We try to favor system paths first, so that we can wire up the subprocess to use
|
||||
// the system version. Only use our bundled version if the system version doesn't work
|
||||
// This gives users a more recovery options if versions have subtle problems at runtime
|
||||
|
||||
// Prefer explicit HIP env var
|
||||
hipPath := os.Getenv("HIP_PATH")
|
||||
if hipPath != "" {
|
||||
hipLibDir := filepath.Join(hipPath, "bin")
|
||||
if rocmLibUsable(hipLibDir) {
|
||||
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
|
||||
return hipLibDir, nil
|
||||
}
|
||||
}
|
||||
|
||||
val := strings.Join(devices, ",")
|
||||
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
|
||||
} else {
|
||||
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
|
||||
// Scan the LD_LIBRARY_PATH or PATH
|
||||
pathEnv := "LD_LIBRARY_PATH"
|
||||
if runtime.GOOS == "windows" {
|
||||
pathEnv = "PATH"
|
||||
}
|
||||
|
||||
paths := os.Getenv(pathEnv)
|
||||
for _, path := range filepath.SplitList(paths) {
|
||||
d, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if rocmLibUsable(d) {
|
||||
return d, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Well known location(s)
|
||||
if rocmLibUsable(RocmStandardLocation) {
|
||||
return RocmStandardLocation, nil
|
||||
}
|
||||
|
||||
// Installer payload location if we're running the installed binary
|
||||
exe, err := os.Executable()
|
||||
if err == nil {
|
||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
||||
return rocmTargetDir, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ func NewHipLib() (*HipLib, error) {
|
||||
func (hl *HipLib) Release() {
|
||||
err := windows.FreeLibrary(hl.dll)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
|
||||
slog.Warn("failed to unload amdhip64.dll", "error", err)
|
||||
}
|
||||
hl.dll = 0
|
||||
}
|
||||
@@ -98,7 +98,7 @@ func (hl *HipLib) HipGetDeviceCount() int {
|
||||
return 0
|
||||
}
|
||||
if status != hipSuccess {
|
||||
slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
|
||||
slog.Warn("failed call to hipGetDeviceCount", "status", status, "error", err)
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
483
gpu/amd_linux.go
483
gpu/amd_linux.go
@@ -11,6 +11,8 @@ import (
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
// Discovery logic for AMD/ROCm GPUs
|
||||
@@ -24,9 +26,6 @@ const (
|
||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
||||
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
||||
RocmStandardLocation = "/opt/rocm/lib"
|
||||
|
||||
// TODO find a better way to detect iGPU instead of minimum memory
|
||||
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -35,14 +34,11 @@ var (
|
||||
)
|
||||
|
||||
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
|
||||
// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
|
||||
// and the user hasn't already set this variable
|
||||
func AMDGetGPUInfo(resp *GpuInfo) {
|
||||
// TODO - DRY this out with windows
|
||||
func AMDGetGPUInfo() []GpuInfo {
|
||||
resp := []GpuInfo{}
|
||||
if !AMDDetected() {
|
||||
return
|
||||
return resp
|
||||
}
|
||||
skip := map[int]interface{}{}
|
||||
|
||||
// Opportunistic logging of driver version to aid in troubleshooting
|
||||
ver, err := AMDDriverVersion()
|
||||
@@ -50,160 +46,117 @@ func AMDGetGPUInfo(resp *GpuInfo) {
|
||||
slog.Info("AMD Driver: " + ver)
|
||||
} else {
|
||||
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
|
||||
slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
|
||||
slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
|
||||
}
|
||||
|
||||
// If the user has specified exactly which GPUs to use, look up their memory
|
||||
visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
|
||||
if visibleDevices != "" {
|
||||
ids := []int{}
|
||||
for _, idStr := range strings.Split(visibleDevices, ",") {
|
||||
id, err := strconv.Atoi(idStr)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
|
||||
} else {
|
||||
ids = append(ids, id)
|
||||
}
|
||||
}
|
||||
amdProcMemLookup(resp, nil, ids)
|
||||
return
|
||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
||||
var visibleDevices []string
|
||||
hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only
|
||||
rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
|
||||
gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index
|
||||
switch {
|
||||
// TODO is this priorty order right?
|
||||
case hipVD != "":
|
||||
visibleDevices = strings.Split(hipVD, ",")
|
||||
case rocrVD != "":
|
||||
visibleDevices = strings.Split(rocrVD, ",")
|
||||
// TODO - since we don't yet support UUIDs, consider detecting and reporting here
|
||||
// all our test systems show GPU-XX indicating UUID is not supported
|
||||
case gpuDO != "":
|
||||
visibleDevices = strings.Split(gpuDO, ",")
|
||||
}
|
||||
|
||||
// Gather GFX version information from all detected cards
|
||||
gfx := AMDGFXVersions()
|
||||
verStrings := []string{}
|
||||
for i, v := range gfx {
|
||||
verStrings = append(verStrings, v.ToGFXString())
|
||||
if v.Major == 0 {
|
||||
// Silently skip CPUs
|
||||
skip[i] = struct{}{}
|
||||
continue
|
||||
}
|
||||
if v.Major < 9 {
|
||||
// TODO consider this a build-time setting if we can support 8xx family GPUs
|
||||
slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
|
||||
skip[i] = struct{}{}
|
||||
}
|
||||
}
|
||||
slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
|
||||
|
||||
// Abort if all GPUs are skipped
|
||||
if len(skip) >= len(gfx) {
|
||||
slog.Info("all detected amdgpus are skipped, falling back to CPU")
|
||||
return
|
||||
}
|
||||
|
||||
// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
|
||||
libDir, err := AMDValidateLibDir()
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
updateLibPath(libDir)
|
||||
|
||||
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
|
||||
if gfxOverride == "" {
|
||||
supported, err := GetSupportedGFX(libDir)
|
||||
var supported []string
|
||||
libDir := ""
|
||||
|
||||
// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
|
||||
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||
cpuCount := 0
|
||||
for _, match := range matches {
|
||||
slog.Debug("evaluating amdgpu node " + match)
|
||||
fp, err := os.Open(match)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
|
||||
return
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
|
||||
|
||||
for i, v := range gfx {
|
||||
if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
|
||||
slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
|
||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
|
||||
skip[i] = struct{}{}
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
||||
}
|
||||
|
||||
if len(skip) >= len(gfx) {
|
||||
slog.Info("all detected amdgpus are skipped, falling back to CPU")
|
||||
return
|
||||
}
|
||||
|
||||
ids := make([]int, len(gfx))
|
||||
i := 0
|
||||
for k := range gfx {
|
||||
ids[i] = k
|
||||
i++
|
||||
}
|
||||
amdProcMemLookup(resp, skip, ids)
|
||||
if resp.memInfo.DeviceCount == 0 {
|
||||
return
|
||||
}
|
||||
if len(skip) > 0 {
|
||||
amdSetVisibleDevices(ids, skip)
|
||||
}
|
||||
}
|
||||
|
||||
func updateLibPath(libDir string) {
|
||||
ldPaths := []string{}
|
||||
if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
|
||||
ldPaths = strings.Split(val, ":")
|
||||
}
|
||||
for _, d := range ldPaths {
|
||||
if d == libDir {
|
||||
return
|
||||
}
|
||||
}
|
||||
val := strings.Join(append(ldPaths, libDir), ":")
|
||||
slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
|
||||
os.Setenv("LD_LIBRARY_PATH", val)
|
||||
}
|
||||
|
||||
// Walk the sysfs nodes for the available GPUs and gather information from them
|
||||
// skipping over any devices in the skip map
|
||||
func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||
resp.memInfo.DeviceCount = 0
|
||||
resp.memInfo.TotalMemory = 0
|
||||
resp.memInfo.FreeMemory = 0
|
||||
slog.Debug("discovering VRAM for amdgpu devices")
|
||||
if len(ids) == 0 {
|
||||
entries, err := os.ReadDir(AMDNodesSysfsDir)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
|
||||
return
|
||||
}
|
||||
for _, node := range entries {
|
||||
if !node.IsDir() {
|
||||
continue
|
||||
}
|
||||
id, err := strconv.Atoi(node.Name())
|
||||
if err != nil {
|
||||
slog.Warn("malformed amdgpu sysfs node id " + node.Name())
|
||||
continue
|
||||
}
|
||||
ids = append(ids, id)
|
||||
}
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
|
||||
|
||||
for _, id := range ids {
|
||||
if _, skipped := skip[id]; skipped {
|
||||
slog.Debug("failed to open sysfs node", "file", match, "error", err)
|
||||
continue
|
||||
}
|
||||
defer fp.Close()
|
||||
nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
|
||||
if err != nil {
|
||||
slog.Debug("failed to parse node ID", "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(fp)
|
||||
isCPU := false
|
||||
var major, minor, patch uint64
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
|
||||
if strings.HasPrefix(line, "gfx_target_version") {
|
||||
ver := strings.Fields(line)
|
||||
|
||||
// Detect CPUs
|
||||
if len(ver) == 2 && ver[1] == "0" {
|
||||
slog.Debug("detected CPU " + match)
|
||||
isCPU = true
|
||||
break
|
||||
}
|
||||
|
||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||
slog.Warn("malformed "+match, "gfx_target_version", line)
|
||||
// If this winds up being a CPU, our offsets may be wrong
|
||||
continue
|
||||
}
|
||||
l := len(ver[1])
|
||||
var err1, err2, err3 error
|
||||
patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
||||
minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
||||
major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
|
||||
if err1 != nil || err2 != nil || err3 != nil {
|
||||
slog.Debug("malformed int " + line)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// TODO - any other properties we want to extract and record?
|
||||
// vendor_id + device_id -> pci lookup for "Name"
|
||||
// Other metrics that may help us understand relative performance between multiple GPUs
|
||||
}
|
||||
|
||||
if isCPU {
|
||||
cpuCount++
|
||||
continue
|
||||
}
|
||||
|
||||
// CPUs are always first in the list
|
||||
gpuID := nodeID - cpuCount
|
||||
|
||||
// Shouldn't happen, but just in case...
|
||||
if gpuID < 0 {
|
||||
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
||||
return []GpuInfo{}
|
||||
}
|
||||
|
||||
if int(major) < RocmComputeMin {
|
||||
slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
|
||||
continue
|
||||
}
|
||||
|
||||
// Look up the memory for the current node
|
||||
totalMemory := uint64(0)
|
||||
usedMemory := uint64(0)
|
||||
// Adjust for sysfs vs HIP ids
|
||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
|
||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
|
||||
propFiles, err := filepath.Glob(propGlob)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
|
||||
slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
|
||||
}
|
||||
// 1 or more memory banks - sum the values of all of them
|
||||
for _, propFile := range propFiles {
|
||||
fp, err := os.Open(propFile)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
|
||||
slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
|
||||
continue
|
||||
}
|
||||
defer fp.Close()
|
||||
@@ -226,49 +179,113 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||
}
|
||||
}
|
||||
if totalMemory == 0 {
|
||||
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
|
||||
skip[id] = struct{}{}
|
||||
slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
|
||||
continue
|
||||
}
|
||||
if totalMemory < IGPUMemLimit {
|
||||
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
|
||||
skip[id] = struct{}{}
|
||||
continue
|
||||
}
|
||||
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
|
||||
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
|
||||
usedFiles, err := filepath.Glob(usedGlob)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
|
||||
slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
|
||||
continue
|
||||
}
|
||||
for _, usedFile := range usedFiles {
|
||||
fp, err := os.Open(usedFile)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
|
||||
slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
|
||||
continue
|
||||
}
|
||||
defer fp.Close()
|
||||
data, err := io.ReadAll(fp)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
|
||||
slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
|
||||
continue
|
||||
}
|
||||
used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
|
||||
slog.Warn("malformed used memory", "data", string(data), "error", err)
|
||||
continue
|
||||
}
|
||||
usedMemory += used
|
||||
}
|
||||
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
|
||||
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
|
||||
resp.memInfo.DeviceCount++
|
||||
resp.memInfo.TotalMemory += totalMemory
|
||||
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
|
||||
|
||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||
if totalMemory < IGPUMemLimit {
|
||||
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
||||
continue
|
||||
}
|
||||
|
||||
slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
||||
slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
||||
gpuInfo := GpuInfo{
|
||||
Library: "rocm",
|
||||
memInfo: memInfo{
|
||||
TotalMemory: totalMemory,
|
||||
FreeMemory: (totalMemory - usedMemory),
|
||||
},
|
||||
ID: fmt.Sprintf("%d", gpuID),
|
||||
// Name: not exposed in sysfs directly, would require pci device id lookup
|
||||
Major: int(major),
|
||||
Minor: int(minor),
|
||||
Patch: int(patch),
|
||||
MinimumMemory: rocmMinimumMemory,
|
||||
}
|
||||
|
||||
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
||||
if len(visibleDevices) > 0 {
|
||||
include := false
|
||||
for _, visible := range visibleDevices {
|
||||
if visible == gpuInfo.ID {
|
||||
include = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !include {
|
||||
slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Final validation is gfx compatibility - load the library if we haven't already loaded it
|
||||
// even if the user overrides, we still need to validate the library
|
||||
if libDir == "" {
|
||||
libDir, err = AMDValidateLibDir()
|
||||
if err != nil {
|
||||
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
||||
return []GpuInfo{}
|
||||
}
|
||||
}
|
||||
gpuInfo.DependencyPath = libDir
|
||||
|
||||
if gfxOverride == "" {
|
||||
// Only load supported list once
|
||||
if len(supported) == 0 {
|
||||
supported, err = GetSupportedGFX(libDir)
|
||||
if err != nil {
|
||||
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
||||
return []GpuInfo{}
|
||||
}
|
||||
slog.Debug("rocm supported GPUs", "types", supported)
|
||||
}
|
||||
gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
|
||||
if !slices.Contains[[]string, string](supported, gfx) {
|
||||
slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
|
||||
continue
|
||||
} else {
|
||||
slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
|
||||
}
|
||||
} else {
|
||||
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
||||
}
|
||||
|
||||
// The GPU has passed all the verification steps and is supported
|
||||
resp = append(resp, gpuInfo)
|
||||
}
|
||||
if resp.memInfo.DeviceCount > 0 {
|
||||
resp.Library = "rocm"
|
||||
if len(resp) == 0 {
|
||||
slog.Info("no compatible amdgpu devices detected")
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// Quick check for AMD driver so we can skip amdgpu discovery if not present
|
||||
@@ -280,87 +297,24 @@ func AMDDetected() bool {
|
||||
slog.Debug("amdgpu driver not detected " + sysfsDir)
|
||||
return false
|
||||
} else if err != nil {
|
||||
slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
|
||||
slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func setupLink(source, target string) error {
|
||||
if err := os.RemoveAll(target); err != nil {
|
||||
return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
|
||||
}
|
||||
if err := os.Symlink(source, target); err != nil {
|
||||
return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ensure the AMD rocm lib dir is wired up
|
||||
// Prefer to use host installed ROCm, as long as it meets our minimum requirements
|
||||
// failing that, tell the user how to download it on their own
|
||||
func AMDValidateLibDir() (string, error) {
|
||||
// We rely on the rpath compiled into our library to find rocm
|
||||
// so we establish a symlink to wherever we find it on the system
|
||||
// to <payloads>/rocm
|
||||
payloadsDir, err := PayloadsDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// If we already have a rocm dependency wired, nothing more to do
|
||||
rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
return rocmTargetDir, nil
|
||||
}
|
||||
|
||||
// next to the running binary
|
||||
exe, err := os.Executable()
|
||||
libDir, err := commonAMDValidateLibDir()
|
||||
if err == nil {
|
||||
peerDir := filepath.Dir(exe)
|
||||
if rocmLibUsable(peerDir) {
|
||||
slog.Debug("detected ROCM next to ollama executable " + peerDir)
|
||||
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
|
||||
}
|
||||
peerDir = filepath.Join(filepath.Dir(exe), "rocm")
|
||||
if rocmLibUsable(peerDir) {
|
||||
slog.Debug("detected ROCM next to ollama executable " + peerDir)
|
||||
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
|
||||
}
|
||||
return libDir, nil
|
||||
}
|
||||
|
||||
// Well known ollama installer path
|
||||
installedRocmDir := "/usr/share/ollama/lib/rocm"
|
||||
if rocmLibUsable(installedRocmDir) {
|
||||
return rocmTargetDir, setupLink(installedRocmDir, rocmTargetDir)
|
||||
}
|
||||
|
||||
// Prefer explicit HIP env var
|
||||
hipPath := os.Getenv("HIP_PATH")
|
||||
if hipPath != "" {
|
||||
hipLibDir := filepath.Join(hipPath, "lib")
|
||||
if rocmLibUsable(hipLibDir) {
|
||||
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
|
||||
return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
|
||||
}
|
||||
}
|
||||
|
||||
// Scan the library path for potential matches
|
||||
ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
|
||||
for _, ldPath := range ldPaths {
|
||||
d, err := filepath.Abs(ldPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if rocmLibUsable(d) {
|
||||
return rocmTargetDir, setupLink(d, rocmTargetDir)
|
||||
}
|
||||
}
|
||||
|
||||
// Well known location(s)
|
||||
if rocmLibUsable("/opt/rocm/lib") {
|
||||
return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
|
||||
return installedRocmDir, nil
|
||||
}
|
||||
|
||||
// If we still haven't found a usable rocm, the user will have to install it on their own
|
||||
@@ -384,68 +338,3 @@ func AMDDriverVersion() (string, error) {
|
||||
}
|
||||
return strings.TrimSpace(string(verString)), nil
|
||||
}
|
||||
|
||||
func AMDGFXVersions() map[int]Version {
|
||||
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
|
||||
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
||||
res := map[int]Version{}
|
||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||
for _, match := range matches {
|
||||
fp, err := os.Open(match)
|
||||
if err != nil {
|
||||
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
|
||||
continue
|
||||
}
|
||||
defer fp.Close()
|
||||
i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
|
||||
if err != nil {
|
||||
slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
|
||||
continue
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
// Skipping the CPU
|
||||
continue
|
||||
}
|
||||
// Align with HIP IDs (zero is first GPU, not CPU)
|
||||
i -= 1
|
||||
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if strings.HasPrefix(line, "gfx_target_version") {
|
||||
ver := strings.Fields(line)
|
||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||
if ver[1] != "0" {
|
||||
slog.Debug("malformed " + line)
|
||||
}
|
||||
res[i] = Version{
|
||||
Major: 0,
|
||||
Minor: 0,
|
||||
Patch: 0,
|
||||
}
|
||||
continue
|
||||
}
|
||||
l := len(ver[1])
|
||||
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
||||
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
||||
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
|
||||
if err1 != nil || err2 != nil || err3 != nil {
|
||||
slog.Debug("malformed int " + line)
|
||||
continue
|
||||
}
|
||||
|
||||
res[i] = Version{
|
||||
Major: uint(major),
|
||||
Minor: uint(minor),
|
||||
Patch: uint(patch),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func (v Version) ToGFXString() string {
|
||||
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
|
||||
}
|
||||
|
||||
@@ -7,7 +7,10 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -22,36 +25,32 @@ var (
|
||||
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
|
||||
)
|
||||
|
||||
func AMDGetGPUInfo(resp *GpuInfo) {
|
||||
func AMDGetGPUInfo() []GpuInfo {
|
||||
resp := []GpuInfo{}
|
||||
hl, err := NewHipLib()
|
||||
if err != nil {
|
||||
slog.Debug(err.Error())
|
||||
return
|
||||
return nil
|
||||
}
|
||||
defer hl.Release()
|
||||
skip := map[int]interface{}{}
|
||||
ids := []int{}
|
||||
resp.memInfo.DeviceCount = 0
|
||||
resp.memInfo.TotalMemory = 0
|
||||
resp.memInfo.FreeMemory = 0
|
||||
|
||||
ver, err := hl.AMDDriverVersion()
|
||||
if err == nil {
|
||||
slog.Info("AMD Driver: " + ver)
|
||||
} else {
|
||||
// For now this is benign, but we may eventually need to fail compatibility checks
|
||||
slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
|
||||
slog.Debug("error looking up amd driver version", "error", err)
|
||||
}
|
||||
|
||||
// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
|
||||
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
|
||||
count := hl.HipGetDeviceCount()
|
||||
if count == 0 {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
libDir, err := AMDValidateLibDir()
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
|
||||
return
|
||||
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var supported []string
|
||||
@@ -59,95 +58,120 @@ func AMDGetGPUInfo(resp *GpuInfo) {
|
||||
if gfxOverride == "" {
|
||||
supported, err = GetSupportedGFX(libDir)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
|
||||
return
|
||||
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
||||
}
|
||||
|
||||
slog.Info(fmt.Sprintf("detected %d hip devices", count))
|
||||
slog.Info("detected hip devices", "count", count)
|
||||
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
|
||||
for i := 0; i < count; i++ {
|
||||
ids = append(ids, i)
|
||||
err = hl.HipSetDevice(i)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
|
||||
skip[i] = struct{}{}
|
||||
slog.Warn("set device", "id", i, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
props, err := hl.HipGetDeviceProperties(i)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
|
||||
skip[i] = struct{}{}
|
||||
slog.Warn("get properties", "id", i, "error", err)
|
||||
continue
|
||||
}
|
||||
n := bytes.IndexByte(props.Name[:], 0)
|
||||
name := string(props.Name[:n])
|
||||
slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
|
||||
// TODO is UUID actually populated on windows?
|
||||
// Can luid be used on windows for setting visible devices (and is it actually set?)
|
||||
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
||||
gfx := string(props.GcnArchName[:n])
|
||||
slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
|
||||
slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
|
||||
var major, minor, patch string
|
||||
switch len(gfx) {
|
||||
case 6:
|
||||
major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
|
||||
case 7:
|
||||
major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
|
||||
}
|
||||
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
||||
// TODO Why isn't props.iGPU accurate!?
|
||||
if strings.EqualFold(name, iGPUName) {
|
||||
slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
|
||||
skip[i] = struct{}{}
|
||||
slog.Info("iGPU detected skipping", "id", i)
|
||||
continue
|
||||
}
|
||||
if gfxOverride == "" {
|
||||
if !slices.Contains[[]string, string](supported, gfx) {
|
||||
slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
|
||||
slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
||||
skip[i] = struct{}{}
|
||||
continue
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
|
||||
slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
|
||||
}
|
||||
}
|
||||
|
||||
totalMemory, freeMemory, err := hl.HipMemGetInfo()
|
||||
freeMemory, totalMemory, err := hl.HipMemGetInfo()
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
|
||||
slog.Warn("get mem info", "id", i, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// TODO according to docs, freeMem may lie on windows!
|
||||
slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
|
||||
slog.Info(fmt.Sprintf("[%d] Free Mem: %d", i, freeMemory))
|
||||
resp.memInfo.DeviceCount++
|
||||
resp.memInfo.TotalMemory += totalMemory
|
||||
resp.memInfo.FreeMemory += freeMemory
|
||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||
if totalMemory < IGPUMemLimit {
|
||||
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||
continue
|
||||
}
|
||||
|
||||
// TODO revisit this once ROCm v6 is available on windows.
|
||||
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
||||
slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||
slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
||||
gpuInfo := GpuInfo{
|
||||
Library: "rocm",
|
||||
memInfo: memInfo{
|
||||
TotalMemory: totalMemory,
|
||||
FreeMemory: freeMemory,
|
||||
},
|
||||
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
|
||||
DependencyPath: libDir,
|
||||
MinimumMemory: rocmMinimumMemory,
|
||||
}
|
||||
if major != "" {
|
||||
gpuInfo.Major, err = strconv.Atoi(major)
|
||||
if err != nil {
|
||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
||||
}
|
||||
}
|
||||
if minor != "" {
|
||||
gpuInfo.Minor, err = strconv.Atoi(minor)
|
||||
if err != nil {
|
||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
||||
}
|
||||
}
|
||||
if patch != "" {
|
||||
// Patch rev is hex; e.g. gfx90a
|
||||
p, err := strconv.ParseInt(patch, 16, 0)
|
||||
if err != nil {
|
||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
||||
} else {
|
||||
gpuInfo.Patch = int(p)
|
||||
}
|
||||
}
|
||||
if gpuInfo.Major < RocmComputeMin {
|
||||
slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
|
||||
continue
|
||||
}
|
||||
|
||||
resp = append(resp, gpuInfo)
|
||||
}
|
||||
if resp.memInfo.DeviceCount > 0 {
|
||||
resp.Library = "rocm"
|
||||
}
|
||||
// Abort if all GPUs are skipped
|
||||
if len(skip) >= count {
|
||||
slog.Info("all detected amdgpus are skipped, falling back to CPU")
|
||||
return
|
||||
}
|
||||
if len(skip) > 0 {
|
||||
amdSetVisibleDevices(ids, skip)
|
||||
}
|
||||
UpdatePath(libDir)
|
||||
|
||||
return resp
|
||||
}
|
||||
|
||||
func AMDValidateLibDir() (string, error) {
|
||||
// On windows non-admins typically can't create links
|
||||
// so instead of trying to rely on rpath and a link in
|
||||
// $LibDir/rocm, we instead rely on setting PATH to point
|
||||
// to the location of the ROCm library
|
||||
|
||||
// Installer payload location if we're running the installed binary
|
||||
exe, err := os.Executable()
|
||||
libDir, err := commonAMDValidateLibDir()
|
||||
if err == nil {
|
||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
||||
return rocmTargetDir, nil
|
||||
}
|
||||
return libDir, nil
|
||||
}
|
||||
|
||||
// Installer payload (if we're running from some other location)
|
||||
@@ -159,21 +183,6 @@ func AMDValidateLibDir() (string, error) {
|
||||
return rocmTargetDir, nil
|
||||
}
|
||||
|
||||
// Prefer explicit HIP env var
|
||||
hipPath := os.Getenv("HIP_PATH")
|
||||
if hipPath != "" {
|
||||
hipLibDir := filepath.Join(hipPath, "bin")
|
||||
if rocmLibUsable(hipLibDir) {
|
||||
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
|
||||
return hipLibDir, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Well known location(s)
|
||||
if rocmLibUsable(RocmStandardLocation) {
|
||||
return RocmStandardLocation, nil
|
||||
}
|
||||
|
||||
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
|
||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
||||
|
||||
@@ -24,6 +24,51 @@ func PayloadsDir() (string, error) {
|
||||
defer lock.Unlock()
|
||||
var err error
|
||||
if payloadsDir == "" {
|
||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
||||
// On Windows we do not carry the payloads inside the main executable
|
||||
if runtime.GOOS == "windows" && runnersDir == "" {
|
||||
appExe, err := os.Executable()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup executable path", "error", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup working directory", "error", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
var paths []string
|
||||
for _, root := range []string{filepath.Dir(appExe), cwd} {
|
||||
paths = append(paths,
|
||||
filepath.Join(root),
|
||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, p := range paths {
|
||||
candidate := filepath.Join(p, "ollama_runners")
|
||||
_, err := os.Stat(candidate)
|
||||
if err == nil {
|
||||
runnersDir = candidate
|
||||
break
|
||||
}
|
||||
}
|
||||
if runnersDir == "" {
|
||||
err = fmt.Errorf("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
||||
slog.Error("incomplete distribution", "error", err)
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
if runnersDir != "" {
|
||||
payloadsDir = runnersDir
|
||||
return payloadsDir, nil
|
||||
}
|
||||
|
||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
||||
cleanupTmpDirs()
|
||||
tmpDir := os.Getenv("OLLAMA_TMPDIR")
|
||||
if tmpDir == "" {
|
||||
@@ -80,7 +125,7 @@ func cleanupTmpDirs() {
|
||||
}
|
||||
err = os.RemoveAll(d)
|
||||
if err != nil {
|
||||
slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
|
||||
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -88,7 +133,8 @@ func cleanupTmpDirs() {
|
||||
func Cleanup() {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
if payloadsDir != "" {
|
||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
||||
slog.Debug("cleaning up", "dir", tmpDir)
|
||||
@@ -120,7 +166,7 @@ func UpdatePath(dir string) {
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
|
||||
slog.Info("updating", "PATH", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
// linux and darwin rely on rpath
|
||||
|
||||
22
gpu/cuda_common.go
Normal file
22
gpu/cuda_common.go
Normal file
@@ -0,0 +1,22 @@
|
||||
//go:build linux || windows
|
||||
|
||||
package gpu
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
if info.Library != "cuda" {
|
||||
// TODO shouldn't happen if things are wired correctly...
|
||||
slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
|
||||
continue
|
||||
}
|
||||
ids = append(ids, info.ID)
|
||||
}
|
||||
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||
|
||||
}
|
||||
233
gpu/gpu.go
233
gpu/gpu.go
@@ -16,7 +16,6 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
@@ -25,8 +24,8 @@ import (
|
||||
)
|
||||
|
||||
type handles struct {
|
||||
nvml *C.nvml_handle_t
|
||||
cudart *C.cudart_handle_t
|
||||
deviceCount int
|
||||
cudart *C.cudart_handle_t
|
||||
}
|
||||
|
||||
const (
|
||||
@@ -39,26 +38,10 @@ var gpuMutex sync.Mutex
|
||||
// With our current CUDA compile flags, older than 5.0 will not work properly
|
||||
var CudaComputeMin = [2]C.int{5, 0}
|
||||
|
||||
// Possible locations for the nvidia-ml library
|
||||
var NvmlLinuxGlobs = []string{
|
||||
"/usr/local/cuda/lib64/libnvidia-ml.so*",
|
||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so*",
|
||||
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
|
||||
"/opt/cuda/lib64/libnvidia-ml.so*",
|
||||
"/usr/lib*/libnvidia-ml.so*",
|
||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
|
||||
"/usr/local/lib*/libnvidia-ml.so*",
|
||||
var RocmComputeMin = 9
|
||||
|
||||
// TODO: are these stubs ever valid?
|
||||
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
|
||||
}
|
||||
|
||||
var NvmlWindowsGlobs = []string{
|
||||
"c:\\Windows\\System32\\nvml.dll",
|
||||
}
|
||||
// TODO find a better way to detect iGPU instead of minimum memory
|
||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||
|
||||
var CudartLinuxGlobs = []string{
|
||||
"/usr/local/cuda/lib64/libcudart.so*",
|
||||
@@ -88,26 +71,18 @@ func initGPUHandles() *handles {
|
||||
|
||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||
|
||||
gpuHandles := &handles{nil, nil}
|
||||
var nvmlMgmtName string
|
||||
var nvmlMgmtPatterns []string
|
||||
gpuHandles := &handles{}
|
||||
var cudartMgmtName string
|
||||
var cudartMgmtPatterns []string
|
||||
|
||||
tmpDir, _ := PayloadsDir()
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
nvmlMgmtName = "nvml.dll"
|
||||
nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
|
||||
copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
|
||||
cudartMgmtName = "cudart64_*.dll"
|
||||
localAppData := os.Getenv("LOCALAPPDATA")
|
||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
||||
case "linux":
|
||||
nvmlMgmtName = "libnvidia-ml.so"
|
||||
nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
|
||||
copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
|
||||
cudartMgmtName = "libcudart.so*"
|
||||
if tmpDir != "" {
|
||||
// TODO - add "payloads" for subprocess
|
||||
@@ -118,31 +93,21 @@ func initGPUHandles() *handles {
|
||||
return gpuHandles
|
||||
}
|
||||
|
||||
slog.Info("Detecting GPU type")
|
||||
slog.Info("Detecting GPUs")
|
||||
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
||||
if len(cudartLibPaths) > 0 {
|
||||
cudart := LoadCUDARTMgmt(cudartLibPaths)
|
||||
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
||||
if cudart != nil {
|
||||
slog.Info("Nvidia GPU detected via cudart")
|
||||
slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
|
||||
gpuHandles.cudart = cudart
|
||||
return gpuHandles
|
||||
}
|
||||
}
|
||||
|
||||
// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
|
||||
nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
|
||||
if len(nvmlLibPaths) > 0 {
|
||||
nvml := LoadNVMLMgmt(nvmlLibPaths)
|
||||
if nvml != nil {
|
||||
slog.Info("Nvidia GPU detected via nvidia-ml")
|
||||
gpuHandles.nvml = nvml
|
||||
gpuHandles.deviceCount = deviceCount
|
||||
return gpuHandles
|
||||
}
|
||||
}
|
||||
return gpuHandles
|
||||
}
|
||||
|
||||
func GetGPUInfo() GpuInfo {
|
||||
func GetGPUInfo() GpuInfoList {
|
||||
// TODO - consider exploring lspci (and equivalent on windows) to check for
|
||||
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
||||
gpuMutex.Lock()
|
||||
@@ -150,9 +115,6 @@ func GetGPUInfo() GpuInfo {
|
||||
|
||||
gpuHandles := initGPUHandles()
|
||||
defer func() {
|
||||
if gpuHandles.nvml != nil {
|
||||
C.nvml_release(*gpuHandles.nvml)
|
||||
}
|
||||
if gpuHandles.cudart != nil {
|
||||
C.cudart_release(*gpuHandles.cudart)
|
||||
}
|
||||
@@ -165,72 +127,63 @@ func GetGPUInfo() GpuInfo {
|
||||
}
|
||||
|
||||
var memInfo C.mem_info_t
|
||||
resp := GpuInfo{}
|
||||
if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||
C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
|
||||
resp := []GpuInfo{}
|
||||
|
||||
// NVIDIA first
|
||||
for i := 0; i < gpuHandles.deviceCount; i++ {
|
||||
// TODO once we support CPU compilation variants of GPU libraries refine this...
|
||||
if cpuVariant == "" && runtime.GOARCH == "amd64" {
|
||||
continue
|
||||
}
|
||||
gpuInfo := GpuInfo{
|
||||
Library: "cuda",
|
||||
}
|
||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
|
||||
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
} else if memInfo.count > 0 {
|
||||
// Verify minimum compute capability
|
||||
var cc C.nvml_compute_capability_t
|
||||
C.nvml_compute_capability(*gpuHandles.nvml, &cc)
|
||||
if cc.err != nil {
|
||||
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
|
||||
C.free(unsafe.Pointer(cc.err))
|
||||
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
|
||||
slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
resp.Library = "cuda"
|
||||
resp.MinimumMemory = cudaMinimumMemory
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
}
|
||||
continue
|
||||
}
|
||||
} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||
C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
} else if memInfo.count > 0 {
|
||||
// Verify minimum compute capability
|
||||
var cc C.cudart_compute_capability_t
|
||||
C.cudart_compute_capability(*gpuHandles.cudart, &cc)
|
||||
if cc.err != nil {
|
||||
slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
|
||||
C.free(unsafe.Pointer(cc.err))
|
||||
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
|
||||
slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
resp.Library = "cuda"
|
||||
resp.MinimumMemory = cudaMinimumMemory
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
}
|
||||
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
|
||||
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
AMDGetGPUInfo(&resp)
|
||||
if resp.Library != "" {
|
||||
resp.MinimumMemory = rocmMinimumMemory
|
||||
return resp
|
||||
}
|
||||
}
|
||||
if resp.Library == "" {
|
||||
C.cpu_check_ram(&memInfo)
|
||||
resp.Library = "cpu"
|
||||
resp.Variant = cpuVariant
|
||||
}
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
return resp
|
||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||
gpuInfo.Major = int(memInfo.major)
|
||||
gpuInfo.Minor = int(memInfo.minor)
|
||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||
|
||||
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
||||
resp = append(resp, gpuInfo)
|
||||
}
|
||||
|
||||
// Then AMD
|
||||
resp = append(resp, AMDGetGPUInfo()...)
|
||||
|
||||
if len(resp) == 0 {
|
||||
C.cpu_check_ram(&memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
return resp
|
||||
}
|
||||
gpuInfo := GpuInfo{
|
||||
Library: "cpu",
|
||||
Variant: cpuVariant,
|
||||
}
|
||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||
|
||||
resp = append(resp, gpuInfo)
|
||||
}
|
||||
|
||||
resp.DeviceCount = uint32(memInfo.count)
|
||||
resp.FreeMemory = uint64(memInfo.free)
|
||||
resp.TotalMemory = uint64(memInfo.total)
|
||||
return resp
|
||||
}
|
||||
|
||||
func getCPUMem() (memInfo, error) {
|
||||
func GetCPUMem() (memInfo, error) {
|
||||
var ret memInfo
|
||||
var info C.mem_info_t
|
||||
C.cpu_check_ram(&info)
|
||||
@@ -243,29 +196,11 @@ func getCPUMem() (memInfo, error) {
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func CheckVRAM() (uint64, error) {
|
||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
if userLimit != "" {
|
||||
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
||||
}
|
||||
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
||||
return uint64(avail), nil
|
||||
}
|
||||
gpuInfo := GetGPUInfo()
|
||||
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||
return gpuInfo.FreeMemory, nil
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
||||
}
|
||||
|
||||
func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||
var ldPaths []string
|
||||
gpuLibPaths := []string{}
|
||||
slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
|
||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
||||
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
@@ -283,7 +218,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||
}
|
||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
|
||||
slog.Debug("gpu library search", "globs", patterns)
|
||||
for _, pattern := range patterns {
|
||||
// Ignore glob discovery errors
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
@@ -311,28 +246,11 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||
}
|
||||
}
|
||||
}
|
||||
slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
|
||||
slog.Debug("discovered GPU libraries", "paths", gpuLibPaths)
|
||||
return gpuLibPaths
|
||||
}
|
||||
|
||||
func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
|
||||
var resp C.nvml_init_resp_t
|
||||
resp.ch.verbose = getVerboseState()
|
||||
for _, libPath := range nvmlLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.nvml_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return &resp.ch
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
|
||||
func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
|
||||
var resp C.cudart_init_resp_t
|
||||
resp.ch.verbose = getVerboseState()
|
||||
for _, libPath := range cudartLibPaths {
|
||||
@@ -340,13 +258,13 @@ func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.cudart_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
|
||||
slog.Debug("Unable to load cudart", "library", libPath, "error", C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return &resp.ch
|
||||
return int(resp.num_devices), &resp.ch, libPath
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return 0, nil, ""
|
||||
}
|
||||
|
||||
func getVerboseState() C.uint16_t {
|
||||
@@ -355,3 +273,22 @@ func getVerboseState() C.uint16_t {
|
||||
}
|
||||
return C.uint16_t(0)
|
||||
}
|
||||
|
||||
// Given the list of GPUs this instantiation is targeted for,
|
||||
// figure out the visible devices environment variable
|
||||
//
|
||||
// If different libraries are detected, the first one is what we use
|
||||
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
||||
if len(l) == 0 {
|
||||
return "", ""
|
||||
}
|
||||
switch l[0].Library {
|
||||
case "cuda":
|
||||
return cudaGetVisibleDevicesEnv(l)
|
||||
case "rocm":
|
||||
return rocmGetVisibleDevicesEnv(l)
|
||||
default:
|
||||
slog.Debug("no filter required for library " + l[0].Library)
|
||||
return "", ""
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,52 +9,47 @@ package gpu
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
|
||||
func CheckVRAM() (uint64, error) {
|
||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
if userLimit != "" {
|
||||
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
||||
}
|
||||
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
||||
return uint64(avail), nil
|
||||
}
|
||||
const (
|
||||
metalMinimumMemory = 512 * format.MebiByte
|
||||
)
|
||||
|
||||
func GetGPUInfo() GpuInfoList {
|
||||
mem, _ := GetCPUMem()
|
||||
if runtime.GOARCH == "amd64" {
|
||||
// gpu not supported, this may not be metal
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return uint64(C.getRecommendedMaxVRAM()), nil
|
||||
}
|
||||
|
||||
func GetGPUInfo() GpuInfo {
|
||||
mem, _ := getCPUMem()
|
||||
if runtime.GOARCH == "amd64" {
|
||||
return GpuInfo{
|
||||
Library: "cpu",
|
||||
Variant: GetCPUVariant(),
|
||||
memInfo: mem,
|
||||
return []GpuInfo{
|
||||
{
|
||||
Library: "cpu",
|
||||
Variant: GetCPUVariant(),
|
||||
memInfo: mem,
|
||||
},
|
||||
}
|
||||
}
|
||||
return GpuInfo{
|
||||
info := GpuInfo{
|
||||
Library: "metal",
|
||||
memInfo: mem,
|
||||
ID: "0",
|
||||
}
|
||||
info.TotalMemory = uint64(C.getRecommendedMaxVRAM())
|
||||
|
||||
// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
|
||||
info.FreeMemory = info.TotalMemory
|
||||
|
||||
info.MinimumMemory = metalMinimumMemory
|
||||
return []GpuInfo{info}
|
||||
}
|
||||
|
||||
func getCPUMem() (memInfo, error) {
|
||||
func GetCPUMem() (memInfo, error) {
|
||||
return memInfo{
|
||||
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||
FreeMemory: 0,
|
||||
DeviceCount: 1,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
||||
// No-op on darwin
|
||||
return "", ""
|
||||
}
|
||||
|
||||
@@ -38,12 +38,17 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GPU_ID_LEN 64
|
||||
|
||||
typedef struct mem_info {
|
||||
char *err; // If non-nill, caller responsible for freeing
|
||||
char gpu_id[GPU_ID_LEN];
|
||||
uint64_t total;
|
||||
uint64_t free;
|
||||
unsigned int count;
|
||||
int igpu_index; // If >= 0, we detected an integrated GPU to ignore
|
||||
char *err; // If non-nill, caller responsible for freeing
|
||||
|
||||
// Compute Capability
|
||||
int major;
|
||||
int minor;
|
||||
} mem_info_t;
|
||||
|
||||
void cpu_check_ram(mem_info_t *resp);
|
||||
@@ -52,7 +57,6 @@ void cpu_check_ram(mem_info_t *resp);
|
||||
}
|
||||
#endif
|
||||
|
||||
#include "gpu_info_nvml.h"
|
||||
#include "gpu_info_cudart.h"
|
||||
|
||||
#endif // __GPU_INFO_H__
|
||||
|
||||
@@ -8,9 +8,11 @@ void cpu_check_ram(mem_info_t *resp) {
|
||||
MEMORYSTATUSEX info;
|
||||
info.dwLength = sizeof(info);
|
||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
||||
resp->count = 1;
|
||||
resp->total = info.ullTotalPhys;
|
||||
resp->free = info.ullAvailPhys;
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
||||
} else {
|
||||
resp->err = LOAD_ERR();
|
||||
}
|
||||
@@ -27,9 +29,11 @@ void cpu_check_ram(mem_info_t *resp) {
|
||||
if (sysinfo(&info) != 0) {
|
||||
resp->err = strdup(strerror(errno));
|
||||
} else {
|
||||
resp->count = 1;
|
||||
resp->total = info.totalram * info.mem_unit;
|
||||
resp->free = info.freeram * info.mem_unit;
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||
cudartReturn_t ret;
|
||||
resp->err = NULL;
|
||||
resp->num_devices = 0;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
@@ -21,6 +22,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||
{"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
|
||||
{"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
|
||||
{"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
|
||||
{"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
@@ -36,13 +38,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO once we've squashed the remaining corner cases remove this log
|
||||
LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
|
||||
|
||||
for (i = 0; l[i].s != NULL; i++) {
|
||||
// TODO once we've squashed the remaining corner cases remove this log
|
||||
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
||||
|
||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
char *msg = LOAD_ERR();
|
||||
@@ -63,7 +59,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
||||
resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama");
|
||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
||||
return;
|
||||
}
|
||||
snprintf(buf, buflen, "cudart init failure: %d", ret);
|
||||
@@ -85,110 +81,95 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
|
||||
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
|
||||
}
|
||||
|
||||
ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
|
||||
void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
|
||||
resp->err = NULL;
|
||||
cudartMemory_t memInfo = {0,0,0};
|
||||
cudartReturn_t ret;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("cudart handle isn't initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
// cudaGetDeviceCount takes int type, resp-> count is uint
|
||||
int deviceCount;
|
||||
ret = (*h.cudaGetDeviceCount)(&deviceCount);
|
||||
ret = (*h.cudaSetDevice)(i);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
snprintf(buf, buflen, "cudart device failed to initialize");
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
cudaDeviceProp_t props;
|
||||
ret = (*h.cudaGetDeviceProperties)(&props, i);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
} else {
|
||||
resp->count = (unsigned int)deviceCount;
|
||||
}
|
||||
|
||||
resp->total = 0;
|
||||
resp->free = 0;
|
||||
for (i = 0; i < resp-> count; i++) {
|
||||
ret = (*h.cudaSetDevice)(i);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
snprintf(buf, buflen, "cudart device failed to initialize");
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
int allNull = 1;
|
||||
for (int j = 0; j < 16; j++) {
|
||||
if (props.uuid.bytes[j] != 0) {
|
||||
allNull = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
if (allNull != 0) {
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
||||
} else {
|
||||
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
props.uuid.bytes[0],
|
||||
props.uuid.bytes[1],
|
||||
props.uuid.bytes[2],
|
||||
props.uuid.bytes[3],
|
||||
props.uuid.bytes[4],
|
||||
props.uuid.bytes[5],
|
||||
props.uuid.bytes[6],
|
||||
props.uuid.bytes[7],
|
||||
props.uuid.bytes[8],
|
||||
props.uuid.bytes[9],
|
||||
props.uuid.bytes[10],
|
||||
props.uuid.bytes[11],
|
||||
props.uuid.bytes[12],
|
||||
props.uuid.bytes[13],
|
||||
props.uuid.bytes[14],
|
||||
props.uuid.bytes[15]
|
||||
);
|
||||
}
|
||||
resp->major = props.major;
|
||||
resp->minor = props.minor;
|
||||
|
||||
LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
|
||||
LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
|
||||
|
||||
resp->total += memInfo.total;
|
||||
resp->free += memInfo.free;
|
||||
// TODO add other useful properties from props
|
||||
}
|
||||
}
|
||||
|
||||
void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
|
||||
resp->err = NULL;
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
cudartReturn_t ret;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("cudart handle not initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
int devices;
|
||||
ret = (*h.cudaGetDeviceCount)(&devices);
|
||||
ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
|
||||
snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < devices; i++) {
|
||||
ret = (*h.cudaSetDevice)(i);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
snprintf(buf, buflen, "cudart device failed to initialize");
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
resp->total = memInfo.total;
|
||||
resp->free = memInfo.free;
|
||||
|
||||
ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
|
||||
if (ret != CUDART_SUCCESS) {
|
||||
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
// Report the lowest major.minor we detect as that limits our compatibility
|
||||
if (resp->major == 0 || resp->major > major ) {
|
||||
resp->major = major;
|
||||
resp->minor = minor;
|
||||
} else if ( resp->major == major && resp->minor > minor ) {
|
||||
resp->minor = minor;
|
||||
}
|
||||
}
|
||||
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
|
||||
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
|
||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
||||
}
|
||||
|
||||
void cudart_release(cudart_handle_t h) {
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
// Just enough typedef's to dlopen/dlsym for memory information
|
||||
typedef enum cudartReturn_enum {
|
||||
CUDART_SUCCESS = 0,
|
||||
CUDART_UNSUPPORTED = 1,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
// Other values omitted for now...
|
||||
} cudartReturn_t;
|
||||
@@ -14,6 +15,11 @@ typedef enum cudartReturn_enum {
|
||||
typedef enum cudartDeviceAttr_enum {
|
||||
cudartDevAttrComputeCapabilityMajor = 75,
|
||||
cudartDevAttrComputeCapabilityMinor = 76,
|
||||
|
||||
// TODO - not yet wired up but may be useful for Jetson or other
|
||||
// integrated GPU scenarios with shared memory
|
||||
cudaDevAttrIntegrated = 18
|
||||
|
||||
} cudartDeviceAttr_t;
|
||||
|
||||
typedef void *cudartDevice_t; // Opaque is sufficient
|
||||
@@ -28,6 +34,92 @@ typedef struct cudartDriverVersion {
|
||||
int minor;
|
||||
} cudartDriverVersion_t;
|
||||
|
||||
typedef struct cudaUUID {
|
||||
unsigned char bytes[16];
|
||||
} cudaUUID_t;
|
||||
typedef struct cudaDeviceProp {
|
||||
char name[256]; /**< ASCII string identifying device */
|
||||
cudaUUID_t uuid; /**< 16-byte unique identifier */
|
||||
char luid[8]; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
|
||||
unsigned int luidDeviceNodeMask; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
|
||||
size_t totalGlobalMem; /**< Global memory available on device in bytes */
|
||||
size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */
|
||||
int regsPerBlock; /**< 32-bit registers available per block */
|
||||
int warpSize; /**< Warp size in threads */
|
||||
size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */
|
||||
int maxThreadsPerBlock; /**< Maximum number of threads per block */
|
||||
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
|
||||
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
|
||||
int clockRate; /**< Clock frequency in kilohertz */
|
||||
size_t totalConstMem; /**< Constant memory available on device in bytes */
|
||||
int major; /**< Major compute capability */
|
||||
int minor; /**< Minor compute capability */
|
||||
size_t textureAlignment; /**< Alignment requirement for textures */
|
||||
size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */
|
||||
int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
|
||||
int multiProcessorCount; /**< Number of multiprocessors on device */
|
||||
int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */
|
||||
int integrated; /**< Device is integrated as opposed to discrete */
|
||||
int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
|
||||
int computeMode; /**< Compute mode (See ::cudaComputeMode) */
|
||||
int maxTexture1D; /**< Maximum 1D texture size */
|
||||
int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */
|
||||
int maxTexture1DLinear; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
|
||||
int maxTexture2D[2]; /**< Maximum 2D texture dimensions */
|
||||
int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */
|
||||
int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
|
||||
int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
|
||||
int maxTexture3D[3]; /**< Maximum 3D texture dimensions */
|
||||
int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */
|
||||
int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */
|
||||
int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */
|
||||
int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */
|
||||
int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
|
||||
int maxSurface1D; /**< Maximum 1D surface size */
|
||||
int maxSurface2D[2]; /**< Maximum 2D surface dimensions */
|
||||
int maxSurface3D[3]; /**< Maximum 3D surface dimensions */
|
||||
int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */
|
||||
int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */
|
||||
int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */
|
||||
int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
|
||||
size_t surfaceAlignment; /**< Alignment requirements for surfaces */
|
||||
int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */
|
||||
int ECCEnabled; /**< Device has ECC support enabled */
|
||||
int pciBusID; /**< PCI bus ID of the device */
|
||||
int pciDeviceID; /**< PCI device ID of the device */
|
||||
int pciDomainID; /**< PCI domain ID of the device */
|
||||
int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
|
||||
int asyncEngineCount; /**< Number of asynchronous engines */
|
||||
int unifiedAddressing; /**< Device shares a unified address space with the host */
|
||||
int memoryClockRate; /**< Peak memory clock frequency in kilohertz */
|
||||
int memoryBusWidth; /**< Global memory bus width in bits */
|
||||
int l2CacheSize; /**< Size of L2 cache in bytes */
|
||||
int persistingL2CacheMaxSize; /**< Device's maximum l2 persisting lines capacity setting in bytes */
|
||||
int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
|
||||
int streamPrioritiesSupported; /**< Device supports stream priorities */
|
||||
int globalL1CacheSupported; /**< Device supports caching globals in L1 */
|
||||
int localL1CacheSupported; /**< Device supports caching locals in L1 */
|
||||
size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
|
||||
int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */
|
||||
int managedMemory; /**< Device supports allocating managed memory on this system */
|
||||
int isMultiGpuBoard; /**< Device is on a multi-GPU board */
|
||||
int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */
|
||||
int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */
|
||||
int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
|
||||
int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
|
||||
int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */
|
||||
int computePreemptionSupported; /**< Device supports Compute Preemption */
|
||||
int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
|
||||
int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
|
||||
int cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
|
||||
size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */
|
||||
int pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
|
||||
int directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
|
||||
int maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
|
||||
int accessPolicyMaxWindowSize; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
|
||||
size_t reservedSharedMemPerBlock; /**< Shared memory reserved by CUDA driver per block in bytes */
|
||||
} cudaDeviceProp_t;
|
||||
|
||||
typedef struct cudart_handle {
|
||||
void *handle;
|
||||
uint16_t verbose;
|
||||
@@ -38,23 +130,17 @@ typedef struct cudart_handle {
|
||||
cudartReturn_t (*cudaGetDeviceCount)(int *);
|
||||
cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
|
||||
cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
|
||||
cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
|
||||
} cudart_handle_t;
|
||||
|
||||
typedef struct cudart_init_resp {
|
||||
char *err; // If err is non-null handle is invalid
|
||||
cudart_handle_t ch;
|
||||
int num_devices;
|
||||
} cudart_init_resp_t;
|
||||
|
||||
typedef struct cudart_compute_capability {
|
||||
char *err;
|
||||
int major;
|
||||
int minor;
|
||||
} cudart_compute_capability_t;
|
||||
|
||||
|
||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
||||
void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
|
||||
void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
|
||||
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
|
||||
void cudart_release(cudart_handle_t ch);
|
||||
|
||||
#endif // __GPU_INFO_CUDART_H__
|
||||
|
||||
@@ -1,221 +0,0 @@
|
||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "gpu_info_nvml.h"
|
||||
|
||||
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||
nvmlReturn_t ret;
|
||||
resp->err = NULL;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
struct lookup {
|
||||
char *s;
|
||||
void **p;
|
||||
} l[] = {
|
||||
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
||||
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
||||
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
|
||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
||||
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
|
||||
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
|
||||
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
|
||||
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
|
||||
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
|
||||
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
|
||||
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
|
||||
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
|
||||
if (!resp->ch.handle) {
|
||||
char *msg = LOAD_ERR();
|
||||
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||
nvml_lib_path, msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO once we've squashed the remaining corner cases remove this log
|
||||
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
|
||||
|
||||
for (i = 0; l[i].s != NULL; i++) {
|
||||
// TODO once we've squashed the remaining corner cases remove this log
|
||||
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
||||
|
||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
resp->ch.handle = NULL;
|
||||
char *msg = LOAD_ERR();
|
||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||
msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
ret = (*resp->ch.nvmlInit_v2)();
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
// Report driver version if we're in verbose mode, ignore errors
|
||||
ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
|
||||
}
|
||||
}
|
||||
|
||||
void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
|
||||
resp->err = NULL;
|
||||
nvmlDevice_t device;
|
||||
nvmlMemory_t memInfo = {0};
|
||||
nvmlReturn_t ret;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("nvml handle isn't initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
resp->total = 0;
|
||||
resp->free = 0;
|
||||
for (i = 0; i < resp->count; i++) {
|
||||
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
if (h.verbose) {
|
||||
nvmlBrandType_t brand = 0;
|
||||
// When in verbose mode, report more information about
|
||||
// the card we discover, but don't fail on error
|
||||
ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetBrand)(device, &brand);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
|
||||
}
|
||||
}
|
||||
|
||||
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
|
||||
LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);
|
||||
|
||||
resp->total += memInfo.total;
|
||||
resp->free += memInfo.free;
|
||||
}
|
||||
}
|
||||
|
||||
void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
|
||||
resp->err = NULL;
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
nvmlDevice_t device;
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
nvmlReturn_t ret;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("nvml handle not initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned int devices;
|
||||
ret = (*h.nvmlDeviceGetCount_v2)(&devices);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < devices; i++) {
|
||||
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
// Report the lowest major.minor we detect as that limits our compatibility
|
||||
if (resp->major == 0 || resp->major > major ) {
|
||||
resp->major = major;
|
||||
resp->minor = minor;
|
||||
} else if ( resp->major == major && resp->minor > minor ) {
|
||||
resp->minor = minor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void nvml_release(nvml_handle_t h) {
|
||||
LOG(h.verbose, "releasing nvml library\n");
|
||||
UNLOAD_LIBRARY(h.handle);
|
||||
h.handle = NULL;
|
||||
}
|
||||
|
||||
#endif // __APPLE__
|
||||
@@ -1,57 +0,0 @@
|
||||
#ifndef __APPLE__
|
||||
#ifndef __GPU_INFO_NVML_H__
|
||||
#define __GPU_INFO_NVML_H__
|
||||
#include "gpu_info.h"
|
||||
|
||||
// Just enough typedef's to dlopen/dlsym for memory information
|
||||
typedef enum nvmlReturn_enum {
|
||||
NVML_SUCCESS = 0,
|
||||
// Other values omitted for now...
|
||||
} nvmlReturn_t;
|
||||
typedef void *nvmlDevice_t; // Opaque is sufficient
|
||||
typedef struct nvmlMemory_st {
|
||||
unsigned long long total;
|
||||
unsigned long long free;
|
||||
unsigned long long used;
|
||||
} nvmlMemory_t;
|
||||
|
||||
typedef enum nvmlBrandType_enum
|
||||
{
|
||||
NVML_BRAND_UNKNOWN = 0,
|
||||
} nvmlBrandType_t;
|
||||
|
||||
typedef struct nvml_handle {
|
||||
void *handle;
|
||||
uint16_t verbose;
|
||||
nvmlReturn_t (*nvmlInit_v2)(void);
|
||||
nvmlReturn_t (*nvmlShutdown)(void);
|
||||
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
|
||||
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||
nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
|
||||
nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
|
||||
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
|
||||
} nvml_handle_t;
|
||||
|
||||
typedef struct nvml_init_resp {
|
||||
char *err; // If err is non-null handle is invalid
|
||||
nvml_handle_t ch;
|
||||
} nvml_init_resp_t;
|
||||
|
||||
typedef struct nvml_compute_capability {
|
||||
char *err;
|
||||
int major;
|
||||
int minor;
|
||||
} nvml_compute_capability_t;
|
||||
|
||||
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
||||
void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
|
||||
void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
|
||||
void nvml_release(nvml_handle_t ch);
|
||||
|
||||
#endif // __GPU_INFO_NVML_H__
|
||||
#endif // __APPLE__
|
||||
@@ -9,23 +9,16 @@ import (
|
||||
|
||||
func TestBasicGetGPUInfo(t *testing.T) {
|
||||
info := GetGPUInfo()
|
||||
assert.Contains(t, "cuda rocm cpu metal", info.Library)
|
||||
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
// TODO - remove this once MacOS returns some size for CPU
|
||||
return
|
||||
case "linux", "windows":
|
||||
assert.Greater(t, info.TotalMemory, uint64(0))
|
||||
assert.Greater(t, info.FreeMemory, uint64(0))
|
||||
assert.Greater(t, info.DeviceCount, uint32(0))
|
||||
default:
|
||||
return
|
||||
assert.Greater(t, len(info), 0)
|
||||
assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
|
||||
if info[0].Library != "cpu" {
|
||||
assert.Greater(t, info[0].TotalMemory, uint64(0))
|
||||
assert.Greater(t, info[0].FreeMemory, uint64(0))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPUMemInfo(t *testing.T) {
|
||||
info, err := getCPUMem()
|
||||
info, err := GetCPUMem()
|
||||
assert.NoError(t, err)
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
|
||||
49
gpu/types.go
49
gpu/types.go
@@ -3,7 +3,6 @@ package gpu
|
||||
type memInfo struct {
|
||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||
DeviceCount uint32 `json:"device_count,omitempty"`
|
||||
}
|
||||
|
||||
// Beginning of an `ollama info` command
|
||||
@@ -17,11 +16,49 @@ type GpuInfo struct {
|
||||
// MinimumMemory represents the minimum memory required to use the GPU
|
||||
MinimumMemory uint64 `json:"-"`
|
||||
|
||||
// TODO add other useful attributes about the card here for discovery information
|
||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
||||
DependencyPath string `json:"lib_path,omitempty"`
|
||||
|
||||
// GPU information
|
||||
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
||||
Name string `json:"name"` // user friendly name if available
|
||||
Major int `json:"major,omitempty"` // Major compatibility version (CC or gfx)
|
||||
Minor int `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
|
||||
Patch int `json:"patch,omitempty"` // Patch compatibility only matters on AMD
|
||||
|
||||
// TODO other performance capability info to help in scheduling decisions
|
||||
}
|
||||
|
||||
type Version struct {
|
||||
Major uint
|
||||
Minor uint
|
||||
Patch uint
|
||||
type GpuInfoList []GpuInfo
|
||||
|
||||
// Split up the set of gpu info's by Library and variant
|
||||
func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
||||
resp := []GpuInfoList{}
|
||||
libs := []string{}
|
||||
for _, info := range l {
|
||||
found := false
|
||||
requested := info.Library
|
||||
if info.Variant != "" {
|
||||
requested += "_" + info.Variant
|
||||
}
|
||||
for i, lib := range libs {
|
||||
if lib == requested {
|
||||
resp[i] = append(resp[i], info)
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
libs = append(libs, info.Library)
|
||||
resp = append(resp, []GpuInfo{info})
|
||||
}
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// Sort by Free Space
|
||||
type ByFreeMemory []GpuInfo
|
||||
|
||||
func (a ByFreeMemory) Len() int { return len(a) }
|
||||
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
||||
|
||||
@@ -4,11 +4,14 @@ package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestOrcaMiniBlueSky(t *testing.T) {
|
||||
@@ -24,5 +27,44 @@ func TestOrcaMiniBlueSky(t *testing.T) {
|
||||
"seed": 123,
|
||||
},
|
||||
}
|
||||
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh", "scattering"})
|
||||
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
||||
}
|
||||
|
||||
func TestUnicodeModelDir(t *testing.T) {
|
||||
// This is only useful for Windows with utf-16 characters, so skip this test for other platforms
|
||||
if runtime.GOOS != "windows" {
|
||||
t.Skip("Unicode test only applicable to windows")
|
||||
}
|
||||
// Only works for local testing
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
|
||||
t.Skip("TestUnicodeModelDir only works for local testing, skipping")
|
||||
}
|
||||
|
||||
modelDir, err := os.MkdirTemp("", "ollama_埃")
|
||||
require.NoError(t, err)
|
||||
defer os.RemoveAll(modelDir)
|
||||
slog.Info("unicode", "OLLAMA_MODELS", modelDir)
|
||||
|
||||
oldModelsDir := os.Getenv("OLLAMA_MODELS")
|
||||
if oldModelsDir == "" {
|
||||
defer os.Unsetenv("OLLAMA_MODELS")
|
||||
} else {
|
||||
defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
|
||||
}
|
||||
err = os.Setenv("OLLAMA_MODELS", modelDir)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
req := api.GenerateRequest{
|
||||
Model: "orca-mini",
|
||||
Prompt: "why is the sky blue?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"temperature": 0,
|
||||
"seed": 123,
|
||||
},
|
||||
}
|
||||
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
||||
}
|
||||
|
||||
225
integration/concurrency_test.go
Normal file
225
integration/concurrency_test.go
Normal file
@@ -0,0 +1,225 @@
|
||||
//go:build integration
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMultiModelConcurrency(t *testing.T) {
|
||||
var (
|
||||
req = [2]api.GenerateRequest{
|
||||
{
|
||||
Model: "orca-mini",
|
||||
Prompt: "why is the ocean blue?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: "tinydolphin",
|
||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
},
|
||||
}
|
||||
resp = [2][]string{
|
||||
[]string{"sunlight"},
|
||||
[]string{"england", "english", "massachusetts", "pilgrims"},
|
||||
}
|
||||
)
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(req))
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
||||
defer cancel()
|
||||
for i := 0; i < len(req); i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
GenerateTestHelper(ctx, t, req[i], resp[i])
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
req, resp := GenerateRequests()
|
||||
// Get the server running (if applicable) warm the model up with a single initial request
|
||||
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(req))
|
||||
for i := 0; i < len(req); i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
for j := 0; j < 5; j++ {
|
||||
slog.Info("Starting", "req", i, "iter", j)
|
||||
// On slower GPUs it can take a while to process the 4 concurrent requests
|
||||
// so we allow a much longer initial timeout
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
|
||||
func TestMultiModelStress(t *testing.T) {
|
||||
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
if vram == "" {
|
||||
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
|
||||
}
|
||||
max, err := strconv.ParseUint(vram, 10, 64)
|
||||
require.NoError(t, err)
|
||||
const MB = uint64(1024 * 1024)
|
||||
type model struct {
|
||||
name string
|
||||
size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
|
||||
}
|
||||
|
||||
smallModels := []model{
|
||||
{
|
||||
name: "orca-mini",
|
||||
size: 2992 * MB,
|
||||
},
|
||||
{
|
||||
name: "phi",
|
||||
size: 2616 * MB,
|
||||
},
|
||||
{
|
||||
name: "gemma:2b",
|
||||
size: 2364 * MB,
|
||||
},
|
||||
{
|
||||
name: "stable-code:3b",
|
||||
size: 2608 * MB,
|
||||
},
|
||||
{
|
||||
name: "starcoder2:3b",
|
||||
size: 2166 * MB,
|
||||
},
|
||||
}
|
||||
mediumModels := []model{
|
||||
{
|
||||
name: "llama2",
|
||||
size: 5118 * MB,
|
||||
},
|
||||
{
|
||||
name: "mistral",
|
||||
size: 4620 * MB,
|
||||
},
|
||||
{
|
||||
name: "orca-mini:7b",
|
||||
size: 5118 * MB,
|
||||
},
|
||||
{
|
||||
name: "dolphin-mistral",
|
||||
size: 4620 * MB,
|
||||
},
|
||||
{
|
||||
name: "gemma:7b",
|
||||
size: 5000 * MB,
|
||||
},
|
||||
// TODO - uncomment this once #3565 is merged and this is rebased on it
|
||||
// {
|
||||
// name: "codellama:7b",
|
||||
// size: 5118 * MB,
|
||||
// },
|
||||
}
|
||||
|
||||
// These seem to be too slow to be useful...
|
||||
// largeModels := []model{
|
||||
// {
|
||||
// name: "llama2:13b",
|
||||
// size: 7400 * MB,
|
||||
// },
|
||||
// {
|
||||
// name: "codellama:13b",
|
||||
// size: 7400 * MB,
|
||||
// },
|
||||
// {
|
||||
// name: "orca-mini:13b",
|
||||
// size: 7400 * MB,
|
||||
// },
|
||||
// {
|
||||
// name: "gemma:7b",
|
||||
// size: 5000 * MB,
|
||||
// },
|
||||
// {
|
||||
// name: "starcoder2:15b",
|
||||
// size: 9100 * MB,
|
||||
// },
|
||||
// }
|
||||
|
||||
var chosenModels []model
|
||||
switch {
|
||||
case max < 10000*MB:
|
||||
slog.Info("selecting small models")
|
||||
chosenModels = smallModels
|
||||
// case max < 30000*MB:
|
||||
default:
|
||||
slog.Info("selecting medium models")
|
||||
chosenModels = mediumModels
|
||||
// default:
|
||||
// slog.Info("selecting large models")
|
||||
// chosenModels = largModels
|
||||
}
|
||||
|
||||
req, resp := GenerateRequests()
|
||||
|
||||
for i := range req {
|
||||
if i > len(chosenModels) {
|
||||
break
|
||||
}
|
||||
req[i].Model = chosenModels[i].name
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
// Make sure all the models are pulled before we get started
|
||||
for _, r := range req {
|
||||
require.NoError(t, PullIfMissing(ctx, client, r.Model))
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
consumed := uint64(256 * MB) // Assume some baseline usage
|
||||
for i := 0; i < len(req); i++ {
|
||||
// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
|
||||
if i > 1 && consumed > max {
|
||||
slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
|
||||
break
|
||||
}
|
||||
consumed += chosenModels[i].size
|
||||
slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
|
||||
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
for j := 0; j < 3; j++ {
|
||||
slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
@@ -4,7 +4,6 @@ package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -25,5 +24,5 @@ func TestContextExhaustion(t *testing.T) {
|
||||
"num_ctx": 128,
|
||||
},
|
||||
}
|
||||
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"once", "upon", "lived"})
|
||||
GenerateTestHelper(ctx, t, req, []string{"once", "upon", "lived"})
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ package integration
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -29,10 +28,11 @@ func TestIntegrationMultimodal(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
resp := "the ollamas"
|
||||
// Note: sometimes it returns "the ollamas" sometimes "the ollams"
|
||||
resp := "the ollam"
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||
defer cancel()
|
||||
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{resp})
|
||||
GenerateTestHelper(ctx, t, req, []string{resp})
|
||||
}
|
||||
|
||||
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
|
||||
|
||||
@@ -4,8 +4,6 @@ package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -45,25 +43,5 @@ var (
|
||||
func TestIntegrationSimpleOrcaMini(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
||||
defer cancel()
|
||||
GenerateTestHelper(ctx, t, &http.Client{}, req[0], resp[0])
|
||||
GenerateTestHelper(ctx, t, req[0], resp[0])
|
||||
}
|
||||
|
||||
// TODO
|
||||
// The server always loads a new runner and closes the old one, which forces serial execution
|
||||
// At present this test case fails with concurrency problems. Eventually we should try to
|
||||
// get true concurrency working with n_parallel support in the backend
|
||||
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(req))
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
||||
defer cancel()
|
||||
for i := 0; i < len(req); i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
GenerateTestHelper(ctx, t, &http.Client{}, req[i], resp[i])
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// TODO - create a parallel test with 2 different models once we support concurrency
|
||||
|
||||
@@ -5,13 +5,14 @@ package integration
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
@@ -23,9 +24,13 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func Init() {
|
||||
lifecycle.InitLogging()
|
||||
}
|
||||
|
||||
func FindPort() string {
|
||||
port := 0
|
||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||
@@ -41,7 +46,7 @@ func FindPort() string {
|
||||
return strconv.Itoa(port)
|
||||
}
|
||||
|
||||
func GetTestEndpoint() (string, string) {
|
||||
func GetTestEndpoint() (*api.Client, string) {
|
||||
defaultPort := "11434"
|
||||
ollamaHost := os.Getenv("OLLAMA_HOST")
|
||||
|
||||
@@ -67,16 +72,20 @@ func GetTestEndpoint() (string, string) {
|
||||
port = FindPort()
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s:%s", host, port)
|
||||
slog.Info("server connection", "url", url)
|
||||
return scheme, url
|
||||
slog.Info("server connection", "host", host, "port", port)
|
||||
|
||||
return api.NewClient(
|
||||
&url.URL{
|
||||
Scheme: scheme,
|
||||
Host: net.JoinHostPort(host, port),
|
||||
},
|
||||
http.DefaultClient), fmt.Sprintf("%s:%s", host, port)
|
||||
}
|
||||
|
||||
// TODO make fanicier, grab logs, etc.
|
||||
var serverMutex sync.Mutex
|
||||
var serverReady bool
|
||||
|
||||
func StartServer(ctx context.Context, ollamaHost string) error {
|
||||
func startServer(ctx context.Context, ollamaHost string) error {
|
||||
// Make sure the server has been built
|
||||
CLIName, err := filepath.Abs("../ollama")
|
||||
if err != nil {
|
||||
@@ -98,7 +107,7 @@ func StartServer(ctx context.Context, ollamaHost string) error {
|
||||
|
||||
if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
|
||||
slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
|
||||
os.Setenv("OLLAMA_HOST", ollamaHost)
|
||||
t.Setenv("OLLAMA_HOST", ollamaHost)
|
||||
}
|
||||
|
||||
slog.Info("starting server", "url", ollamaHost)
|
||||
@@ -125,67 +134,76 @@ func StartServer(ctx context.Context, ollamaHost string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
|
||||
func PullIfMissing(ctx context.Context, client *api.Client, modelName string) error {
|
||||
slog.Info("checking status of model", "model", modelName)
|
||||
showReq := &api.ShowRequest{Name: modelName}
|
||||
requestJSON, err := json.Marshal(showReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/show", bytes.NewReader(requestJSON))
|
||||
if err != nil {
|
||||
showCtx, cancel := context.WithDeadlineCause(
|
||||
ctx,
|
||||
time.Now().Add(5*time.Second),
|
||||
fmt.Errorf("show for existing model %s took too long", modelName),
|
||||
)
|
||||
defer cancel()
|
||||
_, err := client.Show(showCtx, showReq)
|
||||
var statusError api.StatusError
|
||||
switch {
|
||||
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
|
||||
break
|
||||
case err != nil:
|
||||
return err
|
||||
}
|
||||
|
||||
// Make the request with the HTTP client
|
||||
response, err := client.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer response.Body.Close()
|
||||
if response.StatusCode == 200 {
|
||||
default:
|
||||
slog.Info("model already present", "model", modelName)
|
||||
return nil
|
||||
}
|
||||
slog.Info("model missing", "status", response.StatusCode)
|
||||
slog.Info("model missing", "model", modelName)
|
||||
|
||||
stallDuration := 30 * time.Second // This includes checksum verification, which can take a while on larger models
|
||||
stallTimer := time.NewTimer(stallDuration)
|
||||
fn := func(resp api.ProgressResponse) error {
|
||||
// fmt.Print(".")
|
||||
if !stallTimer.Reset(stallDuration) {
|
||||
return fmt.Errorf("stall was detected, aborting status reporting")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
stream := true
|
||||
pullReq := &api.PullRequest{Name: modelName, Stream: &stream}
|
||||
requestJSON, err = json.Marshal(pullReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err = http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/pull", bytes.NewReader(requestJSON))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
slog.Info("pulling", "model", modelName)
|
||||
var pullError error
|
||||
|
||||
response, err = client.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return err
|
||||
done := make(chan int)
|
||||
go func() {
|
||||
pullError = client.Pull(ctx, pullReq, fn)
|
||||
done <- 0
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-stallTimer.C:
|
||||
return fmt.Errorf("download stalled")
|
||||
case <-done:
|
||||
return pullError
|
||||
}
|
||||
defer response.Body.Close()
|
||||
if response.StatusCode != 200 {
|
||||
return fmt.Errorf("failed to pull model") // TODO more details perhaps
|
||||
}
|
||||
slog.Info("model pulled", "model", modelName)
|
||||
return nil
|
||||
}
|
||||
|
||||
var serverProcMutex sync.Mutex
|
||||
|
||||
func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
|
||||
|
||||
// TODO maybe stuff in an init routine?
|
||||
lifecycle.InitLogging()
|
||||
|
||||
requestJSON, err := json.Marshal(genReq)
|
||||
if err != nil {
|
||||
t.Fatalf("Error serializing request: %v", err)
|
||||
// Returns an Client, the testEndpoint, and a cleanup function, fails the test on errors
|
||||
// Starts the server if needed
|
||||
func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, string, func()) {
|
||||
client, testEndpoint := GetTestEndpoint()
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
||||
serverProcMutex.Lock()
|
||||
fp, err := os.CreateTemp("", "ollama-server-*.log")
|
||||
if err != nil {
|
||||
t.Fatalf("failed to generate log file: %s", err)
|
||||
}
|
||||
lifecycle.ServerLogFile = fp.Name()
|
||||
fp.Close()
|
||||
require.NoError(t, startServer(ctx, testEndpoint))
|
||||
}
|
||||
defer func() {
|
||||
|
||||
return client, testEndpoint, func() {
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
||||
defer serverProcMutex.Unlock()
|
||||
if t.Failed() {
|
||||
@@ -203,63 +221,118 @@ func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client,
|
||||
os.Stderr.Write(data)
|
||||
slog.Warn("END OF SERVER")
|
||||
}
|
||||
err = os.Remove(lifecycle.ServerLogFile)
|
||||
err := os.Remove(lifecycle.ServerLogFile)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
scheme, testEndpoint := GetTestEndpoint()
|
||||
|
||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
||||
serverProcMutex.Lock()
|
||||
fp, err := os.CreateTemp("", "ollama-server-*.log")
|
||||
if err != nil {
|
||||
t.Fatalf("failed to generate log file: %s", err)
|
||||
}
|
||||
lifecycle.ServerLogFile = fp.Name()
|
||||
fp.Close()
|
||||
assert.NoError(t, StartServer(ctx, testEndpoint))
|
||||
}
|
||||
|
||||
err = PullIfMissing(ctx, client, scheme, testEndpoint, genReq.Model)
|
||||
if err != nil {
|
||||
t.Fatalf("Error pulling model: %v", err)
|
||||
}
|
||||
|
||||
// Make the request and get the response
|
||||
req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/generate", bytes.NewReader(requestJSON))
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating request: %v", err)
|
||||
}
|
||||
|
||||
// Set the content type for the request
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
// Make the request with the HTTP client
|
||||
response, err := client.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
t.Fatalf("Error making request: %v", err)
|
||||
}
|
||||
defer response.Body.Close()
|
||||
body, err := io.ReadAll(response.Body)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, response.StatusCode, 200, string(body))
|
||||
|
||||
// Verify the response is valid JSON
|
||||
var payload api.GenerateResponse
|
||||
err = json.Unmarshal(body, &payload)
|
||||
if err != nil {
|
||||
assert.NoError(t, err, body)
|
||||
}
|
||||
|
||||
// Verify the response contains the expected data
|
||||
atLeastOne := false
|
||||
for _, resp := range anyResp {
|
||||
if strings.Contains(strings.ToLower(payload.Response), resp) {
|
||||
atLeastOne = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, atLeastOne, "none of %v found in %s", anyResp, payload.Response)
|
||||
}
|
||||
|
||||
func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRequest, anyResp []string) {
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
require.NoError(t, PullIfMissing(ctx, client, genReq.Model))
|
||||
DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
|
||||
}
|
||||
|
||||
func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) {
|
||||
stallTimer := time.NewTimer(initialTimeout)
|
||||
var buf bytes.Buffer
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
// fmt.Print(".")
|
||||
buf.Write([]byte(response.Response))
|
||||
if !stallTimer.Reset(streamTimeout) {
|
||||
return fmt.Errorf("stall was detected while streaming response, aborting")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
stream := true
|
||||
genReq.Stream = &stream
|
||||
done := make(chan int)
|
||||
var genErr error
|
||||
go func() {
|
||||
genErr = client.Generate(ctx, &genReq, fn)
|
||||
done <- 0
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-stallTimer.C:
|
||||
if buf.Len() == 0 {
|
||||
t.Errorf("generate never started. Timed out after :%s", initialTimeout.String())
|
||||
} else {
|
||||
t.Errorf("generate stalled. Response so far:%s", buf.String())
|
||||
}
|
||||
case <-done:
|
||||
require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
|
||||
// Verify the response contains the expected data
|
||||
response := buf.String()
|
||||
atLeastOne := false
|
||||
for _, resp := range anyResp {
|
||||
if strings.Contains(strings.ToLower(response), resp) {
|
||||
atLeastOne = true
|
||||
break
|
||||
}
|
||||
}
|
||||
require.True(t, atLeastOne, "none of %v found in %s", anyResp, response)
|
||||
slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response)
|
||||
case <-ctx.Done():
|
||||
t.Error("outer test context done while waiting for generate")
|
||||
}
|
||||
}
|
||||
|
||||
// Generate a set of requests
|
||||
// By default each request uses orca-mini as the model
|
||||
func GenerateRequests() ([]api.GenerateRequest, [][]string) {
|
||||
return []api.GenerateRequest{
|
||||
{
|
||||
Model: "orca-mini",
|
||||
Prompt: "why is the ocean blue?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: "orca-mini",
|
||||
Prompt: "why is the color of dirt brown?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: "orca-mini",
|
||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: "orca-mini",
|
||||
Prompt: "what is the origin of independence day?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: "orca-mini",
|
||||
Prompt: "what is the composition of air?",
|
||||
Stream: &stream,
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
[][]string{
|
||||
[]string{"sunlight"},
|
||||
[]string{"soil", "organic", "earth", "black", "tan"},
|
||||
[]string{"england", "english", "massachusetts", "pilgrims"},
|
||||
[]string{"fourth", "july", "declaration", "independence"},
|
||||
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
|
||||
}
|
||||
}
|
||||
|
||||
15
llm/ext_server/server.cpp
vendored
15
llm/ext_server/server.cpp
vendored
@@ -1032,7 +1032,7 @@ struct llama_server_context
|
||||
slot.has_next_token = false;
|
||||
}
|
||||
|
||||
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
|
||||
if (!slot.cache_tokens.empty() && llama_token_is_eog(model, result.tok))
|
||||
{
|
||||
slot.stopped_eos = true;
|
||||
slot.has_next_token = false;
|
||||
@@ -1144,12 +1144,15 @@ struct llama_server_context
|
||||
|
||||
res.result_json = json
|
||||
{
|
||||
{"content", tkn.text_to_send},
|
||||
{"stop", false},
|
||||
{"slot_id", slot.id},
|
||||
{"multimodal", multimodal}
|
||||
};
|
||||
|
||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
||||
res.result_json["content"] = tkn.text_to_send;
|
||||
}
|
||||
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
@@ -2644,18 +2647,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
kvo.val_i64 = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
kvo.val_f64 = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
kvo.val_bool = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
kvo.val_bool = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
|
||||
165
llm/filetype.go
Normal file
165
llm/filetype.go
Normal file
@@ -0,0 +1,165 @@
|
||||
package llm
|
||||
|
||||
import "fmt"
|
||||
|
||||
type fileType uint32
|
||||
|
||||
const (
|
||||
fileTypeF32 fileType = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
fileTypeQ4_1_F16
|
||||
fileTypeQ4_2 // unused
|
||||
fileTypeQ4_3 // unused
|
||||
fileTypeQ8_0
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
fileTypeQ4_K_S
|
||||
fileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
fileTypeIQ2_XXS
|
||||
fileTypeIQ2_XS
|
||||
fileTypeQ2_K_S
|
||||
fileTypeQ3_K_XS
|
||||
fileTypeIQ3_XXS
|
||||
fileTypeIQ1_S
|
||||
fileTypeIQ4_NL
|
||||
fileTypeIQ3_S
|
||||
fileTypeIQ2_S
|
||||
fileTypeIQ4_XS
|
||||
|
||||
fileTypeUnknown
|
||||
)
|
||||
|
||||
func ParseFileType(s string) (fileType, error) {
|
||||
switch s {
|
||||
case "F32":
|
||||
return fileTypeF32, nil
|
||||
case "F16":
|
||||
return fileTypeF16, nil
|
||||
case "Q4_0":
|
||||
return fileTypeQ4_0, nil
|
||||
case "Q4_1":
|
||||
return fileTypeQ4_1, nil
|
||||
case "Q4_1_F16":
|
||||
return fileTypeQ4_1_F16, nil
|
||||
case "Q8_0":
|
||||
return fileTypeQ8_0, nil
|
||||
case "Q5_0":
|
||||
return fileTypeQ5_0, nil
|
||||
case "Q5_1":
|
||||
return fileTypeQ5_1, nil
|
||||
case "Q2_K":
|
||||
return fileTypeQ2_K, nil
|
||||
case "Q3_K_S":
|
||||
return fileTypeQ3_K_S, nil
|
||||
case "Q3_K_M":
|
||||
return fileTypeQ3_K_M, nil
|
||||
case "Q3_K_L":
|
||||
return fileTypeQ3_K_L, nil
|
||||
case "Q4_K_S":
|
||||
return fileTypeQ4_K_S, nil
|
||||
case "Q4_K_M":
|
||||
return fileTypeQ4_K_M, nil
|
||||
case "Q5_K_S":
|
||||
return fileTypeQ5_K_S, nil
|
||||
case "Q5_K_M":
|
||||
return fileTypeQ5_K_M, nil
|
||||
case "Q6_K":
|
||||
return fileTypeQ6_K, nil
|
||||
case "IQ2_XXS":
|
||||
return fileTypeIQ2_XXS, nil
|
||||
case "IQ2_XS":
|
||||
return fileTypeIQ2_XS, nil
|
||||
case "Q2_K_S":
|
||||
return fileTypeQ2_K_S, nil
|
||||
case "Q3_K_XS":
|
||||
return fileTypeQ3_K_XS, nil
|
||||
case "IQ1_S":
|
||||
return fileTypeIQ1_S, nil
|
||||
case "IQ4_NL":
|
||||
return fileTypeIQ4_NL, nil
|
||||
case "IQ3_S":
|
||||
return fileTypeIQ3_S, nil
|
||||
case "IQ2_S":
|
||||
return fileTypeIQ2_S, nil
|
||||
case "IQ4_XS":
|
||||
return fileTypeIQ4_XS, nil
|
||||
case "IQ3_XXS":
|
||||
return fileTypeIQ3_XXS, nil
|
||||
default:
|
||||
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) String() string {
|
||||
switch t {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
case fileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case fileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case fileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case fileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case fileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
case fileTypeIQ2_XXS:
|
||||
return "IQ2_XXS"
|
||||
case fileTypeIQ2_XS:
|
||||
return "IQ2_XS"
|
||||
case fileTypeQ2_K_S:
|
||||
return "Q2_K_S"
|
||||
case fileTypeQ3_K_XS:
|
||||
return "Q3_K_XS"
|
||||
case fileTypeIQ1_S:
|
||||
return "IQ1_S"
|
||||
case fileTypeIQ4_NL:
|
||||
return "IQ4_NL"
|
||||
case fileTypeIQ3_S:
|
||||
return "IQ3_S"
|
||||
case fileTypeIQ2_S:
|
||||
return "IQ2_S"
|
||||
case fileTypeIQ4_XS:
|
||||
return "IQ4_XS"
|
||||
case fileTypeIQ3_XXS:
|
||||
return "IQ3_XXS"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) Value() uint32 {
|
||||
return uint32(t)
|
||||
}
|
||||
@@ -21,7 +21,7 @@ init_vars() {
|
||||
# TODO - add additional optimization flags...
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
||||
fi
|
||||
case $(uname -s) in
|
||||
case $(uname -s) in
|
||||
"Darwin")
|
||||
LIB_EXT="dylib"
|
||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||
|
||||
@@ -165,14 +165,22 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||
fi
|
||||
if [ "${ARCH}" == "arm64" ]; then
|
||||
echo "ARM CPU detected - disabling unsupported AVX instructions"
|
||||
|
||||
|
||||
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
|
||||
#
|
||||
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||
# Disabling has minimal performance effect while maintaining compatibility.
|
||||
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||
# Disabling has minimal performance effect while maintaining compatibility.
|
||||
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
|
||||
fi
|
||||
CMAKE_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||
if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
|
||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
||||
echo "Building custom CUDA GPU"
|
||||
else
|
||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||
fi
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
build
|
||||
@@ -217,6 +225,12 @@ if [ -d "${ROCM_PATH}" ]; then
|
||||
fi
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
||||
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
||||
echo "Building custom ROCM GPU"
|
||||
fi
|
||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
build
|
||||
|
||||
@@ -26,15 +26,25 @@ function amdGPUs {
|
||||
$GPU_LIST -join ';'
|
||||
}
|
||||
|
||||
|
||||
function init_vars {
|
||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
if (!$script:SRC_DIR) {
|
||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||
}
|
||||
if (!$script:llamacppDir) {
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
}
|
||||
if (!$script:cmakeTargets) {
|
||||
$script:cmakeTargets = @("ollama_llama_server")
|
||||
}
|
||||
$script:cmakeDefs = @(
|
||||
"-DBUILD_SHARED_LIBS=on",
|
||||
"-DLLAMA_NATIVE=off"
|
||||
)
|
||||
$script:cmakeTargets = @("ollama_llama_server")
|
||||
$script:ARCH = "amd64" # arm not yet supported.
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
|
||||
md "$script:DIST_BASE" -ea 0 > $null
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
||||
$script:config = "RelWithDebInfo"
|
||||
@@ -55,7 +65,6 @@ function init_vars {
|
||||
} else {
|
||||
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
|
||||
}
|
||||
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
|
||||
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
|
||||
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
|
||||
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
@@ -134,21 +143,18 @@ function sign {
|
||||
}
|
||||
}
|
||||
|
||||
function compress {
|
||||
if ($script:GZIP -eq $null) {
|
||||
write-host "gzip not installed, not compressing files"
|
||||
return
|
||||
}
|
||||
write-host "Compressing binaries..."
|
||||
function install {
|
||||
write-host "Installing binaries to dist dir ${script:distDir}"
|
||||
mkdir ${script:distDir} -ErrorAction SilentlyContinue
|
||||
$binaries = dir "${script:buildDir}/bin/*.exe"
|
||||
foreach ($file in $binaries) {
|
||||
& "$script:GZIP" --best -f $file
|
||||
copy-item -Path $file -Destination ${script:distDir} -Force
|
||||
}
|
||||
|
||||
write-host "Compressing dlls..."
|
||||
write-host "Installing dlls to dist dir ${script:distDir}"
|
||||
$dlls = dir "${script:buildDir}/bin/*.dll"
|
||||
foreach ($file in $dlls) {
|
||||
& "$script:GZIP" --best -f $file
|
||||
copy-item -Path $file -Destination ${script:distDir} -Force
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,123 +175,195 @@ function cleanup {
|
||||
}
|
||||
}
|
||||
|
||||
init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
|
||||
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
|
||||
function build_static() {
|
||||
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
|
||||
# GCC build for direct linking into the Go binary
|
||||
init_vars
|
||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||
write-host "Checking for MinGW..."
|
||||
# error action ensures we exit on failure
|
||||
get-command gcc
|
||||
get-command mingw32-make
|
||||
$oldTargets = $script:cmakeTargets
|
||||
$script:cmakeTargets = @("llama", "ggml")
|
||||
$script:cmakeDefs = @(
|
||||
"-G", "MinGW Makefiles"
|
||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||
"-DBUILD_SHARED_LIBS=off",
|
||||
"-DLLAMA_NATIVE=off",
|
||||
"-DLLAMA_AVX=off",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DLLAMA_AVX512=off",
|
||||
"-DLLAMA_F16C=off",
|
||||
"-DLLAMA_FMA=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
$script:cmakeTargets = $oldTargets
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu($gen_arch) {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
|
||||
# remaining llama.cpp builds use MSVC
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
||||
$script:distDir="$script:DIST_BASE\cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu_avx() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU AVX generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu_avx2() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU AVX2 generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cuda() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
||||
write-host "building custom CUDA GPU"
|
||||
}
|
||||
build
|
||||
sign
|
||||
install
|
||||
|
||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
} else {
|
||||
write-host "Skipping CUDA generation step"
|
||||
}
|
||||
}
|
||||
|
||||
function build_rocm() {
|
||||
if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
|
||||
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
||||
if ($null -ne $script:ROCM_VERSION) {
|
||||
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
||||
}
|
||||
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
|
||||
$script:cmakeDefs += @(
|
||||
"-G", "Ninja",
|
||||
"-DCMAKE_C_COMPILER=clang.exe",
|
||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||
"-DLLAMA_HIPBLAS=on",
|
||||
"-DHIP_PLATFORM=amd",
|
||||
"-DLLAMA_AVX=on",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
||||
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
||||
"-DGPU_TARGETS=$(amdGPUs)"
|
||||
)
|
||||
|
||||
# Make sure the ROCm binary dir is first in the path
|
||||
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
||||
|
||||
# We have to clobber the LIB var from the developer shell for clang to work properly
|
||||
$env:LIB=""
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
|
||||
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
|
||||
write-host "building custom ROCM GPU"
|
||||
}
|
||||
write-host "Building ROCm"
|
||||
build
|
||||
# Ninja doesn't prefix with config name
|
||||
${script:config}=""
|
||||
if ($null -ne $script:DUMPBIN) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
||||
}
|
||||
sign
|
||||
install
|
||||
|
||||
# Assumes v5.7, may need adjustments for v6
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
|
||||
} else {
|
||||
write-host "Skipping ROCm generation step"
|
||||
}
|
||||
}
|
||||
|
||||
# GCC build for direct linking into the Go binary
|
||||
init_vars
|
||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||
write-host "Checking for MinGW..."
|
||||
# error action ensures we exit on failure
|
||||
get-command gcc
|
||||
get-command mingw32-make
|
||||
$script:cmakeTargets = @("llama", "ggml")
|
||||
$script:cmakeDefs = @(
|
||||
"-G", "MinGW Makefiles"
|
||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||
"-DBUILD_SHARED_LIBS=off",
|
||||
"-DLLAMA_NATIVE=off",
|
||||
"-DLLAMA_AVX=off",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DLLAMA_AVX512=off",
|
||||
"-DLLAMA_F16C=off",
|
||||
"-DLLAMA_FMA=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
if ($($args.count) -eq 0) {
|
||||
git_module_setup
|
||||
apply_patches
|
||||
build_static
|
||||
if ($script:ARCH -eq "arm64") {
|
||||
build_cpu("ARM64")
|
||||
} else { # amd64
|
||||
build_cpu("x64")
|
||||
build_cpu_avx
|
||||
build_cpu_avx2
|
||||
build_cuda
|
||||
build_rocm
|
||||
}
|
||||
|
||||
# remaining llama.cpp builds use MSVC
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
sign
|
||||
compress
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
sign
|
||||
compress
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
sign
|
||||
compress
|
||||
cleanup
|
||||
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
|
||||
if ($null -ne $script:CUDA_LIB_DIR) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
build
|
||||
sign
|
||||
compress
|
||||
}
|
||||
|
||||
if ($null -ne $env:HIP_PATH) {
|
||||
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
||||
if ($null -ne $script:ROCM_VERSION) {
|
||||
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
||||
}
|
||||
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
||||
$script:cmakeDefs += @(
|
||||
"-G", "Ninja",
|
||||
"-DCMAKE_C_COMPILER=clang.exe",
|
||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||
"-DLLAMA_HIPBLAS=on",
|
||||
"-DHIP_PLATFORM=amd",
|
||||
"-DLLAMA_AVX=on",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
||||
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
||||
"-DGPU_TARGETS=$(amdGPUs)"
|
||||
)
|
||||
|
||||
# Make sure the ROCm binary dir is first in the path
|
||||
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
||||
|
||||
# We have to clobber the LIB var from the developer shell for clang to work properly
|
||||
$env:LIB=""
|
||||
|
||||
write-host "Building ROCm"
|
||||
build
|
||||
# Ninja doesn't prefix with config name
|
||||
${script:config}=""
|
||||
if ($null -ne $script:DUMPBIN) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
||||
}
|
||||
sign
|
||||
compress
|
||||
}
|
||||
|
||||
|
||||
cleanup
|
||||
write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
|
||||
for ( $i = 0; $i -lt $args.count; $i++ ) {
|
||||
write-host "performing $($args[$i])"
|
||||
& $($args[$i])
|
||||
}
|
||||
}
|
||||
119
llm/ggml.go
119
llm/ggml.go
@@ -13,82 +13,6 @@ type GGML struct {
|
||||
model
|
||||
}
|
||||
|
||||
const (
|
||||
fileTypeF32 uint32 = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
fileTypeQ4_1_F16
|
||||
fileTypeQ8_0 uint32 = iota + 2
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
fileTypeQ4_K_S
|
||||
fileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
fileTypeIQ2_XXS
|
||||
fileTypeIQ2_XS
|
||||
fileTypeQ2_K_S
|
||||
fileTypeQ3_K_XS
|
||||
fileTypeIQ3_XXS
|
||||
)
|
||||
|
||||
func fileType(fileType uint32) string {
|
||||
switch fileType {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
case fileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case fileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case fileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case fileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case fileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
case fileTypeIQ2_XXS:
|
||||
return "IQ2_XXS"
|
||||
case fileTypeIQ2_XS:
|
||||
return "IQ2_XS"
|
||||
case fileTypeQ2_K_S:
|
||||
return "Q2_K_S"
|
||||
case fileTypeQ3_K_XS:
|
||||
return "Q3_K_XS"
|
||||
case fileTypeIQ3_XXS:
|
||||
return "IQ3_XXS"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type model interface {
|
||||
KV() KV
|
||||
Tensors() Tensors
|
||||
@@ -123,7 +47,7 @@ func (kv KV) ParameterCount() uint64 {
|
||||
|
||||
func (kv KV) FileType() string {
|
||||
if u64 := kv.u64("general.file_type"); u64 > 0 {
|
||||
return fileType(uint32(u64))
|
||||
return fileType(uint32(u64)).String()
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
@@ -201,9 +125,9 @@ type Tensor struct {
|
||||
|
||||
func (t Tensor) blockSize() uint64 {
|
||||
switch {
|
||||
case t.Kind < 2:
|
||||
case t.Kind < 2 || (t.Kind > 23 && t.Kind < 29):
|
||||
return 1
|
||||
case t.Kind < 10:
|
||||
case t.Kind < 10 || t.Kind == 20:
|
||||
return 32
|
||||
default:
|
||||
return 256
|
||||
@@ -248,6 +172,16 @@ func (t Tensor) typeSize() uint64 {
|
||||
return 2 + 2*blockSize/8 + blockSize/32
|
||||
case 18: // IQ3_XXS
|
||||
return 2 + 3*blockSize/8
|
||||
case 19: // IQ1_S
|
||||
return 2 + blockSize/8 + blockSize/16
|
||||
case 20: // IQ4_NL
|
||||
return 2 + blockSize/2
|
||||
case 21: // IQ3_S
|
||||
return 2 + 2*blockSize/8 + blockSize/8 + blockSize/32 + 4
|
||||
case 22: // IQ2_S
|
||||
return 2 + blockSize/4 + blockSize/16
|
||||
case 23: // IQ4_XS
|
||||
return 4 + blockSize/2 + blockSize/64
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
@@ -286,6 +220,23 @@ const (
|
||||
|
||||
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||
|
||||
func DetectGGMLType(b []byte) string {
|
||||
switch binary.LittleEndian.Uint32(b[:4]) {
|
||||
case FILE_MAGIC_GGML:
|
||||
return "ggml"
|
||||
case FILE_MAGIC_GGMF:
|
||||
return "ggmf"
|
||||
case FILE_MAGIC_GGJT:
|
||||
return "ggjt"
|
||||
case FILE_MAGIC_GGLA:
|
||||
return "ggla"
|
||||
case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
|
||||
return "gguf"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
||||
var magic uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||
@@ -343,7 +294,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||
)
|
||||
|
||||
if ffnGateWeight, ok := layers["0"]["ffn_gate.0.weight"]; ok {
|
||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
||||
// mixtral 8x22b
|
||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
||||
partialOffload = max(
|
||||
3*ffnGateExpsWeight.size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
||||
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
|
||||
)
|
||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
||||
// mixtral 8x7b
|
||||
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
||||
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
||||
partialOffload = max(
|
||||
|
||||
16
llm/gguf.go
16
llm/gguf.go
@@ -190,8 +190,6 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||
llm.kv[k] = v
|
||||
}
|
||||
|
||||
slog.Debug(fmt.Sprintf("general.architecture = %s", llm.kv["general.architecture"]))
|
||||
|
||||
// decode tensors
|
||||
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
||||
name, err := readGGUFString(llm, rs)
|
||||
@@ -465,11 +463,13 @@ var ggufKVOrder = map[string][]string{
|
||||
"llama.embedding_length",
|
||||
"llama.block_count",
|
||||
"llama.feed_forward_length",
|
||||
"llama.rope.dimension_count",
|
||||
"llama.attention.head_count",
|
||||
"llama.attention.head_count_kv",
|
||||
"llama.attention.layer_norm_rms_epsilon",
|
||||
"llama.rope.freq_base",
|
||||
"llama.rope.dimension_count",
|
||||
"llama.expert_count",
|
||||
"llama.expert_used_count",
|
||||
"gemma.context_length",
|
||||
"gemma.embedding_length",
|
||||
"gemma.block_count",
|
||||
@@ -577,6 +577,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
||||
return err
|
||||
}
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("improper type for '%s'", k)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -598,9 +600,11 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
||||
return err
|
||||
}
|
||||
|
||||
dims := 1
|
||||
if tensor.Shape[1] > 0 {
|
||||
dims = 2
|
||||
dims := 0
|
||||
for cnt := 0; cnt < len(tensor.Shape); cnt++ {
|
||||
if tensor.Shape[cnt] > 0 {
|
||||
dims++
|
||||
}
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, llm.ByteOrder, uint32(dims)); err != nil {
|
||||
|
||||
Submodule llm/llama.cpp updated: 7593639ce3...952d03dbea
57
llm/llm.go
57
llm/llm.go
@@ -4,6 +4,7 @@ package llm
|
||||
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
|
||||
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
|
||||
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
|
||||
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
|
||||
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
|
||||
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
|
||||
// #include <stdlib.h>
|
||||
@@ -19,7 +20,7 @@ func SystemInfo() string {
|
||||
return C.GoString(C.llama_print_system_info())
|
||||
}
|
||||
|
||||
func Quantize(infile, outfile, filetype string) error {
|
||||
func Quantize(infile, outfile string, ftype fileType) error {
|
||||
cinfile := C.CString(infile)
|
||||
defer C.free(unsafe.Pointer(cinfile))
|
||||
|
||||
@@ -28,58 +29,10 @@ func Quantize(infile, outfile, filetype string) error {
|
||||
|
||||
params := C.llama_model_quantize_default_params()
|
||||
params.nthread = -1
|
||||
params.ftype = ftype.Value()
|
||||
|
||||
switch filetype {
|
||||
case "F32":
|
||||
params.ftype = fileTypeF32
|
||||
case "F16":
|
||||
params.ftype = fileTypeF16
|
||||
case "Q4_0":
|
||||
params.ftype = fileTypeQ4_0
|
||||
case "Q4_1":
|
||||
params.ftype = fileTypeQ4_1
|
||||
case "Q4_1_F16":
|
||||
params.ftype = fileTypeQ4_1_F16
|
||||
case "Q8_0":
|
||||
params.ftype = fileTypeQ8_0
|
||||
case "Q5_0":
|
||||
params.ftype = fileTypeQ5_0
|
||||
case "Q5_1":
|
||||
params.ftype = fileTypeQ5_1
|
||||
case "Q2_K":
|
||||
params.ftype = fileTypeQ2_K
|
||||
case "Q3_K_S":
|
||||
params.ftype = fileTypeQ3_K_S
|
||||
case "Q3_K_M":
|
||||
params.ftype = fileTypeQ3_K_M
|
||||
case "Q3_K_L":
|
||||
params.ftype = fileTypeQ3_K_L
|
||||
case "Q4_K_S":
|
||||
params.ftype = fileTypeQ4_K_S
|
||||
case "Q4_K_M":
|
||||
params.ftype = fileTypeQ4_K_M
|
||||
case "Q5_K_S":
|
||||
params.ftype = fileTypeQ5_K_S
|
||||
case "Q5_K_M":
|
||||
params.ftype = fileTypeQ5_K_M
|
||||
case "Q6_K":
|
||||
params.ftype = fileTypeQ6_K
|
||||
case "IQ2_XXS":
|
||||
params.ftype = fileTypeIQ2_XXS
|
||||
case "IQ2_XS":
|
||||
params.ftype = fileTypeIQ2_XS
|
||||
case "Q2_K_S":
|
||||
params.ftype = fileTypeQ2_K_S
|
||||
case "Q3_K_XS":
|
||||
params.ftype = fileTypeQ3_K_XS
|
||||
case "IQ3_XXS":
|
||||
params.ftype = fileTypeIQ3_XXS
|
||||
default:
|
||||
return fmt.Errorf("unknown filetype: %s", filetype)
|
||||
}
|
||||
|
||||
if retval := C.llama_model_quantize(cinfile, coutfile, ¶ms); retval != 0 {
|
||||
return fmt.Errorf("llama_model_quantize: %d", retval)
|
||||
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
||||
return fmt.Errorf("llama_model_quantize: %d", rc)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -2,5 +2,5 @@ package llm
|
||||
|
||||
import "embed"
|
||||
|
||||
//go:embed build/windows/*/*/bin/*
|
||||
// unused on windows
|
||||
var libEmbed embed.FS
|
||||
|
||||
193
llm/memory.go
Normal file
193
llm/memory.go
Normal file
@@ -0,0 +1,193 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
var estimatedVRAM uint64
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
// Split up the GPUs by type and try them
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
var layerCount int
|
||||
layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
if opts.NumGPU < 0 {
|
||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
||||
return true, estimatedVRAM
|
||||
}
|
||||
} else {
|
||||
if layerCount > 0 && layerCount >= opts.NumGPU {
|
||||
return true, estimatedVRAM
|
||||
}
|
||||
}
|
||||
}
|
||||
return false, estimatedVRAM
|
||||
}
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
|
||||
// The GPUs provided must all be the same Library
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
|
||||
if gpus[0].Library == "cpu" {
|
||||
return 0, 0
|
||||
}
|
||||
var memoryAvailable uint64
|
||||
for _, info := range gpus {
|
||||
memoryAvailable += info.FreeMemory
|
||||
}
|
||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
if userLimit != "" {
|
||||
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
||||
} else {
|
||||
slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
|
||||
memoryAvailable = avail
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
||||
|
||||
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
|
||||
memoryMinimum := gpus[0].MinimumMemory
|
||||
|
||||
for _, projector := range projectors {
|
||||
memoryMinimum += projectorMemoryRequirements(projector)
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||
|
||||
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||
}
|
||||
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
graphFullOffload *= uint64(len(gpus))
|
||||
graphPartialOffload *= uint64(len(gpus))
|
||||
|
||||
// on metal there's no partial offload overhead
|
||||
if gpus[0].Library == "metal" {
|
||||
graphPartialOffload = graphFullOffload
|
||||
}
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if layer, ok := layers["output"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
} else if layer, ok := layers["token_embd"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if gpus[0].Library == "metal" && opts.UseMMap {
|
||||
// memory is preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
memoryRequiredPartial += memoryLayerOutput
|
||||
}
|
||||
|
||||
var layerCount int
|
||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
memoryLayer += kv / ggml.KV().BlockCount()
|
||||
|
||||
memoryRequiredTotal += memoryLayer
|
||||
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
||||
memoryRequiredPartial += memoryLayer
|
||||
layerCount++
|
||||
}
|
||||
}
|
||||
|
||||
if gpus[0].Library != "metal" || !opts.UseMMap {
|
||||
// memory was not preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
}
|
||||
|
||||
if memoryAvailable > memoryRequiredTotal {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
}
|
||||
|
||||
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
|
||||
|
||||
slog.Info(
|
||||
"offload to gpu",
|
||||
slog.Group(
|
||||
"layers",
|
||||
// actual number of layers offloaded
|
||||
"real", opts.NumGPU,
|
||||
// estimated number of layers that can be offloaded
|
||||
"estimate", layerCount,
|
||||
),
|
||||
slog.Group(
|
||||
"memory",
|
||||
// memory available for offloading
|
||||
"available", format.HumanBytes2(memoryAvailable),
|
||||
slog.Group(
|
||||
"required",
|
||||
// memory required for full offloading
|
||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
||||
// memory required to offload layers.estimate layers
|
||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
||||
// memory of KV cache
|
||||
"kv", format.HumanBytes2(kv),
|
||||
),
|
||||
slog.Group(
|
||||
"weights",
|
||||
// memory of the weights
|
||||
"total", format.HumanBytes2(memoryWeights),
|
||||
// memory of repeating layers
|
||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
||||
// memory of non-repeating layers
|
||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
||||
),
|
||||
slog.Group(
|
||||
"graph",
|
||||
// memory of graph when fully offloaded
|
||||
"full", format.HumanBytes2(graphFullOffload),
|
||||
// memory of graph when not fully offloaded
|
||||
"partial", format.HumanBytes2(graphPartialOffload),
|
||||
),
|
||||
),
|
||||
)
|
||||
return layerCount, uint64(memoryRequiredPartial)
|
||||
}
|
||||
12
llm/patches/02-clip-log.diff
Normal file
12
llm/patches/02-clip-log.diff
Normal file
@@ -0,0 +1,12 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index e431c7f7..f077e688 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -3,6 +3,7 @@
|
||||
// I'll gradually clean and extend it
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
+#include "common.h"
|
||||
#include "log.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
45
llm/patches/04-metal.diff
Normal file
45
llm/patches/04-metal.diff
Normal file
@@ -0,0 +1,45 @@
|
||||
diff --git a/ggml-metal.m b/ggml-metal.m
|
||||
index 0207b787..b5e9884b 100644
|
||||
--- a/ggml-metal.m
|
||||
+++ b/ggml-metal.m
|
||||
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
-#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
// these numbers do not translate to other devices or model sizes
|
||||
// TODO: need to find a better approach
|
||||
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||
- switch (src0t) {
|
||||
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q4_0:
|
||||
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
- case GGML_TYPE_Q5_0: // not tested yet
|
||||
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
- default: ne11_mm_min = 1; break;
|
||||
- }
|
||||
+ switch (src0t) {
|
||||
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q4_0:
|
||||
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
+ case GGML_TYPE_Q5_0: // not tested yet
|
||||
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
+ default: ne11_mm_min = 1; break;
|
||||
}
|
||||
-#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
@@ -25,13 +26,15 @@ func Init() error {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.Info("extracting embedded files", "dir", payloadsDir)
|
||||
binGlob := "build/*/*/*/bin/*"
|
||||
if runtime.GOOS != "windows" {
|
||||
slog.Info("extracting embedded files", "dir", payloadsDir)
|
||||
binGlob := "build/*/*/*/bin/*"
|
||||
|
||||
// extract server libraries
|
||||
err = extractFiles(payloadsDir, binGlob)
|
||||
if err != nil {
|
||||
return fmt.Errorf("extract binaries: %v", err)
|
||||
// extract server libraries
|
||||
err = extractFiles(payloadsDir, binGlob)
|
||||
if err != nil {
|
||||
return fmt.Errorf("extract binaries: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
var variants []string
|
||||
@@ -138,6 +141,23 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
||||
return servers
|
||||
}
|
||||
|
||||
// Return the optimal server for this CPU architecture
|
||||
func serverForCpu() string {
|
||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||
return "metal"
|
||||
}
|
||||
variant := gpu.GetCPUVariant()
|
||||
availableServers := availableServers()
|
||||
if variant != "" {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant {
|
||||
return cmp
|
||||
}
|
||||
}
|
||||
}
|
||||
return "cpu"
|
||||
}
|
||||
|
||||
// extract extracts the embedded files to the target directory
|
||||
func extractFiles(targetDir string, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
|
||||
366
llm/server.go
366
llm/server.go
@@ -21,21 +21,43 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
// LlamaServer is an instance of the llama.cpp server
|
||||
type LlamaServer struct {
|
||||
type LlamaServer interface {
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
Embedding(ctx context.Context, prompt string) ([]float64, error)
|
||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
EstimatedVRAM() uint64
|
||||
}
|
||||
|
||||
// llmServer is an instance of the llama.cpp server
|
||||
type llmServer struct {
|
||||
port int
|
||||
cmd *exec.Cmd
|
||||
done chan error // Channel to signal when the process exits
|
||||
status *StatusWriter
|
||||
options api.Options
|
||||
|
||||
// TODO - this should be broken down by GPU
|
||||
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
||||
|
||||
sem *semaphore.Weighted
|
||||
}
|
||||
|
||||
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
|
||||
func LoadModel(model string) (*GGML, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
f, err := os.Open(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -43,144 +65,66 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
defer f.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ggml, err
|
||||
}
|
||||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
// The gpu list must be a single family.
|
||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
||||
var err error
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||
slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
memoryAvailable, _ := gpu.CheckVRAM()
|
||||
info := gpu.GetGPUInfo()
|
||||
cpuRunner := ""
|
||||
var estimatedVRAM uint64
|
||||
var systemMemory uint64
|
||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
||||
|
||||
memoryMinimum := info.MinimumMemory
|
||||
for _, projector := range projectors {
|
||||
memoryMinimum += projectorMemoryRequirements(projector)
|
||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
cpuRunner = serverForCpu()
|
||||
} else {
|
||||
if gpus[0].Library == "metal" {
|
||||
memInfo, err := gpu.GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup system memory", "error", err)
|
||||
} else {
|
||||
systemMemory = memInfo.TotalMemory
|
||||
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
||||
}
|
||||
}
|
||||
var layers int
|
||||
layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||
|
||||
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||
}
|
||||
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
graphFullOffload *= uint64(info.DeviceCount)
|
||||
graphPartialOffload *= uint64(info.DeviceCount)
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||
|
||||
if info.Library != "metal" {
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
info.Library = "cpu"
|
||||
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
|
||||
// disable partial offloading when model is greater than total system memory as this
|
||||
// can lead to locking up the system
|
||||
opts.NumGPU = 0
|
||||
} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
|
||||
opts.NumGPU = layers
|
||||
}
|
||||
}
|
||||
|
||||
var layerCount int
|
||||
layers := ggml.Tensors().Layers()
|
||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
memoryLayer += kv / ggml.KV().BlockCount()
|
||||
|
||||
memoryRequiredTotal += memoryLayer
|
||||
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
||||
memoryRequiredPartial += memoryLayer
|
||||
layerCount++
|
||||
}
|
||||
}
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
for k, v := range layers {
|
||||
if !strings.HasPrefix(k, "blk.") {
|
||||
memoryLayerOutput += v.size()
|
||||
}
|
||||
}
|
||||
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
|
||||
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
|
||||
// disable partial offloading when model is greater than total system memory
|
||||
opts.NumGPU = 0
|
||||
} else if memoryAvailable > memoryRequiredTotal {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
}
|
||||
|
||||
if opts.NumGPU < 0 {
|
||||
opts.NumGPU = layerCount
|
||||
}
|
||||
|
||||
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
|
||||
|
||||
slog.Info(
|
||||
"offload to gpu",
|
||||
slog.Group(
|
||||
"layers",
|
||||
// actual number of layers offloaded
|
||||
"real", opts.NumGPU,
|
||||
// estimated number of layers that can be offloaded
|
||||
"estimate", layerCount,
|
||||
),
|
||||
slog.Group(
|
||||
"memory",
|
||||
// memory available for offloading
|
||||
"available", format.HumanBytes2(memoryAvailable),
|
||||
slog.Group(
|
||||
"required",
|
||||
// memory required for full offloading
|
||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
||||
// memory required to offload layers.estimate layers
|
||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
||||
// memory of KV cache
|
||||
"kv", format.HumanBytes2(kv),
|
||||
),
|
||||
slog.Group(
|
||||
"weights",
|
||||
// memory of the weights
|
||||
"total", format.HumanBytes2(memoryWeights),
|
||||
// memory of repeating layers
|
||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
||||
// memory of non-repeating layers
|
||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
||||
),
|
||||
slog.Group(
|
||||
"graph",
|
||||
// memory of graph when fully offloaded
|
||||
"full", format.HumanBytes2(graphFullOffload),
|
||||
// memory of graph when not fully offloaded
|
||||
"partial", format.HumanBytes2(graphPartialOffload),
|
||||
),
|
||||
),
|
||||
)
|
||||
// Loop through potential servers
|
||||
finalErr := fmt.Errorf("no suitable llama servers found")
|
||||
|
||||
if len(adapters) > 1 {
|
||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||
}
|
||||
|
||||
availableServers := availableServers()
|
||||
servers := serversForGpu(info)
|
||||
|
||||
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
|
||||
var servers []string
|
||||
if cpuRunner != "" {
|
||||
servers = []string{cpuRunner}
|
||||
} else {
|
||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
}
|
||||
demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
|
||||
if demandLib != "" {
|
||||
serverPath := availableServers[demandLib]
|
||||
if serverPath == "" {
|
||||
@@ -192,7 +136,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
}
|
||||
|
||||
if len(servers) == 0 {
|
||||
return nil, fmt.Errorf("no servers found for %v", info)
|
||||
return nil, fmt.Errorf("no servers found for %v", gpus)
|
||||
}
|
||||
|
||||
params := []string{
|
||||
@@ -249,10 +193,26 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
params = append(params, "--numa")
|
||||
}
|
||||
|
||||
// Loop through potential servers
|
||||
var finalErr error
|
||||
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
|
||||
numParallel := 1
|
||||
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||
numParallel, err = strconv.Atoi(onp)
|
||||
if err != nil || numParallel <= 0 {
|
||||
err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
|
||||
slog.Error("misconfiguration", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||
|
||||
for i := 0; i < len(servers); i++ {
|
||||
dir := availableServers[servers[i]]
|
||||
if dir == "" {
|
||||
// Shouldn't happen
|
||||
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
|
||||
slog.Error("sever list inconsistent", "error", finalErr)
|
||||
continue
|
||||
}
|
||||
|
||||
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
|
||||
port := 0
|
||||
@@ -275,30 +235,60 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
}
|
||||
// append the server directory to LD_LIBRARY_PATH/PATH
|
||||
libraryPaths := []string{dir}
|
||||
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
// Append our runner directory to the path
|
||||
// This will favor system libraries over our bundled library dependencies
|
||||
libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
|
||||
}
|
||||
|
||||
// Note: we always put the dependency path first
|
||||
// since this was the exact version we verified for AMD GPUs
|
||||
// and we favor what the user had in their path
|
||||
if gpus[0].DependencyPath != "" {
|
||||
// TODO refine for multi-gpu support
|
||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
||||
}
|
||||
|
||||
server := filepath.Join(dir, "ollama_llama_server")
|
||||
if runtime.GOOS == "windows" {
|
||||
server = server + ".exe"
|
||||
}
|
||||
|
||||
s := &LlamaServer{
|
||||
port: port,
|
||||
cmd: exec.Command(server, finalParams...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
// Detect tmp cleaners wiping out the file
|
||||
_, err := os.Stat(server)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||
err = Init()
|
||||
if err != nil {
|
||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
s := &llmServer{
|
||||
port: port,
|
||||
cmd: exec.Command(server, finalParams...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
estimatedVRAM: estimatedVRAM,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
}
|
||||
|
||||
libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
|
||||
slog.Debug(libEnv)
|
||||
s.cmd.Env = append(os.Environ(), libEnv)
|
||||
s.cmd.Stdout = os.Stdout
|
||||
s.cmd.Stderr = s.status
|
||||
|
||||
// TODO - multiple GPU selection logic...
|
||||
key, val := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
|
||||
if key != "" {
|
||||
s.cmd.Env = append(s.cmd.Env, key+"="+val)
|
||||
}
|
||||
|
||||
slog.Info("starting llama server", "cmd", s.cmd.String())
|
||||
// Log at debug as the environment is inherited and might contain sensitive information
|
||||
slog.Debug("subprocess", "environment", s.cmd.Env)
|
||||
|
||||
if err = s.cmd.Start(); err != nil {
|
||||
msg := ""
|
||||
@@ -310,12 +300,13 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
continue
|
||||
}
|
||||
|
||||
// reap subprocess when it exits
|
||||
go func() {
|
||||
// Exit status managed via getServerStatus
|
||||
_ = s.cmd.Wait()
|
||||
}()
|
||||
|
||||
// TODO - make sure this is all wired up correctly
|
||||
// if err = s.WaitUntilRunning(); err != nil {
|
||||
// slog.Error("error starting llama server", "server", servers[i], "error", err)
|
||||
// s.Close()
|
||||
// finalErr = err
|
||||
// continue
|
||||
// }
|
||||
return s, nil
|
||||
}
|
||||
|
||||
@@ -353,6 +344,21 @@ const ( // iota is reset to 0
|
||||
ServerStatusError
|
||||
)
|
||||
|
||||
func (s ServerStatus) ToString() string {
|
||||
switch s {
|
||||
case ServerStatusReady:
|
||||
return "llm server ready"
|
||||
case ServerStatusNoSlotsAvaialble:
|
||||
return "llm busy - no slots available"
|
||||
case ServerStatusLoadingModel:
|
||||
return "llm server loading model"
|
||||
case ServerStatusNotResponding:
|
||||
return "llm server not responding"
|
||||
default:
|
||||
return "llm server error"
|
||||
}
|
||||
}
|
||||
|
||||
type ServerStatusResp struct {
|
||||
Status string `json:"status"`
|
||||
SlotsIdle int `json:"slots_idle"`
|
||||
@@ -360,7 +366,7 @@ type ServerStatusResp struct {
|
||||
Error string `json:"error"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
// Fail fast if its exited
|
||||
if s.cmd.ProcessState != nil {
|
||||
msg := ""
|
||||
@@ -407,7 +413,7 @@ func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Ping(ctx context.Context) error {
|
||||
func (s *llmServer) Ping(ctx context.Context) error {
|
||||
_, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
slog.Debug("server unhealthy", "error", err)
|
||||
@@ -416,7 +422,7 @@ func (s *LlamaServer) Ping(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *LlamaServer) WaitUntilRunning() error {
|
||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
// TODO we need to wire up a better way to detect hangs during model load and startup of the server
|
||||
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
||||
@@ -427,6 +433,9 @@ func (s *LlamaServer) WaitUntilRunning() error {
|
||||
var lastStatus ServerStatus = -1
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Info("context expired before server started")
|
||||
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
|
||||
case err := <-s.done:
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
@@ -450,9 +459,9 @@ func (s *LlamaServer) WaitUntilRunning() error {
|
||||
return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
|
||||
c, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
||||
defer cancel()
|
||||
status, err := s.getServerStatus(ctx)
|
||||
status, err := s.getServerStatus(c)
|
||||
if err != nil && lastStatus != status {
|
||||
slog.Debug("server not yet available", "error", err)
|
||||
lastStatus = status
|
||||
@@ -509,13 +518,10 @@ type ImageData struct {
|
||||
}
|
||||
|
||||
type completion struct {
|
||||
Content string `json:"content"`
|
||||
Model string `json:"model"`
|
||||
Prompt string `json:"prompt"`
|
||||
Stop bool `json:"stop"`
|
||||
StoppedEos bool `json:"stopped_eos"`
|
||||
StoppedWord bool `json:"stopped_word"`
|
||||
StoppedLimit bool `json:"stopped_limit"`
|
||||
Content string `json:"content"`
|
||||
Model string `json:"model"`
|
||||
Prompt string `json:"prompt"`
|
||||
Stop bool `json:"stop"`
|
||||
|
||||
Timings struct {
|
||||
PredictedN int `json:"predicted_n"`
|
||||
@@ -535,14 +541,25 @@ type CompletionRequest struct {
|
||||
type CompletionResponse struct {
|
||||
Content string
|
||||
Done bool
|
||||
DoneReason string
|
||||
PromptEvalCount int
|
||||
PromptEvalDuration time.Duration
|
||||
EvalCount int
|
||||
EvalDuration time.Duration
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
return err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// only allow maximum 10 "context shifts" to avoid infinite generation
|
||||
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
||||
req.Options.NumPredict = 10 * s.options.NumCtx
|
||||
slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
|
||||
}
|
||||
|
||||
request := map[string]any{
|
||||
"prompt": req.Prompt,
|
||||
"stream": true,
|
||||
@@ -573,7 +590,7 @@ func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn
|
||||
if err != nil {
|
||||
return err
|
||||
} else if status != ServerStatusReady {
|
||||
return fmt.Errorf("unexpected server status: %d", status)
|
||||
return fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
if req.Format == "json" {
|
||||
@@ -652,8 +669,6 @@ func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn
|
||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||
}
|
||||
|
||||
fmt.Println("c", string(evt))
|
||||
|
||||
var c completion
|
||||
if err := json.Unmarshal(evt, &c); err != nil {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||
@@ -680,18 +695,8 @@ func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn
|
||||
}
|
||||
|
||||
if c.Stop {
|
||||
var doneReason string
|
||||
switch {
|
||||
case c.StoppedEos:
|
||||
doneReason = "stop"
|
||||
case c.StoppedWord:
|
||||
doneReason = "stop"
|
||||
case c.StoppedLimit:
|
||||
doneReason = "limit"
|
||||
}
|
||||
fn(CompletionResponse{
|
||||
Done: true,
|
||||
DoneReason: doneReason,
|
||||
PromptEvalCount: c.Timings.PromptN,
|
||||
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
|
||||
EvalCount: c.Timings.PredictedN,
|
||||
@@ -732,13 +737,18 @@ type EmbeddingResponse struct {
|
||||
Embedding []float64 `json:"embedding"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
||||
func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady {
|
||||
return nil, fmt.Errorf("unexpected server status: %d", status)
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
||||
@@ -784,13 +794,13 @@ type TokenizeResponse struct {
|
||||
Tokens []int `json:"tokens"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady {
|
||||
return nil, fmt.Errorf("unexpected server status: %d", status)
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(TokenizeRequest{Content: content})
|
||||
@@ -836,13 +846,13 @@ type DetokenizeResponse struct {
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
} else if status != ServerStatusReady {
|
||||
return "", fmt.Errorf("unexpected server status: %d", status)
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
|
||||
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
||||
@@ -880,15 +890,25 @@ func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, err
|
||||
return decoded.Content, nil
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Close() error {
|
||||
func (s *llmServer) Close() error {
|
||||
if s.cmd != nil {
|
||||
slog.Debug("stopping llama server")
|
||||
return s.cmd.Process.Kill()
|
||||
if err := s.cmd.Process.Kill(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_ = s.cmd.Wait()
|
||||
|
||||
slog.Debug("llama server stopped")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *llmServer) EstimatedVRAM() uint64 {
|
||||
return s.estimatedVRAM
|
||||
}
|
||||
|
||||
func parseDurationMs(ms float64) time.Duration {
|
||||
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
||||
if err != nil {
|
||||
|
||||
@@ -19,7 +19,7 @@ export default function () {
|
||||
const [step, setStep] = useState<Step>(Step.WELCOME)
|
||||
const [commandCopied, setCommandCopied] = useState<boolean>(false)
|
||||
|
||||
const command = 'ollama run llama2'
|
||||
const command = 'ollama run llama3'
|
||||
|
||||
return (
|
||||
<div className='drag'>
|
||||
|
||||
132
parser/parser.go
132
parser/parser.go
@@ -1,132 +0,0 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"slices"
|
||||
)
|
||||
|
||||
type Command struct {
|
||||
Name string
|
||||
Args string
|
||||
}
|
||||
|
||||
func (c *Command) Reset() {
|
||||
c.Name = ""
|
||||
c.Args = ""
|
||||
}
|
||||
|
||||
func Parse(reader io.Reader) ([]Command, error) {
|
||||
var commands []Command
|
||||
var command, modelCommand Command
|
||||
|
||||
scanner := bufio.NewScanner(reader)
|
||||
scanner.Buffer(make([]byte, 0, bufio.MaxScanTokenSize), bufio.MaxScanTokenSize)
|
||||
scanner.Split(scanModelfile)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Bytes()
|
||||
|
||||
fields := bytes.SplitN(line, []byte(" "), 2)
|
||||
if len(fields) == 0 || len(fields[0]) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
switch string(bytes.ToUpper(fields[0])) {
|
||||
case "FROM":
|
||||
command.Name = "model"
|
||||
command.Args = string(bytes.TrimSpace(fields[1]))
|
||||
// copy command for validation
|
||||
modelCommand = command
|
||||
case "ADAPTER":
|
||||
command.Name = string(bytes.ToLower(fields[0]))
|
||||
command.Args = string(bytes.TrimSpace(fields[1]))
|
||||
case "LICENSE", "TEMPLATE", "SYSTEM", "PROMPT":
|
||||
command.Name = string(bytes.ToLower(fields[0]))
|
||||
command.Args = string(fields[1])
|
||||
case "PARAMETER":
|
||||
fields = bytes.SplitN(fields[1], []byte(" "), 2)
|
||||
if len(fields) < 2 {
|
||||
return nil, fmt.Errorf("missing value for %s", fields)
|
||||
}
|
||||
|
||||
command.Name = string(fields[0])
|
||||
command.Args = string(bytes.TrimSpace(fields[1]))
|
||||
case "EMBED":
|
||||
return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
|
||||
case "MESSAGE":
|
||||
command.Name = string(bytes.ToLower(fields[0]))
|
||||
fields = bytes.SplitN(fields[1], []byte(" "), 2)
|
||||
if len(fields) < 2 {
|
||||
return nil, fmt.Errorf("should be in the format <role> <message>")
|
||||
}
|
||||
if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) {
|
||||
return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"")
|
||||
}
|
||||
command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1]))
|
||||
default:
|
||||
if !bytes.HasPrefix(fields[0], []byte("#")) {
|
||||
// log a warning for unknown commands
|
||||
slog.Warn(fmt.Sprintf("Unknown command: %s", fields[0]))
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
commands = append(commands, command)
|
||||
command.Reset()
|
||||
}
|
||||
|
||||
if modelCommand.Args == "" {
|
||||
return nil, errors.New("no FROM line for the model was specified")
|
||||
}
|
||||
|
||||
return commands, scanner.Err()
|
||||
}
|
||||
|
||||
func scanModelfile(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
||||
advance, token, err = scan([]byte(`"""`), []byte(`"""`), data, atEOF)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if advance > 0 && token != nil {
|
||||
return advance, token, nil
|
||||
}
|
||||
|
||||
advance, token, err = scan([]byte(`"`), []byte(`"`), data, atEOF)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if advance > 0 && token != nil {
|
||||
return advance, token, nil
|
||||
}
|
||||
|
||||
return bufio.ScanLines(data, atEOF)
|
||||
}
|
||||
|
||||
func scan(openBytes, closeBytes, data []byte, atEOF bool) (advance int, token []byte, err error) {
|
||||
newline := bytes.IndexByte(data, '\n')
|
||||
|
||||
if start := bytes.Index(data, openBytes); start >= 0 && start < newline {
|
||||
end := bytes.Index(data[start+len(openBytes):], closeBytes)
|
||||
if end < 0 {
|
||||
if atEOF {
|
||||
return 0, nil, fmt.Errorf("unterminated %s: expecting %s", openBytes, closeBytes)
|
||||
} else {
|
||||
return 0, nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
n := start + len(openBytes) + end + len(closeBytes)
|
||||
|
||||
newData := data[:start]
|
||||
newData = append(newData, data[start+len(openBytes):n-len(closeBytes)]...)
|
||||
return n, newData, nil
|
||||
}
|
||||
|
||||
return 0, nil, nil
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func Test_Parser(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM model1
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
commands, err := Parse(reader)
|
||||
assert.Nil(t, err)
|
||||
|
||||
expectedCommands := []Command{
|
||||
{Name: "model", Args: "model1"},
|
||||
{Name: "adapter", Args: "adapter1"},
|
||||
{Name: "license", Args: "MIT"},
|
||||
{Name: "param1", Args: "value1"},
|
||||
{Name: "param2", Args: "value2"},
|
||||
{Name: "template", Args: "template1"},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedCommands, commands)
|
||||
}
|
||||
|
||||
func Test_Parser_NoFromLine(t *testing.T) {
|
||||
|
||||
input := `
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "no FROM line")
|
||||
}
|
||||
|
||||
func Test_Parser_MissingValue(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM foo
|
||||
PARAMETER param1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "missing value for [param1]")
|
||||
|
||||
}
|
||||
|
||||
func Test_Parser_Messages(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM foo
|
||||
MESSAGE system You are a Parser. Always Parse things.
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
commands, err := Parse(reader)
|
||||
assert.Nil(t, err)
|
||||
|
||||
expectedCommands := []Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a Parser. Always Parse things."},
|
||||
{Name: "message", Args: "user: Hey there!"},
|
||||
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedCommands, commands)
|
||||
}
|
||||
|
||||
func Test_Parser_Messages_BadRole(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM foo
|
||||
MESSAGE badguy I'm a bad guy!
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "role must be one of \"system\", \"user\", or \"assistant\"")
|
||||
}
|
||||
@@ -7,6 +7,8 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function checkEnv() {
|
||||
$script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
Write-host "Building for ${script:TARGET_ARCH}"
|
||||
write-host "Locating required tools and paths"
|
||||
$script:SRC_DIR=$PWD
|
||||
if (!$env:VCToolsRedistDir) {
|
||||
@@ -30,7 +32,7 @@ function checkEnv() {
|
||||
|
||||
$script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
|
||||
|
||||
$script:DEPS_DIR="${script:SRC_DIR}\dist\windeps"
|
||||
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
|
||||
$env:CGO_ENABLED="1"
|
||||
echo "Checking version"
|
||||
if (!$env:VERSION) {
|
||||
@@ -81,8 +83,8 @@ function buildOllama() {
|
||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
New-Item -ItemType Directory -Path .\dist -Force
|
||||
cp .\ollama.exe .\dist\ollama-windows-amd64.exe
|
||||
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
|
||||
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
|
||||
}
|
||||
|
||||
function buildApp() {
|
||||
@@ -101,7 +103,6 @@ function buildApp() {
|
||||
function gatherDependencies() {
|
||||
write-host "Gathering runtime dependencies"
|
||||
cd "${script:SRC_DIR}"
|
||||
rm -ea 0 -recurse -force -path "${script:DEPS_DIR}"
|
||||
md "${script:DEPS_DIR}" -ea 0 > $null
|
||||
|
||||
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
|
||||
@@ -110,9 +111,6 @@ function gatherDependencies() {
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
|
||||
|
||||
cp "${script:NVIDIA_DIR}\cudart64_*.dll" "${script:DEPS_DIR}\"
|
||||
cp "${script:NVIDIA_DIR}\cublas64_*.dll" "${script:DEPS_DIR}\"
|
||||
cp "${script:NVIDIA_DIR}\cublasLt64_*.dll" "${script:DEPS_DIR}\"
|
||||
|
||||
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
@@ -124,7 +122,6 @@ function gatherDependencies() {
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function buildInstaller() {
|
||||
@@ -132,19 +129,25 @@ function buildInstaller() {
|
||||
cd "${script:SRC_DIR}\app"
|
||||
$env:PKG_VERSION=$script:PKG_VERSION
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
|
||||
} else {
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" .\ollama.iss
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH .\ollama.iss
|
||||
}
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
|
||||
function distZip() {
|
||||
write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip"
|
||||
Compress-Archive -Path "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip" -Force
|
||||
}
|
||||
|
||||
try {
|
||||
checkEnv
|
||||
buildOllama
|
||||
buildApp
|
||||
gatherDependencies
|
||||
buildInstaller
|
||||
distZip
|
||||
} catch {
|
||||
write-host "Build Failed"
|
||||
write-host $_
|
||||
|
||||
723
server/images.go
723
server/images.go
@@ -1,16 +1,16 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"cmp"
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
@@ -20,15 +20,15 @@ import (
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"text/template"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/convert"
|
||||
"github.com/ollama/ollama/auth"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
@@ -60,6 +60,76 @@ func (m *Model) IsEmbedding() bool {
|
||||
return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
|
||||
}
|
||||
|
||||
func (m *Model) String() string {
|
||||
var modelfile model.File
|
||||
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "model",
|
||||
Args: m.ModelPath,
|
||||
})
|
||||
|
||||
if m.Template != "" {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "template",
|
||||
Args: m.Template,
|
||||
})
|
||||
}
|
||||
|
||||
if m.System != "" {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "system",
|
||||
Args: m.System,
|
||||
})
|
||||
}
|
||||
|
||||
for _, adapter := range m.AdapterPaths {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "adapter",
|
||||
Args: adapter,
|
||||
})
|
||||
}
|
||||
|
||||
for _, projector := range m.ProjectorPaths {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "projector",
|
||||
Args: projector,
|
||||
})
|
||||
}
|
||||
|
||||
for k, v := range m.Options {
|
||||
switch v := v.(type) {
|
||||
case []any:
|
||||
for _, s := range v {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: k,
|
||||
Args: fmt.Sprintf("%v", s),
|
||||
})
|
||||
}
|
||||
default:
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: k,
|
||||
Args: fmt.Sprintf("%v", v),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for _, license := range m.License {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "license",
|
||||
Args: license,
|
||||
})
|
||||
}
|
||||
|
||||
for _, msg := range m.Messages {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "message",
|
||||
Args: fmt.Sprintf("%s %s", msg.Role, msg.Content),
|
||||
})
|
||||
}
|
||||
|
||||
return modelfile.String()
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
@@ -85,36 +155,6 @@ type ConfigV2 struct {
|
||||
RootFS RootFS `json:"rootfs"`
|
||||
}
|
||||
|
||||
func (c *ConfigV2) SetModelFormat(format string) {
|
||||
if c.ModelFormat == "" {
|
||||
c.ModelFormat = format
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ConfigV2) SetModelFamily(families ...string) {
|
||||
for _, family := range families {
|
||||
if c.ModelFamily == "" {
|
||||
c.ModelFamily = family
|
||||
}
|
||||
|
||||
if !slices.Contains(c.ModelFamilies, family) {
|
||||
c.ModelFamilies = append(c.ModelFamilies, family)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ConfigV2) SetModelType(modelType string) {
|
||||
if c.ModelType == "" {
|
||||
c.ModelType = modelType
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ConfigV2) SetFileType(fileType string) {
|
||||
if c.FileType == "" {
|
||||
c.FileType = fileType
|
||||
}
|
||||
}
|
||||
|
||||
type RootFS struct {
|
||||
Type string `json:"type"`
|
||||
DiffIDs []string `json:"diff_ids"`
|
||||
@@ -259,7 +299,7 @@ func GetModel(name string) (*Model, error) {
|
||||
return model, nil
|
||||
}
|
||||
|
||||
func realpath(mfDir, from string) string {
|
||||
func realpath(rel, from string) string {
|
||||
abspath, err := filepath.Abs(from)
|
||||
if err != nil {
|
||||
return from
|
||||
@@ -276,22 +316,15 @@ func realpath(mfDir, from string) string {
|
||||
return filepath.Join(home, from[2:])
|
||||
}
|
||||
|
||||
if _, err := os.Stat(filepath.Join(mfDir, from)); err == nil {
|
||||
if _, err := os.Stat(filepath.Join(rel, from)); err == nil {
|
||||
// this is a file relative to the Modelfile
|
||||
return filepath.Join(mfDir, from)
|
||||
return filepath.Join(rel, from)
|
||||
}
|
||||
|
||||
return abspath
|
||||
}
|
||||
|
||||
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
|
||||
deleteMap := make(map[string]struct{})
|
||||
if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
|
||||
for _, layer := range append(manifest.Layers, manifest.Config) {
|
||||
deleteMap[layer.Digest] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) (err error) {
|
||||
config := ConfigV2{
|
||||
OS: "linux",
|
||||
Architecture: "amd64",
|
||||
@@ -300,250 +333,181 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
|
||||
},
|
||||
}
|
||||
|
||||
var layers Layers
|
||||
messages := []string{}
|
||||
var messages []*api.Message
|
||||
parameters := make(map[string]any)
|
||||
|
||||
params := make(map[string][]string)
|
||||
fromParams := make(map[string]any)
|
||||
|
||||
for _, c := range commands {
|
||||
var layers []*Layer
|
||||
for _, c := range modelfile.Commands {
|
||||
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
|
||||
|
||||
switch c.Name {
|
||||
case "model":
|
||||
if strings.HasPrefix(c.Args, "@") {
|
||||
blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
|
||||
case "model", "adapter":
|
||||
var baseLayers []*layerWithGGML
|
||||
if name := model.ParseName(c.Args); name.IsValid() {
|
||||
baseLayers, err = parseFromModel(ctx, name, fn)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else if strings.HasPrefix(c.Args, "@") {
|
||||
blobpath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.Args = blobPath
|
||||
}
|
||||
|
||||
pathName := realpath(modelFileDir, c.Args)
|
||||
|
||||
ggufName, err := convertModel(name, pathName, fn)
|
||||
if err != nil {
|
||||
var pathErr *fs.PathError
|
||||
switch {
|
||||
case errors.Is(err, zip.ErrFormat):
|
||||
// it's not a safetensor archive
|
||||
case errors.As(err, &pathErr):
|
||||
// it's not a file on disk, could be a model reference
|
||||
default:
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if ggufName != "" {
|
||||
pathName = ggufName
|
||||
defer os.RemoveAll(ggufName)
|
||||
|
||||
if quantization != "" {
|
||||
quantization = strings.ToUpper(quantization)
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", "F16", quantization)})
|
||||
tempfile, err := os.CreateTemp(filepath.Dir(ggufName), quantization)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.RemoveAll(tempfile.Name())
|
||||
|
||||
if err := llm.Quantize(ggufName, tempfile.Name(), quantization); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := tempfile.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pathName = tempfile.Name()
|
||||
}
|
||||
}
|
||||
|
||||
bin, err := os.Open(pathName)
|
||||
if err != nil {
|
||||
// not a file on disk so must be a model reference
|
||||
modelpath := ParseModelPath(c.Args)
|
||||
manifest, _, err := GetManifest(modelpath)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
fn(api.ProgressResponse{Status: "pulling model"})
|
||||
if err := PullModel(ctx, c.Args, ®istryOptions{}, fn); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
manifest, _, err = GetManifest(modelpath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case err != nil:
|
||||
return err
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "reading model metadata"})
|
||||
fromConfigPath, err := GetBlobsPath(manifest.Config.Digest)
|
||||
blob, err := os.Open(blobpath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer blob.Close()
|
||||
|
||||
fromConfigFile, err := os.Open(fromConfigPath)
|
||||
baseLayers, err = parseFromFile(ctx, blob, fn)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer fromConfigFile.Close()
|
||||
} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
|
||||
defer file.Close()
|
||||
|
||||
var fromConfig ConfigV2
|
||||
if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil {
|
||||
baseLayers, err = parseFromFile(ctx, file, fn)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
return fmt.Errorf("invalid model reference: %s", c.Args)
|
||||
}
|
||||
|
||||
// if the model is still not in gguf format, error out
|
||||
if fromConfig.ModelFormat != "gguf" {
|
||||
return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args)
|
||||
}
|
||||
|
||||
config.SetModelFormat(fromConfig.ModelFormat)
|
||||
config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...)
|
||||
config.SetModelType(fromConfig.ModelType)
|
||||
config.SetFileType(fromConfig.FileType)
|
||||
|
||||
for _, layer := range manifest.Layers {
|
||||
deleteMap[layer.Digest] = struct{}{}
|
||||
if layer.MediaType == "application/vnd.ollama.image.params" {
|
||||
fromParamsPath, err := GetBlobsPath(layer.Digest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fromParamsFile, err := os.Open(fromParamsPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer fromParamsFile.Close()
|
||||
|
||||
if err := json.NewDecoder(fromParamsFile).Decode(&fromParams); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
layer, err := NewLayerFromLayer(layer.Digest, layer.MediaType, modelpath.GetShortTagname())
|
||||
for _, baseLayer := range baseLayers {
|
||||
if quantization != "" &&
|
||||
baseLayer.MediaType == "application/vnd.ollama.image.model" &&
|
||||
baseLayer.GGML != nil &&
|
||||
baseLayer.GGML.Name() == "gguf" {
|
||||
ftype, err := llm.ParseFileType(quantization)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
layers.Add(layer)
|
||||
filetype := baseLayer.GGML.KV().FileType()
|
||||
if !slices.Contains([]string{"F16", "F32"}, filetype) {
|
||||
return errors.New("quantization is only supported for F16 and F32 models")
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", filetype, quantization)})
|
||||
|
||||
blob, err := GetBlobsPath(baseLayer.Digest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
temp, err := os.CreateTemp(filepath.Dir(blob), quantization)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer temp.Close()
|
||||
defer os.Remove(temp.Name())
|
||||
|
||||
if err := llm.Quantize(blob, temp.Name(), ftype); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
baseLayer.Layer, err = NewLayer(temp, baseLayer.Layer.MediaType)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
deleteMap[manifest.Config.Digest] = struct{}{}
|
||||
continue
|
||||
if baseLayer.GGML != nil {
|
||||
config.ModelFormat = cmp.Or(config.ModelFormat, baseLayer.GGML.Name())
|
||||
config.ModelFamily = cmp.Or(config.ModelFamily, baseLayer.GGML.KV().Architecture())
|
||||
config.ModelType = cmp.Or(config.ModelType, format.HumanNumber(baseLayer.GGML.KV().ParameterCount()))
|
||||
config.FileType = cmp.Or(config.FileType, baseLayer.GGML.KV().FileType())
|
||||
config.ModelFamilies = append(config.ModelFamilies, baseLayer.GGML.KV().Architecture())
|
||||
}
|
||||
|
||||
layers = append(layers, baseLayer.Layer)
|
||||
}
|
||||
defer bin.Close()
|
||||
|
||||
var offset int64
|
||||
for {
|
||||
fn(api.ProgressResponse{Status: "creating model layer"})
|
||||
if _, err := bin.Seek(offset, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ggml, size, err := llm.DecodeGGML(bin)
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if errors.Is(err, llm.ErrUnsupportedFormat) {
|
||||
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
config.SetModelFormat(ggml.Name())
|
||||
config.SetModelFamily(ggml.KV().Architecture())
|
||||
config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
|
||||
config.SetFileType(ggml.KV().FileType())
|
||||
|
||||
mediatype := mediatype
|
||||
if ggml.KV().Architecture() == "clip" {
|
||||
mediatype = "application/vnd.ollama.image.projector"
|
||||
}
|
||||
|
||||
sr := io.NewSectionReader(bin, offset, size)
|
||||
layer, err := NewLayer(sr, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
layers.Add(layer)
|
||||
|
||||
offset += size
|
||||
}
|
||||
case "adapter":
|
||||
if strings.HasPrefix(c.Args, "@") {
|
||||
blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.Args = blobPath
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "creating adapter layer"})
|
||||
bin, err := os.Open(realpath(modelFileDir, c.Args))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer bin.Close()
|
||||
|
||||
_, size, err := llm.DecodeGGML(bin)
|
||||
case "license", "template", "system":
|
||||
blob := strings.NewReader(c.Args)
|
||||
layer, err := NewLayer(blob, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sr := io.NewSectionReader(bin, 0, size)
|
||||
layer, err := NewLayer(sr, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
if c.Name != "license" {
|
||||
// replace
|
||||
layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
|
||||
return layer.MediaType == mediatype
|
||||
})
|
||||
}
|
||||
|
||||
layers.Add(layer)
|
||||
case "license":
|
||||
fn(api.ProgressResponse{Status: "creating license layer"})
|
||||
|
||||
bin := strings.NewReader(c.Args)
|
||||
layer, err := NewLayer(bin, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
layers.Add(layer)
|
||||
case "template", "system":
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("creating %s layer", c.Name)})
|
||||
|
||||
bin := strings.NewReader(c.Args)
|
||||
layer, err := NewLayer(bin, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
layers.Replace(layer)
|
||||
layers = append(layers, layer)
|
||||
case "message":
|
||||
messages = append(messages, c.Args)
|
||||
role, content, ok := strings.Cut(c.Args, ": ")
|
||||
if !ok {
|
||||
return fmt.Errorf("invalid message: %s", c.Args)
|
||||
}
|
||||
|
||||
messages = append(messages, &api.Message{Role: role, Content: content})
|
||||
default:
|
||||
params[c.Name] = append(params[c.Name], c.Args)
|
||||
ps, err := api.FormatParams(map[string][]string{c.Name: {c.Args}})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for k, v := range ps {
|
||||
if ks, ok := parameters[k].([]string); ok {
|
||||
parameters[k] = append(ks, v.([]string)...)
|
||||
} else if vs, ok := v.([]string); ok {
|
||||
parameters[k] = vs
|
||||
} else {
|
||||
parameters[k] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(messages) > 0 {
|
||||
fn(api.ProgressResponse{Status: "creating parameters layer"})
|
||||
var err2 error
|
||||
layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
|
||||
switch layer.MediaType {
|
||||
case "application/vnd.ollama.image.message":
|
||||
// if there are new messages, remove the inherited ones
|
||||
if len(messages) > 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
msgs := make([]api.Message, 0)
|
||||
return false
|
||||
case "application/vnd.ollama.image.params":
|
||||
// merge inherited parameters with new ones
|
||||
r, err := layer.Open()
|
||||
if err != nil {
|
||||
err2 = err
|
||||
return false
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
for _, m := range messages {
|
||||
// todo: handle images
|
||||
msg := strings.SplitN(m, ": ", 2)
|
||||
msgs = append(msgs, api.Message{Role: msg[0], Content: msg[1]})
|
||||
var ps map[string]any
|
||||
if err := json.NewDecoder(r).Decode(&ps); err != nil {
|
||||
err2 = err
|
||||
return false
|
||||
}
|
||||
|
||||
for k, v := range ps {
|
||||
if _, ok := parameters[k]; !ok {
|
||||
parameters[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
})
|
||||
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
|
||||
if len(messages) > 0 {
|
||||
var b bytes.Buffer
|
||||
if err := json.NewEncoder(&b).Encode(msgs); err != nil {
|
||||
if err := json.NewEncoder(&b).Encode(messages); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -552,39 +516,25 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
|
||||
return err
|
||||
}
|
||||
|
||||
layers.Replace(layer)
|
||||
layers = append(layers, layer)
|
||||
}
|
||||
|
||||
if len(params) > 0 {
|
||||
fn(api.ProgressResponse{Status: "creating parameters layer"})
|
||||
|
||||
formattedParams, err := api.FormatParams(params)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for k, v := range fromParams {
|
||||
if _, ok := formattedParams[k]; !ok {
|
||||
formattedParams[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
if len(parameters) > 0 {
|
||||
var b bytes.Buffer
|
||||
if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
|
||||
if err := json.NewEncoder(&b).Encode(parameters); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "creating config layer"})
|
||||
layer, err := NewLayer(&b, "application/vnd.ollama.image.params")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
layers.Replace(layer)
|
||||
layers = append(layers, layer)
|
||||
}
|
||||
|
||||
digests := make([]string, len(layers.items))
|
||||
for i, layer := range layers.items {
|
||||
digests := make([]string, len(layers))
|
||||
for i, layer := range layers {
|
||||
digests[i] = layer.Digest
|
||||
}
|
||||
|
||||
@@ -595,36 +545,38 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
|
||||
return err
|
||||
}
|
||||
|
||||
configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
|
||||
layer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
delete(deleteMap, configLayer.Digest)
|
||||
for _, layer := range append(layers, layer) {
|
||||
if layer.status != "" {
|
||||
fn(api.ProgressResponse{Status: layer.status})
|
||||
}
|
||||
}
|
||||
|
||||
for _, layer := range append(layers.items, configLayer) {
|
||||
committed, err := layer.Commit()
|
||||
if err != nil {
|
||||
return err
|
||||
unref := make(map[string]struct{})
|
||||
if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
|
||||
for _, layer := range manifest.Layers {
|
||||
if !slices.Contains(digests, layer.Digest) {
|
||||
unref[layer.Digest] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
status := "writing layer"
|
||||
if !committed {
|
||||
status = "using already created layer"
|
||||
if manifest.Config.Digest != layer.Digest {
|
||||
unref[manifest.Config.Digest] = struct{}{}
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("%s %s", status, layer.Digest)})
|
||||
|
||||
delete(deleteMap, layer.Digest)
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "writing manifest"})
|
||||
if err := WriteManifest(name, configLayer, layers.items); err != nil {
|
||||
if err := WriteManifest(name, layer, layers); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
|
||||
if os.Getenv("OLLAMA_NOPRUNE") == "" && len(unref) > 0 {
|
||||
fn(api.ProgressResponse{Status: "removing unused layers"})
|
||||
if err := deleteUnusedLayers(nil, unref, false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
@@ -633,104 +585,43 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
|
||||
return nil
|
||||
}
|
||||
|
||||
func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string, error) {
|
||||
r, err := zip.OpenReader(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
func CopyModel(src, dst model.Name) error {
|
||||
if !dst.IsFullyQualified() {
|
||||
return model.Unqualified(dst)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
tempDir, err := os.MkdirTemp("", "ollama-convert")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.RemoveAll(tempDir)
|
||||
|
||||
fn(api.ProgressResponse{Status: "unpacking model metadata"})
|
||||
for _, f := range r.File {
|
||||
fpath := filepath.Join(tempDir, f.Name)
|
||||
outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
rc, err := f.Open()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
_, err = io.Copy(outFile, rc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
outFile.Close()
|
||||
rc.Close()
|
||||
if !src.IsFullyQualified() {
|
||||
return model.Unqualified(src)
|
||||
}
|
||||
|
||||
mf, err := convert.GetModelFormat(tempDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
if src.Filepath() == dst.Filepath() {
|
||||
return nil
|
||||
}
|
||||
|
||||
params, err := mf.GetParams(tempDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
mArch, err := mf.GetModelArch(name, tempDir, params)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "processing tensors"})
|
||||
if err := mArch.GetTensors(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if err := mArch.LoadVocab(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "converting model"})
|
||||
path, err = mArch.WriteGGUF()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func CopyModel(src, dest string) error {
|
||||
srcModelPath := ParseModelPath(src)
|
||||
srcPath, err := srcModelPath.GetManifestPath()
|
||||
manifests, err := GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
destModelPath := ParseModelPath(dest)
|
||||
destPath, err := destModelPath.GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(destPath), 0o755); err != nil {
|
||||
dstpath := filepath.Join(manifests, dst.Filepath())
|
||||
if err := os.MkdirAll(filepath.Dir(dstpath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// copy the file
|
||||
input, err := os.ReadFile(srcPath)
|
||||
srcpath := filepath.Join(manifests, src.Filepath())
|
||||
srcfile, err := os.Open(srcpath)
|
||||
if err != nil {
|
||||
fmt.Println("Error reading file:", err)
|
||||
return err
|
||||
}
|
||||
defer srcfile.Close()
|
||||
|
||||
err = os.WriteFile(destPath, input, 0o644)
|
||||
dstfile, err := os.Create(dstpath)
|
||||
if err != nil {
|
||||
fmt.Println("Error reading file:", err)
|
||||
return err
|
||||
}
|
||||
defer dstfile.Close()
|
||||
|
||||
return nil
|
||||
_, err = io.Copy(dstfile, srcfile)
|
||||
return err
|
||||
}
|
||||
|
||||
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
|
||||
@@ -890,67 +781,6 @@ func DeleteModel(name string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func ShowModelfile(model *Model) (string, error) {
|
||||
var mt struct {
|
||||
*Model
|
||||
From string
|
||||
Parameters map[string][]any
|
||||
}
|
||||
|
||||
mt.Parameters = make(map[string][]any)
|
||||
for k, v := range model.Options {
|
||||
if s, ok := v.([]any); ok {
|
||||
mt.Parameters[k] = s
|
||||
continue
|
||||
}
|
||||
|
||||
mt.Parameters[k] = []any{v}
|
||||
}
|
||||
|
||||
mt.Model = model
|
||||
mt.From = model.ModelPath
|
||||
|
||||
if model.ParentModel != "" {
|
||||
mt.From = model.ParentModel
|
||||
}
|
||||
|
||||
modelFile := `# Modelfile generated by "ollama show"
|
||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||
# FROM {{ .ShortName }}
|
||||
|
||||
FROM {{ .From }}
|
||||
TEMPLATE """{{ .Template }}"""
|
||||
|
||||
{{- if .System }}
|
||||
SYSTEM """{{ .System }}"""
|
||||
{{- end }}
|
||||
|
||||
{{- range $adapter := .AdapterPaths }}
|
||||
ADAPTER {{ $adapter }}
|
||||
{{- end }}
|
||||
|
||||
{{- range $k, $v := .Parameters }}
|
||||
{{- range $parameter := $v }}
|
||||
PARAMETER {{ $k }} {{ printf "%#v" $parameter }}
|
||||
{{- end }}
|
||||
{{- end }}`
|
||||
|
||||
tmpl, err := template.New("").Parse(modelFile)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("error parsing template: %q", err))
|
||||
return "", err
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
|
||||
if err = tmpl.Execute(&buf, mt); err != nil {
|
||||
slog.Info(fmt.Sprintf("error executing template: %q", err))
|
||||
return "", err
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
fn(api.ProgressResponse{Status: "retrieving manifest"})
|
||||
@@ -972,9 +802,6 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||
for _, layer := range layers {
|
||||
if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
|
||||
slog.Info(fmt.Sprintf("error uploading blob: %v", err))
|
||||
if errors.Is(err, errUnauthorized) {
|
||||
return fmt.Errorf("unable to push %s, make sure this namespace exists and you are authorized to push to it", ParseModelPath(name).GetNamespaceRepository())
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
@@ -1137,9 +964,40 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
|
||||
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
|
||||
}
|
||||
|
||||
var errUnauthorized = errors.New("unauthorized")
|
||||
var errUnauthorized = fmt.Errorf("unauthorized: access denied")
|
||||
|
||||
// getTokenSubject returns the subject of a JWT token, it does not validate the token
|
||||
func getTokenSubject(token string) string {
|
||||
parts := strings.Split(token, ".")
|
||||
if len(parts) != 3 {
|
||||
slog.Error("jwt token does not contain 3 parts")
|
||||
return ""
|
||||
}
|
||||
|
||||
payload := parts[1]
|
||||
payloadBytes, err := base64.RawURLEncoding.DecodeString(payload)
|
||||
if err != nil {
|
||||
slog.Error(fmt.Sprintf("failed to decode jwt payload: %v", err))
|
||||
return ""
|
||||
}
|
||||
|
||||
var payloadMap map[string]interface{}
|
||||
if err := json.Unmarshal(payloadBytes, &payloadMap); err != nil {
|
||||
slog.Error(fmt.Sprintf("failed to unmarshal payload JSON: %v", err))
|
||||
return ""
|
||||
}
|
||||
|
||||
sub, ok := payloadMap["sub"]
|
||||
if !ok {
|
||||
slog.Error("jwt does not contain 'sub' field")
|
||||
return ""
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s", sub)
|
||||
}
|
||||
|
||||
func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
|
||||
anonymous := true // access will default to anonymous if no user is found associated with the public key
|
||||
for i := 0; i < 2; i++ {
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
|
||||
if err != nil {
|
||||
@@ -1158,6 +1016,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
anonymous = getTokenSubject(token) == "anonymous"
|
||||
regOpts.Token = token
|
||||
if body != nil {
|
||||
_, err = body.Seek(0, io.SeekStart)
|
||||
@@ -1178,6 +1037,16 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
}
|
||||
}
|
||||
|
||||
if anonymous {
|
||||
// no user is associated with the public key, and the request requires non-anonymous access
|
||||
pubKey, nestedErr := auth.GetPublicKey()
|
||||
if nestedErr != nil {
|
||||
slog.Error(fmt.Sprintf("couldn't get public key: %v", nestedErr))
|
||||
return nil, errUnauthorized
|
||||
}
|
||||
return nil, &errtypes.UnknownOllamaKey{Key: pubKey}
|
||||
}
|
||||
// user is associated with the public key, but is not authorized to make the request
|
||||
return nil, errUnauthorized
|
||||
}
|
||||
|
||||
|
||||
@@ -5,39 +5,14 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
type Layers struct {
|
||||
items []*Layer
|
||||
}
|
||||
|
||||
func (ls *Layers) Add(layer *Layer) {
|
||||
if layer.Size > 0 {
|
||||
ls.items = append(ls.items, layer)
|
||||
}
|
||||
}
|
||||
|
||||
func (ls *Layers) Replace(layer *Layer) {
|
||||
if layer.Size > 0 {
|
||||
mediatype := layer.MediaType
|
||||
layers := slices.DeleteFunc(ls.items, func(l *Layer) bool {
|
||||
return l.MediaType == mediatype
|
||||
})
|
||||
|
||||
ls.items = append(layers, layer)
|
||||
}
|
||||
}
|
||||
|
||||
type Layer struct {
|
||||
MediaType string `json:"mediaType"`
|
||||
Digest string `json:"digest"`
|
||||
Size int64 `json:"size"`
|
||||
From string `json:"from,omitempty"`
|
||||
|
||||
tempFileName string
|
||||
status string
|
||||
}
|
||||
|
||||
func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
|
||||
@@ -46,14 +21,12 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
const delimiter = "-"
|
||||
|
||||
pattern := strings.Join([]string{"sha256", "*-partial"}, delimiter)
|
||||
temp, err := os.CreateTemp(blobs, pattern)
|
||||
temp, err := os.CreateTemp(blobs, "sha256-")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer temp.Close()
|
||||
defer os.Remove(temp.Name())
|
||||
|
||||
sha256sum := sha256.New()
|
||||
n, err := io.Copy(io.MultiWriter(temp, sha256sum), r)
|
||||
@@ -61,11 +34,29 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := temp.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
digest := fmt.Sprintf("sha256:%x", sha256sum.Sum(nil))
|
||||
blob, err := GetBlobsPath(digest)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
status := "using existing layer"
|
||||
if _, err := os.Stat(blob); err != nil {
|
||||
status = "creating new layer"
|
||||
if err := os.Rename(temp.Name(), blob); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return &Layer{
|
||||
MediaType: mediatype,
|
||||
Digest: fmt.Sprintf("sha256:%x", sha256sum.Sum(nil)),
|
||||
Size: n,
|
||||
tempFileName: temp.Name(),
|
||||
MediaType: mediatype,
|
||||
Digest: digest,
|
||||
Size: n,
|
||||
status: fmt.Sprintf("%s %s", status, digest),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -85,21 +76,15 @@ func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
|
||||
Digest: digest,
|
||||
Size: fi.Size(),
|
||||
From: from,
|
||||
status: fmt.Sprintf("using existing layer %s", digest),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (l *Layer) Commit() (bool, error) {
|
||||
// always remove temp
|
||||
defer os.Remove(l.tempFileName)
|
||||
|
||||
func (l *Layer) Open() (io.ReadCloser, error) {
|
||||
blob, err := GetBlobsPath(l.Digest)
|
||||
if err != nil {
|
||||
return false, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if _, err := os.Stat(blob); err != nil {
|
||||
return true, os.Rename(l.tempFileName, blob)
|
||||
}
|
||||
|
||||
return false, nil
|
||||
return os.Open(blob)
|
||||
}
|
||||
259
server/model.go
Normal file
259
server/model.go
Normal file
@@ -0,0 +1,259 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/convert"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
type layerWithGGML struct {
|
||||
*Layer
|
||||
*llm.GGML
|
||||
}
|
||||
|
||||
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
|
||||
modelpath := ParseModelPath(name.String())
|
||||
manifest, _, err := GetManifest(modelpath)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
if err := PullModel(ctx, name.String(), ®istryOptions{}, fn); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
modelpath = ParseModelPath(name.String())
|
||||
manifest, _, err = GetManifest(modelpath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
case err != nil:
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, layer := range manifest.Layers {
|
||||
layer, err := NewLayerFromLayer(layer.Digest, layer.MediaType, modelpath.GetShortTagname())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch layer.MediaType {
|
||||
case "application/vnd.ollama.image.model",
|
||||
"application/vnd.ollama.image.projector",
|
||||
"application/vnd.ollama.image.adapter":
|
||||
blobpath, err := GetBlobsPath(layer.Digest)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
blob, err := os.Open(blobpath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer blob.Close()
|
||||
|
||||
ggml, _, err := llm.DecodeGGML(blob)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
layers = append(layers, &layerWithGGML{layer, ggml})
|
||||
default:
|
||||
layers = append(layers, &layerWithGGML{layer, nil})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return layers, nil
|
||||
}
|
||||
|
||||
func parseFromZipFile(_ context.Context, file *os.File, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
|
||||
stat, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r, err := zip.NewReader(file, stat.Size())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer os.RemoveAll(tempdir)
|
||||
|
||||
fn(api.ProgressResponse{Status: "unpacking model metadata"})
|
||||
for _, f := range r.File {
|
||||
// TODO(mxyng): this should not write out all files to disk
|
||||
outfile, err := os.Create(filepath.Join(tempdir, f.Name))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
infile, err := f.Open()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if _, err = io.Copy(outfile, infile); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := outfile.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := infile.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
mf, err := convert.GetModelFormat(tempdir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
params, err := mf.GetParams(tempdir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mArch, err := mf.GetModelArch("", tempdir, params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "processing tensors"})
|
||||
if err := mArch.GetTensors(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := mArch.LoadVocab(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "converting model"})
|
||||
|
||||
// TODO(mxyng): this should write directly into a layer
|
||||
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
|
||||
temp, err := os.CreateTemp(tempdir, "fp16")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer temp.Close()
|
||||
defer os.Remove(temp.Name())
|
||||
|
||||
if err = mArch.WriteGGUF(temp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
layer, err := NewLayer(temp, "application/vnd.ollama.image.model")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("aaa: %w", err)
|
||||
}
|
||||
|
||||
blobpath, err := GetBlobsPath(layer.Digest)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
bin, err := os.Open(blobpath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer bin.Close()
|
||||
|
||||
ggml, _, err := llm.DecodeGGML(bin)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
layer, err = NewLayerFromLayer(layer.Digest, layer.MediaType, "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
layers = append(layers, &layerWithGGML{layer, ggml})
|
||||
return layers, nil
|
||||
}
|
||||
|
||||
func parseFromFile(ctx context.Context, file *os.File, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
|
||||
sr := io.NewSectionReader(file, 0, 512)
|
||||
contentType, err := detectContentType(sr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch contentType {
|
||||
case "gguf", "ggla":
|
||||
// noop
|
||||
case "application/zip":
|
||||
return parseFromZipFile(ctx, file, fn)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported content type: %s", contentType)
|
||||
}
|
||||
|
||||
stat, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var offset int64
|
||||
for offset < stat.Size() {
|
||||
ggml, n, err := llm.DecodeGGML(file)
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mediatype := "application/vnd.ollama.image.model"
|
||||
if ggml.Name() == "ggla" {
|
||||
mediatype = "application/vnd.ollama.image.adapter"
|
||||
} else if ggml.KV().Architecture() == "clip" {
|
||||
mediatype = "application/vnd.ollama.image.projector"
|
||||
}
|
||||
|
||||
layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
layers = append(layers, &layerWithGGML{layer, ggml})
|
||||
offset = n
|
||||
}
|
||||
|
||||
return layers, nil
|
||||
}
|
||||
|
||||
func detectContentType(r io.Reader) (string, error) {
|
||||
var b bytes.Buffer
|
||||
if _, err := io.Copy(&b, r); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
|
||||
return contentType, nil
|
||||
}
|
||||
|
||||
if contentType := http.DetectContentType(b.Bytes()); contentType != "application/octet-stream" {
|
||||
return contentType, nil
|
||||
}
|
||||
|
||||
return "unknown", nil
|
||||
}
|
||||
@@ -91,7 +91,7 @@ func countTokens(tmpl string, system string, prompt string, response string, enc
|
||||
}
|
||||
|
||||
// ChatPrompt builds up a prompt from a series of messages, truncating based on context window size
|
||||
func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(string) ([]int, error)) (string, int, error) {
|
||||
func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(string) ([]int, error)) (string, error) {
|
||||
type prompt struct {
|
||||
System string
|
||||
Prompt string
|
||||
@@ -138,7 +138,7 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
|
||||
|
||||
p.Response = msg.Content
|
||||
default:
|
||||
return "", 0, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
|
||||
return "", fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,7 +151,7 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
|
||||
for i, p := range prompts {
|
||||
tokens, err := countTokens(tmpl, p.System, p.Prompt, p.Response, encode)
|
||||
if err != nil {
|
||||
return "", 0, err
|
||||
return "", err
|
||||
}
|
||||
|
||||
prompts[i].tokens = tokens + len(prompts[i].images)*768
|
||||
@@ -160,17 +160,15 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
|
||||
// truncate images and prompts starting from the beginning of the list
|
||||
// until either one prompt remains or the total tokens fits the context window
|
||||
// TODO (jmorganca): this doesn't account for the context window room required for the response
|
||||
var required int
|
||||
for {
|
||||
required = 0
|
||||
var required int
|
||||
for _, p := range prompts {
|
||||
required += p.tokens
|
||||
}
|
||||
|
||||
required += 1 // for bos token
|
||||
|
||||
// leave ~1024 tokens for generation
|
||||
if required <= max(1024, window/2) {
|
||||
if required <= window {
|
||||
slog.Debug("prompt now fits in context window", "required", required, "window", window)
|
||||
break
|
||||
}
|
||||
@@ -196,7 +194,7 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
|
||||
|
||||
tokens, err := countTokens(tmpl, prompts[0].System, prompts[0].Prompt, prompts[0].Response, encode)
|
||||
if err != nil {
|
||||
return "", 0, err
|
||||
return "", err
|
||||
}
|
||||
|
||||
prompts[0].tokens = tokens + len(prompts[0].images)*768
|
||||
@@ -214,10 +212,10 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
|
||||
// last prompt should leave the response unrendered (for completion)
|
||||
rendered, err := Prompt(tmpl, p.System, p.Prompt, p.Response, i == len(prompts)-1)
|
||||
if err != nil {
|
||||
return "", 0, err
|
||||
return "", err
|
||||
}
|
||||
sb.WriteString(rendered)
|
||||
}
|
||||
|
||||
return sb.String(), required, nil
|
||||
return sb.String(), nil
|
||||
}
|
||||
|
||||
@@ -192,7 +192,7 @@ func TestChatPrompt(t *testing.T) {
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, _, err := ChatPrompt(tc.template, tc.messages, tc.window, encode)
|
||||
got, err := ChatPrompt(tc.template, tc.messages, tc.window, encode)
|
||||
if err != nil {
|
||||
t.Errorf("error = %v", err)
|
||||
}
|
||||
|
||||
349
server/routes.go
349
server/routes.go
@@ -1,6 +1,7 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
@@ -15,11 +16,8 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -31,14 +29,15 @@ import (
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
var mode string = gin.DebugMode
|
||||
|
||||
type Server struct {
|
||||
addr net.Addr
|
||||
addr net.Addr
|
||||
sched *Scheduler
|
||||
}
|
||||
|
||||
func init() {
|
||||
@@ -53,88 +52,8 @@ func init() {
|
||||
gin.SetMode(mode)
|
||||
}
|
||||
|
||||
var loaded struct {
|
||||
mu sync.Mutex
|
||||
|
||||
llama *llm.LlamaServer
|
||||
|
||||
expireTimer *time.Timer
|
||||
|
||||
model string
|
||||
adapters []string
|
||||
projectors []string
|
||||
*api.Options
|
||||
}
|
||||
|
||||
var defaultSessionDuration = 5 * time.Minute
|
||||
|
||||
func unload() {
|
||||
if loaded.llama != nil {
|
||||
loaded.llama.Close()
|
||||
}
|
||||
|
||||
loaded.llama = nil
|
||||
loaded.model = ""
|
||||
loaded.adapters = nil
|
||||
loaded.projectors = nil
|
||||
loaded.Options = nil
|
||||
}
|
||||
|
||||
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
|
||||
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
|
||||
ctx, cancel := context.WithTimeout(c, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
needLoad := loaded.llama == nil || // is there a model loaded?
|
||||
loaded.model != model.ModelPath || // has the base model changed?
|
||||
!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
|
||||
!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
|
||||
!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
|
||||
loaded.llama.Ping(ctx) != nil
|
||||
|
||||
if needLoad {
|
||||
if loaded.llama != nil {
|
||||
slog.Info("changing loaded model")
|
||||
unload()
|
||||
}
|
||||
|
||||
llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
// check for model compatibility
|
||||
if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
loaded.model = model.ModelPath
|
||||
loaded.adapters = model.AdapterPaths
|
||||
loaded.projectors = model.ProjectorPaths
|
||||
loaded.llama = llama
|
||||
loaded.Options = &opts
|
||||
|
||||
if err = llama.WaitUntilRunning(); err != nil {
|
||||
slog.Error("error loading llama server", "error", err)
|
||||
unload()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if loaded.expireTimer == nil {
|
||||
loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
unload()
|
||||
})
|
||||
}
|
||||
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
return nil
|
||||
}
|
||||
|
||||
func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
|
||||
opts := api.DefaultOptions()
|
||||
if err := opts.FromMap(model.Options); err != nil {
|
||||
@@ -154,9 +73,7 @@ func isSupportedImageType(image []byte) bool {
|
||||
return slices.Contains(allowedTypes, contentType)
|
||||
}
|
||||
|
||||
func GenerateHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
checkpointStart := time.Now()
|
||||
var req api.GenerateRequest
|
||||
@@ -224,7 +141,16 @@ func GenerateHandler(c *gin.Context) {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
||||
var runner *runnerRef
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
if errors.Is(err, context.Canceled) {
|
||||
c.JSON(499, gin.H{"error": "request canceled"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -234,10 +160,9 @@ func GenerateHandler(c *gin.Context) {
|
||||
// of `raw` mode so we need to check for it too
|
||||
if req.Prompt == "" && req.Template == "" && req.System == "" {
|
||||
c.JSON(http.StatusOK, api.GenerateResponse{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true,
|
||||
DoneReason: "load",
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true,
|
||||
})
|
||||
return
|
||||
}
|
||||
@@ -276,7 +201,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
|
||||
sb.Reset()
|
||||
if req.Context != nil {
|
||||
prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
|
||||
prev, err := runner.llama.Detokenize(c.Request.Context(), req.Context)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -290,14 +215,6 @@ func GenerateHandler(c *gin.Context) {
|
||||
prompt = sb.String()
|
||||
}
|
||||
|
||||
tokens, err := loaded.llama.Tokenize(c.Request.Context(), prompt)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
opts.NumPredict = max(opts.NumCtx-len(tokens), 0)
|
||||
|
||||
slog.Debug("generate handler", "prompt", prompt)
|
||||
|
||||
ch := make(chan any)
|
||||
@@ -306,9 +223,6 @@ func GenerateHandler(c *gin.Context) {
|
||||
defer close(ch)
|
||||
|
||||
fn := func(r llm.CompletionResponse) {
|
||||
// Update model expiration
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
|
||||
// Build up the full response
|
||||
if _, err := generated.WriteString(r.Content); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
@@ -316,11 +230,10 @@ func GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
resp := api.GenerateResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Done: r.Done,
|
||||
DoneReason: r.DoneReason,
|
||||
Response: r.Content,
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Done: r.Done,
|
||||
Response: r.Content,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: r.PromptEvalCount,
|
||||
PromptEvalDuration: r.PromptEvalDuration,
|
||||
@@ -341,7 +254,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
// TODO (jmorganca): encode() should not strip special tokens
|
||||
tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
|
||||
tokens, err := runner.llama.Tokenize(c.Request.Context(), p)
|
||||
if err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
return
|
||||
@@ -369,7 +282,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
Images: images,
|
||||
Options: opts,
|
||||
}
|
||||
if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
|
||||
if err := runner.llama.Completion(c.Request.Context(), req, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -431,10 +344,7 @@ func getDefaultSessionDuration() time.Duration {
|
||||
return defaultSessionDuration
|
||||
}
|
||||
|
||||
func EmbeddingsHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
|
||||
func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||
var req api.EmbeddingRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -479,7 +389,16 @@ func EmbeddingsHandler(c *gin.Context) {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
||||
var runner *runnerRef
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
if errors.Is(err, context.Canceled) {
|
||||
c.JSON(499, gin.H{"error": "request canceled"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -490,7 +409,7 @@ func EmbeddingsHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
|
||||
embedding, err := runner.llama.Embedding(c.Request.Context(), req.Prompt)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
|
||||
@@ -503,7 +422,7 @@ func EmbeddingsHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func PullModelHandler(c *gin.Context) {
|
||||
func (s *Server) PullModelHandler(c *gin.Context) {
|
||||
var req api.PullRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -552,7 +471,7 @@ func PullModelHandler(c *gin.Context) {
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func PushModelHandler(c *gin.Context) {
|
||||
func (s *Server) PushModelHandler(c *gin.Context) {
|
||||
var req api.PushRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -601,30 +520,19 @@ func PushModelHandler(c *gin.Context) {
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func CreateModelHandler(c *gin.Context) {
|
||||
func (s *Server) CreateModelHandler(c *gin.Context) {
|
||||
var req api.CreateRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
|
||||
return
|
||||
case err != nil:
|
||||
} else if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
var model string
|
||||
if req.Model != "" {
|
||||
model = req.Model
|
||||
} else if req.Name != "" {
|
||||
model = req.Name
|
||||
} else {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := ParseModelPath(model).Validate(); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
name := model.ParseName(cmp.Or(req.Model, req.Name))
|
||||
if !name.IsValid() {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -633,19 +541,19 @@ func CreateModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
var modelfile io.Reader = strings.NewReader(req.Modelfile)
|
||||
var r io.Reader = strings.NewReader(req.Modelfile)
|
||||
if req.Path != "" && req.Modelfile == "" {
|
||||
mf, err := os.Open(req.Path)
|
||||
f, err := os.Open(req.Path)
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
|
||||
return
|
||||
}
|
||||
defer mf.Close()
|
||||
defer f.Close()
|
||||
|
||||
modelfile = mf
|
||||
r = f
|
||||
}
|
||||
|
||||
commands, err := parser.Parse(modelfile)
|
||||
modelfile, err := model.ParseFile(r)
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -661,7 +569,7 @@ func CreateModelHandler(c *gin.Context) {
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
defer cancel()
|
||||
|
||||
if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil {
|
||||
if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -674,7 +582,7 @@ func CreateModelHandler(c *gin.Context) {
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func DeleteModelHandler(c *gin.Context) {
|
||||
func (s *Server) DeleteModelHandler(c *gin.Context) {
|
||||
var req api.DeleteRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -719,7 +627,7 @@ func DeleteModelHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, nil)
|
||||
}
|
||||
|
||||
func ShowModelHandler(c *gin.Context) {
|
||||
func (s *Server) ShowModelHandler(c *gin.Context) {
|
||||
var req api.ShowRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -809,17 +717,17 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
}
|
||||
}
|
||||
|
||||
mf, err := ShowModelfile(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Modelfile = mf
|
||||
var sb strings.Builder
|
||||
fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"")
|
||||
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
|
||||
fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
|
||||
fmt.Fprint(&sb, model.String())
|
||||
resp.Modelfile = sb.String()
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func ListModelsHandler(c *gin.Context) {
|
||||
func (s *Server) ListModelsHandler(c *gin.Context) {
|
||||
models := make([]api.ModelResponse, 0)
|
||||
manifestsPath, err := GetManifestPath()
|
||||
if err != nil {
|
||||
@@ -879,39 +787,36 @@ func ListModelsHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, api.ListResponse{Models: models})
|
||||
}
|
||||
|
||||
func CopyModelHandler(c *gin.Context) {
|
||||
var req api.CopyRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
func (s *Server) CopyModelHandler(c *gin.Context) {
|
||||
var r api.CopyRequest
|
||||
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
|
||||
return
|
||||
case err != nil:
|
||||
} else if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
if req.Source == "" || req.Destination == "" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "source add destination are required"})
|
||||
src := model.ParseName(r.Source)
|
||||
if !src.IsValid() {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
|
||||
return
|
||||
}
|
||||
|
||||
if err := ParseModelPath(req.Destination).Validate(); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
dst := model.ParseName(r.Destination)
|
||||
if !dst.IsValid() {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Source)})
|
||||
return
|
||||
}
|
||||
|
||||
if err := CopyModel(req.Source, req.Destination); err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Source)})
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
return
|
||||
if err := CopyModel(src, dst); errors.Is(err, os.ErrNotExist) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found", r.Source)})
|
||||
} else if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
}
|
||||
|
||||
func HeadBlobHandler(c *gin.Context) {
|
||||
func (s *Server) HeadBlobHandler(c *gin.Context) {
|
||||
path, err := GetBlobsPath(c.Param("digest"))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
@@ -926,7 +831,7 @@ func HeadBlobHandler(c *gin.Context) {
|
||||
c.Status(http.StatusOK)
|
||||
}
|
||||
|
||||
func CreateBlobHandler(c *gin.Context) {
|
||||
func (s *Server) CreateBlobHandler(c *gin.Context) {
|
||||
path, err := GetBlobsPath(c.Param("digest"))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
@@ -956,11 +861,6 @@ func CreateBlobHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := layer.Commit(); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.Status(http.StatusCreated)
|
||||
}
|
||||
|
||||
@@ -1073,27 +973,27 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
allowedHostsMiddleware(s.addr),
|
||||
)
|
||||
|
||||
r.POST("/api/pull", PullModelHandler)
|
||||
r.POST("/api/generate", GenerateHandler)
|
||||
r.POST("/api/chat", ChatHandler)
|
||||
r.POST("/api/embeddings", EmbeddingsHandler)
|
||||
r.POST("/api/create", CreateModelHandler)
|
||||
r.POST("/api/push", PushModelHandler)
|
||||
r.POST("/api/copy", CopyModelHandler)
|
||||
r.DELETE("/api/delete", DeleteModelHandler)
|
||||
r.POST("/api/show", ShowModelHandler)
|
||||
r.POST("/api/blobs/:digest", CreateBlobHandler)
|
||||
r.HEAD("/api/blobs/:digest", HeadBlobHandler)
|
||||
r.POST("/api/pull", s.PullModelHandler)
|
||||
r.POST("/api/generate", s.GenerateHandler)
|
||||
r.POST("/api/chat", s.ChatHandler)
|
||||
r.POST("/api/embeddings", s.EmbeddingsHandler)
|
||||
r.POST("/api/create", s.CreateModelHandler)
|
||||
r.POST("/api/push", s.PushModelHandler)
|
||||
r.POST("/api/copy", s.CopyModelHandler)
|
||||
r.DELETE("/api/delete", s.DeleteModelHandler)
|
||||
r.POST("/api/show", s.ShowModelHandler)
|
||||
r.POST("/api/blobs/:digest", s.CreateBlobHandler)
|
||||
r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
|
||||
|
||||
// Compatibility endpoints
|
||||
r.POST("/v1/chat/completions", openai.Middleware(), ChatHandler)
|
||||
r.POST("/v1/chat/completions", openai.Middleware(), s.ChatHandler)
|
||||
|
||||
for _, method := range []string{http.MethodGet, http.MethodHead} {
|
||||
r.Handle(method, "/", func(c *gin.Context) {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
|
||||
r.Handle(method, "/api/tags", ListModelsHandler)
|
||||
r.Handle(method, "/api/tags", s.ListModelsHandler)
|
||||
r.Handle(method, "/api/version", func(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, gin.H{"version": version.Version})
|
||||
})
|
||||
@@ -1147,7 +1047,9 @@ func Serve(ln net.Listener) error {
|
||||
}
|
||||
}
|
||||
|
||||
s := &Server{addr: ln.Addr()}
|
||||
ctx, done := context.WithCancel(context.Background())
|
||||
sched := InitScheduler(ctx)
|
||||
s := &Server{addr: ln.Addr(), sched: sched}
|
||||
r := s.GenerateRoutes()
|
||||
|
||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||
@@ -1160,7 +1062,8 @@ func Serve(ln net.Listener) error {
|
||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
<-signals
|
||||
unload()
|
||||
done()
|
||||
sched.unloadAllRunners()
|
||||
gpu.Cleanup()
|
||||
os.Exit(0)
|
||||
}()
|
||||
@@ -1168,12 +1071,12 @@ func Serve(ln net.Listener) error {
|
||||
if err := llm.Init(); err != nil {
|
||||
return fmt.Errorf("unable to initialize llm library %w", err)
|
||||
}
|
||||
if runtime.GOOS == "linux" { // TODO - windows too
|
||||
// check compatibility to log warnings
|
||||
if _, err := gpu.CheckVRAM(); err != nil {
|
||||
slog.Info(err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
s.sched.Run(ctx)
|
||||
|
||||
// At startup we retrieve GPU information so we can get log messages before loading a model
|
||||
// This will log warnings to the log in case we have problems with detected GPUs
|
||||
_ = gpu.GetGPUInfo()
|
||||
|
||||
return srvr.Serve(ln)
|
||||
}
|
||||
@@ -1229,23 +1132,20 @@ func streamResponse(c *gin.Context, ch chan any) {
|
||||
}
|
||||
|
||||
// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
|
||||
func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, int, error) {
|
||||
func chatPrompt(ctx context.Context, runner *runnerRef, template string, messages []api.Message, numCtx int) (string, error) {
|
||||
encode := func(s string) ([]int, error) {
|
||||
return loaded.llama.Tokenize(ctx, s)
|
||||
return runner.llama.Tokenize(ctx, s)
|
||||
}
|
||||
|
||||
prompt, tokens, err := ChatPrompt(template, messages, numCtx, encode)
|
||||
prompt, err := ChatPrompt(template, messages, numCtx, encode)
|
||||
if err != nil {
|
||||
return "", 0, err
|
||||
return "", err
|
||||
}
|
||||
|
||||
return prompt, tokens, nil
|
||||
return prompt, nil
|
||||
}
|
||||
|
||||
func ChatHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
|
||||
func (s *Server) ChatHandler(c *gin.Context) {
|
||||
checkpointStart := time.Now()
|
||||
|
||||
var req api.ChatRequest
|
||||
@@ -1302,7 +1202,16 @@ func ChatHandler(c *gin.Context) {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
||||
var runner *runnerRef
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
if errors.Is(err, context.Canceled) {
|
||||
c.JSON(499, gin.H{"error": "request canceled"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -1319,22 +1228,19 @@ func ChatHandler(c *gin.Context) {
|
||||
}, req.Messages...)
|
||||
}
|
||||
|
||||
prompt, tokens, err := chatPrompt(c.Request.Context(), model.Template, req.Messages, opts.NumCtx)
|
||||
prompt, err := chatPrompt(c.Request.Context(), runner, model.Template, req.Messages, opts.NumCtx)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
opts.NumPredict = max(opts.NumCtx-tokens, 0)
|
||||
|
||||
// an empty request loads the model
|
||||
if len(req.Messages) == 0 || prompt == "" {
|
||||
resp := api.ChatResponse{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true,
|
||||
DoneReason: "load",
|
||||
Message: api.Message{Role: "assistant"},
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true,
|
||||
Message: api.Message{Role: "assistant"},
|
||||
}
|
||||
c.JSON(http.StatusOK, resp)
|
||||
return
|
||||
@@ -1365,15 +1271,12 @@ func ChatHandler(c *gin.Context) {
|
||||
defer close(ch)
|
||||
|
||||
fn := func(r llm.CompletionResponse) {
|
||||
// Update model expiration
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Message: api.Message{Role: "assistant", Content: r.Content},
|
||||
Done: r.Done,
|
||||
DoneReason: r.DoneReason,
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Message: api.Message{Role: "assistant", Content: r.Content},
|
||||
Done: r.Done,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: r.PromptEvalCount,
|
||||
PromptEvalDuration: r.PromptEvalDuration,
|
||||
@@ -1390,7 +1293,7 @@ func ChatHandler(c *gin.Context) {
|
||||
ch <- resp
|
||||
}
|
||||
|
||||
if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||
if err := runner.llama.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
Images: images,
|
||||
|
||||
@@ -17,7 +17,7 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
@@ -55,13 +55,13 @@ func Test_Routes(t *testing.T) {
|
||||
createTestModel := func(t *testing.T, name string) {
|
||||
fname := createTestFile(t, "ollama-model")
|
||||
|
||||
modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
||||
commands, err := parser.Parse(modelfile)
|
||||
r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
||||
modelfile, err := model.ParseFile(r)
|
||||
assert.Nil(t, err)
|
||||
fn := func(resp api.ProgressResponse) {
|
||||
t.Logf("Status: %s", resp.Status)
|
||||
}
|
||||
err = CreateModel(context.TODO(), name, "", "", commands, fn)
|
||||
err = CreateModel(context.TODO(), name, "", "", modelfile, fn)
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
|
||||
@@ -124,14 +124,12 @@ func Test_Routes(t *testing.T) {
|
||||
Method: http.MethodPost,
|
||||
Path: "/api/create",
|
||||
Setup: func(t *testing.T, req *http.Request) {
|
||||
f, err := os.CreateTemp(t.TempDir(), "ollama-model")
|
||||
assert.Nil(t, err)
|
||||
defer f.Close()
|
||||
fname := createTestFile(t, "ollama-model")
|
||||
|
||||
stream := false
|
||||
createReq := api.CreateRequest{
|
||||
Name: "t-bone",
|
||||
Modelfile: fmt.Sprintf("FROM %s", f.Name()),
|
||||
Modelfile: fmt.Sprintf("FROM %s", fname),
|
||||
Stream: &stream,
|
||||
}
|
||||
jsonData, err := json.Marshal(createReq)
|
||||
@@ -216,28 +214,25 @@ func Test_Routes(t *testing.T) {
|
||||
httpSrv := httptest.NewServer(router)
|
||||
t.Cleanup(httpSrv.Close)
|
||||
|
||||
workDir, err := os.MkdirTemp("", "ollama-test")
|
||||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(workDir)
|
||||
os.Setenv("OLLAMA_MODELS", workDir)
|
||||
t.Setenv("OLLAMA_MODELS", t.TempDir())
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Logf("Running Test: [%s]", tc.Name)
|
||||
u := httpSrv.URL + tc.Path
|
||||
req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil)
|
||||
assert.Nil(t, err)
|
||||
t.Run(tc.Name, func(t *testing.T) {
|
||||
u := httpSrv.URL + tc.Path
|
||||
req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil)
|
||||
assert.Nil(t, err)
|
||||
|
||||
if tc.Setup != nil {
|
||||
tc.Setup(t, req)
|
||||
}
|
||||
if tc.Setup != nil {
|
||||
tc.Setup(t, req)
|
||||
}
|
||||
|
||||
resp, err := httpSrv.Client().Do(req)
|
||||
assert.Nil(t, err)
|
||||
defer resp.Body.Close()
|
||||
|
||||
if tc.Expected != nil {
|
||||
tc.Expected(t, resp)
|
||||
}
|
||||
resp, err := httpSrv.Client().Do(req)
|
||||
assert.Nil(t, err)
|
||||
defer resp.Body.Close()
|
||||
|
||||
if tc.Expected != nil {
|
||||
tc.Expected(t, resp)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
563
server/sched.go
Normal file
563
server/sched.go
Normal file
@@ -0,0 +1,563 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
type LlmRequest struct {
|
||||
ctx context.Context //nolint:containedctx
|
||||
model *Model
|
||||
opts api.Options
|
||||
sessionDuration time.Duration
|
||||
successCh chan *runnerRef
|
||||
errCh chan error
|
||||
}
|
||||
|
||||
type Scheduler struct {
|
||||
pendingReqCh chan *LlmRequest
|
||||
finishedReqCh chan *LlmRequest
|
||||
expiredCh chan *runnerRef
|
||||
unloadedCh chan interface{}
|
||||
|
||||
loaded map[string]*runnerRef
|
||||
loadedMu sync.Mutex
|
||||
|
||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
||||
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
|
||||
getGpuFn func() gpu.GpuInfoList
|
||||
}
|
||||
|
||||
// TODO set this to zero after a release or two, to enable multiple models by default
|
||||
var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
|
||||
var maxQueuedRequests = 10 // TODO configurable
|
||||
var numParallel = 1
|
||||
|
||||
func InitScheduler(ctx context.Context) *Scheduler {
|
||||
maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
|
||||
if maxRunners != "" {
|
||||
m, err := strconv.Atoi(maxRunners)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
|
||||
} else {
|
||||
loadedMax = m
|
||||
}
|
||||
}
|
||||
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||
p, err := strconv.Atoi(onp)
|
||||
if err != nil || p <= 0 {
|
||||
slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
||||
} else {
|
||||
numParallel = p
|
||||
}
|
||||
}
|
||||
|
||||
sched := &Scheduler{
|
||||
pendingReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
||||
finishedReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
||||
expiredCh: make(chan *runnerRef, maxQueuedRequests),
|
||||
unloadedCh: make(chan interface{}, maxQueuedRequests),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: llm.NewLlamaServer,
|
||||
getGpuFn: gpu.GetGPUInfo,
|
||||
}
|
||||
sched.loadFn = sched.load
|
||||
return sched
|
||||
}
|
||||
|
||||
// context must be canceled to decrement ref count and release the runner
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
||||
req := &LlmRequest{
|
||||
ctx: c,
|
||||
model: model,
|
||||
opts: opts,
|
||||
sessionDuration: sessionDuration,
|
||||
successCh: make(chan *runnerRef),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
// context split across parallel threads
|
||||
opts.NumCtx = opts.NumCtx * numParallel
|
||||
select {
|
||||
case s.pendingReqCh <- req:
|
||||
default:
|
||||
req.errCh <- fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||
}
|
||||
return req.successCh, req.errCh
|
||||
}
|
||||
|
||||
// Returns immediately, spawns go routines for the scheduler which will shutdown when ctx is done
|
||||
func (s *Scheduler) Run(ctx context.Context) {
|
||||
slog.Debug("starting llm scheduler")
|
||||
go func() {
|
||||
s.processPending(ctx)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
s.processCompleted(ctx)
|
||||
}()
|
||||
}
|
||||
|
||||
func (s *Scheduler) processPending(ctx context.Context) {
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Debug("shutting down scheduler pending loop")
|
||||
return
|
||||
case pending := <-s.pendingReqCh:
|
||||
// Block other requests until we get this pending request running
|
||||
for {
|
||||
var runnerToExpire *runnerRef
|
||||
s.loadedMu.Lock()
|
||||
runner := s.loaded[pending.model.ModelPath]
|
||||
loadedCount := len(s.loaded)
|
||||
s.loadedMu.Unlock()
|
||||
if runner != nil {
|
||||
if runner.needsReload(ctx, pending) {
|
||||
runnerToExpire = runner
|
||||
} else {
|
||||
// Runner is usable, return it
|
||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||
break
|
||||
}
|
||||
} else if loadedMax > 0 && loadedCount >= loadedMax {
|
||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||
runnerToExpire = s.findRunnerToUnload(pending)
|
||||
} else {
|
||||
// Either no models are loaded or below loadedMax
|
||||
// Get a refreshed GPU list
|
||||
gpus := s.getGpuFn()
|
||||
|
||||
// Load model for fitting
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
||||
if err != nil {
|
||||
pending.errCh <- err
|
||||
break
|
||||
}
|
||||
|
||||
// If we're CPU only mode, just limit by loadedMax above
|
||||
// TODO handle system memory exhaustion
|
||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
|
||||
slog.Debug("cpu mode with existing models, loading")
|
||||
s.loadFn(pending, ggml, gpus)
|
||||
break
|
||||
}
|
||||
|
||||
// No models loaded. Load the model but prefer the best fit.
|
||||
if loadedCount == 0 {
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
g := pickBestFitGPUs(pending, ggml, gpus)
|
||||
if g != nil {
|
||||
gpus = g
|
||||
}
|
||||
s.loadFn(pending, ggml, gpus)
|
||||
break
|
||||
}
|
||||
|
||||
// More than one loaded model, so we have to see if the new one fits
|
||||
// Update free memory from currently loaded models
|
||||
s.updateFreeSpace(gpus)
|
||||
gpus = pickBestFitGPUs(pending, ggml, gpus)
|
||||
if gpus != nil {
|
||||
slog.Debug("new model fits with existing models, loading")
|
||||
s.loadFn(pending, ggml, gpus)
|
||||
break
|
||||
}
|
||||
runnerToExpire = s.findRunnerToUnload(pending)
|
||||
}
|
||||
|
||||
if runnerToExpire == nil {
|
||||
// Shouildn't happen
|
||||
slog.Error("runner to expire was nil!")
|
||||
continue
|
||||
}
|
||||
// Trigger an expiration to unload once it's done
|
||||
runnerToExpire.refMu.Lock()
|
||||
slog.Debug("resetting model to expire immediately to make room", "model", runnerToExpire.model, "refCount", runnerToExpire.refCount)
|
||||
if runnerToExpire.expireTimer != nil {
|
||||
runnerToExpire.expireTimer.Stop()
|
||||
runnerToExpire.expireTimer = nil
|
||||
}
|
||||
runnerToExpire.sessionDuration = 0
|
||||
if runnerToExpire.refCount <= 0 {
|
||||
s.expiredCh <- runnerToExpire
|
||||
}
|
||||
runnerToExpire.refMu.Unlock()
|
||||
// Wait for the unload to happen
|
||||
// Note: at this point we're queueing up all incoming requests, even if they were for
|
||||
// a different model that's loaded and not scheduled to be removed.
|
||||
slog.Debug("waiting for pending requests to complete and unload to occur", "model", runnerToExpire.model)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Debug("shutting down scheduler pending loop")
|
||||
return
|
||||
case <-s.unloadedCh:
|
||||
slog.Debug("unload completed", "model", runnerToExpire.model)
|
||||
continue
|
||||
}
|
||||
}
|
||||
case <-s.unloadedCh:
|
||||
// An unload request when there are no pending request can be ignored
|
||||
slog.Debug("ignoring unload event with no pending requests")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scheduler) processCompleted(ctx context.Context) {
|
||||
// Process completed requests, expired timers, and unloading models
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Debug("shutting down scheduler completed loop")
|
||||
return
|
||||
case finished := <-s.finishedReqCh:
|
||||
s.loadedMu.Lock()
|
||||
runner := s.loaded[finished.model.ModelPath]
|
||||
s.loadedMu.Unlock()
|
||||
if runner == nil {
|
||||
slog.Error("finished requeset signal received after model unloaded", "model", finished.model.ModelPath)
|
||||
continue
|
||||
}
|
||||
runner.refMu.Lock()
|
||||
runner.refCount--
|
||||
if runner.refCount <= 0 {
|
||||
if runner.sessionDuration <= 0 {
|
||||
slog.Debug("runner with zero duration has gone idle, expiring to unload", "model", runner.model)
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
s.expiredCh <- runner
|
||||
} else if runner.expireTimer == nil {
|
||||
slog.Debug("runner with non-zero duration has gone idle, adding timer", "model", runner.model, "duration", runner.sessionDuration)
|
||||
runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() {
|
||||
slog.Debug("timer expired, expiring to unload", "model", runner.model)
|
||||
runner.refMu.Lock()
|
||||
defer runner.refMu.Unlock()
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
s.expiredCh <- runner
|
||||
})
|
||||
} else {
|
||||
slog.Debug("runner with non-zero duration has gone idle, resetting timer", "model", runner.model, "duration", runner.sessionDuration)
|
||||
runner.expireTimer.Reset(runner.sessionDuration)
|
||||
}
|
||||
}
|
||||
slog.Debug("after processing request finished event", "model", runner.model, "refCount", runner.refCount)
|
||||
runner.refMu.Unlock()
|
||||
case runner := <-s.expiredCh:
|
||||
slog.Debug("runner expired event received", "model", runner.model)
|
||||
runner.refMu.Lock()
|
||||
if runner.refCount > 0 {
|
||||
// Shouldn't happen, but safeguard to ensure no leaked runners
|
||||
slog.Debug("expired event with positive ref count, retrying", "model", runner.model, "refCount", runner.refCount)
|
||||
go func(runner *runnerRef) {
|
||||
// We can't unload yet, but want to as soon as the current request completes
|
||||
// So queue up another expired event
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
s.expiredCh <- runner
|
||||
}(runner)
|
||||
runner.refMu.Unlock()
|
||||
continue
|
||||
}
|
||||
|
||||
slog.Debug("got lock to unload", "model", runner.model)
|
||||
runner.unload()
|
||||
s.loadedMu.Lock()
|
||||
delete(s.loaded, runner.model)
|
||||
s.loadedMu.Unlock()
|
||||
slog.Debug("runner released", "model", runner.model)
|
||||
runner.refMu.Unlock()
|
||||
slog.Debug("sending an unloaded event", "model", runner.model)
|
||||
s.unloadedCh <- struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Complete the pending request and send the runner back to the requester
|
||||
// Wires up a finished event after the request context is completed
|
||||
// Updates session duration, and resets expiration timer
|
||||
func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *LlmRequest) {
|
||||
runner.refMu.Lock()
|
||||
defer runner.refMu.Unlock()
|
||||
runner.refCount++
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
runner.sessionDuration = pending.sessionDuration
|
||||
pending.successCh <- runner
|
||||
go func() {
|
||||
<-pending.ctx.Done()
|
||||
slog.Debug("context for request finished")
|
||||
finished <- pending
|
||||
}()
|
||||
}
|
||||
|
||||
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
|
||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
// check for model compatibility
|
||||
if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
||||
}
|
||||
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
||||
req.errCh <- err
|
||||
return
|
||||
}
|
||||
runner := &runnerRef{}
|
||||
runner.model = req.model.ModelPath
|
||||
runner.adapters = req.model.AdapterPaths
|
||||
runner.projectors = req.model.ProjectorPaths
|
||||
runner.llama = llama
|
||||
runner.Options = &req.opts
|
||||
runner.sessionDuration = req.sessionDuration
|
||||
runner.gpus = gpus
|
||||
runner.estimatedVRAM = llama.EstimatedVRAM()
|
||||
runner.loading = true
|
||||
runner.refCount = 1
|
||||
runner.refMu.Lock()
|
||||
s.loadedMu.Lock()
|
||||
s.loaded[req.model.ModelPath] = runner
|
||||
slog.Info("loaded runners", "count", len(s.loaded))
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
go func() {
|
||||
defer runner.refMu.Unlock()
|
||||
if err = llama.WaitUntilRunning(req.ctx); err != nil {
|
||||
slog.Error("error loading llama server", "error", err)
|
||||
runner.refCount--
|
||||
req.errCh <- err
|
||||
slog.Debug("triggering expiration for failed load", "model", runner.model)
|
||||
s.expiredCh <- runner
|
||||
return
|
||||
}
|
||||
slog.Debug("finished setting up runner", "model", req.model.ModelPath)
|
||||
runner.loading = false
|
||||
go func() {
|
||||
<-req.ctx.Done()
|
||||
slog.Debug("context for request finished")
|
||||
s.finishedReqCh <- req
|
||||
}()
|
||||
req.successCh <- runner
|
||||
}()
|
||||
}
|
||||
|
||||
func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||
type predKey struct {
|
||||
Library string
|
||||
ID string
|
||||
}
|
||||
predMap := map[predKey]uint64{} // Sum up the total predicted usage per GPU for all runners
|
||||
s.loadedMu.Lock()
|
||||
for _, r := range s.loaded {
|
||||
r.refMu.Lock()
|
||||
gpuIDs := make([]string, 0, len(r.gpus))
|
||||
if r.llama != nil {
|
||||
|
||||
// TODO this should be broken down by GPU instead of assuming uniform spread
|
||||
estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
|
||||
for _, gpu := range r.gpus {
|
||||
gpuIDs = append(gpuIDs, gpu.ID)
|
||||
}
|
||||
for _, gpu := range allGpus {
|
||||
if slices.Contains(gpuIDs, gpu.ID) {
|
||||
predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
|
||||
}
|
||||
}
|
||||
} else {
|
||||
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
||||
}
|
||||
r.refMu.Unlock()
|
||||
}
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list
|
||||
for i := range allGpus {
|
||||
if p, ok := predMap[predKey{allGpus[i].Library, allGpus[i].ID}]; ok {
|
||||
slog.Debug("gpu reported", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "available", format.HumanBytes2(allGpus[i].FreeMemory))
|
||||
if p > allGpus[i].TotalMemory {
|
||||
// Shouldn't happen
|
||||
slog.Warn("predicted usage exceeds VRAM", "gpu", allGpus[i].ID, "totalMemory", allGpus[i].TotalMemory, "predicted", p)
|
||||
allGpus[i].FreeMemory = 0
|
||||
} else if (allGpus[i].TotalMemory - p) < allGpus[i].FreeMemory { // predicted free is smaller than reported free, use it
|
||||
// TODO maybe we should just always trust our numbers, since cuda's free memory reporting is laggy
|
||||
// and we might unload models we didn't actually need to. The risk is if some other GPU intensive app is loaded
|
||||
// after we start our first runner, then we'll never acount for that, so picking the smallest free value seems prudent.
|
||||
allGpus[i].FreeMemory = allGpus[i].TotalMemory - p
|
||||
}
|
||||
slog.Info("updated VRAM", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type runnerRef struct {
|
||||
refMu sync.Mutex
|
||||
// refCond sync.Cond // Signaled on transition from 1 -> 0 refCount
|
||||
refCount uint // prevent unloading if > 0
|
||||
// unloading bool // set to true when we are trying to unload the runner
|
||||
|
||||
llama llm.LlamaServer
|
||||
loading bool // True only during initial load, then false forever
|
||||
gpus gpu.GpuInfoList // Recorded at time of provisioning
|
||||
estimatedVRAM uint64
|
||||
|
||||
sessionDuration time.Duration
|
||||
expireTimer *time.Timer
|
||||
|
||||
model string
|
||||
adapters []string
|
||||
projectors []string
|
||||
*api.Options
|
||||
}
|
||||
|
||||
// The refMu must already be held when calling unload
|
||||
func (runner *runnerRef) unload() {
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
if runner.llama != nil {
|
||||
runner.llama.Close()
|
||||
}
|
||||
runner.llama = nil
|
||||
runner.adapters = nil
|
||||
runner.projectors = nil
|
||||
runner.Options = nil
|
||||
runner.gpus = nil
|
||||
}
|
||||
|
||||
func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool {
|
||||
slog.Debug("evaluating already loaded", "model", req.model.ModelPath)
|
||||
runner.refMu.Lock()
|
||||
defer runner.refMu.Unlock()
|
||||
|
||||
timeout := 10 * time.Second
|
||||
if runner.loading {
|
||||
timeout = 2 * time.Minute // Initial load can take a long time for big models on slow systems...
|
||||
}
|
||||
|
||||
// Don't reload runner if num_gpu=-1 was provided
|
||||
optsExisting := runner.Options.Runner
|
||||
optsNew := req.opts.Runner
|
||||
if optsNew.NumGPU < 0 {
|
||||
optsExisting.NumGPU = -1
|
||||
optsNew.NumGPU = -1
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
|
||||
!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
|
||||
!reflect.DeepEqual(optsExisting, optsNew) || // have the runner options changed?
|
||||
runner.llama.Ping(ctx) != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
type ByDuration []*runnerRef
|
||||
|
||||
func (a ByDuration) Len() int { return len(a) }
|
||||
func (a ByDuration) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a ByDuration) Less(i, j int) bool {
|
||||
// uint64 to turn negative time (never unload) to largest
|
||||
return uint64(a[i].sessionDuration) < uint64(a[j].sessionDuration)
|
||||
}
|
||||
|
||||
// TODO - future consideration to pick runners based on size
|
||||
// type BySize []*runnerRef
|
||||
// func (a BySize) Len() int { return len(a) }
|
||||
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
||||
|
||||
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
for _, gl := range gpus.ByLibrary() {
|
||||
var ok bool
|
||||
sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
|
||||
|
||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
||||
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
||||
|
||||
// First attempt to fit the model into a single GPU
|
||||
for _, g := range sgl {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||
return []gpu.GpuInfo{g}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO future refinements
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// Now try all the GPUs
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
|
||||
return gl
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||
func (s *Scheduler) findRunnerToUnload(req *LlmRequest) *runnerRef {
|
||||
s.loadedMu.Lock()
|
||||
runnerList := make([]*runnerRef, 0, len(s.loaded))
|
||||
for _, r := range s.loaded {
|
||||
runnerList = append(runnerList, r)
|
||||
}
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// In the future we can enhance the algorithm to be smarter about picking the optimal runner to unload
|
||||
// e.g., if we have multiple options, will one make room for the request?
|
||||
sort.Sort(ByDuration(runnerList))
|
||||
|
||||
// First try to find a runner that's already idle
|
||||
for _, runner := range runnerList {
|
||||
runner.refMu.Lock()
|
||||
rc := runner.refCount
|
||||
runner.refMu.Unlock()
|
||||
if rc == 0 {
|
||||
slog.Debug("found an idle runner to unload")
|
||||
return runner
|
||||
}
|
||||
}
|
||||
// None appear idle, just wait for the one with the shortest duration
|
||||
slog.Debug("no idle runners, picking the shortest duration", "count", len(runnerList))
|
||||
return runnerList[0]
|
||||
}
|
||||
|
||||
func (s *Scheduler) unloadAllRunners() {
|
||||
s.loadedMu.Lock()
|
||||
defer s.loadedMu.Unlock()
|
||||
for model, runner := range s.loaded {
|
||||
if runner.llama != nil {
|
||||
slog.Debug("shutting down runner", "model", model)
|
||||
runner.llama.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
629
server/sched_test.go
Normal file
629
server/sched_test.go
Normal file
@@ -0,0 +1,629 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func init() {
|
||||
os.Setenv("OLLAMA_DEBUG", "1")
|
||||
lifecycle.InitLogging()
|
||||
}
|
||||
|
||||
func TestInitScheduler(t *testing.T) {
|
||||
ctx, done := context.WithCancel(context.Background())
|
||||
defer done()
|
||||
initialMax := loadedMax
|
||||
initialParallel := numParallel
|
||||
s := InitScheduler(ctx)
|
||||
require.Equal(t, initialMax, loadedMax)
|
||||
s.loadedMu.Lock()
|
||||
require.NotNil(t, s.loaded)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
|
||||
s = InitScheduler(ctx)
|
||||
require.Equal(t, initialMax, loadedMax)
|
||||
s.loadedMu.Lock()
|
||||
require.NotNil(t, s.loaded)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
||||
s = InitScheduler(ctx)
|
||||
require.Equal(t, 0, loadedMax)
|
||||
s.loadedMu.Lock()
|
||||
require.NotNil(t, s.loaded)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
os.Setenv("OLLAMA_NUM_PARALLEL", "blue")
|
||||
_ = InitScheduler(ctx)
|
||||
require.Equal(t, initialParallel, numParallel)
|
||||
os.Setenv("OLLAMA_NUM_PARALLEL", "10")
|
||||
_ = InitScheduler(ctx)
|
||||
require.Equal(t, 10, numParallel)
|
||||
}
|
||||
|
||||
func TestLoad(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
var ggml *llm.GGML // value not used in tests
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
model: &Model{ModelPath: "foo"},
|
||||
opts: api.DefaultOptions(),
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
sessionDuration: 2,
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
||||
return nil, fmt.Errorf("something failed to load model blah")
|
||||
}
|
||||
gpus := gpu.GpuInfoList{}
|
||||
s.load(req, ggml, gpus)
|
||||
require.Len(t, req.successCh, 0)
|
||||
require.Len(t, req.errCh, 1)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 0)
|
||||
s.loadedMu.Unlock()
|
||||
err := <-req.errCh
|
||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{estimatedVRAM: 10}
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.NoError(t, err)
|
||||
case resp := <-req.successCh:
|
||||
require.Equal(t, uint64(10), resp.estimatedVRAM)
|
||||
require.Equal(t, uint(1), resp.refCount)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
req.model.ModelPath = "dummy_model_path"
|
||||
server.waitResp = fmt.Errorf("wait failure")
|
||||
s.load(req, ggml, gpus)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.Contains(t, err.Error(), "wait failure")
|
||||
case resp := <-req.successCh:
|
||||
t.Errorf("unexpected success %v", resp)
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
runner := s.loaded["dummy_model_path"]
|
||||
s.loadedMu.Unlock()
|
||||
require.NotNil(t, runner)
|
||||
require.Equal(t, uint(0), runner.refCount)
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
require.Len(t, s.expiredCh, 1)
|
||||
}
|
||||
|
||||
type bundle struct {
|
||||
ctx context.Context //nolint:containedctx
|
||||
ctxDone func()
|
||||
srv *mockLlm
|
||||
req *LlmRequest
|
||||
ggml *llm.GGML
|
||||
}
|
||||
|
||||
func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
||||
return scenario.srv, nil
|
||||
}
|
||||
|
||||
func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
|
||||
scenario := &bundle{}
|
||||
scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
|
||||
t.Helper()
|
||||
|
||||
f, err := os.CreateTemp(t.TempDir(), modelName)
|
||||
assert.Nil(t, err)
|
||||
defer f.Close()
|
||||
|
||||
gguf := llm.NewGGUFV3(binary.LittleEndian)
|
||||
err = gguf.Encode(f, llm.KV{
|
||||
"general.architecture": "llama",
|
||||
"general.name": "name",
|
||||
"llama.context_length": uint32(32),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
"llama.block_count": uint32(1),
|
||||
"llama.attention.head_count": uint32(32),
|
||||
"llama.attention.head_count_kv": uint32(32),
|
||||
"tokenizer.ggml.tokens": []string{" "},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []llm.Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
})
|
||||
assert.Nil(t, err)
|
||||
|
||||
fname := f.Name()
|
||||
model := &Model{Name: modelName, ModelPath: fname}
|
||||
scenario.ggml, err = llm.LoadModel(model.ModelPath)
|
||||
require.NoError(t, err)
|
||||
|
||||
scenario.req = &LlmRequest{
|
||||
ctx: scenario.ctx,
|
||||
model: model,
|
||||
opts: api.DefaultOptions(),
|
||||
sessionDuration: 5 * time.Millisecond,
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
|
||||
return scenario
|
||||
}
|
||||
|
||||
func TestRequests(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
||||
scenario1a.req.sessionDuration = 0
|
||||
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
||||
scenario1b.req.model = scenario1a.req.model
|
||||
scenario1b.ggml = scenario1a.ggml
|
||||
scenario1b.req.sessionDuration = 0
|
||||
|
||||
// simple reload of same model
|
||||
scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
|
||||
scenario2a.req.model = scenario1a.req.model
|
||||
scenario2a.ggml = scenario1a.ggml
|
||||
|
||||
// Multiple loaded models
|
||||
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
||||
scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
|
||||
scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
|
||||
scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||
scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
g := gpu.GpuInfo{Library: "metal"}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []gpu.GpuInfo{g}
|
||||
}
|
||||
s.newServerFn = scenario1a.newServer
|
||||
slog.Info("scenario1a")
|
||||
s.pendingReqCh <- scenario1a.req
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
s.Run(ctx)
|
||||
select {
|
||||
case resp := <-scenario1a.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario1a.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario1a.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
|
||||
// Same runner as first request due to not needing a reload
|
||||
s.newServerFn = scenario1b.newServer
|
||||
slog.Info("scenario1b")
|
||||
s.pendingReqCh <- scenario1b.req
|
||||
select {
|
||||
case resp := <-scenario1b.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario1a.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario1b.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
|
||||
// Trigger a reload
|
||||
s.newServerFn = scenario2a.newServer
|
||||
scenario2a.req.model.AdapterPaths = []string{"new"}
|
||||
slog.Info("scenario2a")
|
||||
s.pendingReqCh <- scenario2a.req
|
||||
// finish first two requests, so model can reload
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
scenario1a.ctxDone()
|
||||
scenario1b.ctxDone()
|
||||
select {
|
||||
case resp := <-scenario2a.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario2a.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario2a.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
|
||||
loadedMax = 1
|
||||
s.newServerFn = scenario3a.newServer
|
||||
slog.Info("scenario3a")
|
||||
s.pendingReqCh <- scenario3a.req
|
||||
// finish prior request, so new model can load
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
scenario2a.ctxDone()
|
||||
select {
|
||||
case resp := <-scenario3a.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario3a.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario3a.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
loadedMax = 0
|
||||
s.newServerFn = scenario3b.newServer
|
||||
slog.Info("scenario3b")
|
||||
s.pendingReqCh <- scenario3b.req
|
||||
select {
|
||||
case resp := <-scenario3b.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario3b.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario3b.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 2)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// This is a CPU load with NumGPU = 0 so it should load
|
||||
s.newServerFn = scenario3c.newServer
|
||||
slog.Info("scenario3c")
|
||||
s.pendingReqCh <- scenario3c.req
|
||||
select {
|
||||
case resp := <-scenario3c.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario3c.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario3c.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 3)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// Try to load a model that wont fit
|
||||
s.newServerFn = scenario3d.newServer
|
||||
slog.Info("scenario3d")
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 3)
|
||||
s.loadedMu.Unlock()
|
||||
scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
|
||||
time.Sleep(2 * time.Millisecond)
|
||||
s.pendingReqCh <- scenario3d.req
|
||||
// finish prior request, so new model can load
|
||||
time.Sleep(6 * time.Millisecond)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 2)
|
||||
s.loadedMu.Unlock()
|
||||
scenario3b.ctxDone()
|
||||
select {
|
||||
case resp := <-scenario3d.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario3d.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario3d.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 2)
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
func TestGetRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
||||
scenario1a.req.sessionDuration = 0
|
||||
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
||||
scenario1b.req.sessionDuration = 0
|
||||
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
|
||||
scenario1c.req.sessionDuration = 0
|
||||
maxQueuedRequests = 1
|
||||
s := InitScheduler(ctx)
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
g := gpu.GpuInfo{Library: "metal"}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []gpu.GpuInfo{g}
|
||||
}
|
||||
s.newServerFn = scenario1a.newServer
|
||||
slog.Info("scenario1a")
|
||||
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
slog.Info("scenario1b")
|
||||
successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
require.Len(t, successCh1b, 0)
|
||||
require.Len(t, errCh1b, 1)
|
||||
err := <-errCh1b
|
||||
require.Contains(t, err.Error(), "server busy")
|
||||
s.Run(ctx)
|
||||
select {
|
||||
case resp := <-successCh1a:
|
||||
require.Equal(t, resp.llama, scenario1a.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, errCh1a, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
scenario1a.ctxDone()
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
scenario1c.req.model.ModelPath = "bad path"
|
||||
slog.Info("scenario1c")
|
||||
successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, successCh1c, 0)
|
||||
require.Len(t, errCh1c, 0)
|
||||
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 0)
|
||||
s.loadedMu.Unlock()
|
||||
require.Len(t, errCh1c, 1)
|
||||
err = <-errCh1c
|
||||
require.Contains(t, err.Error(), "bad path")
|
||||
scenario1b.ctxDone()
|
||||
}
|
||||
|
||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||
func TestPrematureExpired(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
||||
s := InitScheduler(ctx)
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
g := gpu.GpuInfo{Library: "metal"}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []gpu.GpuInfo{g}
|
||||
}
|
||||
s.newServerFn = scenario1a.newServer
|
||||
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
s.Run(ctx)
|
||||
select {
|
||||
case resp := <-successCh1a:
|
||||
require.Equal(t, resp.llama, scenario1a.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, errCh1a, 0)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
slog.Info("sending premature expired event now")
|
||||
s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
time.Sleep(scenario1a.req.sessionDuration)
|
||||
scenario1a.ctxDone()
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
require.LessOrEqual(t, len(s.finishedReqCh), 1)
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
require.Len(t, s.finishedReqCh, 0)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 0)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// also shouldn't happen in real life
|
||||
s.finishedReqCh <- scenario1a.req
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
|
||||
func TestUseLoadedRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
opts: api.DefaultOptions(),
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
sessionDuration: 2,
|
||||
}
|
||||
finished := make(chan *LlmRequest)
|
||||
llm1 := &mockLlm{}
|
||||
r1 := &runnerRef{llama: llm1, sessionDuration: 1}
|
||||
req.useLoadedRunner(r1, finished)
|
||||
require.Equal(t, uint(1), r1.refCount)
|
||||
require.Equal(t, time.Duration(2), r1.sessionDuration)
|
||||
select {
|
||||
case success := <-req.successCh:
|
||||
require.Equal(t, r1, success)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
done()
|
||||
fin := <-finished
|
||||
require.Equal(t, req, fin)
|
||||
}
|
||||
|
||||
func TestUpdateFreeSpace(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := gpu.GpuInfoList{
|
||||
{
|
||||
Library: "a",
|
||||
ID: "1",
|
||||
},
|
||||
{
|
||||
Library: "a",
|
||||
ID: "2",
|
||||
},
|
||||
}
|
||||
gpus[0].TotalMemory = 1000
|
||||
gpus[0].FreeMemory = 900
|
||||
gpus[1].TotalMemory = 2000
|
||||
gpus[1].FreeMemory = 1900
|
||||
llm1 := &mockLlm{estimatedVRAM: 100}
|
||||
llm2 := &mockLlm{estimatedVRAM: 200}
|
||||
r1 := &runnerRef{llama: llm1, gpus: gpus}
|
||||
r2 := &runnerRef{llama: llm2, gpus: gpus}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
s.updateFreeSpace(gpus)
|
||||
require.Equal(t, uint64(850), gpus[0].FreeMemory)
|
||||
require.Equal(t, uint64(1850), gpus[1].FreeMemory)
|
||||
}
|
||||
|
||||
func TestFindRunnerToUnload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
opts: api.DefaultOptions(),
|
||||
}
|
||||
r1 := &runnerRef{refCount: 1, sessionDuration: 1}
|
||||
r2 := &runnerRef{sessionDuration: 2}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
resp := s.findRunnerToUnload(req)
|
||||
require.Equal(t, r2, resp)
|
||||
r2.refCount = 1
|
||||
resp = s.findRunnerToUnload(req)
|
||||
require.Equal(t, r1, resp)
|
||||
|
||||
}
|
||||
|
||||
func TestNeedsReload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
llm := &mockLlm{}
|
||||
do := api.DefaultOptions()
|
||||
runner := &runnerRef{
|
||||
adapters: []string{"adapter1"},
|
||||
projectors: []string{"projector1"},
|
||||
Options: &do,
|
||||
llama: llm,
|
||||
}
|
||||
req := &LlmRequest{
|
||||
model: &Model{
|
||||
AdapterPaths: []string{"adapter2"},
|
||||
ProjectorPaths: []string{"projector2"},
|
||||
},
|
||||
opts: api.DefaultOptions(),
|
||||
}
|
||||
resp := runner.needsReload(ctx, req)
|
||||
require.True(t, resp)
|
||||
req.model.AdapterPaths = runner.adapters
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.True(t, resp)
|
||||
req.model.ProjectorPaths = runner.projectors
|
||||
runner.loading = true
|
||||
req.opts.NumBatch = 1234
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.True(t, resp)
|
||||
req.opts.NumBatch = runner.Options.NumBatch
|
||||
llm.pingResp = fmt.Errorf("foo")
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.True(t, resp)
|
||||
llm.pingResp = nil
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.False(t, resp)
|
||||
req.opts.NumGPU = 99
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.True(t, resp)
|
||||
req.opts.NumGPU = -1
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.False(t, resp)
|
||||
}
|
||||
|
||||
func TestUnloadAllRunners(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
llm1 := &mockLlm{}
|
||||
llm2 := &mockLlm{}
|
||||
s := InitScheduler(ctx)
|
||||
s.unloadAllRunners()
|
||||
|
||||
r1 := &runnerRef{llama: llm1}
|
||||
r2 := &runnerRef{llama: llm2}
|
||||
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
s.loadedMu.Unlock()
|
||||
s.unloadAllRunners()
|
||||
|
||||
require.True(t, llm1.closeCalled)
|
||||
require.True(t, llm2.closeCalled)
|
||||
}
|
||||
|
||||
func TestUnload(t *testing.T) {
|
||||
llm1 := &mockLlm{}
|
||||
r1 := &runnerRef{llama: llm1}
|
||||
r2 := &runnerRef{adapters: []string{"A"}}
|
||||
r1.unload()
|
||||
require.True(t, llm1.closeCalled)
|
||||
r2.unload()
|
||||
require.Nil(t, r2.adapters)
|
||||
}
|
||||
|
||||
type mockLlm struct {
|
||||
pingResp error
|
||||
waitResp error
|
||||
completionResp error
|
||||
embeddingResp []float64
|
||||
embeddingRespErr error
|
||||
tokenizeResp []int
|
||||
tokenizeRespErr error
|
||||
detokenizeResp string
|
||||
detonekizeRespErr error
|
||||
closeResp error
|
||||
closeCalled bool
|
||||
estimatedVRAM uint64
|
||||
}
|
||||
|
||||
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
||||
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
|
||||
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
||||
return s.completionResp
|
||||
}
|
||||
func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
||||
return s.embeddingResp, s.embeddingRespErr
|
||||
}
|
||||
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
return s.tokenizeResp, s.tokenizeRespErr
|
||||
}
|
||||
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||
return s.detokenizeResp, s.detonekizeRespErr
|
||||
}
|
||||
func (s *mockLlm) Close() error {
|
||||
s.closeCalled = true
|
||||
return s.closeResp
|
||||
}
|
||||
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
||||
18
types/errtypes/errtypes.go
Normal file
18
types/errtypes/errtypes.go
Normal file
@@ -0,0 +1,18 @@
|
||||
// Package errtypes contains custom error types
|
||||
package errtypes
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const UnknownOllamaKeyErrMsg = "unknown ollama key"
|
||||
|
||||
// TODO: This should have a structured response from the API
|
||||
type UnknownOllamaKey struct {
|
||||
Key string
|
||||
}
|
||||
|
||||
func (e *UnknownOllamaKey) Error() string {
|
||||
return fmt.Sprintf("unauthorized: %s %q", UnknownOllamaKeyErrMsg, strings.TrimSpace(e.Key))
|
||||
}
|
||||
@@ -1,83 +0,0 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Digest represents a digest of a model Manifest. It is a comparable value
|
||||
// type and is immutable.
|
||||
//
|
||||
// The zero Digest is not a valid digest.
|
||||
type Digest struct {
|
||||
s string
|
||||
}
|
||||
|
||||
// Type returns the digest type of the digest.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// ParseDigest("sha256-1234").Type() // returns "sha256"
|
||||
func (d Digest) Type() string {
|
||||
typ, _, _ := strings.Cut(d.s, "-")
|
||||
return typ
|
||||
}
|
||||
|
||||
// String returns the digest in the form of "<digest-type>-<digest>", or the
|
||||
// empty string if the digest is invalid.
|
||||
func (d Digest) String() string { return d.s }
|
||||
|
||||
// IsValid returns true if the digest is valid (not zero).
|
||||
//
|
||||
// A valid digest may be created only by ParseDigest, or
|
||||
// ParseName(name).Digest().
|
||||
func (d Digest) IsValid() bool { return d.s != "" }
|
||||
|
||||
// LogValue implements slog.Value.
|
||||
func (d Digest) LogValue() slog.Value {
|
||||
return slog.StringValue(d.String())
|
||||
}
|
||||
|
||||
var (
|
||||
_ slog.LogValuer = Digest{}
|
||||
)
|
||||
|
||||
// ParseDigest parses a string in the form of "<digest-type>-<digest>" into a
|
||||
// Digest.
|
||||
func ParseDigest(s string) Digest {
|
||||
typ, digest, ok := strings.Cut(s, "-")
|
||||
if !ok {
|
||||
typ, digest, ok = strings.Cut(s, ":")
|
||||
}
|
||||
if ok && isValidDigestType(typ) && isValidHex(digest) {
|
||||
return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
|
||||
}
|
||||
return Digest{}
|
||||
}
|
||||
|
||||
func isValidDigestType(s string) bool {
|
||||
if len(s) == 0 {
|
||||
return false
|
||||
}
|
||||
for _, r := range s {
|
||||
if !unicode.IsLower(r) && !unicode.IsDigit(r) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func isValidHex(s string) bool {
|
||||
if len(s) == 0 {
|
||||
return false
|
||||
}
|
||||
for i := range s {
|
||||
c := s[i]
|
||||
if c < '0' || c > '9' && c < 'a' || c > 'f' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -1,46 +0,0 @@
|
||||
package model
|
||||
|
||||
import "testing"
|
||||
|
||||
var testDigests = map[string]Digest{
|
||||
"": {},
|
||||
"sha256-1234": {s: "sha256-1234"},
|
||||
"sha256-5678": {s: "sha256-5678"},
|
||||
"blake2-9abc": {s: "blake2-9abc"},
|
||||
"-1234": {},
|
||||
"sha256-": {},
|
||||
"sha256-1234-5678": {},
|
||||
"sha256-P": {}, // invalid hex
|
||||
"sha256-1234P": {},
|
||||
"---": {},
|
||||
}
|
||||
|
||||
func TestDigestParse(t *testing.T) {
|
||||
// Test cases.
|
||||
for s, want := range testDigests {
|
||||
got := ParseDigest(s)
|
||||
t.Logf("ParseDigest(%q) = %#v", s, got)
|
||||
if got != want {
|
||||
t.Errorf("ParseDigest(%q) = %q; want %q", s, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDigestString(t *testing.T) {
|
||||
// Test cases.
|
||||
for s, d := range testDigests {
|
||||
want := s
|
||||
if !d.IsValid() {
|
||||
want = ""
|
||||
}
|
||||
got := d.String()
|
||||
if got != want {
|
||||
t.Errorf("ParseDigest(%q).String() = %q; want %q", s, got, want)
|
||||
}
|
||||
|
||||
got = ParseDigest(s).String()
|
||||
if got != want {
|
||||
t.Errorf("roundtrip ParseDigest(%q).String() = %q; want %q", s, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
303
types/model/file.go
Normal file
303
types/model/file.go
Normal file
@@ -0,0 +1,303 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type File struct {
|
||||
Commands []Command
|
||||
}
|
||||
|
||||
func (f File) String() string {
|
||||
var sb strings.Builder
|
||||
for _, cmd := range f.Commands {
|
||||
fmt.Fprintln(&sb, cmd.String())
|
||||
}
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
type Command struct {
|
||||
Name string
|
||||
Args string
|
||||
}
|
||||
|
||||
func (c Command) String() string {
|
||||
var sb strings.Builder
|
||||
switch c.Name {
|
||||
case "model":
|
||||
fmt.Fprintf(&sb, "FROM %s", c.Args)
|
||||
case "license", "template", "system", "adapter":
|
||||
fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
|
||||
case "message":
|
||||
role, message, _ := strings.Cut(c.Args, ": ")
|
||||
fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
|
||||
default:
|
||||
fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
|
||||
}
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
type state int
|
||||
|
||||
const (
|
||||
stateNil state = iota
|
||||
stateName
|
||||
stateValue
|
||||
stateParameter
|
||||
stateMessage
|
||||
stateComment
|
||||
)
|
||||
|
||||
var (
|
||||
errMissingFrom = errors.New("no FROM line")
|
||||
errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
|
||||
errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
|
||||
)
|
||||
|
||||
func ParseFile(r io.Reader) (*File, error) {
|
||||
var cmd Command
|
||||
var curr state
|
||||
var b bytes.Buffer
|
||||
var role string
|
||||
|
||||
var f File
|
||||
|
||||
br := bufio.NewReader(r)
|
||||
for {
|
||||
r, _, err := br.ReadRune()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
next, r, err := parseRuneForState(r, curr)
|
||||
if errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
return nil, fmt.Errorf("%w: %s", err, b.String())
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// process the state transition, some transitions need to be intercepted and redirected
|
||||
if next != curr {
|
||||
switch curr {
|
||||
case stateName:
|
||||
if !isValidCommand(b.String()) {
|
||||
return nil, errInvalidCommand
|
||||
}
|
||||
|
||||
// next state sometimes depends on the current buffer value
|
||||
switch s := strings.ToLower(b.String()); s {
|
||||
case "from":
|
||||
cmd.Name = "model"
|
||||
case "parameter":
|
||||
// transition to stateParameter which sets command name
|
||||
next = stateParameter
|
||||
case "message":
|
||||
// transition to stateMessage which validates the message role
|
||||
next = stateMessage
|
||||
fallthrough
|
||||
default:
|
||||
cmd.Name = s
|
||||
}
|
||||
case stateParameter:
|
||||
cmd.Name = b.String()
|
||||
case stateMessage:
|
||||
if !isValidMessageRole(b.String()) {
|
||||
return nil, errInvalidMessageRole
|
||||
}
|
||||
|
||||
role = b.String()
|
||||
case stateComment, stateNil:
|
||||
// pass
|
||||
case stateValue:
|
||||
s, ok := unquote(b.String())
|
||||
if !ok || isSpace(r) {
|
||||
if _, err := b.WriteRune(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if role != "" {
|
||||
s = role + ": " + s
|
||||
role = ""
|
||||
}
|
||||
|
||||
cmd.Args = s
|
||||
f.Commands = append(f.Commands, cmd)
|
||||
}
|
||||
|
||||
b.Reset()
|
||||
curr = next
|
||||
}
|
||||
|
||||
if strconv.IsPrint(r) {
|
||||
if _, err := b.WriteRune(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// flush the buffer
|
||||
switch curr {
|
||||
case stateComment, stateNil:
|
||||
// pass; nothing to flush
|
||||
case stateValue:
|
||||
s, ok := unquote(b.String())
|
||||
if !ok {
|
||||
return nil, io.ErrUnexpectedEOF
|
||||
}
|
||||
|
||||
if role != "" {
|
||||
s = role + ": " + s
|
||||
}
|
||||
|
||||
cmd.Args = s
|
||||
f.Commands = append(f.Commands, cmd)
|
||||
default:
|
||||
return nil, io.ErrUnexpectedEOF
|
||||
}
|
||||
|
||||
for _, cmd := range f.Commands {
|
||||
if cmd.Name == "model" {
|
||||
return &f, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errMissingFrom
|
||||
}
|
||||
|
||||
func parseRuneForState(r rune, cs state) (state, rune, error) {
|
||||
switch cs {
|
||||
case stateNil:
|
||||
switch {
|
||||
case r == '#':
|
||||
return stateComment, 0, nil
|
||||
case isSpace(r), isNewline(r):
|
||||
return stateNil, 0, nil
|
||||
default:
|
||||
return stateName, r, nil
|
||||
}
|
||||
case stateName:
|
||||
switch {
|
||||
case isAlpha(r):
|
||||
return stateName, r, nil
|
||||
case isSpace(r):
|
||||
return stateValue, 0, nil
|
||||
default:
|
||||
return stateNil, 0, errInvalidCommand
|
||||
}
|
||||
case stateValue:
|
||||
switch {
|
||||
case isNewline(r):
|
||||
return stateNil, r, nil
|
||||
case isSpace(r):
|
||||
return stateNil, r, nil
|
||||
default:
|
||||
return stateValue, r, nil
|
||||
}
|
||||
case stateParameter:
|
||||
switch {
|
||||
case isAlpha(r), isNumber(r), r == '_':
|
||||
return stateParameter, r, nil
|
||||
case isSpace(r):
|
||||
return stateValue, 0, nil
|
||||
default:
|
||||
return stateNil, 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
case stateMessage:
|
||||
switch {
|
||||
case isAlpha(r):
|
||||
return stateMessage, r, nil
|
||||
case isSpace(r):
|
||||
return stateValue, 0, nil
|
||||
default:
|
||||
return stateNil, 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
case stateComment:
|
||||
switch {
|
||||
case isNewline(r):
|
||||
return stateNil, 0, nil
|
||||
default:
|
||||
return stateComment, 0, nil
|
||||
}
|
||||
default:
|
||||
return stateNil, 0, errors.New("")
|
||||
}
|
||||
}
|
||||
|
||||
func quote(s string) string {
|
||||
if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
|
||||
if strings.Contains(s, "\"") {
|
||||
return `"""` + s + `"""`
|
||||
}
|
||||
|
||||
return `"` + s + `"`
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func unquote(s string) (string, bool) {
|
||||
if len(s) == 0 {
|
||||
return "", false
|
||||
}
|
||||
|
||||
// TODO: single quotes
|
||||
if len(s) >= 3 && s[:3] == `"""` {
|
||||
if len(s) >= 6 && s[len(s)-3:] == `"""` {
|
||||
return s[3 : len(s)-3], true
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
if len(s) >= 1 && s[0] == '"' {
|
||||
if len(s) >= 2 && s[len(s)-1] == '"' {
|
||||
return s[1 : len(s)-1], true
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
return s, true
|
||||
}
|
||||
|
||||
func isAlpha(r rune) bool {
|
||||
return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
|
||||
}
|
||||
|
||||
func isNumber(r rune) bool {
|
||||
return r >= '0' && r <= '9'
|
||||
}
|
||||
|
||||
func isSpace(r rune) bool {
|
||||
return r == ' ' || r == '\t'
|
||||
}
|
||||
|
||||
func isNewline(r rune) bool {
|
||||
return r == '\r' || r == '\n'
|
||||
}
|
||||
|
||||
func isValidMessageRole(role string) bool {
|
||||
return role == "system" || role == "user" || role == "assistant"
|
||||
}
|
||||
|
||||
func isValidCommand(cmd string) bool {
|
||||
switch strings.ToLower(cmd) {
|
||||
case "from", "license", "template", "system", "adapter", "parameter", "message":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
507
types/model/file_test.go
Normal file
507
types/model/file_test.go
Normal file
@@ -0,0 +1,507 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestParseFileFile(t *testing.T) {
|
||||
input := `
|
||||
FROM model1
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
modelfile, err := ParseFile(reader)
|
||||
assert.NoError(t, err)
|
||||
|
||||
expectedCommands := []Command{
|
||||
{Name: "model", Args: "model1"},
|
||||
{Name: "adapter", Args: "adapter1"},
|
||||
{Name: "license", Args: "MIT"},
|
||||
{Name: "param1", Args: "value1"},
|
||||
{Name: "param2", Args: "value2"},
|
||||
{Name: "template", Args: "template1"},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedCommands, modelfile.Commands)
|
||||
}
|
||||
|
||||
func TestParseFileFrom(t *testing.T) {
|
||||
var cases = []struct {
|
||||
input string
|
||||
expected []Command
|
||||
err error
|
||||
}{
|
||||
{
|
||||
"FROM foo",
|
||||
[]Command{{Name: "model", Args: "foo"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM /path/to/model",
|
||||
[]Command{{Name: "model", Args: "/path/to/model"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM /path/to/model/fp16.bin",
|
||||
[]Command{{Name: "model", Args: "/path/to/model/fp16.bin"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM llama3:latest",
|
||||
[]Command{{Name: "model", Args: "llama3:latest"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM llama3:7b-instruct-q4_K_M",
|
||||
[]Command{{Name: "model", Args: "llama3:7b-instruct-q4_K_M"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"", nil, errMissingFrom,
|
||||
},
|
||||
{
|
||||
"PARAMETER param1 value1",
|
||||
nil,
|
||||
errMissingFrom,
|
||||
},
|
||||
{
|
||||
"PARAMETER param1 value1\nFROM foo",
|
||||
[]Command{{Name: "param1", Args: "value1"}, {Name: "model", Args: "foo"}},
|
||||
nil,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.input))
|
||||
assert.ErrorIs(t, err, c.err)
|
||||
if modelfile != nil {
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileParametersMissingValue(t *testing.T) {
|
||||
input := `
|
||||
FROM foo
|
||||
PARAMETER param1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := ParseFile(reader)
|
||||
assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
|
||||
}
|
||||
|
||||
func TestParseFileBadCommand(t *testing.T) {
|
||||
input := `
|
||||
FROM foo
|
||||
BADCOMMAND param1 value1
|
||||
`
|
||||
_, err := ParseFile(strings.NewReader(input))
|
||||
assert.ErrorIs(t, err, errInvalidCommand)
|
||||
|
||||
}
|
||||
|
||||
func TestParseFileMessages(t *testing.T) {
|
||||
var cases = []struct {
|
||||
input string
|
||||
expected []Command
|
||||
err error
|
||||
}{
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system You are a file parser. Always parse things.
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a file parser. Always parse things."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system You are a file parser. Always parse things.`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a file parser. Always parse things."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system You are a file parser. Always parse things.
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a file parser. Always parse things."},
|
||||
{Name: "message", Args: "user: Hey there!"},
|
||||
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system """
|
||||
You are a multiline file parser. Always parse things.
|
||||
"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: \nYou are a multiline file parser. Always parse things.\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE badguy I'm a bad guy!
|
||||
`,
|
||||
nil,
|
||||
errInvalidMessageRole,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system
|
||||
`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.input))
|
||||
assert.ErrorIs(t, err, c.err)
|
||||
if modelfile != nil {
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileQuoted(t *testing.T) {
|
||||
var cases = []struct {
|
||||
multiline string
|
||||
expected []Command
|
||||
err error
|
||||
}{
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """
|
||||
This is a
|
||||
multiline system.
|
||||
"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "\nThis is a\nmultiline system.\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """
|
||||
This is a
|
||||
multiline system."""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "\nThis is a\nmultiline system."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """This is a
|
||||
multiline system."""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "This is a\nmultiline system."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """This is a multiline system."""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "This is a multiline system."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """This is a multiline system.""
|
||||
`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM "
|
||||
`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """
|
||||
This is a multiline system with "quotes".
|
||||
"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "\nThis is a multiline system with \"quotes\".\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: ""},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM ""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: ""},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM "'"
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "'"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """''"'""'""'"'''''""'""'"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: `''"'""'""'"'''''""'""'`},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
TEMPLATE """
|
||||
{{ .Prompt }}
|
||||
"""`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "template", Args: "\n{{ .Prompt }}\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.multiline))
|
||||
assert.ErrorIs(t, err, c.err)
|
||||
if modelfile != nil {
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileParameters(t *testing.T) {
|
||||
var cases = map[string]struct {
|
||||
name, value string
|
||||
}{
|
||||
"numa true": {"numa", "true"},
|
||||
"num_ctx 1": {"num_ctx", "1"},
|
||||
"num_batch 1": {"num_batch", "1"},
|
||||
"num_gqa 1": {"num_gqa", "1"},
|
||||
"num_gpu 1": {"num_gpu", "1"},
|
||||
"main_gpu 1": {"main_gpu", "1"},
|
||||
"low_vram true": {"low_vram", "true"},
|
||||
"f16_kv true": {"f16_kv", "true"},
|
||||
"logits_all true": {"logits_all", "true"},
|
||||
"vocab_only true": {"vocab_only", "true"},
|
||||
"use_mmap true": {"use_mmap", "true"},
|
||||
"use_mlock true": {"use_mlock", "true"},
|
||||
"num_thread 1": {"num_thread", "1"},
|
||||
"num_keep 1": {"num_keep", "1"},
|
||||
"seed 1": {"seed", "1"},
|
||||
"num_predict 1": {"num_predict", "1"},
|
||||
"top_k 1": {"top_k", "1"},
|
||||
"top_p 1.0": {"top_p", "1.0"},
|
||||
"tfs_z 1.0": {"tfs_z", "1.0"},
|
||||
"typical_p 1.0": {"typical_p", "1.0"},
|
||||
"repeat_last_n 1": {"repeat_last_n", "1"},
|
||||
"temperature 1.0": {"temperature", "1.0"},
|
||||
"repeat_penalty 1.0": {"repeat_penalty", "1.0"},
|
||||
"presence_penalty 1.0": {"presence_penalty", "1.0"},
|
||||
"frequency_penalty 1.0": {"frequency_penalty", "1.0"},
|
||||
"mirostat 1": {"mirostat", "1"},
|
||||
"mirostat_tau 1.0": {"mirostat_tau", "1.0"},
|
||||
"mirostat_eta 1.0": {"mirostat_eta", "1.0"},
|
||||
"penalize_newline true": {"penalize_newline", "true"},
|
||||
"stop ### User:": {"stop", "### User:"},
|
||||
"stop ### User: ": {"stop", "### User: "},
|
||||
"stop \"### User:\"": {"stop", "### User:"},
|
||||
"stop \"### User: \"": {"stop", "### User: "},
|
||||
"stop \"\"\"### User:\"\"\"": {"stop", "### User:"},
|
||||
"stop \"\"\"### User:\n\"\"\"": {"stop", "### User:\n"},
|
||||
"stop <|endoftext|>": {"stop", "<|endoftext|>"},
|
||||
"stop <|eot_id|>": {"stop", "<|eot_id|>"},
|
||||
"stop </s>": {"stop", "</s>"},
|
||||
}
|
||||
|
||||
for k, v := range cases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
fmt.Fprintln(&b, "FROM foo")
|
||||
fmt.Fprintln(&b, "PARAMETER", k)
|
||||
modelfile, err := ParseFile(&b)
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Equal(t, []Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: v.name, Args: v.value},
|
||||
}, modelfile.Commands)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileComments(t *testing.T) {
|
||||
var cases = []struct {
|
||||
input string
|
||||
expected []Command
|
||||
}{
|
||||
{
|
||||
`
|
||||
# comment
|
||||
FROM foo
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.input))
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileFormatParseFile(t *testing.T) {
|
||||
var cases = []string{
|
||||
`
|
||||
FROM foo
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
MESSAGE system You are a file parser. Always parse things.
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
`
|
||||
FROM foo
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
MESSAGE system """
|
||||
You are a store greeter. Always responsed with "Hello!".
|
||||
"""
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
`
|
||||
FROM foo
|
||||
ADAPTER adapter1
|
||||
LICENSE """
|
||||
Very long and boring legal text.
|
||||
Blah blah blah.
|
||||
"Oh look, a quote!"
|
||||
"""
|
||||
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
MESSAGE system """
|
||||
You are a store greeter. Always responsed with "Hello!".
|
||||
"""
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c))
|
||||
assert.NoError(t, err)
|
||||
|
||||
modelfile2, err := ParseFile(strings.NewReader(modelfile.String()))
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Equal(t, modelfile, modelfile2)
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,2 +1,2 @@
|
||||
go test fuzz v1
|
||||
string("/0")
|
||||
string("00@")
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user