Compare commits

..

2 Commits

Author SHA1 Message Date
Blake Mizerany
39a199bb3e remove duplicate check for ".." 2024-04-24 15:06:19 -07:00
Blake Mizerany
1b21a22d0e types/model: require all names parts start with an alnum char 2024-04-24 11:54:49 -07:00
98 changed files with 2814 additions and 3903 deletions

View File

@@ -311,18 +311,29 @@ jobs:
- uses: actions/download-artifact@v4
with:
name: generate-windows-cpu
path: |
llm/build
dist/windows-amd64
- uses: actions/download-artifact@v4
with:
name: generate-windows-cuda
path: |
llm/build
dist/windows-amd64
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps
path: dist/deps
- uses: actions/download-artifact@v4
with:
name: windows-rocm-deps
path: dist/deps
- uses: actions/download-artifact@v4
with:
name: generate-windows-rocm
path: |
llm/build
dist/windows-amd64
- run: dir llm/build
- run: |
$gopath=(get-command go).source | split-path -parent
@@ -331,6 +342,8 @@ jobs:
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH"
$env:OLLAMA_SKIP_GENERATE="1"
$env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
$env:HIP_PATH=$(resolve-path ".\dist\deps")
& .\scripts\build_windows.ps1
- uses: actions/upload-artifact@v4
with:

View File

@@ -1,15 +1,5 @@
name: test
concurrency:
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
# cancels running CI jobs and starts all new ones.
#
# For non-PR pushes, concurrency.group needs to be unique for every distinct
# CI run we want to have happen. Use run_id, which in practice means all
# non-PR CI runs will be allowed to run without preempting each other.
group: ${{ github.workflow }}-$${{ github.pull_request.number || github.run_id }}
cancel-in-progress: true
on:
pull_request:
paths:
@@ -31,9 +21,7 @@ jobs:
- id: changes
run: |
changed() {
git diff-tree -r --no-commit-id --name-only \
$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
${{ github.event.pull_request.head.sha }} \
git diff-tree -r --no-commit-id --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
| xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
}
@@ -295,6 +283,7 @@ jobs:
with:
go-version-file: go.mod
cache: true
- run: go get
- run: |
case ${{ matrix.arch }} in
amd64) echo ARCH=x86_64 ;;

3
.gitignore vendored
View File

@@ -11,5 +11,4 @@ ggml-metal.metal
.idea
test_data
*.crt
llm/build
__debug_bin*
llm/build

View File

@@ -1,5 +1,5 @@
<div align="center">
 <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
</div>
# Ollama
@@ -51,7 +51,7 @@ Here are some example models that can be downloaded:
| ------------------ | ---------- | ----- | ------------------------------ |
| Llama 3 | 8B | 4.7GB | `ollama run llama3` |
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
| Phi-3 | 3.8B | 2.3GB | `ollama run phi3` |
| Phi-3 | 3,8B | 2.3GB | `ollama run phi3` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
The image features a yellow smiley face, which is likely the central focus of the picture.
```
### Pass the prompt as an argument
### Pass in prompt as arguments
```
$ ollama run llama3 "Summarize this file: $(cat README.md)"
@@ -284,19 +284,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
- [OpenAOE](https://github.com/InternLM/OpenAOE)
- [Odin Runes](https://github.com/leonid20000/OdinRunes)
- [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
- [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
- [StreamDeploy](https://github.com/StreamDeploy-DevRel/streamdeploy-llm-app-scaffold) (LLM Application Scaffold)
- [chat](https://github.com/swuecho/chat) (chat web app for teams)
- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
- [chat: chat web app for teams](https://github.com/swuecho/chat)
- [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
- [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
- [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
### Terminal
@@ -350,11 +348,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
- [Elixir LangChain](https://github.com/brainlid/langchain)
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
- [Ollama for R - ollama-r](https://github.com/hauselin/ollama-r)
- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
- [Testcontainers](https://testcontainers.com/modules/ollama/)
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
### Mobile
@@ -374,13 +370,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
- [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
- [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -389,5 +384,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

View File

@@ -18,7 +18,6 @@ import (
"net/url"
"os"
"runtime"
"strconv"
"strings"
"github.com/ollama/ollama/format"
@@ -58,36 +57,12 @@ func checkError(resp *http.Response, body []byte) error {
// If the variable is not specified, a default ollama host and port will be
// used.
func ClientFromEnvironment() (*Client, error) {
ollamaHost, err := GetOllamaHost()
if err != nil {
return nil, err
}
return &Client{
base: &url.URL{
Scheme: ollamaHost.Scheme,
Host: net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
},
http: http.DefaultClient,
}, nil
}
type OllamaHost struct {
Scheme string
Host string
Port string
}
func GetOllamaHost() (OllamaHost, error) {
defaultPort := "11434"
hostVar := os.Getenv("OLLAMA_HOST")
hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
scheme, hostport, ok := strings.Cut(hostVar, "://")
scheme, hostport, ok := strings.Cut(os.Getenv("OLLAMA_HOST"), "://")
switch {
case !ok:
scheme, hostport = "http", hostVar
scheme, hostport = "http", os.Getenv("OLLAMA_HOST")
case scheme == "http":
defaultPort = "80"
case scheme == "https":
@@ -107,14 +82,12 @@ func GetOllamaHost() (OllamaHost, error) {
}
}
if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
return OllamaHost{}, ErrInvalidHostPort
}
return OllamaHost{
Scheme: scheme,
Host: host,
Port: port,
return &Client{
base: &url.URL{
Scheme: scheme,
Host: net.JoinHostPort(host, port),
},
http: http.DefaultClient,
}, nil
}

View File

@@ -1,12 +1,6 @@
package api
import (
"fmt"
"net"
"testing"
"github.com/stretchr/testify/assert"
)
import "testing"
func TestClientFromEnvironment(t *testing.T) {
type testCase struct {
@@ -46,40 +40,4 @@ func TestClientFromEnvironment(t *testing.T) {
}
})
}
hostTestCases := map[string]*testCase{
"empty": {value: "", expect: "127.0.0.1:11434"},
"only address": {value: "1.2.3.4", expect: "1.2.3.4:11434"},
"only port": {value: ":1234", expect: ":1234"},
"address and port": {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
"hostname": {value: "example.com", expect: "example.com:11434"},
"hostname and port": {value: "example.com:1234", expect: "example.com:1234"},
"zero port": {value: ":0", expect: ":0"},
"too large port": {value: ":66000", err: ErrInvalidHostPort},
"too small port": {value: ":-1", err: ErrInvalidHostPort},
"ipv6 localhost": {value: "[::1]", expect: "[::1]:11434"},
"ipv6 world open": {value: "[::]", expect: "[::]:11434"},
"ipv6 no brackets": {value: "::1", expect: "[::1]:11434"},
"ipv6 + port": {value: "[::1]:1337", expect: "[::1]:1337"},
"extra space": {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
"extra quotes": {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
"extra space+quotes": {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
}
for k, v := range hostTestCases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", v.value)
oh, err := GetOllamaHost()
if err != v.err {
t.Fatalf("expected %s, got %s", v.err, err)
}
if err == nil {
host := net.JoinHostPort(oh.Host, oh.Port)
assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
}
})
}
}

View File

@@ -309,7 +309,6 @@ func (m *Metrics) Summary() {
}
var ErrInvalidOpts = errors.New("invalid options")
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
func (opts *Options) FromMap(m map[string]interface{}) error {
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
@@ -397,10 +396,8 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
func DefaultOptions() Options {
return Options{
// options set on request to runner
NumPredict: -1,
// set a minimal num_keep to avoid issues on context shifts
NumKeep: 4,
NumPredict: -1,
NumKeep: 0,
Temperature: 0.8,
TopK: 40,
TopP: 0.9,
@@ -436,13 +433,6 @@ type Duration struct {
time.Duration
}
func (d Duration) MarshalJSON() ([]byte, error) {
if d.Duration < 0 {
return []byte("-1"), nil
}
return []byte("\"" + d.Duration.String() + "\""), nil
}
func (d *Duration) UnmarshalJSON(b []byte) (err error) {
var v any
if err := json.Unmarshal(b, &v); err != nil {
@@ -456,7 +446,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
if t < 0 {
d.Duration = time.Duration(math.MaxInt64)
} else {
d.Duration = time.Duration(int(t) * int(time.Second))
d.Duration = time.Duration(t * float64(time.Second))
}
case string:
d.Duration, err = time.ParseDuration(t)
@@ -466,8 +456,6 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
if d.Duration < 0 {
d.Duration = time.Duration(math.MaxInt64)
}
default:
return fmt.Errorf("Unsupported type: '%s'", reflect.TypeOf(v))
}
return nil

View File

@@ -21,11 +21,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
req: `{ "keep_alive": 42 }`,
exp: &Duration{42 * time.Second},
},
{
name: "Positive Float",
req: `{ "keep_alive": 42.5 }`,
exp: &Duration{42 * time.Second},
},
{
name: "Positive Integer String",
req: `{ "keep_alive": "42m" }`,
@@ -36,11 +31,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
req: `{ "keep_alive": -1 }`,
exp: &Duration{math.MaxInt64},
},
{
name: "Negative Float",
req: `{ "keep_alive": -3.14 }`,
exp: &Duration{math.MaxInt64},
},
{
name: "Negative Integer String",
req: `{ "keep_alive": "-1m" }`,
@@ -58,50 +48,3 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
})
}
}
func TestDurationMarshalUnmarshal(t *testing.T) {
tests := []struct {
name string
input time.Duration
expected time.Duration
}{
{
"negative duration",
time.Duration(-1),
time.Duration(math.MaxInt64),
},
{
"positive duration",
time.Duration(42 * time.Second),
time.Duration(42 * time.Second),
},
{
"another positive duration",
time.Duration(42 * time.Minute),
time.Duration(42 * time.Minute),
},
{
"zero duration",
time.Duration(0),
time.Duration(0),
},
{
"max duration",
time.Duration(math.MaxInt64),
time.Duration(math.MaxInt64),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
b, err := json.Marshal(Duration{test.input})
require.NoError(t, err)
var d Duration
err = json.Unmarshal(b, &d)
require.NoError(t, err)
assert.Equal(t, test.expected, d.Duration, "input %v, marshalled %v, got %v", test.input, string(b), d.Duration)
})
}
}

View File

@@ -5,14 +5,12 @@ import (
"log/slog"
"os"
"path/filepath"
"github.com/ollama/ollama/server/envconfig"
)
func InitLogging() {
level := slog.LevelInfo
if envconfig.Debug {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
level = slog.LevelDebug
}

View File

@@ -43,36 +43,37 @@ func getCLIFullPath(command string) string {
return command
}
func start(ctx context.Context, command string) (*exec.Cmd, error) {
func SpawnServer(ctx context.Context, command string) (chan int, error) {
done := make(chan int)
logDir := filepath.Dir(ServerLogFile)
_, err := os.Stat(logDir)
if errors.Is(err, os.ErrNotExist) {
if err := os.MkdirAll(logDir, 0o755); err != nil {
return done, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
}
}
cmd := getCmd(ctx, getCLIFullPath(command))
// send stdout and stderr to a file
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
return done, fmt.Errorf("failed to spawn server stdout pipe %s", err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
return done, fmt.Errorf("failed to spawn server stderr pipe %s", err)
}
stdin, err := cmd.StdinPipe()
if err != nil {
return done, fmt.Errorf("failed to spawn server stdin pipe %s", err)
}
// TODO - rotation
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err)
return done, fmt.Errorf("failed to create server log %w", err)
}
logDir := filepath.Dir(ServerLogFile)
_, err = os.Stat(logDir)
if err != nil {
if !errors.Is(err, os.ErrNotExist) {
return nil, fmt.Errorf("stat ollama server log dir %s: %v", logDir, err)
}
if err := os.MkdirAll(logDir, 0o755); err != nil {
return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
}
}
go func() {
defer logFile.Close()
io.Copy(logFile, stdout) //nolint:errcheck
@@ -116,33 +117,19 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
// run the command and wait for it to finish
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("failed to start server %w", err)
return done, fmt.Errorf("failed to start server %w", err)
}
if cmd.Process != nil {
slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
}
slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))
return cmd, nil
}
func SpawnServer(ctx context.Context, command string) (chan int, error) {
done := make(chan int)
go func() {
// Keep the server running unless we're shuttind down the app
crashCount := 0
for {
slog.Info("starting server...")
cmd, err := start(ctx, command)
if err != nil {
crashCount++
slog.Error(fmt.Sprintf("failed to start server %s", err))
time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
continue
}
cmd.Wait() //nolint:errcheck
stdin.Close()
var code int
if cmd.ProcessState != nil {
code = cmd.ProcessState.ExitCode()
@@ -156,12 +143,15 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
default:
crashCount++
slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
break
time.Sleep(500 * time.Millisecond)
if err := cmd.Start(); err != nil {
slog.Error(fmt.Sprintf("failed to restart server %s", err))
// Keep trying, but back off if we keep failing
time.Sleep(time.Duration(crashCount) * time.Second)
}
}
}
}()
return done, nil
}

View File

@@ -31,13 +31,16 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
}
// make the upgrade as quiet as possible (no GUI, no prompts)
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
// TODO - temporarily disable since we're pinning in debug mode for the preview
// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
installArgs = append(installArgs,
"/SP", // Skip the "This will install... Do you wish to continue" prompt
"/SUPPRESSMSGBOXES",
"/SILENT",
"/VERYSILENT",
)
// }
// Safeguard in case we have requests in flight that need to drain...
slog.Info("Waiting for server to shutdown")

View File

@@ -88,12 +88,16 @@ DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\windows-amd64\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-amd64\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
; Assumes v5.7, may need adjustments for v6
#if GetEnv("HIP_PATH") != ""
Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
; amdhip64.dll dependency comes from the driver and must be installed already
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
#endif
@@ -129,7 +133,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
;FinishedHeadingLabel=Run your first model
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama2
;ClickFinish=%n
[Registry]

View File

@@ -1,71 +1,71 @@
//go:build windows
package wintray
import (
"fmt"
"log/slog"
"unsafe"
"golang.org/x/sys/windows"
)
const (
updatAvailableMenuID = 1
updateMenuID = updatAvailableMenuID + 1
separatorMenuID = updateMenuID + 1
diagLogsMenuID = separatorMenuID + 1
diagSeparatorMenuID = diagLogsMenuID + 1
quitMenuID = diagSeparatorMenuID + 1
)
func (t *winTray) initMenus() error {
if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err)
}
if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err)
}
return nil
}
func (t *winTray) UpdateAvailable(ver string) error {
if !t.updateNotified {
slog.Debug("updating menu and sending notification for new update")
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
if err != nil {
return fmt.Errorf("unable to write icon data to temp file: %w", err)
}
if err := wt.setIcon(iconFilePath); err != nil {
return fmt.Errorf("unable to set icon: %w", err)
}
t.updateNotified = true
t.pendingUpdate = true
// Now pop up the notification
t.muNID.Lock()
defer t.muNID.Unlock()
copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
t.nid.Flags |= NIF_INFO
t.nid.Timeout = 10
t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
err = t.nid.modify()
if err != nil {
return err
}
}
return nil
}
//go:build windows
package wintray
import (
"fmt"
"log/slog"
"unsafe"
"golang.org/x/sys/windows"
)
const (
updatAvailableMenuID = 1
updateMenuID = updatAvailableMenuID + 1
separatorMenuID = updateMenuID + 1
diagLogsMenuID = separatorMenuID + 1
diagSeparatorMenuID = diagLogsMenuID + 1
quitMenuID = diagSeparatorMenuID + 1
)
func (t *winTray) initMenus() error {
if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err)
}
if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err)
}
return nil
}
func (t *winTray) UpdateAvailable(ver string) error {
if !t.updateNotified {
slog.Debug("updating menu and sending notification for new update")
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
if err != nil {
return fmt.Errorf("unable to write icon data to temp file: %w", err)
}
if err := wt.setIcon(iconFilePath); err != nil {
return fmt.Errorf("unable to set icon: %w", err)
}
t.updateNotified = true
t.pendingUpdate = true
// Now pop up the notification
t.muNID.Lock()
defer t.muNID.Unlock()
copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
t.nid.Flags |= NIF_INFO
t.nid.Timeout = 10
t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
err = t.nid.modify()
if err != nil {
return err
}
}
return nil
}

View File

@@ -10,44 +10,12 @@ import (
"log/slog"
"os"
"path/filepath"
"strings"
"golang.org/x/crypto/ssh"
)
const defaultPrivateKey = "id_ed25519"
func keyPath() (string, error) {
home, err := os.UserHomeDir()
if err != nil {
return "", err
}
return filepath.Join(home, ".ollama", defaultPrivateKey), nil
}
func GetPublicKey() (string, error) {
keyPath, err := keyPath()
if err != nil {
return "", err
}
privateKeyFile, err := os.ReadFile(keyPath)
if err != nil {
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
return "", err
}
privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
if err != nil {
return "", err
}
publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
return strings.TrimSpace(string(publicKey)), nil
}
func NewNonce(r io.Reader, length int) (string, error) {
nonce := make([]byte, length)
if _, err := io.ReadFull(r, nonce); err != nil {
@@ -58,11 +26,13 @@ func NewNonce(r io.Reader, length int) (string, error) {
}
func Sign(ctx context.Context, bts []byte) (string, error) {
keyPath, err := keyPath()
home, err := os.UserHomeDir()
if err != nil {
return "", err
}
keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
privateKeyFile, err := os.ReadFile(keyPath)
if err != nil {
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))

View File

@@ -17,7 +17,6 @@ import (
"os"
"os/signal"
"path/filepath"
"regexp"
"runtime"
"strings"
"syscall"
@@ -32,12 +31,10 @@ import (
"golang.org/x/term"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/auth"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/server"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
)
@@ -56,13 +53,14 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
p := progress.NewProgress(os.Stderr)
defer p.Stop()
f, err := os.Open(filename)
bars := make(map[string]*progress.Bar)
modelfile, err := os.ReadFile(filename)
if err != nil {
return err
}
defer f.Close()
modelfile, err := model.ParseFile(f)
commands, err := parser.Parse(bytes.NewReader(modelfile))
if err != nil {
return err
}
@@ -76,10 +74,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
spinner := progress.NewSpinner(status)
p.Add(status, spinner)
for i := range modelfile.Commands {
switch modelfile.Commands[i].Name {
for _, c := range commands {
switch c.Name {
case "model", "adapter":
path := modelfile.Commands[i].Args
path := c.Args
if path == "~" {
path = home
} else if strings.HasPrefix(path, "~/") {
@@ -91,22 +89,101 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
}
fi, err := os.Stat(path)
if errors.Is(err, os.ErrNotExist) && modelfile.Commands[i].Name == "model" {
if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
continue
} else if err != nil {
return err
}
// TODO make this work w/ adapters
if fi.IsDir() {
// this is likely a safetensors or pytorch directory
// TODO make this work w/ adapters
tempfile, err := tempZipFiles(path)
tf, err := os.CreateTemp("", "ollama-tf")
if err != nil {
return err
}
defer os.RemoveAll(tempfile)
defer os.RemoveAll(tf.Name())
path = tempfile
zf := zip.NewWriter(tf)
files := []string{}
tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
if err != nil {
return err
} else if len(tfiles) == 0 {
tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
if err != nil {
return err
}
}
files = append(files, tfiles...)
if len(files) == 0 {
return fmt.Errorf("no models were found in '%s'", path)
}
// add the safetensor/torch config file + tokenizer
files = append(files, filepath.Join(path, "config.json"))
files = append(files, filepath.Join(path, "params.json"))
files = append(files, filepath.Join(path, "added_tokens.json"))
files = append(files, filepath.Join(path, "tokenizer.model"))
for _, fn := range files {
f, err := os.Open(fn)
// just skip whatever files aren't there
if os.IsNotExist(err) {
if strings.HasSuffix(fn, "tokenizer.model") {
// try the parent dir before giving up
parentDir := filepath.Dir(path)
newFn := filepath.Join(parentDir, "tokenizer.model")
f, err = os.Open(newFn)
if os.IsNotExist(err) {
continue
} else if err != nil {
return err
}
} else {
continue
}
} else if err != nil {
return err
}
fi, err := f.Stat()
if err != nil {
return err
}
h, err := zip.FileInfoHeader(fi)
if err != nil {
return err
}
h.Name = filepath.Base(fn)
h.Method = zip.Store
w, err := zf.CreateHeader(h)
if err != nil {
return err
}
_, err = io.Copy(w, f)
if err != nil {
return err
}
}
if err := zf.Close(); err != nil {
return err
}
if err := tf.Close(); err != nil {
return err
}
path = tf.Name()
}
digest, err := createBlob(cmd, client, path)
@@ -114,11 +191,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
return err
}
modelfile.Commands[i].Args = "@" + digest
modelfile = bytes.ReplaceAll(modelfile, []byte(c.Args), []byte("@"+digest))
}
}
bars := make(map[string]*progress.Bar)
fn := func(resp api.ProgressResponse) error {
if resp.Digest != "" {
spinner.Stop()
@@ -144,7 +220,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
quantization, _ := cmd.Flags().GetString("quantization")
request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
if err := client.Create(cmd.Context(), &request, fn); err != nil {
return err
}
@@ -152,114 +228,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
return nil
}
func tempZipFiles(path string) (string, error) {
tempfile, err := os.CreateTemp("", "ollama-tf")
if err != nil {
return "", err
}
defer tempfile.Close()
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
detectContentType := func(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
var b bytes.Buffer
b.Grow(512)
if _, err := io.CopyN(&b, f, 512); err != nil && !errors.Is(err, io.EOF) {
return "", err
}
contentType, _, _ := strings.Cut(http.DetectContentType(b.Bytes()), ";")
return contentType, nil
}
glob := func(pattern, contentType string) ([]string, error) {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, err
}
for _, safetensor := range matches {
if ct, err := detectContentType(safetensor); err != nil {
return nil, err
} else if ct != contentType {
return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, safetensor)
}
}
return matches, nil
}
var files []string
if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
// safetensors files might be unresolved git lfs references; skip if they are
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
files = append(files, st...)
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
// pytorch files might also be unresolved git lfs references; skip if they are
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
files = append(files, pt...)
} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/octet-stream"); len(pt) > 0 {
// pytorch files might also be unresolved git lfs references; skip if they are
// covers consolidated.x.pth, consolidated.pth
files = append(files, pt...)
} else {
return "", errors.New("no safetensors or torch files found")
}
// add configuration files, json files are detected as text/plain
js, err := glob(filepath.Join(path, "*.json"), "text/plain")
if err != nil {
return "", err
}
files = append(files, js...)
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
// tokenizer.model might be a unresolved git lfs reference; error if it is
files = append(files, tks...)
} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
files = append(files, tks...)
}
for _, file := range files {
f, err := os.Open(file)
if err != nil {
return "", err
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
return "", err
}
zfi, err := zip.FileInfoHeader(fi)
if err != nil {
return "", err
}
zf, err := zipfile.CreateHeader(zfi)
if err != nil {
return "", err
}
if _, err := io.Copy(zf, f); err != nil {
return "", err
}
}
return tempfile.Name(), nil
}
func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
bin, err := os.Open(path)
if err != nil {
@@ -354,47 +322,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
return generateInteractive(cmd, opts)
}
func errFromUnknownKey(unknownKeyErr error) error {
// find SSH public key in the error message
sshKeyPattern := `ssh-\w+ [^\s"]+`
re := regexp.MustCompile(sshKeyPattern)
matches := re.FindStringSubmatch(unknownKeyErr.Error())
if len(matches) > 0 {
serverPubKey := matches[0]
localPubKey, err := auth.GetPublicKey()
if err != nil {
return unknownKeyErr
}
if runtime.GOOS == "linux" && serverPubKey != localPubKey {
// try the ollama service public key
svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
if err != nil {
return unknownKeyErr
}
localPubKey = strings.TrimSpace(string(svcPubKey))
}
// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
if serverPubKey != localPubKey {
return unknownKeyErr
}
var msg strings.Builder
msg.WriteString(unknownKeyErr.Error())
msg.WriteString("\n\nYour ollama key is:\n")
msg.WriteString(localPubKey)
msg.WriteString("\nAdd your key at:\n")
msg.WriteString("https://ollama.com/settings/keys")
return errors.New(msg.String())
}
return unknownKeyErr
}
func PushHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
@@ -442,20 +369,6 @@ func PushHandler(cmd *cobra.Command, args []string) error {
request := api.PushRequest{Name: args[0], Insecure: insecure}
if err := client.Push(cmd.Context(), &request, fn); err != nil {
if spinner != nil {
spinner.Stop()
}
if strings.Contains(err.Error(), "access denied") {
return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
}
host := model.ParseName(args[0]).Host
isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
// the user has not added their ollama key to ollama.com
// re-throw an error with a more user-friendly message
return errFromUnknownKey(err)
}
return err
}
@@ -883,27 +796,24 @@ func generate(cmd *cobra.Command, opts runOptions) error {
}
func RunServer(cmd *cobra.Command, _ []string) error {
// retrieve the OLLAMA_HOST environment variable
ollamaHost, err := api.GetOllamaHost()
host, port, err := net.SplitHostPort(strings.Trim(os.Getenv("OLLAMA_HOST"), "\"'"))
if err != nil {
return err
host, port = "127.0.0.1", "11434"
if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
host = ip.String()
}
}
if err := initializeKeypair(); err != nil {
return err
}
ln, err := net.Listen("tcp", net.JoinHostPort(ollamaHost.Host, ollamaHost.Port))
ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
if err != nil {
return err
}
err = server.Serve(ln)
if errors.Is(err, http.ErrServerClosed) {
return nil
}
return err
return server.Serve(ln)
}
func initializeKeypair() error {
@@ -1124,7 +1034,7 @@ Environment Variables:
RunE: ListHandler,
}
copyCmd := &cobra.Command{
Use: "cp SOURCE DESTINATION",
Use: "cp SOURCE TARGET",
Short: "Copy a model",
Args: cobra.ExactArgs(2),
PreRunE: checkServerHeartbeat,

View File

@@ -94,7 +94,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
fmt.Fprintln(os.Stderr, " /show Show model information")
fmt.Fprintln(os.Stderr, " /load <model> Load a session or model")
fmt.Fprintln(os.Stderr, " /save <model> Save your current session")
fmt.Fprintln(os.Stderr, " /clear Clear session context")
fmt.Fprintln(os.Stderr, " /bye Exit")
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
@@ -281,10 +280,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
}
fmt.Printf("Created new model '%s'\n", args[1])
continue
case strings.HasPrefix(line, "/clear"):
opts.Messages = []api.Message{}
fmt.Println("Cleared session context")
continue
case strings.HasPrefix(line, "/set"):
args := strings.Fields(line)
if len(args) > 1 {

View File

@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten
var err error
t, offset, err = m.readTensors(f, offset, params)
if err != nil {
slog.Error(err.Error())
slog.Error("%v", err)
return nil, err
}
tensors = append(tensors, t...)
@@ -122,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
ggufName, err := m.GetLayerName(k)
if err != nil {
slog.Error(err.Error())
slog.Error("%v", err)
return nil, 0, err
}

View File

@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
ggufName, err := tf.GetLayerName(k.(string))
if err != nil {
slog.Error(err.Error())
slog.Error("%v", err)
return nil, err
}
slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))

View File

@@ -17,7 +17,7 @@
### Model names
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
### Durations
@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3",
"model": "llama2",
"prompt": "Why is the sky blue?"
}'
```
@@ -77,7 +77,7 @@ A stream of JSON objects is returned:
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"response": "The",
"done": false
@@ -95,11 +95,11 @@ The final response in the stream also includes additional data about the generat
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` * `10^9`.
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "",
"done": true,
@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3",
"model": "llama2",
"prompt": "Why is the sky blue?",
"stream": false
}'
@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"done": true,
@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3",
"model": "llama2",
"prompt": "What color is the sky at different times of the day? Respond using JSON",
"format": "json",
"stream": false
@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-11-09T21:07:55.186497Z",
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
"done": true,
@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3",
"model": "llama2",
"prompt": "Why is the sky blue?",
"stream": false,
"options": {
@@ -332,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"done": true,
@@ -354,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory.
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3"
"model": "llama2"
}'
```
@@ -364,7 +364,7 @@ A single JSON object is returned:
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-12-18T19:52:07.071755Z",
"response": "",
"done": true
@@ -407,7 +407,7 @@ Send a chat message with a streaming response.
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3",
"model": "llama2",
"messages": [
{
"role": "user",
@@ -423,7 +423,7 @@ A stream of JSON objects is returned:
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assistant",
@@ -438,7 +438,7 @@ Final response:
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"done": true,
"total_duration": 4883583458,
@@ -456,7 +456,7 @@ Final response:
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3",
"model": "llama2",
"messages": [
{
"role": "user",
@@ -471,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{
```json
{
"model": "registry.ollama.ai/library/llama3:latest",
"model": "registry.ollama.ai/library/llama2:latest",
"created_at": "2023-12-12T14:13:43.416799Z",
"message": {
"role": "assistant",
@@ -495,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3",
"model": "llama2",
"messages": [
{
"role": "user",
@@ -519,7 +519,7 @@ A stream of JSON objects is returned:
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assistant",
@@ -533,7 +533,7 @@ Final response:
```json
{
"model": "llama3",
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"done": true,
"total_duration": 8113331500,
@@ -591,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3",
"model": "llama2",
"messages": [
{
"role": "user",
@@ -609,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{
```json
{
"model": "registry.ollama.ai/library/llama3:latest",
"model": "registry.ollama.ai/library/llama2:latest",
"created_at": "2023-12-12T14:13:43.416799Z",
"message": {
"role": "assistant",
@@ -651,7 +651,7 @@ Create a new model from a `Modelfile`.
```shell
curl http://localhost:11434/api/create -d '{
"name": "mario",
"modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
"modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros."
}'
```
@@ -758,7 +758,7 @@ A single JSON object will be returned.
}
},
{
"name": "llama3:latest",
"name": "llama2:latest",
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
"size": 3825819519,
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
@@ -792,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter
```shell
curl http://localhost:11434/api/show -d '{
"name": "llama3"
"name": "llama2"
}'
```
@@ -827,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model.
```shell
curl http://localhost:11434/api/copy -d '{
"source": "llama3",
"destination": "llama3-backup"
"source": "llama2",
"destination": "llama2-backup"
}'
```
@@ -854,7 +854,7 @@ Delete a model and its data.
```shell
curl -X DELETE http://localhost:11434/api/delete -d '{
"name": "llama3:13b"
"name": "llama2:13b"
}'
```
@@ -882,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
```shell
curl http://localhost:11434/api/pull -d '{
"name": "llama3"
"name": "llama2"
}'
```

View File

@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
Then generate dependencies:
@@ -142,4 +142,4 @@ In addition to the common Windows development tools described above, install AMD
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
- [Strawberry Perl](https://strawberryperl.com/)
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).

View File

@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
```
curl http://localhost:11434/api/generate -d '{
"model": "llama3",
"model": "llama2",
"prompt": "Why is the sky blue?",
"options": {
"num_ctx": 4096
@@ -88,9 +88,9 @@ On windows, Ollama inherits your user and system environment variables.
3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
4. Click OK/Apply to save
4. Click OK/Apply to save
5. Run `ollama` from a new terminal window
5. Run `ollama` from a new terminal window
## How can I expose Ollama on my network?
@@ -140,7 +140,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
- macOS: `~/.ollama/models`
- Linux: `/usr/share/ollama/.ollama/models`
- Windows: `C:\Users\%username%\.ollama\models`
- Windows: `C:\Users\<username>\.ollama\models`
### How do I set them to a different location?
@@ -221,20 +221,14 @@ The `keep_alive` parameter can be set to:
For example, to preload a model and leave it in memory use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}'
```
To unload the model and free up memory use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
```
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
## How do I manage the maximum number of requests the server can queue
If too many requests are sent to the server, it will respond with a 503 error
indicating the server is overloaded. You can adjust how many requests may be
queue by setting `OLLAMA_MAX_QUEUE`

View File

@@ -125,7 +125,7 @@ Publishing models is in early alpha. If you'd like to publish your model to shar
1. Create [an account](https://ollama.com/signup)
2. Copy your Ollama public key:
- macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy`
- macOS: `cat ~/.ollama/id_ed25519.pub`
- Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
- Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
@@ -136,8 +136,6 @@ Next, copy your model to your username's namespace:
ollama cp example <your username>/example
```
> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`.
Then push the model:
```

View File

@@ -105,7 +105,7 @@ sudo chmod +x /usr/bin/ollama
To view logs of Ollama running as a startup service, run:
```bash
journalctl -e -u ollama
journalctl -u ollama
```
## Uninstall

View File

@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
- [Examples](#examples)
- [Instructions](#instructions)
- [FROM (Required)](#from-required)
- [Build from llama3](#build-from-llama3)
- [Build from llama2](#build-from-llama2)
- [Build from a bin file](#build-from-a-bin-file)
- [PARAMETER](#parameter)
- [Valid Parameters and Values](#valid-parameters-and-values)
@@ -48,7 +48,7 @@ INSTRUCTION arguments
An example of a `Modelfile` creating a mario blueprint:
```modelfile
FROM llama3
FROM llama2
# sets the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -67,25 +67,33 @@ To use this:
More examples are available in the [examples directory](../examples).
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
### `Modelfile`s in [ollama.com/library][1]
There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
- Option 1: view a details page from a model's tags page:
1. Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
2. Click on a tag (e.g. https://ollama.com/library/llama2:13b)
3. Scroll down to "Layers"
- Note: if the [`FROM` instruction](#from-required) is not present,
it means the model was created from a local file
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
```bash
> ollama show --modelfile llama3
> ollama show --modelfile llama2:13b
# Modelfile generated by "ollama show"
# To build a new Modelfile based on this one, replace the FROM line with:
# FROM llama3:latest
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
# FROM llama2:13b
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
FROM /root/.ollama/models/blobs/sha256:123abc
TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
{{ .Response }}<|eot_id|>"""
PARAMETER stop "<|start_header_id|>"
PARAMETER stop "<|end_header_id|>"
PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|reserved_special_token"
{{ end }}{{ .Prompt }} [/INST] """
SYSTEM """"""
PARAMETER stop [INST]
PARAMETER stop [/INST]
PARAMETER stop <<SYS>>
PARAMETER stop <</SYS>>
```
## Instructions
@@ -98,10 +106,10 @@ The `FROM` instruction defines the base model to use when creating a model.
FROM <model name>:<tag>
```
#### Build from llama3
#### Build from llama2
```modelfile
FROM llama3
FROM llama2
```
A list of available base models:

View File

@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
'content': 'Say this is a test',
}
],
model='llama3',
model='llama2',
)
```
@@ -43,7 +43,7 @@ const openai = new OpenAI({
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama3',
model: 'llama2',
})
```
@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama3",
"model": "llama2",
"messages": [
{
"role": "system",
@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
Before using a model, pull it locally `ollama pull`:
```shell
ollama pull llama3
ollama pull llama2
```
### Default model names
@@ -121,7 +121,7 @@ ollama pull llama3
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
```
ollama cp llama3 gpt-3.5-turbo
ollama cp llama2 gpt-3.5-turbo
```
Afterwards, this new model name can be specified the `model` field:

View File

@@ -15,7 +15,7 @@ import { Ollama } from "langchain/llms/ollama";
const ollama = new Ollama({
baseUrl: "http://localhost:11434",
model: "llama3",
model: "llama2",
});
const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,10 +23,10 @@ const answer = await ollama.invoke(`why is the sky blue?`);
console.log(answer);
```
That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
```bash
npm install cheerio
npm install cheerio
```
```javascript

View File

@@ -17,12 +17,10 @@ Let's start by asking a simple question that we can get an answer to from the **
Then we can create a model and ask the question:
```python
from langchain_community.llms import Ollama
ollama = Ollama(
base_url='http://localhost:11434',
model="llama3"
)
print(ollama.invoke("why is the sky blue"))
from langchain.llms import Ollama
ollama = Ollama(base_url='http://localhost:11434',
model="llama2")
print(ollama("why is the sky blue"))
```
Notice that we are defining the model and the base URL for Ollama.

View File

@@ -1,61 +1,47 @@
# Ollama Windows Preview
Welcome to the Ollama Windows preview.
No more WSL required!
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
After installing Ollama Windows Preview, Ollama will run in the background and
the `ollama` command line is available in `cmd`, `powershell` or your favorite
terminal application. As usual the Ollama [api](./api.md) will be served on
`http://localhost:11434`.
As this is a preview release, you should expect a few bugs here and there. If
you run into a problem you can reach out on
[Discord](https://discord.gg/ollama), or file an
[issue](https://github.com/ollama/ollama/issues).
Logs will often be helpful in diagnosing the problem (see
[Troubleshooting](#troubleshooting) below)
## System Requirements
* Windows 10 or newer, Home or Pro
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
## API Access
Here's a quick example showing API access from `powershell`
```powershell
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
```
## Troubleshooting
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
a "view logs" menu item to the app, and increses logging for the GUI app and
server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains logs from the GUI application
- *server.log* contains the server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
## Standalone CLI
The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
installer. It installs in your account without requiring Administrator rights.
We update Ollama regularly to support the latest models, and this installer will
help you keep up to date.
If you'd like to install or integrate Ollama as a service, a standalone
`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
and GPU library dependencies for Nvidia and AMD. This allows for embedding
Ollama in existing applications, or running it as a system service via `ollama
serve` with tools such as [NSSM](https://nssm.cc/).
# Ollama Windows Preview
Welcome to the Ollama Windows preview.
No more WSL required!
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
After installing Ollama Windows Preview, Ollama will run in the background and
the `ollama` command line is available in `cmd`, `powershell` or your favorite
terminal application. As usual the Ollama [api](./api.md) will be served on
`http://localhost:11434`.
As this is a preview release, you should expect a few bugs here and there. If
you run into a problem you can reach out on
[Discord](https://discord.gg/ollama), or file an
[issue](https://github.com/ollama/ollama/issues).
Logs will often be helpful in dianosing the problem (see
[Troubleshooting](#troubleshooting) below)
## System Requirements
* Windows 10 or newer, Home or Pro
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
## API Access
Here's a quick example showing API access from `powershell`
```powershell
(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
```
## Troubleshooting
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
a "view logs" menu item to the app, and increses logging for the GUI app and
server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains logs from the GUI application
- *server.log* contains the server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories

View File

@@ -2,7 +2,7 @@
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
`ollama run llama3 < sourcequestions.txt`
`ollama run llama2 < sourcequestions.txt`
This concept is used in the following example.

View File

@@ -1 +0,0 @@
fly.toml

View File

@@ -1,67 +0,0 @@
# Deploy Ollama to Fly.io
> Note: this example exposes a public endpoint and does not configure authentication. Use with care.
## Prerequisites
- Ollama: https://ollama.com/download
- Fly.io account. Sign up for a free account: https://fly.io/app/sign-up
## Steps
1. Login to Fly.io
```bash
fly auth login
```
1. Create a new Fly app
```bash
fly launch --name <name> --image ollama/ollama --internal-port 11434 --vm-size shared-cpu-8x --now
```
1. Pull and run `orca-mini:3b`
```bash
OLLAMA_HOST=https://<name>.fly.dev ollama run orca-mini:3b
```
`shared-cpu-8x` is a free-tier eligible machine type. For better performance, switch to a `performance` or `dedicated` machine type or attach a GPU for hardware acceleration (see below).
## (Optional) Persistent Volume
By default Fly Machines use ephemeral storage which is problematic if you want to use the same model across restarts without pulling it again. Create and attach a persistent volume to store the downloaded models:
1. Create the Fly Volume
```bash
fly volume create ollama
```
1. Update `fly.toml` and add `[mounts]`
```toml
[mounts]
source = "ollama"
destination = "/mnt/ollama/models"
```
1. Update `fly.toml` and add `[env]`
```toml
[env]
OLLAMA_MODELS = "/mnt/ollama/models"
```
1. Deploy your app
```bash
fly deploy
```
## (Optional) Hardware Acceleration
Fly.io GPU is currently in waitlist. Sign up for the waitlist: https://fly.io/gpu
Once you've been accepted, create the app with the additional flags `--vm-gpu-kind a100-pcie-40gb` or `--vm-gpu-kind a100-pcie-80gb`.

View File

@@ -35,7 +35,7 @@ func main() {
ctx := context.Background()
req := &api.ChatRequest{
Model: "llama3",
Model: "llama2",
Messages: messages,
}

View File

@@ -19,7 +19,7 @@ func main() {
}
defer resp.Body.Close()
responseData, err := io.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)

View File

@@ -7,24 +7,12 @@
## Steps
1. Create the Ollama namespace, deployment, and service
1. Create the Ollama namespace, daemon set, and service
```bash
kubectl apply -f cpu.yaml
```
## (Optional) Hardware Acceleration
Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin) which is deployed in Kubernetes in form of daemonset. Follow the link for more details.
Once configured, create a GPU enabled Ollama deployment.
```bash
kubectl apply -f gpu.yaml
```
## Test
1. Port forward the Ollama service to connect and use it locally
```bash
@@ -35,4 +23,14 @@ kubectl apply -f gpu.yaml
```bash
ollama run orca-mini:3b
```
```
## (Optional) Hardware Acceleration
Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin). Follow the link for more details.
Once configured, create a GPU enabled Ollama deployment.
```bash
kubectl apply -f gpu.yaml
```

View File

@@ -40,9 +40,9 @@ while True:
continue
# Prompt
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
@@ -51,11 +51,11 @@ while True:
template=template,
)
llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
qa_chain = RetrievalQA.from_chain_type(
llm,
retriever=vectorstore.as_retriever(),
chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)
result = qa_chain({"query": query})
result = qa_chain({"query": query})

View File

@@ -1,12 +1,12 @@
from langchain_community.llms import Ollama
from langchain_community.document_loaders import WebBaseLoader
from langchain.llms import Ollama
from langchain.document_loaders import WebBaseLoader
from langchain.chains.summarize import load_summarize_chain
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
docs = loader.load()
llm = Ollama(model="llama3")
llm = Ollama(model="llama2")
chain = load_summarize_chain(llm, chain_type="stuff")
result = chain.invoke(docs)
result = chain.run(docs)
print(result)

View File

@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
## Running the Example
1. Ensure you have the `llama3` model installed:
1. Ensure you have the `llama2` model installed:
```bash
ollama pull llama3
ollama pull llama2
```
2. Install the Python Requirements.
@@ -21,3 +21,4 @@ This example is a basic "hello world" of using LangChain with Ollama.
```bash
python main.py
```

View File

@@ -1,6 +1,6 @@
from langchain.llms import Ollama
input = input("What is your question?")
llm = Ollama(model="llama3")
llm = Ollama(model="llama2")
res = llm.predict(input)
print (res)

View File

@@ -1,4 +1,4 @@
FROM llama3
FROM llama2
PARAMETER temperature 1
SYSTEM """
You are Mario from super mario bros, acting as an assistant.

View File

@@ -2,12 +2,12 @@
# Example character: Mario
This example shows how to create a basic character using Llama3 as the base model.
This example shows how to create a basic character using Llama2 as the base model.
To run this example:
1. Download the Modelfile
2. `ollama pull llama3` to get the base model used in the model file.
2. `ollama pull llama2` to get the base model used in the model file.
3. `ollama create NAME -f ./Modelfile`
4. `ollama run NAME`
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
What the model file looks like:
```
FROM llama3
FROM llama2
PARAMETER temperature 1
SYSTEM """
You are Mario from Super Mario Bros, acting as an assistant.

View File

@@ -2,16 +2,16 @@ import requests
import json
import random
model = "llama3"
model = "llama2"
template = {
"firstName": "",
"lastName": "",
"firstName": "",
"lastName": "",
"address": {
"street": "",
"city": "",
"state": "",
"street": "",
"city": "",
"state": "",
"zipCode": ""
},
},
"phoneNumber": ""
}

View File

@@ -12,7 +12,7 @@ countries = [
"France",
]
country = random.choice(countries)
model = "llama3"
model = "llama2"
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

View File

@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
## Running the Example
1. Ensure you have the `llama3` model installed:
1. Ensure you have the `llama2` model installed:
```bash
ollama pull llama3
ollama pull llama2
```
2. Install the Python Requirements.

View File

@@ -2,7 +2,7 @@ import json
import requests
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
model = "llama3" # TODO: update this for whatever model you wish to use
model = "llama2" # TODO: update this for whatever model you wish to use
def chat(messages):

View File

@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
## Running the Example
1. Ensure you have the `llama3` model installed:
1. Ensure you have the `llama2` model installed:
```bash
ollama pull llama3
ollama pull llama2
```
2. Install the Python Requirements.

View File

@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a
## Usage
1. Add llama3 to have the mentors ask your questions:
1. Add llama2 to have the mentors ask your questions:
```bash
ollama pull llama3
ollama pull llama2
```
2. Install prerequisites:

View File

@@ -15,7 +15,7 @@ async function characterGenerator() {
ollama.setModel("stablebeluga2:70b-q4_K_M");
const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);
const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
if (err) throw err;
@@ -23,4 +23,4 @@ async function characterGenerator() {
});
}
characterGenerator();
characterGenerator();

View File

@@ -1,6 +1,6 @@
import * as readline from "readline";
const model = "llama3";
const model = "llama2";
type Message = {
role: "assistant" | "user" | "system";
content: string;
@@ -74,4 +74,4 @@ async function main() {
}
main();
main();

View File

@@ -81,10 +81,8 @@ func commonAMDValidateLibDir() (string, error) {
}
// Well known location(s)
for _, path := range RocmStandardLocations {
if rocmLibUsable(path) {
return path, nil
}
if rocmLibUsable(RocmStandardLocation) {
return RocmStandardLocation, nil
}
// Installer payload location if we're running the installed binary

View File

@@ -25,12 +25,12 @@ const (
// Prefix with the node dir
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
RocmStandardLocation = "/opt/rocm/lib"
)
var (
// Used to validate if the given ROCm lib is usable
ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
)
// Gather GPU information from the amdgpu driver if any supported GPUs are detected

View File

@@ -14,6 +14,7 @@ import (
)
const (
RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
// TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
iGPUName = "AMD Radeon(TM) Graphics"
@@ -21,8 +22,7 @@ const (
var (
// Used to validate if the given ROCm lib is usable
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
)
func AMDGetGPUInfo() []GpuInfo {

View File

@@ -12,8 +12,6 @@ import (
"sync"
"syscall"
"time"
"github.com/ollama/ollama/server/envconfig"
)
var (
@@ -26,8 +24,29 @@ func PayloadsDir() (string, error) {
defer lock.Unlock()
var err error
if payloadsDir == "" {
runnersDir := envconfig.RunnersDir
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
// On Windows we do not carry the payloads inside the main executable
if runtime.GOOS == "windows" && runnersDir == "" {
appExe, err := os.Executable()
if err != nil {
slog.Error("failed to lookup executable path", "error", err)
return "", err
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, d := range []string{".", "windows-" + runtime.GOARCH, "dist\\windows-" + runtime.GOARCH} {
candidate := filepath.Join(filepath.Dir(appExe), d, "ollama_runners")
_, err := os.Stat(candidate)
if err == nil {
runnersDir = candidate
break
}
}
if runnersDir == "" {
err = fmt.Errorf("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
slog.Error("incomplete distribution", "error", err)
return "", err
}
}
if runnersDir != "" {
payloadsDir = runnersDir
return payloadsDir, nil
@@ -35,7 +54,7 @@ func PayloadsDir() (string, error) {
// The remainder only applies on non-windows where we still carry payloads in the main executable
cleanupTmpDirs()
tmpDir := envconfig.TmpDir
tmpDir := os.Getenv("OLLAMA_TMPDIR")
if tmpDir == "" {
tmpDir, err = os.MkdirTemp("", "ollama")
if err != nil {
@@ -98,7 +117,7 @@ func cleanupTmpDirs() {
func Cleanup() {
lock.Lock()
defer lock.Unlock()
runnersDir := envconfig.RunnersDir
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))

View File

@@ -21,18 +21,16 @@ import (
"unsafe"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/server/envconfig"
)
type handles struct {
deviceCount int
cudart *C.cudart_handle_t
nvcuda *C.nvcuda_handle_t
}
const (
cudaMinimumMemory = 256 * format.MebiByte
rocmMinimumMemory = 256 * format.MebiByte
cudaMinimumMemory = 457 * format.MebiByte
rocmMinimumMemory = 457 * format.MebiByte
)
var gpuMutex sync.Mutex
@@ -64,22 +62,6 @@ var CudartWindowsGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
var NvcudaLinuxGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var NvcudaWindowsGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
@@ -92,8 +74,6 @@ func initGPUHandles() *handles {
gpuHandles := &handles{}
var cudartMgmtName string
var cudartMgmtPatterns []string
var nvcudaMgmtName string
var nvcudaMgmtPatterns []string
tmpDir, _ := PayloadsDir()
switch runtime.GOOS {
@@ -102,9 +82,6 @@ func initGPUHandles() *handles {
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "nvcuda.dll"
nvcudaMgmtPatterns = NvcudaWindowsGlobs
case "linux":
cudartMgmtName = "libcudart.so*"
if tmpDir != "" {
@@ -112,25 +89,11 @@ func initGPUHandles() *handles {
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "libcuda.so*"
nvcudaMgmtPatterns = NvcudaLinuxGlobs
default:
return gpuHandles
}
slog.Info("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil {
slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount
return gpuHandles
}
}
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
@@ -155,9 +118,6 @@ func GetGPUInfo() GpuInfoList {
if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart)
}
if gpuHandles.nvcuda != nil {
C.nvcuda_release(*gpuHandles.nvcuda)
}
}()
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
@@ -166,12 +126,6 @@ func GetGPUInfo() GpuInfoList {
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
}
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Dir(envconfig.RunnersDir)
}
var memInfo C.mem_info_t
resp := []GpuInfo{}
@@ -184,11 +138,7 @@ func GetGPUInfo() GpuInfoList {
gpuInfo := GpuInfo{
Library: "cuda",
}
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
}
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
@@ -204,7 +154,6 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.Major = int(memInfo.major)
gpuInfo.Minor = int(memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo)
@@ -247,10 +196,9 @@ func GetCPUMem() (memInfo, error) {
return ret, nil
}
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
func FindGPULibs(baseLibName string, patterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
var patterns []string
gpuLibPaths := []string{}
slog.Debug("Searching for GPU library", "name", baseLibName)
@@ -270,14 +218,8 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
}
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
}
patterns = append(patterns, defaultPatterns...)
slog.Debug("gpu library search", "globs", patterns)
for _, pattern := range patterns {
// Nvidia PhysX known to return bogus results
if strings.Contains(pattern, "PhysX") {
slog.Debug("skipping PhysX cuda library path", "path", pattern)
}
// Ignore glob discovery errors
matches, _ := filepath.Glob(pattern)
for _, match := range matches {
@@ -325,25 +267,8 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
return 0, nil, ""
}
func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
var resp C.nvcuda_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range nvcudaLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.nvcuda_init(lib, &resp)
if resp.err != nil {
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
return int(resp.num_devices), &resp.ch, libPath
}
}
return 0, nil, ""
}
func getVerboseState() C.uint16_t {
if envconfig.Debug {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
return C.uint16_t(1)
}
return C.uint16_t(0)

View File

@@ -10,12 +10,6 @@ package gpu
import "C"
import (
"runtime"
"github.com/ollama/ollama/format"
)
const (
metalMinimumMemory = 384 * format.MebiByte
)
func GetGPUInfo() GpuInfoList {
@@ -38,7 +32,7 @@ func GetGPUInfo() GpuInfoList {
// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
info.FreeMemory = info.TotalMemory
info.MinimumMemory = metalMinimumMemory
info.MinimumMemory = 0
return []GpuInfo{info}
}

View File

@@ -58,7 +58,6 @@ void cpu_check_ram(mem_info_t *resp);
#endif
#include "gpu_info_cudart.h"
#include "gpu_info_nvcuda.h"
#endif // __GPU_INFO_H__
#endif // __APPLE__

View File

@@ -6,9 +6,9 @@
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum cudartReturn_enum {
CUDART_SUCCESS = 0,
CUDART_ERROR_INVALID_VALUE = 1,
CUDART_ERROR_MEMORY_ALLOCATION = 2,
CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_MEMORY_ALLOCATION = 2,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
// Other values omitted for now...
} cudartReturn_t;

View File

@@ -1,203 +0,0 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h>
#include "gpu_info_nvcuda.h"
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
CUresult ret;
resp->err = NULL;
resp->num_devices = 0;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[] = {
{"cuInit", (void *)&resp->ch.cuInit},
{"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
{"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
{NULL, NULL},
};
resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
nvcuda_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*l[i].p) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->ch.cuInit)(0);
if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
return;
}
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
resp->err = strdup(buf);
return;
}
int version = 0;
nvcudaDriverVersion_t driverVersion;
driverVersion.major = 0;
driverVersion.minor = 0;
// Report driver version if we're in verbose mode, ignore errors
ret = (*resp->ch.cuDriverGetVersion)(&version);
if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
} else {
driverVersion.major = version / 1000;
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
}
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
}
const int buflen = 256;
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
nvcudaMemory_t memInfo = {0,0};
CUresult ret;
CUdevice device = -1;
CUcontext ctx = NULL;
char buf[buflen + 1];
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
if (h.handle == NULL) {
resp->err = strdup("nvcuda handle isn't initialized");
return;
}
ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device failed to initialize");
resp->err = strdup(buf);
return;
}
resp->major = 0;
resp->minor = 0;
int major = 0;
int minor = 0;
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
if (ret != CUDA_SUCCESS) {
LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
} else {
ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
if (ret != CUDA_SUCCESS) {
LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
} else {
resp->minor = minor;
resp->major = major;
}
}
ret = (*h.cuDeviceGetUuid)(&uuid, device);
if (ret != CUDA_SUCCESS) {
LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
} else {
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid.bytes[0],
uuid.bytes[1],
uuid.bytes[2],
uuid.bytes[3],
uuid.bytes[4],
uuid.bytes[5],
uuid.bytes[6],
uuid.bytes[7],
uuid.bytes[8],
uuid.bytes[9],
uuid.bytes[10],
uuid.bytes[11],
uuid.bytes[12],
uuid.bytes[13],
uuid.bytes[14],
uuid.bytes[15]
);
}
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
resp->err = strdup(buf);
// Best effort on failure...
(*h.cuCtxDestroy)(ctx);
return;
}
resp->total = memInfo.total;
resp->free = memInfo.free;
LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release primary device context %d", ret);
}
}
void nvcuda_release(nvcuda_handle_t h) {
LOG(h.verbose, "releasing nvcuda library\n");
UNLOAD_LIBRARY(h.handle);
// TODO and other context release logic?
h.handle = NULL;
}
#endif // __APPLE__

View File

@@ -1,71 +0,0 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_NVCUDA_H__
#define __GPU_INFO_NVCUDA_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_MEMORY_ALLOCATION = 2,
CUDA_ERROR_NOT_INITIALIZED = 3,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
// Other values omitted for now...
} CUresult;
typedef enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
// TODO - not yet wired up but may be useful for Jetson or other
// integrated GPU scenarios with shared memory
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
} CUdevice_attribute;
typedef void *nvcudaDevice_t; // Opaque is sufficient
typedef struct nvcudaMemory_st {
uint64_t total;
uint64_t free;
} nvcudaMemory_t;
typedef struct nvcudaDriverVersion {
int major;
int minor;
} nvcudaDriverVersion_t;
typedef struct CUuuid_st {
unsigned char bytes[16];
} CUuuid;
typedef int CUdevice;
typedef void* CUcontext;
typedef struct nvcuda_handle {
void *handle;
uint16_t verbose;
CUresult (*cuInit)(unsigned int Flags);
CUresult (*cuDriverGetVersion)(int *driverVersion);
CUresult (*cuDeviceGetCount)(int *);
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
// Context specific aspects
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
CUresult (*cuCtxDestroy)(CUcontext ctx);
} nvcuda_handle_t;
typedef struct nvcuda_init_resp {
char *err; // If err is non-null handle is invalid
nvcuda_handle_t ch;
int num_devices;
} nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_release(nvcuda_handle_t ch);
#endif // __GPU_INFO_NVCUDA_H__
#endif // __APPLE__

View File

@@ -1,117 +0,0 @@
//go:build integration
package integration
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
)
func TestMaxQueue(t *testing.T) {
// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
threadCount := 32
mq := os.Getenv("OLLAMA_MAX_QUEUE")
if mq != "" {
var err error
threadCount, err = strconv.Atoi(mq)
require.NoError(t, err)
} else {
os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
}
req := api.GenerateRequest{
Model: "orca-mini",
Prompt: "write a long historical fiction story about christopher columbus. use at least 10 facts from his actual journey",
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}
resp := []string{"explore", "discover", "ocean"}
// CPU mode takes much longer at the limit with a large queue setting
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
require.NoError(t, PullIfMissing(ctx, client, req.Model))
// Context for the worker threads so we can shut them down
// embedCtx, embedCancel := context.WithCancel(ctx)
embedCtx := ctx
var genwg sync.WaitGroup
go func() {
genwg.Add(1)
defer genwg.Done()
slog.Info("Starting generate request")
DoGenerate(ctx, t, client, req, resp, 45*time.Second, 5*time.Second)
slog.Info("generate completed")
}()
// Give the generate a chance to get started before we start hammering on embed requests
time.Sleep(5 * time.Millisecond)
threadCount += 10 // Add a few extra to ensure we push the queue past its limit
busyCount := 0
resetByPeerCount := 0
canceledCount := 0
succesCount := 0
counterMu := sync.Mutex{}
var embedwg sync.WaitGroup
for i := 0; i < threadCount; i++ {
go func(i int) {
embedwg.Add(1)
defer embedwg.Done()
slog.Info("embed started", "id", i)
embedReq := api.EmbeddingRequest{
Model: req.Model,
Prompt: req.Prompt,
Options: req.Options,
}
// Fresh client for every request
client, _ = GetTestEndpoint()
resp, genErr := client.Embeddings(embedCtx, &embedReq)
counterMu.Lock()
defer counterMu.Unlock()
switch {
case genErr == nil:
succesCount++
require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
case errors.Is(genErr, context.Canceled):
canceledCount++
case strings.Contains(genErr.Error(), "busy"):
busyCount++
case strings.Contains(genErr.Error(), "connection reset by peer"):
resetByPeerCount++
default:
require.NoError(t, genErr, "%d request failed", i)
}
slog.Info("embed finished", "id", i)
}(i)
}
genwg.Wait()
slog.Info("generate done, waiting for embeds")
embedwg.Wait()
require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
require.True(t, busyCount > 0, "no requests hit busy error but some should have")
require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
}

View File

@@ -1032,7 +1032,7 @@ struct llama_server_context
slot.has_next_token = false;
}
if (!slot.cache_tokens.empty() && llama_token_is_eog(model, result.tok))
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
{
slot.stopped_eos = true;
slot.has_next_token = false;
@@ -1144,15 +1144,12 @@ struct llama_server_context
res.result_json = json
{
{"content", tkn.text_to_send},
{"stop", false},
{"slot_id", slot.id},
{"multimodal", multimodal}
};
if (!llama_token_is_eog(model, tkn.tok)) {
res.result_json["content"] = tkn.text_to_send;
}
if (slot.sparams.n_probs > 0)
{
std::vector<completion_token_output> probs_output = {};
@@ -1186,6 +1183,8 @@ struct llama_server_context
{"model", params.model_alias},
{"tokens_predicted", slot.n_decoded},
{"tokens_evaluated", slot.n_prompt_tokens},
{"generation_settings", get_formated_generation(slot)},
{"prompt", slot.prompt},
{"truncated", slot.truncated},
{"stopped_eos", slot.stopped_eos},
{"stopped_word", slot.stopped_word},
@@ -2645,18 +2644,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = std::atol(sep);
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.val_f64 = std::atof(sep);
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.val_bool = true;
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.val_bool = false;
kvo.bool_value = false;
} else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;

View File

@@ -21,7 +21,7 @@ init_vars() {
# TODO - add additional optimization flags...
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
fi
case $(uname -s) in
case $(uname -s) in
"Darwin")
LIB_EXT="dylib"
WHOLE_ARCHIVE="-Wl,-force_load"

View File

@@ -165,11 +165,11 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
fi
if [ "${ARCH}" == "arm64" ]; then
echo "ARM CPU detected - disabling unsupported AVX instructions"
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
#
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# Disabling has minimal performance effect while maintaining compatibility.
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# Disabling has minimal performance effect while maintaining compatibility.
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
fi
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp

View File

@@ -26,25 +26,16 @@ function amdGPUs {
$GPU_LIST -join ';'
}
function init_vars {
if (!$script:SRC_DIR) {
$script:SRC_DIR = $(resolve-path "..\..\")
}
if (!$script:llamacppDir) {
$script:llamacppDir = "../llama.cpp"
}
if (!$script:cmakeTargets) {
$script:cmakeTargets = @("ollama_llama_server")
}
$script:SRC_DIR = $(resolve-path "..\..\")
$script:llamacppDir = "../llama.cpp"
$script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off"
)
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
$script:cmakeTargets = @("ollama_llama_server")
$script:ARCH = "amd64" # arm not yet supported.
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
md "$script:DIST_BASE" -ea 0 > $null
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
$script:config = "RelWithDebInfo"
@@ -175,195 +166,137 @@ function cleanup {
}
}
init_vars
git_module_setup
apply_patches
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
function build_static() {
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
# GCC build for direct linking into the Go binary
init_vars
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with itx
write-host "Checking for MinGW..."
# error action ensures we exit on failure
get-command gcc
get-command mingw32-make
$oldTargets = $script:cmakeTargets
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
"-G", "MinGW Makefiles"
"-DCMAKE_C_COMPILER=gcc.exe",
"-DCMAKE_CXX_COMPILER=g++.exe",
"-DBUILD_SHARED_LIBS=off",
"-DLLAMA_NATIVE=off",
"-DLLAMA_AVX=off",
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
$script:cmakeTargets = $oldTargets
} else {
write-host "Skipping CPU generation step as requested"
}
}
function build_cpu($gen_arch) {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
# remaining llama.cpp builds use MSVC
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
$script:distDir="$script:DIST_BASE\cpu"
write-host "Building LCD CPU"
build
sign
install
} else {
write-host "Skipping CPU generation step as requested"
}
}
function build_cpu_avx() {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
$script:distDir="$script:DIST_BASE\cpu_avx"
write-host "Building AVX CPU"
build
sign
install
} else {
write-host "Skipping CPU AVX generation step as requested"
}
}
function build_cpu_avx2() {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
$script:distDir="$script:DIST_BASE\cpu_avx2"
write-host "Building AVX2 CPU"
build
sign
install
} else {
write-host "Skipping CPU AVX2 generation step as requested"
}
}
function build_cuda() {
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
# Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
write-host "building custom CUDA GPU"
}
build
sign
install
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
} else {
write-host "Skipping CUDA generation step"
}
}
function build_rocm() {
if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
if ($null -ne $script:ROCM_VERSION) {
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
$script:cmakeDefs += @(
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DLLAMA_HIPBLAS=on",
"-DHIP_PLATFORM=amd",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
"-DAMDGPU_TARGETS=$(amdGPUs)",
"-DGPU_TARGETS=$(amdGPUs)"
)
# Make sure the ROCm binary dir is first in the path
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
# We have to clobber the LIB var from the developer shell for clang to work properly
$env:LIB=""
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
write-host "building custom ROCM GPU"
}
write-host "Building ROCm"
build
# Ninja doesn't prefix with config name
${script:config}=""
if ($null -ne $script:DUMPBIN) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
}
sign
install
# Assumes v5.7, may need adjustments for v6
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
} else {
write-host "Skipping ROCm generation step"
}
}
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
# GCC build for direct linking into the Go binary
init_vars
if ($($args.count) -eq 0) {
git_module_setup
apply_patches
build_static
if ($script:ARCH -eq "arm64") {
build_cpu("ARM64")
} else { # amd64
build_cpu("x64")
build_cpu_avx
build_cpu_avx2
build_cuda
build_rocm
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with itx
write-host "Checking for MinGW..."
# error action ensures we exit on failure
get-command gcc
get-command mingw32-make
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
"-G", "MinGW Makefiles"
"-DCMAKE_C_COMPILER=gcc.exe",
"-DCMAKE_CXX_COMPILER=g++.exe",
"-DBUILD_SHARED_LIBS=off",
"-DLLAMA_NATIVE=off",
"-DLLAMA_AVX=off",
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
# remaining llama.cpp builds use MSVC
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
$script:distDir="$script:DIST_BASE\cpu"
write-host "Building LCD CPU"
build
sign
install
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
$script:distDir="$script:DIST_BASE\cpu_avx"
write-host "Building AVX CPU"
build
sign
install
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
$script:distDir="$script:DIST_BASE\cpu_avx2"
write-host "Building AVX2 CPU"
build
sign
install
} else {
write-host "Skipping CPU generation step as requested"
}
if ($null -ne $script:CUDA_LIB_DIR) {
# Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
write-host "building custom CUDA GPU"
}
build
sign
install
}
if ($null -ne $env:HIP_PATH) {
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
if ($null -ne $script:ROCM_VERSION) {
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
}
cleanup
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"
} else {
for ( $i = 0; $i -lt $args.count; $i++ ) {
write-host "performing $($args[$i])"
& $($args[$i])
}
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
$script:cmakeDefs += @(
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DLLAMA_HIPBLAS=on",
"-DHIP_PLATFORM=amd",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
"-DAMDGPU_TARGETS=$(amdGPUs)",
"-DGPU_TARGETS=$(amdGPUs)"
)
# Make sure the ROCm binary dir is first in the path
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
# We have to clobber the LIB var from the developer shell for clang to work properly
$env:LIB=""
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
write-host "building custom ROCM GPU"
}
write-host "Building ROCm"
build
# Ninja doesn't prefix with config name
${script:config}=""
if ($null -ne $script:DUMPBIN) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
}
sign
install
}
cleanup
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"

View File

@@ -4,7 +4,6 @@ package llm
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
// #include <stdlib.h>

View File

@@ -3,11 +3,13 @@ package llm
import (
"fmt"
"log/slog"
"os"
"strconv"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/server/envconfig"
)
// This algorithm looks for a complete fit to determine if we need to unload other models
@@ -49,8 +51,15 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for _, info := range gpus {
memoryAvailable += info.FreeMemory
}
if envconfig.MaxVRAM > 0 {
memoryAvailable = envconfig.MaxVRAM
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseUint(userLimit, 10, 64)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
} else {
slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
memoryAvailable = avail
}
}
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
@@ -80,42 +89,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphFullOffload *= uint64(len(gpus))
graphPartialOffload *= uint64(len(gpus))
// on metal there's no partial offload overhead
if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload
}
layers := ggml.Tensors().Layers()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
memoryRequiredPartial := memoryMinimum + graphPartialOffload
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0
}
var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size()
}
if layer, ok := layers["output"]; ok {
memoryLayerOutput += layer.size()
} else if layer, ok := layers["token_embd"]; ok {
memoryLayerOutput += layer.size()
}
if gpus[0].Library == "metal" && opts.UseMMap {
// memory is preallocated for output tensors
memoryRequiredTotal += memoryLayerOutput
memoryRequiredPartial += memoryLayerOutput
}
var layerCount int
layers := ggml.Tensors().Layers()
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
@@ -129,11 +115,15 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
}
}
if gpus[0].Library != "metal" || !opts.UseMMap {
// memory was not preallocated for output tensors
memoryRequiredTotal += memoryLayerOutput
var memoryLayerOutput uint64
for k, v := range layers {
if !strings.HasPrefix(k, "blk.") {
memoryLayerOutput += v.size()
}
}
memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal

View File

@@ -1,12 +0,0 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e431c7f7..f077e688 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
+#include "common.h"
#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"

View File

@@ -1,45 +0,0 @@
diff --git a/ggml-metal.m b/ggml-metal.m
index 0207b787..b5e9884b 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
// to the matrix-vector kernel
int ne11_mm_min = 1;
-#if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models
// these numbers do not translate to other devices or model sizes
// TODO: need to find a better approach
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
- switch (src0t) {
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
- case GGML_TYPE_Q5_0: // not tested yet
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
- default: ne11_mm_min = 1; break;
- }
+ switch (src0t) {
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+ case GGML_TYPE_Q5_0: // not tested yet
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
+ default: ne11_mm_min = 1; break;
}
-#endif
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel

View File

@@ -1,24 +0,0 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e3c9bcd4..b43f892d 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+ }
+ ggml_set_name(embeddings, "embeddings");
+ ggml_set_input(embeddings);
+
+ if (ctx->has_class_embedding) {
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
- ggml_set_name(embeddings, "embeddings");
- ggml_set_input(embeddings);
-
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");

View File

@@ -26,7 +26,6 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/server/envconfig"
)
type LlamaServer interface {
@@ -74,7 +73,8 @@ func LoadModel(model string) (*GGML, error) {
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
var err error
if opts.NumCtx > int(ggml.KV().ContextLength()) {
slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
opts.NumCtx = int(ggml.KV().ContextLength())
}
if opts.NumCtx < 4 {
@@ -125,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else {
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
}
demandLib := envconfig.LLMLibrary
demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
if demandLib != "" {
serverPath := availableServers[demandLib]
if serverPath == "" {
@@ -146,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--embedding",
}
if envconfig.Debug {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
params = append(params, "--log-format", "json")
} else {
params = append(params, "--log-disable")
@@ -156,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
}
if envconfig.Debug {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
params = append(params, "--verbose")
}
@@ -194,15 +194,16 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--numa")
}
numParallel := envconfig.NumParallel
// TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if len(projectors) > 0 {
numParallel = 1
slog.Warn("multimodal models don't support parallel requests yet")
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
numParallel := 1
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
numParallel, err = strconv.Atoi(onp)
if err != nil || numParallel <= 0 {
err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
slog.Error("misconfiguration", "error", err)
return nil, err
}
}
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
for i := 0; i < len(servers); i++ {
@@ -233,13 +234,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// prepend the server directory to LD_LIBRARY_PATH/PATH
// append the server directory to LD_LIBRARY_PATH/PATH
libraryPaths := []string{dir}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
// This will favor system libraries over our bundled library dependencies
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
}
// Note: we always put the dependency path first
@@ -275,31 +276,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
sem: semaphore.NewWeighted(int64(numParallel)),
}
s.cmd.Env = os.Environ()
libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
s.cmd.Env = append(os.Environ(), libEnv)
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path and visible devices variable with our adjusted version
pathNeeded := true
devicesNeeded := visibleDevicesEnv != ""
for i := range s.cmd.Env {
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
devicesNeeded = false
}
}
if pathNeeded {
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
}
if devicesNeeded {
s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
// TODO - multiple GPU selection logic...
key, val := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
if key != "" {
s.cmd.Env = append(s.cmd.Env, key+"="+val)
}
slog.Info("starting llama server", "cmd", s.cmd.String())
@@ -316,6 +301,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
continue
}
// reap subprocess when it exits
go func() {
// Exit status managed via getServerStatus
_ = s.cmd.Wait()
}()
// TODO - make sure this is all wired up correctly
// if err = s.WaitUntilRunning(); err != nil {
// slog.Error("error starting llama server", "server", servers[i], "error", err)
// s.Close()
// finalErr = err
// continue
// }
return s, nil
}
@@ -347,7 +345,7 @@ type ServerStatus int
const ( // iota is reset to 0
ServerStatusReady ServerStatus = iota
ServerStatusNoSlotsAvailable
ServerStatusNoSlotsAvaialble
ServerStatusLoadingModel
ServerStatusNotResponding
ServerStatusError
@@ -357,7 +355,7 @@ func (s ServerStatus) ToString() string {
switch s {
case ServerStatusReady:
return "llm server ready"
case ServerStatusNoSlotsAvailable:
case ServerStatusNoSlotsAvaialble:
return "llm busy - no slots available"
case ServerStatusLoadingModel:
return "llm server loading model"
@@ -414,7 +412,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
case "ok":
return ServerStatusReady, nil
case "no slot available":
return ServerStatusNoSlotsAvailable, nil
return ServerStatusNoSlotsAvaialble, nil
case "loading model":
return ServerStatusLoadingModel, nil
default:
@@ -422,29 +420,6 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
}
}
// getServerStatusRetry will retry if ServerStatusNoSlotsAvailable is received
func (s *llmServer) getServerStatusRetry(ctx context.Context) (ServerStatus, error) {
var retries int
for {
status, err := s.getServerStatus(ctx)
if err != nil {
return status, err
}
if status == ServerStatusNoSlotsAvailable {
if retries >= 10 {
return status, fmt.Errorf("no slots available after %d retries", retries)
}
time.Sleep(5 * time.Millisecond)
retries++
continue
}
return status, nil
}
}
func (s *llmServer) Ping(ctx context.Context) error {
_, err := s.getServerStatus(ctx)
if err != nil {
@@ -467,7 +442,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
select {
case <-ctx.Done():
slog.Info("context expired before server started")
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
return fmt.Errorf("timed out waiting for llama runner to start")
case err := <-s.done:
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
@@ -542,6 +517,7 @@ ws ::= ([ \t\n] ws)?
`
const maxBufferSize = 512 * format.KiloByte
const maxRetries = 3
type ImageData struct {
Data []byte `json:"data"`
@@ -584,13 +560,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
return err
}
defer s.sem.Release(1)
// only allow maximum 10 "context shifts" to avoid infinite generation
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
req.Options.NumPredict = 10 * s.options.NumCtx
slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
}
request := map[string]any{
"prompt": req.Prompt,
"stream": true,
@@ -617,7 +586,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
// Make sure the server is ready
status, err := s.getServerStatusRetry(ctx)
status, err := s.getServerStatus(ctx)
if err != nil {
return err
} else if status != ServerStatusReady {
@@ -631,113 +600,133 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %v", err)
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
serverReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
if err != nil {
return fmt.Errorf("error creating POST request: %v", err)
}
serverReq.Header.Set("Content-Type", "application/json")
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %v", err)
}
res, err := http.DefaultClient.Do(serverReq)
if err != nil {
return fmt.Errorf("POST predict: %v", err)
}
defer res.Body.Close()
if res.StatusCode >= 400 {
bodyBytes, err := io.ReadAll(res.Body)
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
if err != nil {
return fmt.Errorf("failed reading llm error response: %w", err)
return fmt.Errorf("error creating POST request: %v", err)
}
log.Printf("llm predict error: %s", bodyBytes)
return fmt.Errorf("%s", bodyBytes)
}
req.Header.Set("Content-Type", "application/json")
scanner := bufio.NewScanner(res.Body)
buf := make([]byte, 0, maxBufferSize)
scanner.Buffer(buf, maxBufferSize)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("POST predict: %v", err)
}
defer resp.Body.Close()
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
for scanner.Scan() {
select {
case <-ctx.Done():
// This handles the request cancellation
return ctx.Err()
default:
line := scanner.Bytes()
if len(line) == 0 {
continue
if resp.StatusCode >= 400 {
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed reading llm error response: %w", err)
}
log.Printf("llm predict error: %s", bodyBytes)
return fmt.Errorf("%s", bodyBytes)
}
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
return fmt.Errorf("error parsing llm response stream: %s", line)
}
scanner := bufio.NewScanner(resp.Body)
buf := make([]byte, 0, maxBufferSize)
scanner.Buffer(buf, maxBufferSize)
var c completion
if err := json.Unmarshal(evt, &c); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
retryNeeded := false
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
switch {
case strings.TrimSpace(c.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(c.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
for scanner.Scan() {
select {
case <-ctx.Done():
// This handles the request cancellation
return ctx.Err()
}
default:
line := scanner.Bytes()
if len(line) == 0 {
continue
}
if c.Content != "" {
fn(CompletionResponse{
Content: c.Content,
})
}
// try again on slot unavailable
if bytes.Contains(line, []byte("slot unavailable")) {
retryNeeded = true
break
}
if c.Stop {
fn(CompletionResponse{
Done: true,
PromptEvalCount: c.Timings.PromptN,
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
EvalCount: c.Timings.PredictedN,
EvalDuration: parseDurationMs(c.Timings.PredictedMS),
})
return nil
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
return fmt.Errorf("error parsing llm response stream: %s", line)
}
var c completion
if err := json.Unmarshal(evt, &c); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
switch {
case strings.TrimSpace(c.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(c.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
return ctx.Err()
}
if c.Content != "" {
fn(CompletionResponse{
Content: c.Content,
})
}
if c.Stop {
fn(CompletionResponse{
Done: true,
PromptEvalCount: c.Timings.PromptN,
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
EvalCount: c.Timings.PredictedN,
EvalDuration: parseDurationMs(c.Timings.PredictedMS),
})
return nil
}
}
}
if err := scanner.Err(); err != nil {
if strings.Contains(err.Error(), "unexpected EOF") {
s.Close()
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
}
return fmt.Errorf("error reading llm response: %v", err)
}
if !retryNeeded {
return nil // success
}
}
if err := scanner.Err(); err != nil {
if strings.Contains(err.Error(), "unexpected EOF") {
s.Close()
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
}
return fmt.Errorf("error reading llm response: %v", err)
}
return nil
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
type EmbeddingRequest struct {
@@ -754,9 +743,8 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
return nil, err
}
defer s.sem.Release(1)
// Make sure the server is ready
status, err := s.getServerStatusRetry(ctx)
status, err := s.getServerStatus(ctx)
if err != nil {
return nil, err
} else if status != ServerStatusReady {
@@ -811,7 +799,7 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
status, err := s.getServerStatus(ctx)
if err != nil {
return nil, err
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
}
@@ -863,7 +851,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
status, err := s.getServerStatus(ctx)
if err != nil {
return "", err
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
}
@@ -905,13 +893,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
func (s *llmServer) Close() error {
if s.cmd != nil {
slog.Debug("stopping llama server")
if err := s.cmd.Process.Kill(); err != nil {
return err
}
_ = s.cmd.Wait()
slog.Debug("llama server stopped")
return s.cmd.Process.Kill()
}
return nil

View File

@@ -19,7 +19,7 @@ export default function () {
const [step, setStep] = useState<Step>(Step.WELCOME)
const [commandCopied, setCommandCopied] = useState<boolean>(false)
const command = 'ollama run llama3'
const command = 'ollama run llama2'
return (
<div className='drag'>

132
parser/parser.go Normal file
View File

@@ -0,0 +1,132 @@
package parser
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"log/slog"
"slices"
)
type Command struct {
Name string
Args string
}
func (c *Command) Reset() {
c.Name = ""
c.Args = ""
}
func Parse(reader io.Reader) ([]Command, error) {
var commands []Command
var command, modelCommand Command
scanner := bufio.NewScanner(reader)
scanner.Buffer(make([]byte, 0, bufio.MaxScanTokenSize), bufio.MaxScanTokenSize)
scanner.Split(scanModelfile)
for scanner.Scan() {
line := scanner.Bytes()
fields := bytes.SplitN(line, []byte(" "), 2)
if len(fields) == 0 || len(fields[0]) == 0 {
continue
}
switch string(bytes.ToUpper(fields[0])) {
case "FROM":
command.Name = "model"
command.Args = string(bytes.TrimSpace(fields[1]))
// copy command for validation
modelCommand = command
case "ADAPTER":
command.Name = string(bytes.ToLower(fields[0]))
command.Args = string(bytes.TrimSpace(fields[1]))
case "LICENSE", "TEMPLATE", "SYSTEM", "PROMPT":
command.Name = string(bytes.ToLower(fields[0]))
command.Args = string(fields[1])
case "PARAMETER":
fields = bytes.SplitN(fields[1], []byte(" "), 2)
if len(fields) < 2 {
return nil, fmt.Errorf("missing value for %s", fields)
}
command.Name = string(fields[0])
command.Args = string(bytes.TrimSpace(fields[1]))
case "EMBED":
return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
case "MESSAGE":
command.Name = string(bytes.ToLower(fields[0]))
fields = bytes.SplitN(fields[1], []byte(" "), 2)
if len(fields) < 2 {
return nil, fmt.Errorf("should be in the format <role> <message>")
}
if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) {
return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"")
}
command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1]))
default:
if !bytes.HasPrefix(fields[0], []byte("#")) {
// log a warning for unknown commands
slog.Warn(fmt.Sprintf("Unknown command: %s", fields[0]))
}
continue
}
commands = append(commands, command)
command.Reset()
}
if modelCommand.Args == "" {
return nil, errors.New("no FROM line for the model was specified")
}
return commands, scanner.Err()
}
func scanModelfile(data []byte, atEOF bool) (advance int, token []byte, err error) {
advance, token, err = scan([]byte(`"""`), []byte(`"""`), data, atEOF)
if err != nil {
return 0, nil, err
}
if advance > 0 && token != nil {
return advance, token, nil
}
advance, token, err = scan([]byte(`"`), []byte(`"`), data, atEOF)
if err != nil {
return 0, nil, err
}
if advance > 0 && token != nil {
return advance, token, nil
}
return bufio.ScanLines(data, atEOF)
}
func scan(openBytes, closeBytes, data []byte, atEOF bool) (advance int, token []byte, err error) {
newline := bytes.IndexByte(data, '\n')
if start := bytes.Index(data, openBytes); start >= 0 && start < newline {
end := bytes.Index(data[start+len(openBytes):], closeBytes)
if end < 0 {
if atEOF {
return 0, nil, fmt.Errorf("unterminated %s: expecting %s", openBytes, closeBytes)
} else {
return 0, nil, nil
}
}
n := start + len(openBytes) + end + len(closeBytes)
newData := data[:start]
newData = append(newData, data[start+len(openBytes):n-len(closeBytes)]...)
return n, newData, nil
}
return 0, nil, nil
}

98
parser/parser_test.go Normal file
View File

@@ -0,0 +1,98 @@
package parser
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func Test_Parser(t *testing.T) {
input := `
FROM model1
ADAPTER adapter1
LICENSE MIT
PARAMETER param1 value1
PARAMETER param2 value2
TEMPLATE template1
`
reader := strings.NewReader(input)
commands, err := Parse(reader)
assert.Nil(t, err)
expectedCommands := []Command{
{Name: "model", Args: "model1"},
{Name: "adapter", Args: "adapter1"},
{Name: "license", Args: "MIT"},
{Name: "param1", Args: "value1"},
{Name: "param2", Args: "value2"},
{Name: "template", Args: "template1"},
}
assert.Equal(t, expectedCommands, commands)
}
func Test_Parser_NoFromLine(t *testing.T) {
input := `
PARAMETER param1 value1
PARAMETER param2 value2
`
reader := strings.NewReader(input)
_, err := Parse(reader)
assert.ErrorContains(t, err, "no FROM line")
}
func Test_Parser_MissingValue(t *testing.T) {
input := `
FROM foo
PARAMETER param1
`
reader := strings.NewReader(input)
_, err := Parse(reader)
assert.ErrorContains(t, err, "missing value for [param1]")
}
func Test_Parser_Messages(t *testing.T) {
input := `
FROM foo
MESSAGE system You are a Parser. Always Parse things.
MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things!
`
reader := strings.NewReader(input)
commands, err := Parse(reader)
assert.Nil(t, err)
expectedCommands := []Command{
{Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a Parser. Always Parse things."},
{Name: "message", Args: "user: Hey there!"},
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
}
assert.Equal(t, expectedCommands, commands)
}
func Test_Parser_Messages_BadRole(t *testing.T) {
input := `
FROM foo
MESSAGE badguy I'm a bad guy!
`
reader := strings.NewReader(input)
_, err := Parse(reader)
assert.ErrorContains(t, err, "role must be one of \"system\", \"user\", or \"assistant\"")
}

View File

@@ -7,8 +7,6 @@
$ErrorActionPreference = "Stop"
function checkEnv() {
$script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
Write-host "Building for ${script:TARGET_ARCH}"
write-host "Locating required tools and paths"
$script:SRC_DIR=$PWD
if (!$env:VCToolsRedistDir) {
@@ -32,7 +30,7 @@ function checkEnv() {
$script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-amd64"
$env:CGO_ENABLED="1"
echo "Checking version"
if (!$env:VERSION) {
@@ -83,8 +81,8 @@ function buildOllama() {
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
New-Item -ItemType Directory -Path .\dist\windows-amd64\ -Force
cp .\ollama.exe .\dist\windows-amd64\ollama-windows-amd64.exe
}
function buildApp() {
@@ -111,6 +109,9 @@ function gatherDependencies() {
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
cp "${script:NVIDIA_DIR}\cudart64_*.dll" "${script:DEPS_DIR}\"
cp "${script:NVIDIA_DIR}\cublas64_*.dll" "${script:DEPS_DIR}\"
cp "${script:NVIDIA_DIR}\cublasLt64_*.dll" "${script:DEPS_DIR}\"
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
if ("${env:KEY_CONTAINER}") {
@@ -122,6 +123,15 @@ function gatherDependencies() {
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
if ($null -ne $env:HIP_PATH) {
# Assumes v5.7, may need adjustments for v6
rm -ea 0 -recurse -force -path "${script:DEPS_DIR}\rocm\"
md "${script:DEPS_DIR}\rocm\rocblas\library\" -ea 0 > $null
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:DEPS_DIR}\rocm\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:DEPS_DIR}\rocm\"
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:DEPS_DIR}\rocm\rocblas\library\"
}
}
function buildInstaller() {
@@ -129,16 +139,16 @@ function buildInstaller() {
cd "${script:SRC_DIR}\app"
$env:PKG_VERSION=$script:PKG_VERSION
if ("${env:KEY_CONTAINER}") {
& "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
& "${script:INNO_SETUP_DIR}\ISCC.exe" /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
} else {
& "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH .\ollama.iss
& "${script:INNO_SETUP_DIR}\ISCC.exe" .\ollama.iss
}
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
function distZip() {
write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip"
Compress-Archive -Path "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip" -Force
write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-amd64.zip"
Compress-Archive -Path "${script:SRC_DIR}\dist\windows-amd64\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-amd64.zip" -Force
}
try {

View File

@@ -166,8 +166,8 @@ fi
if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
# Look for pre-existing ROCm v6 before downloading the dependencies
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm"; do
if [ -n "${search}" ] && [ -e "${search}/lib/libhipblas.so.2" ]; then
status "Compatible AMD GPU ROCm library detected at ${search}"
install_success
exit 0

View File

@@ -1,174 +0,0 @@
package envconfig
import (
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
)
var (
// Set via OLLAMA_ORIGINS in the environment
AllowOrigins []string
// Set via OLLAMA_DEBUG in the environment
Debug bool
// Set via OLLAMA_LLM_LIBRARY in the environment
LLMLibrary string
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int
// Set via OLLAMA_MAX_VRAM in the environment
MaxVRAM uint64
// Set via OLLAMA_NOPRUNE in the environment
NoPrune bool
// Set via OLLAMA_NUM_PARALLEL in the environment
NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string
// Set via OLLAMA_TMPDIR in the environment
TmpDir string
)
func AsMap() map[string]string {
return map[string]string{
"OLLAMA_ORIGINS": fmt.Sprintf("%v", AllowOrigins),
"OLLAMA_DEBUG": fmt.Sprintf("%v", Debug),
"OLLAMA_LLM_LIBRARY": fmt.Sprintf("%v", LLMLibrary),
"OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners),
"OLLAMA_MAX_QUEUE": fmt.Sprintf("%v", MaxQueuedRequests),
"OLLAMA_MAX_VRAM": fmt.Sprintf("%v", MaxVRAM),
"OLLAMA_NOPRUNE": fmt.Sprintf("%v", NoPrune),
"OLLAMA_NUM_PARALLEL": fmt.Sprintf("%v", NumParallel),
"OLLAMA_RUNNERS_DIR": fmt.Sprintf("%v", RunnersDir),
"OLLAMA_TMPDIR": fmt.Sprintf("%v", TmpDir),
}
}
var defaultAllowOrigins = []string{
"localhost",
"127.0.0.1",
"0.0.0.0",
}
// Clean quotes and spaces from the value
func clean(key string) string {
return strings.Trim(os.Getenv(key), "\"' ")
}
func init() {
// default values
NumParallel = 1
MaxRunners = 1
MaxQueuedRequests = 512
LoadConfig()
}
func LoadConfig() {
if debug := clean("OLLAMA_DEBUG"); debug != "" {
d, err := strconv.ParseBool(debug)
if err == nil {
Debug = d
} else {
Debug = true
}
}
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
if runtime.GOOS == "windows" && RunnersDir == "" {
// On Windows we do not carry the payloads inside the main executable
appExe, err := os.Executable()
if err != nil {
slog.Error("failed to lookup executable path", "error", err)
}
cwd, err := os.Getwd()
if err != nil {
slog.Error("failed to lookup working directory", "error", err)
}
var paths []string
for _, root := range []string{filepath.Dir(appExe), cwd} {
paths = append(paths,
filepath.Join(root),
filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, p := range paths {
candidate := filepath.Join(p, "ollama_runners")
_, err := os.Stat(candidate)
if err == nil {
RunnersDir = candidate
break
}
}
if RunnersDir == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
}
}
TmpDir = clean("OLLAMA_TMPDIR")
userLimit := clean("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseUint(userLimit, 10, 64)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
} else {
MaxVRAM = avail
}
}
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
val, err := strconv.Atoi(onp)
if err != nil || val <= 0 {
slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
} else {
NumParallel = val
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
if origins := clean("OLLAMA_ORIGINS"); origins != "" {
AllowOrigins = strings.Split(origins, ",")
}
for _, allowOrigin := range defaultAllowOrigins {
AllowOrigins = append(AllowOrigins,
fmt.Sprintf("http://%s", allowOrigin),
fmt.Sprintf("https://%s", allowOrigin),
fmt.Sprintf("http://%s:*", allowOrigin),
fmt.Sprintf("https://%s:*", allowOrigin),
)
}
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" {
m, err := strconv.Atoi(maxRunners)
if err != nil {
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
} else {
MaxRunners = m
}
}
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
p, err := strconv.Atoi(onp)
if err != nil || p <= 0 {
slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
} else {
MaxQueuedRequests = p
}
}
}

View File

@@ -1,20 +0,0 @@
package envconfig
import (
"os"
"testing"
"github.com/stretchr/testify/require"
)
func TestConfig(t *testing.T) {
os.Setenv("OLLAMA_DEBUG", "")
LoadConfig()
require.False(t, Debug)
os.Setenv("OLLAMA_DEBUG", "false")
LoadConfig()
require.False(t, Debug)
os.Setenv("OLLAMA_DEBUG", "1")
LoadConfig()
require.True(t, Debug)
}

View File

@@ -5,7 +5,6 @@ import (
"bytes"
"context"
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"encoding/json"
"errors"
@@ -21,17 +20,15 @@ import (
"runtime"
"strconv"
"strings"
"text/template"
"golang.org/x/exp/slices"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/auth"
"github.com/ollama/ollama/convert"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version"
)
@@ -63,76 +60,6 @@ func (m *Model) IsEmbedding() bool {
return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
}
func (m *Model) String() string {
var modelfile model.File
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "model",
Args: m.ModelPath,
})
if m.Template != "" {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "template",
Args: m.Template,
})
}
if m.System != "" {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "system",
Args: m.System,
})
}
for _, adapter := range m.AdapterPaths {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "adapter",
Args: adapter,
})
}
for _, projector := range m.ProjectorPaths {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "projector",
Args: projector,
})
}
for k, v := range m.Options {
switch v := v.(type) {
case []any:
for _, s := range v {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: k,
Args: fmt.Sprintf("%v", s),
})
}
default:
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: k,
Args: fmt.Sprintf("%v", v),
})
}
}
for _, license := range m.License {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "license",
Args: license,
})
}
for _, msg := range m.Messages {
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "message",
Args: fmt.Sprintf("%s %s", msg.Role, msg.Content),
})
}
return modelfile.String()
}
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
@@ -357,7 +284,7 @@ func realpath(mfDir, from string) string {
return abspath
}
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) error {
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
deleteMap := make(map[string]struct{})
if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
for _, layer := range append(manifest.Layers, manifest.Config) {
@@ -379,7 +306,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
params := make(map[string][]string)
fromParams := make(map[string]any)
for _, c := range modelfile.Commands {
for _, c := range commands {
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
switch c.Name {
@@ -696,7 +623,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
return err
}
if !envconfig.NoPrune {
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
return err
}
@@ -774,43 +701,36 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
return path, nil
}
func CopyModel(src, dst model.Name) error {
if !dst.IsFullyQualified() {
return model.Unqualified(dst)
}
if !src.IsFullyQualified() {
return model.Unqualified(src)
}
if src.Filepath() == dst.Filepath() {
return nil
}
manifests, err := GetManifestPath()
func CopyModel(src, dest string) error {
srcModelPath := ParseModelPath(src)
srcPath, err := srcModelPath.GetManifestPath()
if err != nil {
return err
}
dstpath := filepath.Join(manifests, dst.Filepath())
if err := os.MkdirAll(filepath.Dir(dstpath), 0o755); err != nil {
return err
}
srcpath := filepath.Join(manifests, src.Filepath())
srcfile, err := os.Open(srcpath)
destModelPath := ParseModelPath(dest)
destPath, err := destModelPath.GetManifestPath()
if err != nil {
return err
}
defer srcfile.Close()
dstfile, err := os.Create(dstpath)
if err != nil {
if err := os.MkdirAll(filepath.Dir(destPath), 0o755); err != nil {
return err
}
defer dstfile.Close()
_, err = io.Copy(dstfile, srcfile)
return err
// copy the file
input, err := os.ReadFile(srcPath)
if err != nil {
fmt.Println("Error reading file:", err)
return err
}
err = os.WriteFile(destPath, input, 0o644)
if err != nil {
fmt.Println("Error reading file:", err)
return err
}
return nil
}
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
@@ -970,6 +890,67 @@ func DeleteModel(name string) error {
return nil
}
func ShowModelfile(model *Model) (string, error) {
var mt struct {
*Model
From string
Parameters map[string][]any
}
mt.Parameters = make(map[string][]any)
for k, v := range model.Options {
if s, ok := v.([]any); ok {
mt.Parameters[k] = s
continue
}
mt.Parameters[k] = []any{v}
}
mt.Model = model
mt.From = model.ModelPath
if model.ParentModel != "" {
mt.From = model.ParentModel
}
modelFile := `# Modelfile generated by "ollama show"
# To build a new Modelfile based on this one, replace the FROM line with:
# FROM {{ .ShortName }}
FROM {{ .From }}
TEMPLATE """{{ .Template }}"""
{{- if .System }}
SYSTEM """{{ .System }}"""
{{- end }}
{{- range $adapter := .AdapterPaths }}
ADAPTER {{ $adapter }}
{{- end }}
{{- range $k, $v := .Parameters }}
{{- range $parameter := $v }}
PARAMETER {{ $k }} {{ printf "%#v" $parameter }}
{{- end }}
{{- end }}`
tmpl, err := template.New("").Parse(modelFile)
if err != nil {
slog.Info(fmt.Sprintf("error parsing template: %q", err))
return "", err
}
var buf bytes.Buffer
if err = tmpl.Execute(&buf, mt); err != nil {
slog.Info(fmt.Sprintf("error executing template: %q", err))
return "", err
}
return buf.String(), nil
}
func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name)
fn(api.ProgressResponse{Status: "retrieving manifest"})
@@ -991,6 +972,9 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
for _, layer := range layers {
if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
slog.Info(fmt.Sprintf("error uploading blob: %v", err))
if errors.Is(err, errUnauthorized) {
return fmt.Errorf("unable to push %s, make sure this namespace exists and you are authorized to push to it", ParseModelPath(name).GetNamespaceRepository())
}
return err
}
}
@@ -1027,7 +1011,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
// build deleteMap to prune unused layers
deleteMap := make(map[string]struct{})
if !envconfig.NoPrune {
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
@@ -1153,40 +1137,9 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
}
var errUnauthorized = fmt.Errorf("unauthorized: access denied")
// getTokenSubject returns the subject of a JWT token, it does not validate the token
func getTokenSubject(token string) string {
parts := strings.Split(token, ".")
if len(parts) != 3 {
slog.Error("jwt token does not contain 3 parts")
return ""
}
payload := parts[1]
payloadBytes, err := base64.RawURLEncoding.DecodeString(payload)
if err != nil {
slog.Error(fmt.Sprintf("failed to decode jwt payload: %v", err))
return ""
}
var payloadMap map[string]interface{}
if err := json.Unmarshal(payloadBytes, &payloadMap); err != nil {
slog.Error(fmt.Sprintf("failed to unmarshal payload JSON: %v", err))
return ""
}
sub, ok := payloadMap["sub"]
if !ok {
slog.Error("jwt does not contain 'sub' field")
return ""
}
return fmt.Sprintf("%s", sub)
}
var errUnauthorized = errors.New("unauthorized")
func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
anonymous := true // access will default to anonymous if no user is found associated with the public key
for i := 0; i < 2; i++ {
resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
if err != nil {
@@ -1205,7 +1158,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
if err != nil {
return nil, err
}
anonymous = getTokenSubject(token) == "anonymous"
regOpts.Token = token
if body != nil {
_, err = body.Seek(0, io.SeekStart)
@@ -1226,16 +1178,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
}
}
if anonymous {
// no user is associated with the public key, and the request requires non-anonymous access
pubKey, nestedErr := auth.GetPublicKey()
if nestedErr != nil {
slog.Error(fmt.Sprintf("couldn't get public key: %v", nestedErr))
return nil, errUnauthorized
}
return nil, &errtypes.UnknownOllamaKey{Key: pubKey}
}
// user is associated with the public key, but is not authorized to make the request
return nil, errUnauthorized
}

View File

@@ -6,7 +6,6 @@ import (
"net/url"
"os"
"path/filepath"
"regexp"
"strings"
)
@@ -26,10 +25,9 @@ const (
)
var (
ErrInvalidImageFormat = errors.New("invalid image format")
ErrInvalidProtocol = errors.New("invalid protocol scheme")
ErrInsecureProtocol = errors.New("insecure protocol http")
ErrInvalidDigestFormat = errors.New("invalid digest format")
ErrInvalidImageFormat = errors.New("invalid image format")
ErrInvalidProtocol = errors.New("invalid protocol scheme")
ErrInsecureProtocol = errors.New("insecure protocol http")
)
func ParseModelPath(name string) ModelPath {
@@ -151,17 +149,6 @@ func GetBlobsPath(digest string) (string, error) {
return "", err
}
// only accept actual sha256 digests
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
re := regexp.MustCompile(pattern)
if err != nil {
return "", err
}
if digest != "" && !re.MatchString(digest) {
return "", ErrInvalidDigestFormat
}
digest = strings.ReplaceAll(digest, ":", "-")
path := filepath.Join(dir, "blobs", digest)
dirPath := filepath.Dir(path)

View File

@@ -1,73 +1,6 @@
package server
import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
)
func TestGetBlobsPath(t *testing.T) {
// GetBlobsPath expects an actual directory to exist
dir, err := os.MkdirTemp("", "ollama-test")
assert.Nil(t, err)
defer os.RemoveAll(dir)
tests := []struct {
name string
digest string
expected string
err error
}{
{
"empty digest",
"",
filepath.Join(dir, "blobs"),
nil,
},
{
"valid with colon",
"sha256:456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
nil,
},
{
"valid with dash",
"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
nil,
},
{
"digest too short",
"sha256-45640291",
"",
ErrInvalidDigestFormat,
},
{
"digest too long",
"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9aaaaaaaaaa",
"",
ErrInvalidDigestFormat,
},
{
"digest invalid chars",
"../sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7a",
"",
ErrInvalidDigestFormat,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Setenv("OLLAMA_MODELS", dir)
got, err := GetBlobsPath(tc.digest)
assert.ErrorIs(t, tc.err, err, tc.name)
assert.Equal(t, tc.expected, got, tc.name)
})
}
}
import "testing"
func TestParseModelPath(t *testing.T) {
tests := []struct {

View File

@@ -1,7 +1,6 @@
package server
import (
"cmp"
"context"
"encoding/json"
"errors"
@@ -29,8 +28,7 @@ import (
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/server/envconfig"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version"
)
@@ -147,7 +145,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
select {
case runner = <-rCh:
case err = <-eCh:
handleErrorResponse(c, err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -390,7 +388,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
select {
case runner = <-rCh:
case err = <-eCh:
handleErrorResponse(c, err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -513,17 +511,28 @@ func (s *Server) PushModelHandler(c *gin.Context) {
func (s *Server) CreateModelHandler(c *gin.Context) {
var req api.CreateRequest
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
err := c.ShouldBindJSON(&req)
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
} else if err != nil {
case err != nil:
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
name := model.ParseName(cmp.Or(req.Model, req.Name))
if !name.IsValid() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
var model string
if req.Model != "" {
model = req.Model
} else if req.Name != "" {
model = req.Name
} else {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
if err := ParseModelPath(model).Validate(); err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
@@ -532,19 +541,19 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
return
}
var r io.Reader = strings.NewReader(req.Modelfile)
var modelfile io.Reader = strings.NewReader(req.Modelfile)
if req.Path != "" && req.Modelfile == "" {
f, err := os.Open(req.Path)
mf, err := os.Open(req.Path)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
return
}
defer f.Close()
defer mf.Close()
r = f
modelfile = mf
}
modelfile, err := model.ParseFile(r)
commands, err := parser.Parse(modelfile)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
@@ -560,7 +569,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), req.Quantization, modelfile, fn); err != nil {
if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
}()
@@ -708,12 +717,12 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
}
}
var sb strings.Builder
fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"")
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
fmt.Fprint(&sb, model.String())
resp.Modelfile = sb.String()
mf, err := ShowModelfile(model)
if err != nil {
return nil, err
}
resp.Modelfile = mf
return resp, nil
}
@@ -779,31 +788,34 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
}
func (s *Server) CopyModelHandler(c *gin.Context) {
var r api.CopyRequest
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
var req api.CopyRequest
err := c.ShouldBindJSON(&req)
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
} else if err != nil {
case err != nil:
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
src := model.ParseName(r.Source)
if !src.IsValid() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
if req.Source == "" || req.Destination == "" {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "source add destination are required"})
return
}
dst := model.ParseName(r.Destination)
if !dst.IsValid() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Source)})
if err := ParseModelPath(req.Destination).Validate(); err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
if err := CopyModel(src, dst); errors.Is(err, os.ErrNotExist) {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found", r.Source)})
} else if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
if err := CopyModel(req.Source, req.Destination); err != nil {
if os.IsNotExist(err) {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Source)})
} else {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
}
}
@@ -860,6 +872,12 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
c.Status(http.StatusCreated)
}
var defaultAllowOrigins = []string{
"localhost",
"127.0.0.1",
"0.0.0.0",
}
func isLocalIP(ip netip.Addr) bool {
if interfaces, err := net.Interfaces(); err == nil {
for _, iface := range interfaces {
@@ -943,7 +961,19 @@ func (s *Server) GenerateRoutes() http.Handler {
config := cors.DefaultConfig()
config.AllowWildcard = true
config.AllowBrowserExtensions = true
config.AllowOrigins = envconfig.AllowOrigins
if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
config.AllowOrigins = strings.Split(allowedOrigins, ",")
}
for _, allowOrigin := range defaultAllowOrigins {
config.AllowOrigins = append(config.AllowOrigins,
fmt.Sprintf("http://%s", allowOrigin),
fmt.Sprintf("https://%s", allowOrigin),
fmt.Sprintf("http://%s:*", allowOrigin),
fmt.Sprintf("https://%s:*", allowOrigin),
)
}
r := gin.Default()
r.Use(
@@ -982,11 +1012,10 @@ func (s *Server) GenerateRoutes() http.Handler {
func Serve(ln net.Listener) error {
level := slog.LevelInfo
if envconfig.Debug {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
level = slog.LevelDebug
}
slog.Info("server config", "env", envconfig.AsMap())
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
AddSource: true,
@@ -1010,7 +1039,7 @@ func Serve(ln net.Listener) error {
return err
}
if !envconfig.NoPrune {
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
// clean up unused layers and manifests
if err := PruneLayers(); err != nil {
return err
@@ -1041,7 +1070,6 @@ func Serve(ln net.Listener) error {
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-signals
srvr.Close()
done()
sched.unloadAllRunners()
gpu.Cleanup()
@@ -1187,7 +1215,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
select {
case runner = <-rCh:
case err = <-eCh:
handleErrorResponse(c, err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -1308,15 +1336,3 @@ func (s *Server) ChatHandler(c *gin.Context) {
streamResponse(c, ch)
}
func handleErrorResponse(c *gin.Context, err error) {
if errors.Is(err, context.Canceled) {
c.JSON(499, gin.H{"error": "request canceled"})
return
}
if errors.Is(err, ErrMaxQueue) {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}

View File

@@ -17,7 +17,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version"
)
@@ -55,13 +55,13 @@ func Test_Routes(t *testing.T) {
createTestModel := func(t *testing.T, name string) {
fname := createTestFile(t, "ollama-model")
r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
modelfile, err := model.ParseFile(r)
modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
commands, err := parser.Parse(modelfile)
assert.Nil(t, err)
fn := func(resp api.ProgressResponse) {
t.Logf("Status: %s", resp.Status)
}
err = CreateModel(context.TODO(), name, "", "", modelfile, fn)
err = CreateModel(context.TODO(), name, "", "", commands, fn)
assert.Nil(t, err)
}
@@ -238,5 +238,6 @@ func Test_Routes(t *testing.T) {
if tc.Expected != nil {
tc.Expected(t, resp)
}
}
}

View File

@@ -5,8 +5,10 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"sort"
"strconv"
"strings"
"sync"
"time"
@@ -15,13 +17,13 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"golang.org/x/exp/slices"
)
type LlmRequest struct {
ctx context.Context //nolint:containedctx
model *Model
ggml *llm.GGML // TODO - how large is this, and do we need to free it after we've finished loading?
opts api.Options
sessionDuration time.Duration
successCh chan *runnerRef
@@ -37,19 +39,31 @@ type Scheduler struct {
loaded map[string]*runnerRef
loadedMu sync.Mutex
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
loadFn func(req *LlmRequest, gpus gpu.GpuInfoList)
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
getGpuFn func() gpu.GpuInfoList
}
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
// TODO set this to zero after a release or two, to enable multiple models by default
var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
var maxQueuedRequests = 10 // TODO configurable
func InitScheduler(ctx context.Context) *Scheduler {
maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" {
m, err := strconv.Atoi(maxRunners)
if err != nil {
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
} else {
loadedMax = m
}
}
sched := &Scheduler{
pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests),
unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests),
pendingReqCh: make(chan *LlmRequest, maxQueuedRequests),
finishedReqCh: make(chan *LlmRequest, maxQueuedRequests),
expiredCh: make(chan *runnerRef, maxQueuedRequests),
unloadedCh: make(chan interface{}, maxQueuedRequests),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo,
@@ -60,22 +74,24 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
// allocate a large enough kv cache for all parallel requests
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
ggml, err := llm.LoadModel(model.ModelPath)
req := &LlmRequest{
ctx: c,
model: model,
ggml: ggml,
opts: opts,
sessionDuration: sessionDuration,
successCh: make(chan *runnerRef),
errCh: make(chan error, 1),
}
if err != nil {
req.errCh <- err
return req.successCh, req.errCh
}
select {
case s.pendingReqCh <- req:
default:
req.errCh <- ErrMaxQueue
req.errCh <- fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
}
return req.successCh, req.errCh
}
@@ -100,12 +116,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
return
case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running
if pending.ctx.Err() != nil {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue
}
for {
var runnerToExpire *runnerRef
s.loadedMu.Lock()
@@ -120,50 +130,31 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh)
break
}
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
} else if loadedCount == 0 {
slog.Debug("loading first model", "model", pending.model.ModelPath)
gpus := s.getGpuFn()
g := pickBestFitGPUs(pending, gpus)
if g != nil {
gpus = g
}
s.loadFn(pending, gpus)
break
} else if loadedMax > 0 && loadedCount >= loadedMax {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload()
runnerToExpire = s.findRunnerToUnload(pending)
} else {
// Either no models are loaded or below envconfig.MaxRunners
// More than one loaded model, so we have to see if the new one fits
// Get a refreshed GPU list
gpus := s.getGpuFn()
// Load model for fitting
ggml, err := llm.LoadModel(pending.model.ModelPath)
if err != nil {
pending.errCh <- err
break
}
// If we're CPU only mode, just limit by envconfig.MaxRunners above
// TODO handle system memory exhaustion
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
slog.Debug("cpu mode with existing models, loading")
s.loadFn(pending, ggml, gpus)
break
}
// No models loaded. Load the model but prefer the best fit.
if loadedCount == 0 {
slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFitGPUs(pending, ggml, gpus)
if g != nil {
gpus = g
}
s.loadFn(pending, ggml, gpus)
break
}
// More than one loaded model, so we have to see if the new one fits
// Update free memory from currently loaded models
s.updateFreeSpace(gpus)
gpus = pickBestFitGPUs(pending, ggml, gpus)
gpus = pickBestFitGPUs(pending, gpus)
if gpus != nil {
slog.Debug("new model fits with existing models, loading")
s.loadFn(pending, ggml, gpus)
s.loadFn(pending, gpus)
break
}
runnerToExpire = s.findRunnerToUnload()
runnerToExpire = s.findRunnerToUnload(pending)
}
if runnerToExpire == nil {
@@ -236,7 +227,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
defer runner.refMu.Unlock()
if runner.expireTimer != nil {
runner.expireTimer.Stop()
runner.expireTimer = nil
}
s.expiredCh <- runner
})
@@ -263,9 +253,9 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
continue
}
s.loadedMu.Lock()
slog.Debug("got lock to unload", "model", runner.model)
runner.unload()
s.loadedMu.Lock()
delete(s.loaded, runner.model)
s.loadedMu.Unlock()
slog.Debug("runner released", "model", runner.model)
@@ -283,10 +273,6 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
runner.refMu.Lock()
defer runner.refMu.Unlock()
runner.refCount++
if runner.expireTimer != nil {
runner.expireTimer.Stop()
runner.expireTimer = nil
}
runner.sessionDuration = pending.sessionDuration
pending.successCh <- runner
go func() {
@@ -296,8 +282,8 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}()
}
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
func (s *Scheduler) load(req *LlmRequest, gpus gpu.GpuInfoList) {
llama, err := s.newServerFn(gpus, req.model.ModelPath, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
@@ -417,10 +403,6 @@ type runnerRef struct {
// The refMu must already be held when calling unload
func (runner *runnerRef) unload() {
if runner.expireTimer != nil {
runner.expireTimer.Stop()
runner.expireTimer = nil
}
if runner.llama != nil {
runner.llama.Close()
}
@@ -435,25 +417,16 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
slog.Debug("evaluating already loaded", "model", req.model.ModelPath)
runner.refMu.Lock()
defer runner.refMu.Unlock()
// Ignore the NumGPU settings for comparison
optsExisting := runner.Options.Runner
optsExisting.NumGPU = -1
optsNew := req.opts.Runner
optsNew.NumGPU = -1
timeout := 10 * time.Second
if runner.loading {
timeout = 2 * time.Minute // Initial load can take a long time for big models on slow systems...
}
if runner.Options == nil {
return true
}
// Don't reload runner if num_gpu=-1 was provided
optsExisting := runner.Options.Runner
optsNew := req.opts.Runner
if optsNew.NumGPU < 0 {
optsExisting.NumGPU = -1
optsNew.NumGPU = -1
}
ctx, cancel := context.WithTimeout(ctx, timeout)
ctx, cancel := context.WithTimeout(ctx, timeout) // BUG -
defer cancel()
if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
@@ -461,7 +434,6 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
runner.llama.Ping(ctx) != nil {
return true
}
return false
}
@@ -482,7 +454,7 @@ func (a ByDuration) Less(i, j int) bool {
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
// If the model can not be fit fully within the available GPU(s) nil is returned
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
func pickBestFitGPUs(req *LlmRequest, gpus gpu.GpuInfoList) gpu.GpuInfoList {
var estimatedVRAM uint64
for _, gl := range gpus.ByLibrary() {
var ok bool
@@ -494,7 +466,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
// First attempt to fit the model into a single GPU
for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
return []gpu.GpuInfo{g}
}
@@ -505,7 +477,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUs
if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
if ok, estimatedVRAM = llm.PredictServerFit(gl, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
return gl
}
@@ -514,7 +486,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
}
// findRunnerToUnload finds a runner to unload to make room for a new model
func (s *Scheduler) findRunnerToUnload() *runnerRef {
func (s *Scheduler) findRunnerToUnload(req *LlmRequest) *runnerRef {
s.loadedMu.Lock()
runnerList := make([]*runnerRef, 0, len(s.loaded))
for _, r := range s.loaded {

View File

@@ -15,7 +15,6 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -28,21 +27,29 @@ func init() {
func TestInitScheduler(t *testing.T) {
ctx, done := context.WithCancel(context.Background())
defer done()
initialMax := loadedMax
s := InitScheduler(ctx)
s.loadedMu.Lock()
require.Equal(t, initialMax, loadedMax)
require.NotNil(t, s.loaded)
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
s = InitScheduler(ctx)
require.Equal(t, initialMax, loadedMax)
require.NotNil(t, s.loaded)
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
s = InitScheduler(ctx)
require.Equal(t, 0, loadedMax)
require.NotNil(t, s.loaded)
s.loadedMu.Unlock()
}
func TestLoad(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
defer done()
s := InitScheduler(ctx)
var ggml *llm.GGML // value not used in tests
req := &LlmRequest{
ctx: ctx,
model: &Model{ModelPath: "foo"},
opts: api.DefaultOptions(),
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
sessionDuration: 2,
@@ -52,12 +59,10 @@ func TestLoad(t *testing.T) {
return nil, fmt.Errorf("something failed to load model blah")
}
gpus := gpu.GpuInfoList{}
s.load(req, ggml, gpus)
s.load(req, gpus)
require.Len(t, req.successCh, 0)
require.Len(t, req.errCh, 1)
s.loadedMu.Lock()
require.Len(t, s.loaded, 0)
s.loadedMu.Unlock()
err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible")
@@ -65,30 +70,26 @@ func TestLoad(t *testing.T) {
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
return server, nil
}
s.load(req, ggml, gpus)
s.load(req, gpus)
select {
case err := <-req.errCh:
require.NoError(t, err)
case resp := <-req.successCh:
require.Equal(t, uint64(10), resp.estimatedVRAM)
require.Equal(t, uint(1), resp.refCount)
s.loadedMu.Lock()
require.Len(t, s.loaded, 1)
s.loadedMu.Unlock()
}
req.model.ModelPath = "dummy_model_path"
server.waitResp = fmt.Errorf("wait failure")
s.load(req, ggml, gpus)
s.load(req, gpus)
select {
case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure")
case resp := <-req.successCh:
t.Errorf("unexpected success %v", resp)
}
s.loadedMu.Lock()
runner := s.loaded["dummy_model_path"]
s.loadedMu.Unlock()
require.NotNil(t, runner)
require.Equal(t, uint(0), runner.refCount)
time.Sleep(1 * time.Millisecond)
@@ -100,7 +101,6 @@ type bundle struct {
ctxDone func()
srv *mockLlm
req *LlmRequest
ggml *llm.GGML
}
func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
@@ -132,16 +132,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
})
assert.Nil(t, err)
fname := f.Name()
model := &Model{Name: modelName, ModelPath: fname}
scenario.ggml, err = llm.LoadModel(model.ModelPath)
ggml, err := llm.LoadModel(model.ModelPath)
require.NoError(t, err)
scenario.req = &LlmRequest{
ctx: scenario.ctx,
model: model,
opts: api.DefaultOptions(),
ggml: ggml,
sessionDuration: 5 * time.Millisecond,
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
@@ -151,7 +149,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
}
func TestRequests(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
// Same model, same request
@@ -159,20 +157,18 @@ func TestRequests(t *testing.T) {
scenario1a.req.sessionDuration = 0
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
scenario1b.req.model = scenario1a.req.model
scenario1b.ggml = scenario1a.ggml
scenario1b.req.ggml = scenario1a.req.ggml
scenario1b.req.sessionDuration = 0
// simple reload of same model
scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
scenario2a.req.model = scenario1a.req.model
scenario2a.ggml = scenario1a.ggml
scenario2a.req.ggml = scenario1a.req.ggml
// Multiple loaded models
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
scenario3c := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList {
@@ -226,7 +222,7 @@ func TestRequests(t *testing.T) {
t.Errorf("timeout")
}
envconfig.MaxRunners = 1
loadedMax = 1
s.newServerFn = scenario3a.newServer
slog.Info("scenario3a")
s.pendingReqCh <- scenario3a.req
@@ -241,11 +237,9 @@ func TestRequests(t *testing.T) {
case <-ctx.Done():
t.Errorf("timeout")
}
s.loadedMu.Lock()
require.Len(t, s.loaded, 1)
s.loadedMu.Unlock()
envconfig.MaxRunners = 0
loadedMax = 0
s.newServerFn = scenario3b.newServer
slog.Info("scenario3b")
s.pendingReqCh <- scenario3b.req
@@ -257,14 +251,19 @@ func TestRequests(t *testing.T) {
case <-ctx.Done():
t.Errorf("timeout")
}
s.loadedMu.Lock()
require.Len(t, s.loaded, 2)
s.loadedMu.Unlock()
// This is a CPU load with NumGPU = 0 so it should load
// Try to load a model that wont fit
s.newServerFn = scenario3c.newServer
slog.Info("scenario3c")
require.Len(t, s.loaded, 2)
scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
time.Sleep(2 * time.Millisecond)
s.pendingReqCh <- scenario3c.req
// finish prior request, so new model can load
time.Sleep(6 * time.Millisecond)
require.Len(t, s.loaded, 1)
scenario3b.ctxDone()
select {
case resp := <-scenario3c.req.successCh:
require.Equal(t, resp.llama, scenario3c.srv)
@@ -273,40 +272,11 @@ func TestRequests(t *testing.T) {
case <-ctx.Done():
t.Errorf("timeout")
}
s.loadedMu.Lock()
require.Len(t, s.loaded, 3)
s.loadedMu.Unlock()
// Try to load a model that wont fit
s.newServerFn = scenario3d.newServer
slog.Info("scenario3d")
s.loadedMu.Lock()
require.Len(t, s.loaded, 3)
s.loadedMu.Unlock()
scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
time.Sleep(2 * time.Millisecond)
s.pendingReqCh <- scenario3d.req
// finish prior request, so new model can load
time.Sleep(6 * time.Millisecond)
s.loadedMu.Lock()
require.Len(t, s.loaded, 2)
s.loadedMu.Unlock()
scenario3b.ctxDone()
select {
case resp := <-scenario3d.req.successCh:
require.Equal(t, resp.llama, scenario3d.srv)
require.Len(t, s.pendingReqCh, 0)
require.Len(t, scenario3d.req.errCh, 0)
case <-ctx.Done():
t.Errorf("timeout")
}
s.loadedMu.Lock()
require.Len(t, s.loaded, 2)
s.loadedMu.Unlock()
require.Len(t, s.loaded, 1)
}
func TestGetRunner(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
defer done()
// Same model, same request
@@ -316,7 +286,7 @@ func TestGetRunner(t *testing.T) {
scenario1b.req.sessionDuration = 0
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
scenario1c.req.sessionDuration = 0
envconfig.MaxQueuedRequests = 1
maxQueuedRequests = 1
s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList {
g := gpu.GpuInfo{Library: "metal"}
@@ -345,28 +315,25 @@ func TestGetRunner(t *testing.T) {
t.Errorf("timeout")
}
scenario1a.ctxDone()
s.loadedMu.Lock()
require.Len(t, s.loaded, 1)
s.loadedMu.Unlock()
scenario1c.req.model.ModelPath = "bad path"
slog.Info("scenario1c")
successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
// Starts in pending channel, then should be quickly processsed to return an error
time.Sleep(5 * time.Millisecond)
require.Len(t, s.pendingReqCh, 0)
require.Len(t, successCh1c, 0)
s.loadedMu.Lock()
require.Len(t, s.loaded, 0)
s.loadedMu.Unlock()
require.Len(t, errCh1c, 1)
err = <-errCh1c
require.Contains(t, err.Error(), "bad path")
scenario1b.ctxDone()
time.Sleep(5 * time.Millisecond)
require.Len(t, s.loaded, 0)
}
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
// Same model, same request
@@ -387,9 +354,7 @@ func TestPrematureExpired(t *testing.T) {
require.Equal(t, resp.llama, scenario1a.srv)
require.Len(t, s.pendingReqCh, 0)
require.Len(t, errCh1a, 0)
s.loadedMu.Lock()
require.Len(t, s.loaded, 1)
s.loadedMu.Unlock()
slog.Info("sending premature expired event now")
s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
case <-ctx.Done():
@@ -401,9 +366,7 @@ func TestPrematureExpired(t *testing.T) {
require.LessOrEqual(t, len(s.finishedReqCh), 1)
time.Sleep(10 * time.Millisecond)
require.Len(t, s.finishedReqCh, 0)
s.loadedMu.Lock()
require.Len(t, s.loaded, 0)
s.loadedMu.Unlock()
// also shouldn't happen in real life
s.finishedReqCh <- scenario1a.req
@@ -411,10 +374,9 @@ func TestPrematureExpired(t *testing.T) {
}
func TestUseLoadedRunner(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
req := &LlmRequest{
ctx: ctx,
opts: api.DefaultOptions(),
successCh: make(chan *runnerRef, 1),
sessionDuration: 2,
}
@@ -436,7 +398,7 @@ func TestUseLoadedRunner(t *testing.T) {
}
func TestUpdateFreeSpace(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
defer done()
gpus := gpu.GpuInfoList{
{
@@ -458,47 +420,43 @@ func TestUpdateFreeSpace(t *testing.T) {
r2 := &runnerRef{llama: llm2, gpus: gpus}
s := InitScheduler(ctx)
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loaded["b"] = r2
s.loadedMu.Unlock()
s.updateFreeSpace(gpus)
require.Equal(t, uint64(850), gpus[0].FreeMemory)
require.Equal(t, uint64(1850), gpus[1].FreeMemory)
}
func TestFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
defer done()
req := &LlmRequest{ctx: ctx}
r1 := &runnerRef{refCount: 1, sessionDuration: 1}
r2 := &runnerRef{sessionDuration: 2}
s := InitScheduler(ctx)
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loaded["b"] = r2
s.loadedMu.Unlock()
resp := s.findRunnerToUnload()
resp := s.findRunnerToUnload(req)
require.Equal(t, r2, resp)
r2.refCount = 1
resp = s.findRunnerToUnload()
resp = s.findRunnerToUnload(req)
require.Equal(t, r1, resp)
}
func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
defer done()
llm := &mockLlm{}
do := api.DefaultOptions()
runner := &runnerRef{
adapters: []string{"adapter1"},
projectors: []string{"projector1"},
Options: &do,
Options: &api.Options{},
llama: llm,
}
req := &LlmRequest{
@@ -506,7 +464,7 @@ func TestNeedsReload(t *testing.T) {
AdapterPaths: []string{"adapter2"},
ProjectorPaths: []string{"projector2"},
},
opts: api.DefaultOptions(),
opts: api.Options{},
}
resp := runner.needsReload(ctx, req)
require.True(t, resp)
@@ -527,14 +485,11 @@ func TestNeedsReload(t *testing.T) {
require.False(t, resp)
req.opts.NumGPU = 99
resp = runner.needsReload(ctx, req)
require.True(t, resp)
req.opts.NumGPU = -1
resp = runner.needsReload(ctx, req)
require.False(t, resp)
}
func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
defer done()
llm1 := &mockLlm{}
@@ -545,10 +500,8 @@ func TestUnloadAllRunners(t *testing.T) {
r1 := &runnerRef{llama: llm1}
r2 := &runnerRef{llama: llm2}
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loaded["b"] = r2
s.loadedMu.Unlock()
s.unloadAllRunners()
require.True(t, llm1.closeCalled)

View File

@@ -1,18 +0,0 @@
// Package errtypes contains custom error types
package errtypes
import (
"fmt"
"strings"
)
const UnknownOllamaKeyErrMsg = "unknown ollama key"
// TODO: This should have a structured response from the API
type UnknownOllamaKey struct {
Key string
}
func (e *UnknownOllamaKey) Error() string {
return fmt.Sprintf("unauthorized: %s %q", UnknownOllamaKeyErrMsg, strings.TrimSpace(e.Key))
}

87
types/model/digest.go Normal file
View File

@@ -0,0 +1,87 @@
package model
import (
"fmt"
"log/slog"
"strings"
"unicode"
)
// Digest represents a digest of a model Manifest. It is a comparable value
// type and is immutable.
//
// The zero Digest is not a valid digest.
type Digest struct {
s string
}
// Split returns the digest type and the digest value.
func (d Digest) Split() (typ, digest string) {
typ, digest, _ = strings.Cut(d.s, "-")
return
}
// String returns the digest in the form of "<digest-type>-<digest>", or the
// empty string if the digest is invalid.
func (d Digest) String() string { return d.s }
// IsValid returns true if the digest is valid (not zero).
//
// A valid digest may be created only by ParseDigest, or
// ParseName(name).Digest().
func (d Digest) IsValid() bool { return d.s != "" }
// LogValue implements slog.Value.
func (d Digest) LogValue() slog.Value {
return slog.StringValue(d.String())
}
var (
_ slog.LogValuer = Digest{}
)
// ParseDigest parses a string in the form of "<digest-type>-<digest>" into a
// Digest.
func ParseDigest(s string) Digest {
typ, digest, ok := strings.Cut(s, "-")
if !ok {
typ, digest, ok = strings.Cut(s, ":")
}
if ok && isValidDigestType(typ) && isValidHex(digest) && len(digest) >= 2 {
return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
}
return Digest{}
}
func MustParseDigest(s string) Digest {
d := ParseDigest(s)
if !d.IsValid() {
panic(fmt.Sprintf("invalid digest: %q", s))
}
return d
}
func isValidDigestType(s string) bool {
if len(s) == 0 {
return false
}
for _, r := range s {
if !unicode.IsLower(r) && !unicode.IsDigit(r) {
return false
}
}
return true
}
func isValidHex(s string) bool {
if len(s) == 0 {
return false
}
for i := range s {
c := s[i]
if c < '0' || c > '9' && c < 'a' || c > 'f' {
return false
}
}
return true
}

View File

@@ -0,0 +1,46 @@
package model
import "testing"
var testDigests = map[string]Digest{
"": {},
"sha256-1234": {s: "sha256-1234"},
"sha256-5678": {s: "sha256-5678"},
"blake2-9abc": {s: "blake2-9abc"},
"-1234": {},
"sha256-": {},
"sha256-1234-5678": {},
"sha256-P": {}, // invalid hex
"sha256-1234P": {},
"---": {},
}
func TestDigestParse(t *testing.T) {
// Test cases.
for s, want := range testDigests {
got := ParseDigest(s)
t.Logf("ParseDigest(%q) = %#v", s, got)
if got != want {
t.Errorf("ParseDigest(%q) = %q; want %q", s, got, want)
}
}
}
func TestDigestString(t *testing.T) {
// Test cases.
for s, d := range testDigests {
want := s
if !d.IsValid() {
want = ""
}
got := d.String()
if got != want {
t.Errorf("ParseDigest(%q).String() = %q; want %q", s, got, want)
}
got = ParseDigest(s).String()
if got != want {
t.Errorf("roundtrip ParseDigest(%q).String() = %q; want %q", s, got, want)
}
}
}

View File

@@ -1,299 +0,0 @@
package model
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"strconv"
"strings"
)
type File struct {
Commands []Command
}
func (f File) String() string {
var sb strings.Builder
for _, cmd := range f.Commands {
fmt.Fprintln(&sb, cmd.String())
}
return sb.String()
}
type Command struct {
Name string
Args string
}
func (c Command) String() string {
var sb strings.Builder
switch c.Name {
case "model":
fmt.Fprintf(&sb, "FROM %s", c.Args)
case "license", "template", "system", "adapter":
fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
case "message":
role, message, _ := strings.Cut(c.Args, ": ")
fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
default:
fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
}
return sb.String()
}
type state int
const (
stateNil state = iota
stateName
stateValue
stateParameter
stateMessage
stateComment
)
var (
errMissingFrom = errors.New("no FROM line")
errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
)
func ParseFile(r io.Reader) (*File, error) {
var cmd Command
var curr state
var b bytes.Buffer
var role string
var f File
br := bufio.NewReader(r)
for {
r, _, err := br.ReadRune()
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return nil, err
}
next, r, err := parseRuneForState(r, curr)
if errors.Is(err, io.ErrUnexpectedEOF) {
return nil, fmt.Errorf("%w: %s", err, b.String())
} else if err != nil {
return nil, err
}
// process the state transition, some transitions need to be intercepted and redirected
if next != curr {
switch curr {
case stateName:
if !isValidCommand(b.String()) {
return nil, errInvalidCommand
}
// next state sometimes depends on the current buffer value
switch s := strings.ToLower(b.String()); s {
case "from":
cmd.Name = "model"
case "parameter":
// transition to stateParameter which sets command name
next = stateParameter
case "message":
// transition to stateMessage which validates the message role
next = stateMessage
fallthrough
default:
cmd.Name = s
}
case stateParameter:
cmd.Name = b.String()
case stateMessage:
if !isValidMessageRole(b.String()) {
return nil, errInvalidMessageRole
}
role = b.String()
case stateComment, stateNil:
// pass
case stateValue:
s, ok := unquote(b.String())
if !ok || isSpace(r) {
if _, err := b.WriteRune(r); err != nil {
return nil, err
}
continue
}
if role != "" {
s = role + ": " + s
role = ""
}
cmd.Args = s
f.Commands = append(f.Commands, cmd)
}
b.Reset()
curr = next
}
if strconv.IsPrint(r) {
if _, err := b.WriteRune(r); err != nil {
return nil, err
}
}
}
// flush the buffer
switch curr {
case stateComment, stateNil:
// pass; nothing to flush
case stateValue:
s, ok := unquote(b.String())
if !ok {
return nil, io.ErrUnexpectedEOF
}
if role != "" {
s = role + ": " + s
}
cmd.Args = s
f.Commands = append(f.Commands, cmd)
default:
return nil, io.ErrUnexpectedEOF
}
for _, cmd := range f.Commands {
if cmd.Name == "model" {
return &f, nil
}
}
return nil, errMissingFrom
}
func parseRuneForState(r rune, cs state) (state, rune, error) {
switch cs {
case stateNil:
switch {
case r == '#':
return stateComment, 0, nil
case isSpace(r), isNewline(r):
return stateNil, 0, nil
default:
return stateName, r, nil
}
case stateName:
switch {
case isAlpha(r):
return stateName, r, nil
case isSpace(r):
return stateValue, 0, nil
default:
return stateNil, 0, errInvalidCommand
}
case stateValue:
switch {
case isNewline(r):
return stateNil, r, nil
case isSpace(r):
return stateNil, r, nil
default:
return stateValue, r, nil
}
case stateParameter:
switch {
case isAlpha(r), isNumber(r), r == '_':
return stateParameter, r, nil
case isSpace(r):
return stateValue, 0, nil
default:
return stateNil, 0, io.ErrUnexpectedEOF
}
case stateMessage:
switch {
case isAlpha(r):
return stateMessage, r, nil
case isSpace(r):
return stateValue, 0, nil
default:
return stateNil, 0, io.ErrUnexpectedEOF
}
case stateComment:
switch {
case isNewline(r):
return stateNil, 0, nil
default:
return stateComment, 0, nil
}
default:
return stateNil, 0, errors.New("")
}
}
func quote(s string) string {
if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
if strings.Contains(s, "\"") {
return `"""` + s + `"""`
}
return `"` + s + `"`
}
return s
}
func unquote(s string) (string, bool) {
// TODO: single quotes
if len(s) >= 3 && s[:3] == `"""` {
if len(s) >= 6 && s[len(s)-3:] == `"""` {
return s[3 : len(s)-3], true
}
return "", false
}
if len(s) >= 1 && s[0] == '"' {
if len(s) >= 2 && s[len(s)-1] == '"' {
return s[1 : len(s)-1], true
}
return "", false
}
return s, true
}
func isAlpha(r rune) bool {
return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
}
func isNumber(r rune) bool {
return r >= '0' && r <= '9'
}
func isSpace(r rune) bool {
return r == ' ' || r == '\t'
}
func isNewline(r rune) bool {
return r == '\r' || r == '\n'
}
func isValidMessageRole(role string) bool {
return role == "system" || role == "user" || role == "assistant"
}
func isValidCommand(cmd string) bool {
switch strings.ToLower(cmd) {
case "from", "license", "template", "system", "adapter", "parameter", "message":
return true
default:
return false
}
}

View File

@@ -1,511 +0,0 @@
package model
import (
"bytes"
"fmt"
"io"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestParseFileFile(t *testing.T) {
input := `
FROM model1
ADAPTER adapter1
LICENSE MIT
PARAMETER param1 value1
PARAMETER param2 value2
TEMPLATE template1
`
reader := strings.NewReader(input)
modelfile, err := ParseFile(reader)
assert.NoError(t, err)
expectedCommands := []Command{
{Name: "model", Args: "model1"},
{Name: "adapter", Args: "adapter1"},
{Name: "license", Args: "MIT"},
{Name: "param1", Args: "value1"},
{Name: "param2", Args: "value2"},
{Name: "template", Args: "template1"},
}
assert.Equal(t, expectedCommands, modelfile.Commands)
}
func TestParseFileFrom(t *testing.T) {
var cases = []struct {
input string
expected []Command
err error
}{
{
"FROM foo",
[]Command{{Name: "model", Args: "foo"}},
nil,
},
{
"FROM /path/to/model",
[]Command{{Name: "model", Args: "/path/to/model"}},
nil,
},
{
"FROM /path/to/model/fp16.bin",
[]Command{{Name: "model", Args: "/path/to/model/fp16.bin"}},
nil,
},
{
"FROM llama3:latest",
[]Command{{Name: "model", Args: "llama3:latest"}},
nil,
},
{
"FROM llama3:7b-instruct-q4_K_M",
[]Command{{Name: "model", Args: "llama3:7b-instruct-q4_K_M"}},
nil,
},
{
"", nil, errMissingFrom,
},
{
"PARAMETER param1 value1",
nil,
errMissingFrom,
},
{
"PARAMETER param1 value1\nFROM foo",
[]Command{{Name: "param1", Args: "value1"}, {Name: "model", Args: "foo"}},
nil,
},
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.input))
assert.ErrorIs(t, err, c.err)
if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
})
}
}
func TestParseFileParametersMissingValue(t *testing.T) {
input := `
FROM foo
PARAMETER param1
`
reader := strings.NewReader(input)
_, err := ParseFile(reader)
assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
}
func TestParseFileBadCommand(t *testing.T) {
input := `
FROM foo
BADCOMMAND param1 value1
`
_, err := ParseFile(strings.NewReader(input))
assert.ErrorIs(t, err, errInvalidCommand)
}
func TestParseFileMessages(t *testing.T) {
var cases = []struct {
input string
expected []Command
err error
}{
{
`
FROM foo
MESSAGE system You are a file parser. Always parse things.
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a file parser. Always parse things."},
},
nil,
},
{
`
FROM foo
MESSAGE system You are a file parser. Always parse things.`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a file parser. Always parse things."},
},
nil,
},
{
`
FROM foo
MESSAGE system You are a file parser. Always parse things.
MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things!
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a file parser. Always parse things."},
{Name: "message", Args: "user: Hey there!"},
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
},
nil,
},
{
`
FROM foo
MESSAGE system """
You are a multiline file parser. Always parse things.
"""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "message", Args: "system: \nYou are a multiline file parser. Always parse things.\n"},
},
nil,
},
{
`
FROM foo
MESSAGE badguy I'm a bad guy!
`,
nil,
errInvalidMessageRole,
},
{
`
FROM foo
MESSAGE system
`,
nil,
io.ErrUnexpectedEOF,
},
{
`
FROM foo
MESSAGE system`,
nil,
io.ErrUnexpectedEOF,
},
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.input))
assert.ErrorIs(t, err, c.err)
if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
})
}
}
func TestParseFileQuoted(t *testing.T) {
var cases = []struct {
multiline string
expected []Command
err error
}{
{
`
FROM foo
SYSTEM """
This is a
multiline system.
"""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: "\nThis is a\nmultiline system.\n"},
},
nil,
},
{
`
FROM foo
SYSTEM """
This is a
multiline system."""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: "\nThis is a\nmultiline system."},
},
nil,
},
{
`
FROM foo
SYSTEM """This is a
multiline system."""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: "This is a\nmultiline system."},
},
nil,
},
{
`
FROM foo
SYSTEM """This is a multiline system."""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: "This is a multiline system."},
},
nil,
},
{
`
FROM foo
SYSTEM """This is a multiline system.""
`,
nil,
io.ErrUnexpectedEOF,
},
{
`
FROM foo
SYSTEM "
`,
nil,
io.ErrUnexpectedEOF,
},
{
`
FROM foo
SYSTEM """
This is a multiline system with "quotes".
"""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: "\nThis is a multiline system with \"quotes\".\n"},
},
nil,
},
{
`
FROM foo
SYSTEM """"""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: ""},
},
nil,
},
{
`
FROM foo
SYSTEM ""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: ""},
},
nil,
},
{
`
FROM foo
SYSTEM "'"
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: "'"},
},
nil,
},
{
`
FROM foo
SYSTEM """''"'""'""'"'''''""'""'"""
`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "system", Args: `''"'""'""'"'''''""'""'`},
},
nil,
},
{
`
FROM foo
TEMPLATE """
{{ .Prompt }}
"""`,
[]Command{
{Name: "model", Args: "foo"},
{Name: "template", Args: "\n{{ .Prompt }}\n"},
},
nil,
},
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.multiline))
assert.ErrorIs(t, err, c.err)
if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
})
}
}
func TestParseFileParameters(t *testing.T) {
var cases = map[string]struct {
name, value string
}{
"numa true": {"numa", "true"},
"num_ctx 1": {"num_ctx", "1"},
"num_batch 1": {"num_batch", "1"},
"num_gqa 1": {"num_gqa", "1"},
"num_gpu 1": {"num_gpu", "1"},
"main_gpu 1": {"main_gpu", "1"},
"low_vram true": {"low_vram", "true"},
"f16_kv true": {"f16_kv", "true"},
"logits_all true": {"logits_all", "true"},
"vocab_only true": {"vocab_only", "true"},
"use_mmap true": {"use_mmap", "true"},
"use_mlock true": {"use_mlock", "true"},
"num_thread 1": {"num_thread", "1"},
"num_keep 1": {"num_keep", "1"},
"seed 1": {"seed", "1"},
"num_predict 1": {"num_predict", "1"},
"top_k 1": {"top_k", "1"},
"top_p 1.0": {"top_p", "1.0"},
"tfs_z 1.0": {"tfs_z", "1.0"},
"typical_p 1.0": {"typical_p", "1.0"},
"repeat_last_n 1": {"repeat_last_n", "1"},
"temperature 1.0": {"temperature", "1.0"},
"repeat_penalty 1.0": {"repeat_penalty", "1.0"},
"presence_penalty 1.0": {"presence_penalty", "1.0"},
"frequency_penalty 1.0": {"frequency_penalty", "1.0"},
"mirostat 1": {"mirostat", "1"},
"mirostat_tau 1.0": {"mirostat_tau", "1.0"},
"mirostat_eta 1.0": {"mirostat_eta", "1.0"},
"penalize_newline true": {"penalize_newline", "true"},
"stop ### User:": {"stop", "### User:"},
"stop ### User: ": {"stop", "### User: "},
"stop \"### User:\"": {"stop", "### User:"},
"stop \"### User: \"": {"stop", "### User: "},
"stop \"\"\"### User:\"\"\"": {"stop", "### User:"},
"stop \"\"\"### User:\n\"\"\"": {"stop", "### User:\n"},
"stop <|endoftext|>": {"stop", "<|endoftext|>"},
"stop <|eot_id|>": {"stop", "<|eot_id|>"},
"stop </s>": {"stop", "</s>"},
}
for k, v := range cases {
t.Run(k, func(t *testing.T) {
var b bytes.Buffer
fmt.Fprintln(&b, "FROM foo")
fmt.Fprintln(&b, "PARAMETER", k)
modelfile, err := ParseFile(&b)
assert.NoError(t, err)
assert.Equal(t, []Command{
{Name: "model", Args: "foo"},
{Name: v.name, Args: v.value},
}, modelfile.Commands)
})
}
}
func TestParseFileComments(t *testing.T) {
var cases = []struct {
input string
expected []Command
}{
{
`
# comment
FROM foo
`,
[]Command{
{Name: "model", Args: "foo"},
},
},
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.input))
assert.NoError(t, err)
assert.Equal(t, c.expected, modelfile.Commands)
})
}
}
func TestParseFileFormatParseFile(t *testing.T) {
var cases = []string{
`
FROM foo
ADAPTER adapter1
LICENSE MIT
PARAMETER param1 value1
PARAMETER param2 value2
TEMPLATE template1
MESSAGE system You are a file parser. Always parse things.
MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things!
`,
`
FROM foo
ADAPTER adapter1
LICENSE MIT
PARAMETER param1 value1
PARAMETER param2 value2
TEMPLATE template1
MESSAGE system """
You are a store greeter. Always responsed with "Hello!".
"""
MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things!
`,
`
FROM foo
ADAPTER adapter1
LICENSE """
Very long and boring legal text.
Blah blah blah.
"Oh look, a quote!"
"""
PARAMETER param1 value1
PARAMETER param2 value2
TEMPLATE template1
MESSAGE system """
You are a store greeter. Always responsed with "Hello!".
"""
MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things!
`,
`
FROM foo
SYSTEM ""
`,
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c))
assert.NoError(t, err)
modelfile2, err := ParseFile(strings.NewReader(modelfile.String()))
assert.NoError(t, err)
assert.Equal(t, modelfile, modelfile2)
})
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,2 +1,2 @@
go test fuzz v1
string("00@")
string("/0")

View File

@@ -0,0 +1,2 @@
go test fuzz v1
string("0//0")

View File

@@ -0,0 +1,2 @@
go test fuzz v1
string("0 /0")

View File

@@ -0,0 +1,2 @@
go test fuzz v1
string("+0/00000")

View File

@@ -0,0 +1,2 @@
go test fuzz v1
string(":")

View File

@@ -0,0 +1,2 @@
go test fuzz v1
string("0+.\xf2\x80\xf6\x9d00000\xe5\x99\xe6\xd900\xd90\xa60\x91\xdc0\xff\xbf\x99\xe800\xb9\xdc\xd6\xc300\x970\xfb\xfd0\xe0\x8a\xe1\xad\xd40\x9700\xa80\x980\xdd0000\xb00\x91000\xfe0\x89\x9b\x90\x93\x9f0\xe60\xf7\x84\xb0\x87\xa5\xff0\xa000\x9a\x85\xf6\x85\xfe\xa9\xf9\xe9\xde00\xf4\xe0\x8f\x81\xad\xde00\xd700\xaa\xe000000\xb1\xee0\x91")

15
types/structs/structs.go Normal file
View File

@@ -0,0 +1,15 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// Package structs contains the Incomparable type.
package structs
// Incomparable is a zero-width incomparable type. If added as the
// first field in a struct, it marks that struct as not comparable
// (can't do == or be a map key) and usually doesn't add any width to
// the struct (unless the struct has only small fields).
//
// By making a struct incomparable, you can prevent misuse (prevent
// people from using ==), but also you can shrink generated binaries,
// as the compiler can omit equality funcs from the binary.
type Incomparable [0]func()