Compare commits
2 Commits
v0.1.34
...
modenameen
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
39a199bb3e | ||
|
|
1b21a22d0e |
13
.github/workflows/release.yaml
vendored
13
.github/workflows/release.yaml
vendored
@@ -311,18 +311,29 @@ jobs:
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cpu
|
||||
path: |
|
||||
llm/build
|
||||
dist/windows-amd64
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cuda
|
||||
path: |
|
||||
llm/build
|
||||
dist/windows-amd64
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: windows-cuda-deps
|
||||
path: dist/deps
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: windows-rocm-deps
|
||||
path: dist/deps
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-rocm
|
||||
path: |
|
||||
llm/build
|
||||
dist/windows-amd64
|
||||
- run: dir llm/build
|
||||
- run: |
|
||||
$gopath=(get-command go).source | split-path -parent
|
||||
@@ -331,6 +342,8 @@ jobs:
|
||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
||||
$env:PATH="$gopath;$env:PATH"
|
||||
$env:OLLAMA_SKIP_GENERATE="1"
|
||||
$env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
|
||||
$env:HIP_PATH=$(resolve-path ".\dist\deps")
|
||||
& .\scripts\build_windows.ps1
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
|
||||
15
.github/workflows/test.yaml
vendored
15
.github/workflows/test.yaml
vendored
@@ -1,15 +1,5 @@
|
||||
name: test
|
||||
|
||||
concurrency:
|
||||
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
|
||||
# cancels running CI jobs and starts all new ones.
|
||||
#
|
||||
# For non-PR pushes, concurrency.group needs to be unique for every distinct
|
||||
# CI run we want to have happen. Use run_id, which in practice means all
|
||||
# non-PR CI runs will be allowed to run without preempting each other.
|
||||
group: ${{ github.workflow }}-$${{ github.pull_request.number || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
@@ -31,9 +21,7 @@ jobs:
|
||||
- id: changes
|
||||
run: |
|
||||
changed() {
|
||||
git diff-tree -r --no-commit-id --name-only \
|
||||
$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
|
||||
${{ github.event.pull_request.head.sha }} \
|
||||
git diff-tree -r --no-commit-id --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
|
||||
| xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
|
||||
}
|
||||
|
||||
@@ -295,6 +283,7 @@ jobs:
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache: true
|
||||
- run: go get
|
||||
- run: |
|
||||
case ${{ matrix.arch }} in
|
||||
amd64) echo ARCH=x86_64 ;;
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,5 +11,4 @@ ggml-metal.metal
|
||||
.idea
|
||||
test_data
|
||||
*.crt
|
||||
llm/build
|
||||
__debug_bin*
|
||||
llm/build
|
||||
30
README.md
30
README.md
@@ -1,5 +1,5 @@
|
||||
<div align="center">
|
||||
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
|
||||
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
|
||||
</div>
|
||||
|
||||
# Ollama
|
||||
@@ -51,7 +51,7 @@ Here are some example models that can be downloaded:
|
||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
||||
| Llama 3 | 8B | 4.7GB | `ollama run llama3` |
|
||||
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
||||
| Phi-3 | 3.8B | 2.3GB | `ollama run phi3` |
|
||||
| Phi-3 | 3,8B | 2.3GB | `ollama run phi3` |
|
||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
|
||||
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
||||
The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||
```
|
||||
|
||||
### Pass the prompt as an argument
|
||||
### Pass in prompt as arguments
|
||||
|
||||
```
|
||||
$ ollama run llama3 "Summarize this file: $(cat README.md)"
|
||||
@@ -284,19 +284,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
|
||||
- [OpenAOE](https://github.com/InternLM/OpenAOE)
|
||||
- [Odin Runes](https://github.com/leonid20000/OdinRunes)
|
||||
- [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
|
||||
- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
|
||||
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
|
||||
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
|
||||
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
||||
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
|
||||
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
|
||||
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
|
||||
- [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
|
||||
- [StreamDeploy](https://github.com/StreamDeploy-DevRel/streamdeploy-llm-app-scaffold) (LLM Application Scaffold)
|
||||
- [chat](https://github.com/swuecho/chat) (chat web app for teams)
|
||||
- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
|
||||
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
|
||||
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
|
||||
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
|
||||
- [chat: chat web app for teams](https://github.com/swuecho/chat)
|
||||
- [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
|
||||
- [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
|
||||
- [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
|
||||
- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
|
||||
|
||||
### Terminal
|
||||
|
||||
@@ -350,11 +348,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
|
||||
- [Elixir LangChain](https://github.com/brainlid/langchain)
|
||||
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
|
||||
- [Ollama for R - ollama-r](https://github.com/hauselin/ollama-r)
|
||||
- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
|
||||
- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
|
||||
- [Testcontainers](https://testcontainers.com/modules/ollama/)
|
||||
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
|
||||
|
||||
### Mobile
|
||||
|
||||
@@ -374,13 +370,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
|
||||
- [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
|
||||
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
|
||||
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
|
||||
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
|
||||
- [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
|
||||
- [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
|
||||
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
|
||||
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
|
||||
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
|
||||
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
|
||||
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
|
||||
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
|
||||
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
||||
@@ -389,5 +384,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||
|
||||
### Supported backends
|
||||
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
|
||||
|
||||
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
|
||||
|
||||
@@ -18,7 +18,6 @@ import (
|
||||
"net/url"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
@@ -58,36 +57,12 @@ func checkError(resp *http.Response, body []byte) error {
|
||||
// If the variable is not specified, a default ollama host and port will be
|
||||
// used.
|
||||
func ClientFromEnvironment() (*Client, error) {
|
||||
ollamaHost, err := GetOllamaHost()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &Client{
|
||||
base: &url.URL{
|
||||
Scheme: ollamaHost.Scheme,
|
||||
Host: net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
|
||||
},
|
||||
http: http.DefaultClient,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type OllamaHost struct {
|
||||
Scheme string
|
||||
Host string
|
||||
Port string
|
||||
}
|
||||
|
||||
func GetOllamaHost() (OllamaHost, error) {
|
||||
defaultPort := "11434"
|
||||
|
||||
hostVar := os.Getenv("OLLAMA_HOST")
|
||||
hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
|
||||
|
||||
scheme, hostport, ok := strings.Cut(hostVar, "://")
|
||||
scheme, hostport, ok := strings.Cut(os.Getenv("OLLAMA_HOST"), "://")
|
||||
switch {
|
||||
case !ok:
|
||||
scheme, hostport = "http", hostVar
|
||||
scheme, hostport = "http", os.Getenv("OLLAMA_HOST")
|
||||
case scheme == "http":
|
||||
defaultPort = "80"
|
||||
case scheme == "https":
|
||||
@@ -107,14 +82,12 @@ func GetOllamaHost() (OllamaHost, error) {
|
||||
}
|
||||
}
|
||||
|
||||
if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
|
||||
return OllamaHost{}, ErrInvalidHostPort
|
||||
}
|
||||
|
||||
return OllamaHost{
|
||||
Scheme: scheme,
|
||||
Host: host,
|
||||
Port: port,
|
||||
return &Client{
|
||||
base: &url.URL{
|
||||
Scheme: scheme,
|
||||
Host: net.JoinHostPort(host, port),
|
||||
},
|
||||
http: http.DefaultClient,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
import "testing"
|
||||
|
||||
func TestClientFromEnvironment(t *testing.T) {
|
||||
type testCase struct {
|
||||
@@ -46,40 +40,4 @@ func TestClientFromEnvironment(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
hostTestCases := map[string]*testCase{
|
||||
"empty": {value: "", expect: "127.0.0.1:11434"},
|
||||
"only address": {value: "1.2.3.4", expect: "1.2.3.4:11434"},
|
||||
"only port": {value: ":1234", expect: ":1234"},
|
||||
"address and port": {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
|
||||
"hostname": {value: "example.com", expect: "example.com:11434"},
|
||||
"hostname and port": {value: "example.com:1234", expect: "example.com:1234"},
|
||||
"zero port": {value: ":0", expect: ":0"},
|
||||
"too large port": {value: ":66000", err: ErrInvalidHostPort},
|
||||
"too small port": {value: ":-1", err: ErrInvalidHostPort},
|
||||
"ipv6 localhost": {value: "[::1]", expect: "[::1]:11434"},
|
||||
"ipv6 world open": {value: "[::]", expect: "[::]:11434"},
|
||||
"ipv6 no brackets": {value: "::1", expect: "[::1]:11434"},
|
||||
"ipv6 + port": {value: "[::1]:1337", expect: "[::1]:1337"},
|
||||
"extra space": {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
|
||||
"extra quotes": {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
|
||||
"extra space+quotes": {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
|
||||
"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
|
||||
}
|
||||
|
||||
for k, v := range hostTestCases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_HOST", v.value)
|
||||
|
||||
oh, err := GetOllamaHost()
|
||||
if err != v.err {
|
||||
t.Fatalf("expected %s, got %s", v.err, err)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
host := net.JoinHostPort(oh.Host, oh.Port)
|
||||
assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
18
api/types.go
18
api/types.go
@@ -309,7 +309,6 @@ func (m *Metrics) Summary() {
|
||||
}
|
||||
|
||||
var ErrInvalidOpts = errors.New("invalid options")
|
||||
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
|
||||
|
||||
func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
|
||||
@@ -397,10 +396,8 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||
func DefaultOptions() Options {
|
||||
return Options{
|
||||
// options set on request to runner
|
||||
NumPredict: -1,
|
||||
|
||||
// set a minimal num_keep to avoid issues on context shifts
|
||||
NumKeep: 4,
|
||||
NumPredict: -1,
|
||||
NumKeep: 0,
|
||||
Temperature: 0.8,
|
||||
TopK: 40,
|
||||
TopP: 0.9,
|
||||
@@ -436,13 +433,6 @@ type Duration struct {
|
||||
time.Duration
|
||||
}
|
||||
|
||||
func (d Duration) MarshalJSON() ([]byte, error) {
|
||||
if d.Duration < 0 {
|
||||
return []byte("-1"), nil
|
||||
}
|
||||
return []byte("\"" + d.Duration.String() + "\""), nil
|
||||
}
|
||||
|
||||
func (d *Duration) UnmarshalJSON(b []byte) (err error) {
|
||||
var v any
|
||||
if err := json.Unmarshal(b, &v); err != nil {
|
||||
@@ -456,7 +446,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
|
||||
if t < 0 {
|
||||
d.Duration = time.Duration(math.MaxInt64)
|
||||
} else {
|
||||
d.Duration = time.Duration(int(t) * int(time.Second))
|
||||
d.Duration = time.Duration(t * float64(time.Second))
|
||||
}
|
||||
case string:
|
||||
d.Duration, err = time.ParseDuration(t)
|
||||
@@ -466,8 +456,6 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
|
||||
if d.Duration < 0 {
|
||||
d.Duration = time.Duration(math.MaxInt64)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("Unsupported type: '%s'", reflect.TypeOf(v))
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -21,11 +21,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
|
||||
req: `{ "keep_alive": 42 }`,
|
||||
exp: &Duration{42 * time.Second},
|
||||
},
|
||||
{
|
||||
name: "Positive Float",
|
||||
req: `{ "keep_alive": 42.5 }`,
|
||||
exp: &Duration{42 * time.Second},
|
||||
},
|
||||
{
|
||||
name: "Positive Integer String",
|
||||
req: `{ "keep_alive": "42m" }`,
|
||||
@@ -36,11 +31,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
|
||||
req: `{ "keep_alive": -1 }`,
|
||||
exp: &Duration{math.MaxInt64},
|
||||
},
|
||||
{
|
||||
name: "Negative Float",
|
||||
req: `{ "keep_alive": -3.14 }`,
|
||||
exp: &Duration{math.MaxInt64},
|
||||
},
|
||||
{
|
||||
name: "Negative Integer String",
|
||||
req: `{ "keep_alive": "-1m" }`,
|
||||
@@ -58,50 +48,3 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDurationMarshalUnmarshal(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input time.Duration
|
||||
expected time.Duration
|
||||
}{
|
||||
{
|
||||
"negative duration",
|
||||
time.Duration(-1),
|
||||
time.Duration(math.MaxInt64),
|
||||
},
|
||||
{
|
||||
"positive duration",
|
||||
time.Duration(42 * time.Second),
|
||||
time.Duration(42 * time.Second),
|
||||
},
|
||||
{
|
||||
"another positive duration",
|
||||
time.Duration(42 * time.Minute),
|
||||
time.Duration(42 * time.Minute),
|
||||
},
|
||||
{
|
||||
"zero duration",
|
||||
time.Duration(0),
|
||||
time.Duration(0),
|
||||
},
|
||||
{
|
||||
"max duration",
|
||||
time.Duration(math.MaxInt64),
|
||||
time.Duration(math.MaxInt64),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
b, err := json.Marshal(Duration{test.input})
|
||||
require.NoError(t, err)
|
||||
|
||||
var d Duration
|
||||
err = json.Unmarshal(b, &d)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, test.expected, d.Duration, "input %v, marshalled %v, got %v", test.input, string(b), d.Duration)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,14 +5,12 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
)
|
||||
|
||||
func InitLogging() {
|
||||
level := slog.LevelInfo
|
||||
|
||||
if envconfig.Debug {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
|
||||
|
||||
@@ -43,36 +43,37 @@ func getCLIFullPath(command string) string {
|
||||
return command
|
||||
}
|
||||
|
||||
func start(ctx context.Context, command string) (*exec.Cmd, error) {
|
||||
func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||
done := make(chan int)
|
||||
|
||||
logDir := filepath.Dir(ServerLogFile)
|
||||
_, err := os.Stat(logDir)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
if err := os.MkdirAll(logDir, 0o755); err != nil {
|
||||
return done, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
|
||||
}
|
||||
}
|
||||
|
||||
cmd := getCmd(ctx, getCLIFullPath(command))
|
||||
// send stdout and stderr to a file
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
|
||||
return done, fmt.Errorf("failed to spawn server stdout pipe %s", err)
|
||||
}
|
||||
stderr, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
|
||||
return done, fmt.Errorf("failed to spawn server stderr pipe %s", err)
|
||||
}
|
||||
stdin, err := cmd.StdinPipe()
|
||||
if err != nil {
|
||||
return done, fmt.Errorf("failed to spawn server stdin pipe %s", err)
|
||||
}
|
||||
|
||||
// TODO - rotation
|
||||
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create server log: %w", err)
|
||||
return done, fmt.Errorf("failed to create server log %w", err)
|
||||
}
|
||||
|
||||
logDir := filepath.Dir(ServerLogFile)
|
||||
_, err = os.Stat(logDir)
|
||||
if err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return nil, fmt.Errorf("stat ollama server log dir %s: %v", logDir, err)
|
||||
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(logDir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
|
||||
}
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer logFile.Close()
|
||||
io.Copy(logFile, stdout) //nolint:errcheck
|
||||
@@ -116,33 +117,19 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
|
||||
|
||||
// run the command and wait for it to finish
|
||||
if err := cmd.Start(); err != nil {
|
||||
return nil, fmt.Errorf("failed to start server %w", err)
|
||||
return done, fmt.Errorf("failed to start server %w", err)
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
|
||||
}
|
||||
slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))
|
||||
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||
done := make(chan int)
|
||||
|
||||
go func() {
|
||||
// Keep the server running unless we're shuttind down the app
|
||||
crashCount := 0
|
||||
for {
|
||||
slog.Info("starting server...")
|
||||
cmd, err := start(ctx, command)
|
||||
if err != nil {
|
||||
crashCount++
|
||||
slog.Error(fmt.Sprintf("failed to start server %s", err))
|
||||
time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
|
||||
continue
|
||||
}
|
||||
|
||||
cmd.Wait() //nolint:errcheck
|
||||
stdin.Close()
|
||||
var code int
|
||||
if cmd.ProcessState != nil {
|
||||
code = cmd.ProcessState.ExitCode()
|
||||
@@ -156,12 +143,15 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||
default:
|
||||
crashCount++
|
||||
slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
|
||||
time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
|
||||
break
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Error(fmt.Sprintf("failed to restart server %s", err))
|
||||
// Keep trying, but back off if we keep failing
|
||||
time.Sleep(time.Duration(crashCount) * time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return done, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -31,13 +31,16 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
|
||||
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
|
||||
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
|
||||
}
|
||||
// make the upgrade as quiet as possible (no GUI, no prompts)
|
||||
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
|
||||
// TODO - temporarily disable since we're pinning in debug mode for the preview
|
||||
// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
|
||||
installArgs = append(installArgs,
|
||||
"/SP", // Skip the "This will install... Do you wish to continue" prompt
|
||||
"/SUPPRESSMSGBOXES",
|
||||
"/SILENT",
|
||||
"/VERYSILENT",
|
||||
)
|
||||
// }
|
||||
|
||||
// Safeguard in case we have requests in flight that need to drain...
|
||||
slog.Info("Waiting for server to shutdown")
|
||||
|
||||
@@ -88,12 +88,16 @@ DialogFontSize=12
|
||||
[Files]
|
||||
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
||||
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||
Source: "..\dist\windows-amd64\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windows-amd64\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
||||
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
||||
#if DirExists("..\dist\windows-amd64\rocm")
|
||||
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
||||
; Assumes v5.7, may need adjustments for v6
|
||||
#if GetEnv("HIP_PATH") != ""
|
||||
Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
|
||||
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
|
||||
; amdhip64.dll dependency comes from the driver and must be installed already
|
||||
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
|
||||
#endif
|
||||
|
||||
|
||||
@@ -129,7 +133,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
|
||||
|
||||
|
||||
;FinishedHeadingLabel=Run your first model
|
||||
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3
|
||||
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama2
|
||||
;ClickFinish=%n
|
||||
|
||||
[Registry]
|
||||
|
||||
@@ -1,71 +1,71 @@
|
||||
//go:build windows
|
||||
|
||||
package wintray
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/windows"
|
||||
)
|
||||
|
||||
const (
|
||||
updatAvailableMenuID = 1
|
||||
updateMenuID = updatAvailableMenuID + 1
|
||||
separatorMenuID = updateMenuID + 1
|
||||
diagLogsMenuID = separatorMenuID + 1
|
||||
diagSeparatorMenuID = diagLogsMenuID + 1
|
||||
quitMenuID = diagSeparatorMenuID + 1
|
||||
)
|
||||
|
||||
func (t *winTray) initMenus() error {
|
||||
if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w\n", err)
|
||||
}
|
||||
if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w\n", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *winTray) UpdateAvailable(ver string) error {
|
||||
if !t.updateNotified {
|
||||
slog.Debug("updating menu and sending notification for new update")
|
||||
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to write icon data to temp file: %w", err)
|
||||
}
|
||||
if err := wt.setIcon(iconFilePath); err != nil {
|
||||
return fmt.Errorf("unable to set icon: %w", err)
|
||||
}
|
||||
t.updateNotified = true
|
||||
|
||||
t.pendingUpdate = true
|
||||
// Now pop up the notification
|
||||
t.muNID.Lock()
|
||||
defer t.muNID.Unlock()
|
||||
copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
|
||||
copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
|
||||
t.nid.Flags |= NIF_INFO
|
||||
t.nid.Timeout = 10
|
||||
t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
|
||||
err = t.nid.modify()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
//go:build windows
|
||||
|
||||
package wintray
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/windows"
|
||||
)
|
||||
|
||||
const (
|
||||
updatAvailableMenuID = 1
|
||||
updateMenuID = updatAvailableMenuID + 1
|
||||
separatorMenuID = updateMenuID + 1
|
||||
diagLogsMenuID = separatorMenuID + 1
|
||||
diagSeparatorMenuID = diagLogsMenuID + 1
|
||||
quitMenuID = diagSeparatorMenuID + 1
|
||||
)
|
||||
|
||||
func (t *winTray) initMenus() error {
|
||||
if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w\n", err)
|
||||
}
|
||||
if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w\n", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *winTray) UpdateAvailable(ver string) error {
|
||||
if !t.updateNotified {
|
||||
slog.Debug("updating menu and sending notification for new update")
|
||||
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
|
||||
return fmt.Errorf("unable to create menu entries %w", err)
|
||||
}
|
||||
iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to write icon data to temp file: %w", err)
|
||||
}
|
||||
if err := wt.setIcon(iconFilePath); err != nil {
|
||||
return fmt.Errorf("unable to set icon: %w", err)
|
||||
}
|
||||
t.updateNotified = true
|
||||
|
||||
t.pendingUpdate = true
|
||||
// Now pop up the notification
|
||||
t.muNID.Lock()
|
||||
defer t.muNID.Unlock()
|
||||
copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
|
||||
copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
|
||||
t.nid.Flags |= NIF_INFO
|
||||
t.nid.Timeout = 10
|
||||
t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
|
||||
err = t.nid.modify()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
36
auth/auth.go
36
auth/auth.go
@@ -10,44 +10,12 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
const defaultPrivateKey = "id_ed25519"
|
||||
|
||||
func keyPath() (string, error) {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return filepath.Join(home, ".ollama", defaultPrivateKey), nil
|
||||
}
|
||||
|
||||
func GetPublicKey() (string, error) {
|
||||
keyPath, err := keyPath()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
privateKeyFile, err := os.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
||||
return "", err
|
||||
}
|
||||
|
||||
privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
|
||||
|
||||
return strings.TrimSpace(string(publicKey)), nil
|
||||
}
|
||||
|
||||
func NewNonce(r io.Reader, length int) (string, error) {
|
||||
nonce := make([]byte, length)
|
||||
if _, err := io.ReadFull(r, nonce); err != nil {
|
||||
@@ -58,11 +26,13 @@ func NewNonce(r io.Reader, length int) (string, error) {
|
||||
}
|
||||
|
||||
func Sign(ctx context.Context, bts []byte) (string, error) {
|
||||
keyPath, err := keyPath()
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
|
||||
|
||||
privateKeyFile, err := os.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
||||
|
||||
296
cmd/cmd.go
296
cmd/cmd.go
@@ -17,7 +17,6 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
@@ -32,12 +31,10 @@ import (
|
||||
"golang.org/x/term"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/auth"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/progress"
|
||||
"github.com/ollama/ollama/server"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
@@ -56,13 +53,14 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
p := progress.NewProgress(os.Stderr)
|
||||
defer p.Stop()
|
||||
|
||||
f, err := os.Open(filename)
|
||||
bars := make(map[string]*progress.Bar)
|
||||
|
||||
modelfile, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
modelfile, err := model.ParseFile(f)
|
||||
commands, err := parser.Parse(bytes.NewReader(modelfile))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -76,10 +74,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
spinner := progress.NewSpinner(status)
|
||||
p.Add(status, spinner)
|
||||
|
||||
for i := range modelfile.Commands {
|
||||
switch modelfile.Commands[i].Name {
|
||||
for _, c := range commands {
|
||||
switch c.Name {
|
||||
case "model", "adapter":
|
||||
path := modelfile.Commands[i].Args
|
||||
path := c.Args
|
||||
if path == "~" {
|
||||
path = home
|
||||
} else if strings.HasPrefix(path, "~/") {
|
||||
@@ -91,22 +89,101 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
fi, err := os.Stat(path)
|
||||
if errors.Is(err, os.ErrNotExist) && modelfile.Commands[i].Name == "model" {
|
||||
if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
|
||||
continue
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO make this work w/ adapters
|
||||
if fi.IsDir() {
|
||||
// this is likely a safetensors or pytorch directory
|
||||
// TODO make this work w/ adapters
|
||||
tempfile, err := tempZipFiles(path)
|
||||
tf, err := os.CreateTemp("", "ollama-tf")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.RemoveAll(tempfile)
|
||||
defer os.RemoveAll(tf.Name())
|
||||
|
||||
path = tempfile
|
||||
zf := zip.NewWriter(tf)
|
||||
|
||||
files := []string{}
|
||||
|
||||
tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
|
||||
if err != nil {
|
||||
return err
|
||||
} else if len(tfiles) == 0 {
|
||||
tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
files = append(files, tfiles...)
|
||||
|
||||
if len(files) == 0 {
|
||||
return fmt.Errorf("no models were found in '%s'", path)
|
||||
}
|
||||
|
||||
// add the safetensor/torch config file + tokenizer
|
||||
files = append(files, filepath.Join(path, "config.json"))
|
||||
files = append(files, filepath.Join(path, "params.json"))
|
||||
files = append(files, filepath.Join(path, "added_tokens.json"))
|
||||
files = append(files, filepath.Join(path, "tokenizer.model"))
|
||||
|
||||
for _, fn := range files {
|
||||
f, err := os.Open(fn)
|
||||
|
||||
// just skip whatever files aren't there
|
||||
if os.IsNotExist(err) {
|
||||
if strings.HasSuffix(fn, "tokenizer.model") {
|
||||
// try the parent dir before giving up
|
||||
parentDir := filepath.Dir(path)
|
||||
newFn := filepath.Join(parentDir, "tokenizer.model")
|
||||
f, err = os.Open(newFn)
|
||||
if os.IsNotExist(err) {
|
||||
continue
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
h, err := zip.FileInfoHeader(fi)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
h.Name = filepath.Base(fn)
|
||||
h.Method = zip.Store
|
||||
|
||||
w, err := zf.CreateHeader(h)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = io.Copy(w, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if err := zf.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := tf.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
path = tf.Name()
|
||||
}
|
||||
|
||||
digest, err := createBlob(cmd, client, path)
|
||||
@@ -114,11 +191,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
modelfile.Commands[i].Args = "@" + digest
|
||||
modelfile = bytes.ReplaceAll(modelfile, []byte(c.Args), []byte("@"+digest))
|
||||
}
|
||||
}
|
||||
|
||||
bars := make(map[string]*progress.Bar)
|
||||
fn := func(resp api.ProgressResponse) error {
|
||||
if resp.Digest != "" {
|
||||
spinner.Stop()
|
||||
@@ -144,7 +220,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
|
||||
quantization, _ := cmd.Flags().GetString("quantization")
|
||||
|
||||
request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
|
||||
request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
|
||||
if err := client.Create(cmd.Context(), &request, fn); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -152,114 +228,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func tempZipFiles(path string) (string, error) {
|
||||
tempfile, err := os.CreateTemp("", "ollama-tf")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer tempfile.Close()
|
||||
|
||||
zipfile := zip.NewWriter(tempfile)
|
||||
defer zipfile.Close()
|
||||
|
||||
detectContentType := func(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var b bytes.Buffer
|
||||
b.Grow(512)
|
||||
|
||||
if _, err := io.CopyN(&b, f, 512); err != nil && !errors.Is(err, io.EOF) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
contentType, _, _ := strings.Cut(http.DetectContentType(b.Bytes()), ";")
|
||||
return contentType, nil
|
||||
}
|
||||
|
||||
glob := func(pattern, contentType string) ([]string, error) {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, safetensor := range matches {
|
||||
if ct, err := detectContentType(safetensor); err != nil {
|
||||
return nil, err
|
||||
} else if ct != contentType {
|
||||
return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, safetensor)
|
||||
}
|
||||
}
|
||||
|
||||
return matches, nil
|
||||
}
|
||||
|
||||
var files []string
|
||||
if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||
// safetensors files might be unresolved git lfs references; skip if they are
|
||||
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
||||
files = append(files, st...)
|
||||
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
||||
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
||||
files = append(files, pt...)
|
||||
} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/octet-stream"); len(pt) > 0 {
|
||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
||||
// covers consolidated.x.pth, consolidated.pth
|
||||
files = append(files, pt...)
|
||||
} else {
|
||||
return "", errors.New("no safetensors or torch files found")
|
||||
}
|
||||
|
||||
// add configuration files, json files are detected as text/plain
|
||||
js, err := glob(filepath.Join(path, "*.json"), "text/plain")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
files = append(files, js...)
|
||||
|
||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
||||
files = append(files, tks...)
|
||||
} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
|
||||
// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
|
||||
files = append(files, tks...)
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
zfi, err := zip.FileInfoHeader(fi)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
zf, err := zipfile.CreateHeader(zfi)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if _, err := io.Copy(zf, f); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
return tempfile.Name(), nil
|
||||
}
|
||||
|
||||
func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
|
||||
bin, err := os.Open(path)
|
||||
if err != nil {
|
||||
@@ -354,47 +322,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
return generateInteractive(cmd, opts)
|
||||
}
|
||||
|
||||
func errFromUnknownKey(unknownKeyErr error) error {
|
||||
// find SSH public key in the error message
|
||||
sshKeyPattern := `ssh-\w+ [^\s"]+`
|
||||
re := regexp.MustCompile(sshKeyPattern)
|
||||
matches := re.FindStringSubmatch(unknownKeyErr.Error())
|
||||
|
||||
if len(matches) > 0 {
|
||||
serverPubKey := matches[0]
|
||||
|
||||
localPubKey, err := auth.GetPublicKey()
|
||||
if err != nil {
|
||||
return unknownKeyErr
|
||||
}
|
||||
|
||||
if runtime.GOOS == "linux" && serverPubKey != localPubKey {
|
||||
// try the ollama service public key
|
||||
svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
|
||||
if err != nil {
|
||||
return unknownKeyErr
|
||||
}
|
||||
localPubKey = strings.TrimSpace(string(svcPubKey))
|
||||
}
|
||||
|
||||
// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
|
||||
if serverPubKey != localPubKey {
|
||||
return unknownKeyErr
|
||||
}
|
||||
|
||||
var msg strings.Builder
|
||||
msg.WriteString(unknownKeyErr.Error())
|
||||
msg.WriteString("\n\nYour ollama key is:\n")
|
||||
msg.WriteString(localPubKey)
|
||||
msg.WriteString("\nAdd your key at:\n")
|
||||
msg.WriteString("https://ollama.com/settings/keys")
|
||||
|
||||
return errors.New(msg.String())
|
||||
}
|
||||
|
||||
return unknownKeyErr
|
||||
}
|
||||
|
||||
func PushHandler(cmd *cobra.Command, args []string) error {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
@@ -442,20 +369,6 @@ func PushHandler(cmd *cobra.Command, args []string) error {
|
||||
|
||||
request := api.PushRequest{Name: args[0], Insecure: insecure}
|
||||
if err := client.Push(cmd.Context(), &request, fn); err != nil {
|
||||
if spinner != nil {
|
||||
spinner.Stop()
|
||||
}
|
||||
if strings.Contains(err.Error(), "access denied") {
|
||||
return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
|
||||
}
|
||||
host := model.ParseName(args[0]).Host
|
||||
isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
|
||||
if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
|
||||
// the user has not added their ollama key to ollama.com
|
||||
// re-throw an error with a more user-friendly message
|
||||
return errFromUnknownKey(err)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -883,27 +796,24 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
}
|
||||
|
||||
func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
// retrieve the OLLAMA_HOST environment variable
|
||||
ollamaHost, err := api.GetOllamaHost()
|
||||
host, port, err := net.SplitHostPort(strings.Trim(os.Getenv("OLLAMA_HOST"), "\"'"))
|
||||
if err != nil {
|
||||
return err
|
||||
host, port = "127.0.0.1", "11434"
|
||||
if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
|
||||
host = ip.String()
|
||||
}
|
||||
}
|
||||
|
||||
if err := initializeKeypair(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ln, err := net.Listen("tcp", net.JoinHostPort(ollamaHost.Host, ollamaHost.Port))
|
||||
ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = server.Serve(ln)
|
||||
if errors.Is(err, http.ErrServerClosed) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return err
|
||||
return server.Serve(ln)
|
||||
}
|
||||
|
||||
func initializeKeypair() error {
|
||||
@@ -1124,7 +1034,7 @@ Environment Variables:
|
||||
RunE: ListHandler,
|
||||
}
|
||||
copyCmd := &cobra.Command{
|
||||
Use: "cp SOURCE DESTINATION",
|
||||
Use: "cp SOURCE TARGET",
|
||||
Short: "Copy a model",
|
||||
Args: cobra.ExactArgs(2),
|
||||
PreRunE: checkServerHeartbeat,
|
||||
|
||||
@@ -94,7 +94,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
fmt.Fprintln(os.Stderr, " /show Show model information")
|
||||
fmt.Fprintln(os.Stderr, " /load <model> Load a session or model")
|
||||
fmt.Fprintln(os.Stderr, " /save <model> Save your current session")
|
||||
fmt.Fprintln(os.Stderr, " /clear Clear session context")
|
||||
fmt.Fprintln(os.Stderr, " /bye Exit")
|
||||
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
|
||||
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
|
||||
@@ -281,10 +280,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
}
|
||||
fmt.Printf("Created new model '%s'\n", args[1])
|
||||
continue
|
||||
case strings.HasPrefix(line, "/clear"):
|
||||
opts.Messages = []api.Message{}
|
||||
fmt.Println("Cleared session context")
|
||||
continue
|
||||
case strings.HasPrefix(line, "/set"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
|
||||
@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten
|
||||
var err error
|
||||
t, offset, err = m.readTensors(f, offset, params)
|
||||
if err != nil {
|
||||
slog.Error(err.Error())
|
||||
slog.Error("%v", err)
|
||||
return nil, err
|
||||
}
|
||||
tensors = append(tensors, t...)
|
||||
@@ -122,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
|
||||
|
||||
ggufName, err := m.GetLayerName(k)
|
||||
if err != nil {
|
||||
slog.Error(err.Error())
|
||||
slog.Error("%v", err)
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
|
||||
|
||||
ggufName, err := tf.GetLayerName(k.(string))
|
||||
if err != nil {
|
||||
slog.Error(err.Error())
|
||||
slog.Error("%v", err)
|
||||
return nil, err
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
|
||||
|
||||
60
docs/api.md
60
docs/api.md
@@ -17,7 +17,7 @@
|
||||
|
||||
### Model names
|
||||
|
||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
|
||||
### Durations
|
||||
|
||||
@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"prompt": "Why is the sky blue?"
|
||||
}'
|
||||
```
|
||||
@@ -77,7 +77,7 @@ A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"response": "The",
|
||||
"done": false
|
||||
@@ -95,11 +95,11 @@ The final response in the stream also includes additional data about the generat
|
||||
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
||||
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
||||
|
||||
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` * `10^9`.
|
||||
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "",
|
||||
"done": true,
|
||||
@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"stream": false
|
||||
}'
|
||||
@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "The sky is blue because it is the color of the sky.",
|
||||
"done": true,
|
||||
@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
||||
"format": "json",
|
||||
"stream": false
|
||||
@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-11-09T21:07:55.186497Z",
|
||||
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
||||
"done": true,
|
||||
@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"stream": false,
|
||||
"options": {
|
||||
@@ -332,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "The sky is blue because it is the color of the sky.",
|
||||
"done": true,
|
||||
@@ -354,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3"
|
||||
"model": "llama2"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -364,7 +364,7 @@ A single JSON object is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-12-18T19:52:07.071755Z",
|
||||
"response": "",
|
||||
"done": true
|
||||
@@ -407,7 +407,7 @@ Send a chat message with a streaming response.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -423,7 +423,7 @@ A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -438,7 +438,7 @@ Final response:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"done": true,
|
||||
"total_duration": 4883583458,
|
||||
@@ -456,7 +456,7 @@ Final response:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -471,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "registry.ollama.ai/library/llama3:latest",
|
||||
"model": "registry.ollama.ai/library/llama2:latest",
|
||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -495,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -519,7 +519,7 @@ A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -533,7 +533,7 @@ Final response:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"done": true,
|
||||
"total_duration": 8113331500,
|
||||
@@ -591,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -609,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "registry.ollama.ai/library/llama3:latest",
|
||||
"model": "registry.ollama.ai/library/llama2:latest",
|
||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
@@ -651,7 +651,7 @@ Create a new model from a `Modelfile`.
|
||||
```shell
|
||||
curl http://localhost:11434/api/create -d '{
|
||||
"name": "mario",
|
||||
"modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
|
||||
"modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros."
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -758,7 +758,7 @@ A single JSON object will be returned.
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "llama3:latest",
|
||||
"name": "llama2:latest",
|
||||
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
|
||||
"size": 3825819519,
|
||||
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
|
||||
@@ -792,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/show -d '{
|
||||
"name": "llama3"
|
||||
"name": "llama2"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -827,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/copy -d '{
|
||||
"source": "llama3",
|
||||
"destination": "llama3-backup"
|
||||
"source": "llama2",
|
||||
"destination": "llama2-backup"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -854,7 +854,7 @@ Delete a model and its data.
|
||||
|
||||
```shell
|
||||
curl -X DELETE http://localhost:11434/api/delete -d '{
|
||||
"name": "llama3:13b"
|
||||
"name": "llama2:13b"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -882,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/pull -d '{
|
||||
"name": "llama3"
|
||||
"name": "llama2"
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro
|
||||
or installation approach uses unusual paths, you can specify the location by
|
||||
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
|
||||
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
|
||||
a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
|
||||
set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
|
||||
|
||||
Then generate dependencies:
|
||||
|
||||
@@ -142,4 +142,4 @@ In addition to the common Windows development tools described above, install AMD
|
||||
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
|
||||
- [Strawberry Perl](https://strawberryperl.com/)
|
||||
|
||||
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
|
||||
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
|
||||
18
docs/faq.md
18
docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
|
||||
|
||||
```
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"options": {
|
||||
"num_ctx": 4096
|
||||
@@ -88,9 +88,9 @@ On windows, Ollama inherits your user and system environment variables.
|
||||
|
||||
3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
|
||||
|
||||
4. Click OK/Apply to save
|
||||
4. Click OK/Apply to save
|
||||
|
||||
5. Run `ollama` from a new terminal window
|
||||
5. Run `ollama` from a new terminal window
|
||||
|
||||
|
||||
## How can I expose Ollama on my network?
|
||||
@@ -140,7 +140,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
|
||||
|
||||
- macOS: `~/.ollama/models`
|
||||
- Linux: `/usr/share/ollama/.ollama/models`
|
||||
- Windows: `C:\Users\%username%\.ollama\models`
|
||||
- Windows: `C:\Users\<username>\.ollama\models`
|
||||
|
||||
### How do I set them to a different location?
|
||||
|
||||
@@ -221,20 +221,14 @@ The `keep_alive` parameter can be set to:
|
||||
|
||||
For example, to preload a model and leave it in memory use:
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}'
|
||||
```
|
||||
|
||||
To unload the model and free up memory use:
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
|
||||
```
|
||||
|
||||
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
||||
|
||||
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
|
||||
|
||||
## How do I manage the maximum number of requests the server can queue
|
||||
|
||||
If too many requests are sent to the server, it will respond with a 503 error
|
||||
indicating the server is overloaded. You can adjust how many requests may be
|
||||
queue by setting `OLLAMA_MAX_QUEUE`
|
||||
@@ -125,7 +125,7 @@ Publishing models is in early alpha. If you'd like to publish your model to shar
|
||||
|
||||
1. Create [an account](https://ollama.com/signup)
|
||||
2. Copy your Ollama public key:
|
||||
- macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy`
|
||||
- macOS: `cat ~/.ollama/id_ed25519.pub`
|
||||
- Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
|
||||
- Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
|
||||
3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
|
||||
@@ -136,8 +136,6 @@ Next, copy your model to your username's namespace:
|
||||
ollama cp example <your username>/example
|
||||
```
|
||||
|
||||
> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`.
|
||||
|
||||
Then push the model:
|
||||
|
||||
```
|
||||
|
||||
@@ -105,7 +105,7 @@ sudo chmod +x /usr/bin/ollama
|
||||
To view logs of Ollama running as a startup service, run:
|
||||
|
||||
```bash
|
||||
journalctl -e -u ollama
|
||||
journalctl -u ollama
|
||||
```
|
||||
|
||||
## Uninstall
|
||||
|
||||
@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
|
||||
- [Examples](#examples)
|
||||
- [Instructions](#instructions)
|
||||
- [FROM (Required)](#from-required)
|
||||
- [Build from llama3](#build-from-llama3)
|
||||
- [Build from llama2](#build-from-llama2)
|
||||
- [Build from a bin file](#build-from-a-bin-file)
|
||||
- [PARAMETER](#parameter)
|
||||
- [Valid Parameters and Values](#valid-parameters-and-values)
|
||||
@@ -48,7 +48,7 @@ INSTRUCTION arguments
|
||||
An example of a `Modelfile` creating a mario blueprint:
|
||||
|
||||
```modelfile
|
||||
FROM llama3
|
||||
FROM llama2
|
||||
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
||||
PARAMETER temperature 1
|
||||
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
|
||||
@@ -67,25 +67,33 @@ To use this:
|
||||
|
||||
More examples are available in the [examples directory](../examples).
|
||||
|
||||
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
||||
### `Modelfile`s in [ollama.com/library][1]
|
||||
|
||||
There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
|
||||
|
||||
- Option 1: view a details page from a model's tags page:
|
||||
1. Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
|
||||
2. Click on a tag (e.g. https://ollama.com/library/llama2:13b)
|
||||
3. Scroll down to "Layers"
|
||||
- Note: if the [`FROM` instruction](#from-required) is not present,
|
||||
it means the model was created from a local file
|
||||
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
|
||||
|
||||
```bash
|
||||
> ollama show --modelfile llama3
|
||||
> ollama show --modelfile llama2:13b
|
||||
# Modelfile generated by "ollama show"
|
||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||
# FROM llama3:latest
|
||||
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
||||
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
||||
# FROM llama2:13b
|
||||
|
||||
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
|
||||
FROM /root/.ollama/models/blobs/sha256:123abc
|
||||
TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
|
||||
|
||||
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
{{ .Response }}<|eot_id|>"""
|
||||
PARAMETER stop "<|start_header_id|>"
|
||||
PARAMETER stop "<|end_header_id|>"
|
||||
PARAMETER stop "<|eot_id|>"
|
||||
PARAMETER stop "<|reserved_special_token"
|
||||
{{ end }}{{ .Prompt }} [/INST] """
|
||||
SYSTEM """"""
|
||||
PARAMETER stop [INST]
|
||||
PARAMETER stop [/INST]
|
||||
PARAMETER stop <<SYS>>
|
||||
PARAMETER stop <</SYS>>
|
||||
```
|
||||
|
||||
## Instructions
|
||||
@@ -98,10 +106,10 @@ The `FROM` instruction defines the base model to use when creating a model.
|
||||
FROM <model name>:<tag>
|
||||
```
|
||||
|
||||
#### Build from llama3
|
||||
#### Build from llama2
|
||||
|
||||
```modelfile
|
||||
FROM llama3
|
||||
FROM llama2
|
||||
```
|
||||
|
||||
A list of available base models:
|
||||
|
||||
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
|
||||
'content': 'Say this is a test',
|
||||
}
|
||||
],
|
||||
model='llama3',
|
||||
model='llama2',
|
||||
)
|
||||
```
|
||||
|
||||
@@ -43,7 +43,7 @@ const openai = new OpenAI({
|
||||
|
||||
const chatCompletion = await openai.chat.completions.create({
|
||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||
model: 'llama3',
|
||||
model: 'llama2',
|
||||
})
|
||||
```
|
||||
|
||||
@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
|
||||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "llama3",
|
||||
"model": "llama2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
|
||||
Before using a model, pull it locally `ollama pull`:
|
||||
|
||||
```shell
|
||||
ollama pull llama3
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
### Default model names
|
||||
@@ -121,7 +121,7 @@ ollama pull llama3
|
||||
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
||||
|
||||
```
|
||||
ollama cp llama3 gpt-3.5-turbo
|
||||
ollama cp llama2 gpt-3.5-turbo
|
||||
```
|
||||
|
||||
Afterwards, this new model name can be specified the `model` field:
|
||||
|
||||
@@ -15,7 +15,7 @@ import { Ollama } from "langchain/llms/ollama";
|
||||
|
||||
const ollama = new Ollama({
|
||||
baseUrl: "http://localhost:11434",
|
||||
model: "llama3",
|
||||
model: "llama2",
|
||||
});
|
||||
|
||||
const answer = await ollama.invoke(`why is the sky blue?`);
|
||||
@@ -23,10 +23,10 @@ const answer = await ollama.invoke(`why is the sky blue?`);
|
||||
console.log(answer);
|
||||
```
|
||||
|
||||
That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
|
||||
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
|
||||
|
||||
```bash
|
||||
npm install cheerio
|
||||
npm install cheerio
|
||||
```
|
||||
|
||||
```javascript
|
||||
|
||||
@@ -17,12 +17,10 @@ Let's start by asking a simple question that we can get an answer to from the **
|
||||
Then we can create a model and ask the question:
|
||||
|
||||
```python
|
||||
from langchain_community.llms import Ollama
|
||||
ollama = Ollama(
|
||||
base_url='http://localhost:11434',
|
||||
model="llama3"
|
||||
)
|
||||
print(ollama.invoke("why is the sky blue"))
|
||||
from langchain.llms import Ollama
|
||||
ollama = Ollama(base_url='http://localhost:11434',
|
||||
model="llama2")
|
||||
print(ollama("why is the sky blue"))
|
||||
```
|
||||
|
||||
Notice that we are defining the model and the base URL for Ollama.
|
||||
|
||||
108
docs/windows.md
108
docs/windows.md
@@ -1,61 +1,47 @@
|
||||
# Ollama Windows Preview
|
||||
|
||||
Welcome to the Ollama Windows preview.
|
||||
|
||||
No more WSL required!
|
||||
|
||||
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
|
||||
After installing Ollama Windows Preview, Ollama will run in the background and
|
||||
the `ollama` command line is available in `cmd`, `powershell` or your favorite
|
||||
terminal application. As usual the Ollama [api](./api.md) will be served on
|
||||
`http://localhost:11434`.
|
||||
|
||||
As this is a preview release, you should expect a few bugs here and there. If
|
||||
you run into a problem you can reach out on
|
||||
[Discord](https://discord.gg/ollama), or file an
|
||||
[issue](https://github.com/ollama/ollama/issues).
|
||||
Logs will often be helpful in diagnosing the problem (see
|
||||
[Troubleshooting](#troubleshooting) below)
|
||||
|
||||
## System Requirements
|
||||
|
||||
* Windows 10 or newer, Home or Pro
|
||||
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
|
||||
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
|
||||
|
||||
## API Access
|
||||
|
||||
Here's a quick example showing API access from `powershell`
|
||||
```powershell
|
||||
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
|
||||
a "view logs" menu item to the app, and increses logging for the GUI app and
|
||||
server.
|
||||
|
||||
Ollama on Windows stores files in a few different locations. You can view them in
|
||||
the explorer window by hitting `<cmd>+R` and type in:
|
||||
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
||||
- *app.log* contains logs from the GUI application
|
||||
- *server.log* contains the server logs
|
||||
- *upgrade.log* contains log output for upgrades
|
||||
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
|
||||
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
||||
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
||||
|
||||
|
||||
## Standalone CLI
|
||||
|
||||
The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
|
||||
installer. It installs in your account without requiring Administrator rights.
|
||||
We update Ollama regularly to support the latest models, and this installer will
|
||||
help you keep up to date.
|
||||
|
||||
If you'd like to install or integrate Ollama as a service, a standalone
|
||||
`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
|
||||
and GPU library dependencies for Nvidia and AMD. This allows for embedding
|
||||
Ollama in existing applications, or running it as a system service via `ollama
|
||||
serve` with tools such as [NSSM](https://nssm.cc/).
|
||||
# Ollama Windows Preview
|
||||
|
||||
Welcome to the Ollama Windows preview.
|
||||
|
||||
No more WSL required!
|
||||
|
||||
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
|
||||
After installing Ollama Windows Preview, Ollama will run in the background and
|
||||
the `ollama` command line is available in `cmd`, `powershell` or your favorite
|
||||
terminal application. As usual the Ollama [api](./api.md) will be served on
|
||||
`http://localhost:11434`.
|
||||
|
||||
As this is a preview release, you should expect a few bugs here and there. If
|
||||
you run into a problem you can reach out on
|
||||
[Discord](https://discord.gg/ollama), or file an
|
||||
[issue](https://github.com/ollama/ollama/issues).
|
||||
Logs will often be helpful in dianosing the problem (see
|
||||
[Troubleshooting](#troubleshooting) below)
|
||||
|
||||
## System Requirements
|
||||
|
||||
* Windows 10 or newer, Home or Pro
|
||||
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
|
||||
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
|
||||
|
||||
## API Access
|
||||
|
||||
Here's a quick example showing API access from `powershell`
|
||||
```powershell
|
||||
(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
|
||||
a "view logs" menu item to the app, and increses logging for the GUI app and
|
||||
server.
|
||||
|
||||
Ollama on Windows stores files in a few different locations. You can view them in
|
||||
the explorer window by hitting `<cmd>+R` and type in:
|
||||
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
||||
- *app.log* contains logs from the GUI application
|
||||
- *server.log* contains the server logs
|
||||
- *upgrade.log* contains log output for upgrades
|
||||
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
|
||||
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
||||
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
|
||||
|
||||
`ollama run llama3 < sourcequestions.txt`
|
||||
`ollama run llama2 < sourcequestions.txt`
|
||||
|
||||
This concept is used in the following example.
|
||||
|
||||
|
||||
1
examples/flyio/.gitignore
vendored
1
examples/flyio/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
fly.toml
|
||||
@@ -1,67 +0,0 @@
|
||||
# Deploy Ollama to Fly.io
|
||||
|
||||
> Note: this example exposes a public endpoint and does not configure authentication. Use with care.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Ollama: https://ollama.com/download
|
||||
- Fly.io account. Sign up for a free account: https://fly.io/app/sign-up
|
||||
|
||||
## Steps
|
||||
|
||||
1. Login to Fly.io
|
||||
|
||||
```bash
|
||||
fly auth login
|
||||
```
|
||||
|
||||
1. Create a new Fly app
|
||||
|
||||
```bash
|
||||
fly launch --name <name> --image ollama/ollama --internal-port 11434 --vm-size shared-cpu-8x --now
|
||||
```
|
||||
|
||||
1. Pull and run `orca-mini:3b`
|
||||
|
||||
```bash
|
||||
OLLAMA_HOST=https://<name>.fly.dev ollama run orca-mini:3b
|
||||
```
|
||||
|
||||
`shared-cpu-8x` is a free-tier eligible machine type. For better performance, switch to a `performance` or `dedicated` machine type or attach a GPU for hardware acceleration (see below).
|
||||
|
||||
## (Optional) Persistent Volume
|
||||
|
||||
By default Fly Machines use ephemeral storage which is problematic if you want to use the same model across restarts without pulling it again. Create and attach a persistent volume to store the downloaded models:
|
||||
|
||||
1. Create the Fly Volume
|
||||
|
||||
```bash
|
||||
fly volume create ollama
|
||||
```
|
||||
|
||||
1. Update `fly.toml` and add `[mounts]`
|
||||
|
||||
```toml
|
||||
[mounts]
|
||||
source = "ollama"
|
||||
destination = "/mnt/ollama/models"
|
||||
```
|
||||
|
||||
1. Update `fly.toml` and add `[env]`
|
||||
|
||||
```toml
|
||||
[env]
|
||||
OLLAMA_MODELS = "/mnt/ollama/models"
|
||||
```
|
||||
|
||||
1. Deploy your app
|
||||
|
||||
```bash
|
||||
fly deploy
|
||||
```
|
||||
|
||||
## (Optional) Hardware Acceleration
|
||||
|
||||
Fly.io GPU is currently in waitlist. Sign up for the waitlist: https://fly.io/gpu
|
||||
|
||||
Once you've been accepted, create the app with the additional flags `--vm-gpu-kind a100-pcie-40gb` or `--vm-gpu-kind a100-pcie-80gb`.
|
||||
@@ -35,7 +35,7 @@ func main() {
|
||||
|
||||
ctx := context.Background()
|
||||
req := &api.ChatRequest{
|
||||
Model: "llama3",
|
||||
Model: "llama2",
|
||||
Messages: messages,
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ func main() {
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
|
||||
responseData, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
|
||||
@@ -7,24 +7,12 @@
|
||||
|
||||
## Steps
|
||||
|
||||
1. Create the Ollama namespace, deployment, and service
|
||||
1. Create the Ollama namespace, daemon set, and service
|
||||
|
||||
```bash
|
||||
kubectl apply -f cpu.yaml
|
||||
```
|
||||
|
||||
## (Optional) Hardware Acceleration
|
||||
|
||||
Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin) which is deployed in Kubernetes in form of daemonset. Follow the link for more details.
|
||||
|
||||
Once configured, create a GPU enabled Ollama deployment.
|
||||
|
||||
```bash
|
||||
kubectl apply -f gpu.yaml
|
||||
```
|
||||
|
||||
## Test
|
||||
|
||||
1. Port forward the Ollama service to connect and use it locally
|
||||
|
||||
```bash
|
||||
@@ -35,4 +23,14 @@ kubectl apply -f gpu.yaml
|
||||
|
||||
```bash
|
||||
ollama run orca-mini:3b
|
||||
```
|
||||
```
|
||||
|
||||
## (Optional) Hardware Acceleration
|
||||
|
||||
Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin). Follow the link for more details.
|
||||
|
||||
Once configured, create a GPU enabled Ollama deployment.
|
||||
|
||||
```bash
|
||||
kubectl apply -f gpu.yaml
|
||||
```
|
||||
|
||||
@@ -40,9 +40,9 @@ while True:
|
||||
continue
|
||||
|
||||
# Prompt
|
||||
template = """Use the following pieces of context to answer the question at the end.
|
||||
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
||||
Use three sentences maximum and keep the answer as concise as possible.
|
||||
template = """Use the following pieces of context to answer the question at the end.
|
||||
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
||||
Use three sentences maximum and keep the answer as concise as possible.
|
||||
{context}
|
||||
Question: {question}
|
||||
Helpful Answer:"""
|
||||
@@ -51,11 +51,11 @@ while True:
|
||||
template=template,
|
||||
)
|
||||
|
||||
llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
qa_chain = RetrievalQA.from_chain_type(
|
||||
llm,
|
||||
retriever=vectorstore.as_retriever(),
|
||||
chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
|
||||
)
|
||||
|
||||
result = qa_chain({"query": query})
|
||||
result = qa_chain({"query": query})
|
||||
@@ -1,12 +1,12 @@
|
||||
from langchain_community.llms import Ollama
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
from langchain.llms import Ollama
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain.chains.summarize import load_summarize_chain
|
||||
|
||||
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
|
||||
docs = loader.load()
|
||||
|
||||
llm = Ollama(model="llama3")
|
||||
llm = Ollama(model="llama2")
|
||||
chain = load_summarize_chain(llm, chain_type="stuff")
|
||||
|
||||
result = chain.invoke(docs)
|
||||
result = chain.run(docs)
|
||||
print(result)
|
||||
|
||||
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
@@ -21,3 +21,4 @@ This example is a basic "hello world" of using LangChain with Ollama.
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from langchain.llms import Ollama
|
||||
|
||||
input = input("What is your question?")
|
||||
llm = Ollama(model="llama3")
|
||||
llm = Ollama(model="llama2")
|
||||
res = llm.predict(input)
|
||||
print (res)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM llama3
|
||||
FROM llama2
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from super mario bros, acting as an assistant.
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
# Example character: Mario
|
||||
|
||||
This example shows how to create a basic character using Llama3 as the base model.
|
||||
This example shows how to create a basic character using Llama2 as the base model.
|
||||
|
||||
To run this example:
|
||||
|
||||
1. Download the Modelfile
|
||||
2. `ollama pull llama3` to get the base model used in the model file.
|
||||
2. `ollama pull llama2` to get the base model used in the model file.
|
||||
3. `ollama create NAME -f ./Modelfile`
|
||||
4. `ollama run NAME`
|
||||
|
||||
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
|
||||
What the model file looks like:
|
||||
|
||||
```
|
||||
FROM llama3
|
||||
FROM llama2
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from Super Mario Bros, acting as an assistant.
|
||||
|
||||
@@ -2,16 +2,16 @@ import requests
|
||||
import json
|
||||
import random
|
||||
|
||||
model = "llama3"
|
||||
model = "llama2"
|
||||
template = {
|
||||
"firstName": "",
|
||||
"lastName": "",
|
||||
"firstName": "",
|
||||
"lastName": "",
|
||||
"address": {
|
||||
"street": "",
|
||||
"city": "",
|
||||
"state": "",
|
||||
"street": "",
|
||||
"city": "",
|
||||
"state": "",
|
||||
"zipCode": ""
|
||||
},
|
||||
},
|
||||
"phoneNumber": ""
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ countries = [
|
||||
"France",
|
||||
]
|
||||
country = random.choice(countries)
|
||||
model = "llama3"
|
||||
model = "llama2"
|
||||
|
||||
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
|
||||
|
||||
|
||||
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import requests
|
||||
|
||||
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
|
||||
model = "llama3" # TODO: update this for whatever model you wish to use
|
||||
model = "llama2" # TODO: update this for whatever model you wish to use
|
||||
|
||||
|
||||
def chat(messages):
|
||||
|
||||
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a
|
||||
|
||||
## Usage
|
||||
|
||||
1. Add llama3 to have the mentors ask your questions:
|
||||
1. Add llama2 to have the mentors ask your questions:
|
||||
|
||||
```bash
|
||||
ollama pull llama3
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
2. Install prerequisites:
|
||||
|
||||
@@ -15,7 +15,7 @@ async function characterGenerator() {
|
||||
ollama.setModel("stablebeluga2:70b-q4_K_M");
|
||||
const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);
|
||||
|
||||
const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
|
||||
const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
|
||||
|
||||
fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
|
||||
if (err) throw err;
|
||||
@@ -23,4 +23,4 @@ async function characterGenerator() {
|
||||
});
|
||||
}
|
||||
|
||||
characterGenerator();
|
||||
characterGenerator();
|
||||
@@ -1,6 +1,6 @@
|
||||
import * as readline from "readline";
|
||||
|
||||
const model = "llama3";
|
||||
const model = "llama2";
|
||||
type Message = {
|
||||
role: "assistant" | "user" | "system";
|
||||
content: string;
|
||||
@@ -74,4 +74,4 @@ async function main() {
|
||||
|
||||
}
|
||||
|
||||
main();
|
||||
main();
|
||||
@@ -81,10 +81,8 @@ func commonAMDValidateLibDir() (string, error) {
|
||||
}
|
||||
|
||||
// Well known location(s)
|
||||
for _, path := range RocmStandardLocations {
|
||||
if rocmLibUsable(path) {
|
||||
return path, nil
|
||||
}
|
||||
if rocmLibUsable(RocmStandardLocation) {
|
||||
return RocmStandardLocation, nil
|
||||
}
|
||||
|
||||
// Installer payload location if we're running the installed binary
|
||||
|
||||
@@ -25,12 +25,12 @@ const (
|
||||
// Prefix with the node dir
|
||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
||||
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
||||
RocmStandardLocation = "/opt/rocm/lib"
|
||||
)
|
||||
|
||||
var (
|
||||
// Used to validate if the given ROCm lib is usable
|
||||
ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
|
||||
RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
|
||||
ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
|
||||
)
|
||||
|
||||
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
|
||||
|
||||
// TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
|
||||
iGPUName = "AMD Radeon(TM) Graphics"
|
||||
@@ -21,8 +22,7 @@ const (
|
||||
|
||||
var (
|
||||
// Used to validate if the given ROCm lib is usable
|
||||
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
|
||||
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
|
||||
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
|
||||
)
|
||||
|
||||
func AMDGetGPUInfo() []GpuInfo {
|
||||
|
||||
@@ -12,8 +12,6 @@ import (
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -26,8 +24,29 @@ func PayloadsDir() (string, error) {
|
||||
defer lock.Unlock()
|
||||
var err error
|
||||
if payloadsDir == "" {
|
||||
runnersDir := envconfig.RunnersDir
|
||||
|
||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
||||
// On Windows we do not carry the payloads inside the main executable
|
||||
if runtime.GOOS == "windows" && runnersDir == "" {
|
||||
appExe, err := os.Executable()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup executable path", "error", err)
|
||||
return "", err
|
||||
}
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, d := range []string{".", "windows-" + runtime.GOARCH, "dist\\windows-" + runtime.GOARCH} {
|
||||
candidate := filepath.Join(filepath.Dir(appExe), d, "ollama_runners")
|
||||
_, err := os.Stat(candidate)
|
||||
if err == nil {
|
||||
runnersDir = candidate
|
||||
break
|
||||
}
|
||||
}
|
||||
if runnersDir == "" {
|
||||
err = fmt.Errorf("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
||||
slog.Error("incomplete distribution", "error", err)
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
if runnersDir != "" {
|
||||
payloadsDir = runnersDir
|
||||
return payloadsDir, nil
|
||||
@@ -35,7 +54,7 @@ func PayloadsDir() (string, error) {
|
||||
|
||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
||||
cleanupTmpDirs()
|
||||
tmpDir := envconfig.TmpDir
|
||||
tmpDir := os.Getenv("OLLAMA_TMPDIR")
|
||||
if tmpDir == "" {
|
||||
tmpDir, err = os.MkdirTemp("", "ollama")
|
||||
if err != nil {
|
||||
@@ -98,7 +117,7 @@ func cleanupTmpDirs() {
|
||||
func Cleanup() {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
runnersDir := envconfig.RunnersDir
|
||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
||||
|
||||
85
gpu/gpu.go
85
gpu/gpu.go
@@ -21,18 +21,16 @@ import (
|
||||
"unsafe"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
)
|
||||
|
||||
type handles struct {
|
||||
deviceCount int
|
||||
cudart *C.cudart_handle_t
|
||||
nvcuda *C.nvcuda_handle_t
|
||||
}
|
||||
|
||||
const (
|
||||
cudaMinimumMemory = 256 * format.MebiByte
|
||||
rocmMinimumMemory = 256 * format.MebiByte
|
||||
cudaMinimumMemory = 457 * format.MebiByte
|
||||
rocmMinimumMemory = 457 * format.MebiByte
|
||||
)
|
||||
|
||||
var gpuMutex sync.Mutex
|
||||
@@ -64,22 +62,6 @@ var CudartWindowsGlobs = []string{
|
||||
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
||||
}
|
||||
|
||||
var NvcudaLinuxGlobs = []string{
|
||||
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
||||
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
||||
"/usr/lib/*-linux-gnu/libcuda.so*",
|
||||
"/usr/lib/wsl/lib/libcuda.so*",
|
||||
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
||||
"/opt/cuda/lib*/libcuda.so*",
|
||||
"/usr/local/cuda/lib*/libcuda.so*",
|
||||
"/usr/lib*/libcuda.so*",
|
||||
"/usr/local/lib*/libcuda.so*",
|
||||
}
|
||||
|
||||
var NvcudaWindowsGlobs = []string{
|
||||
"c:\\windows\\system*\\nvcuda.dll",
|
||||
}
|
||||
|
||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||
@@ -92,8 +74,6 @@ func initGPUHandles() *handles {
|
||||
gpuHandles := &handles{}
|
||||
var cudartMgmtName string
|
||||
var cudartMgmtPatterns []string
|
||||
var nvcudaMgmtName string
|
||||
var nvcudaMgmtPatterns []string
|
||||
|
||||
tmpDir, _ := PayloadsDir()
|
||||
switch runtime.GOOS {
|
||||
@@ -102,9 +82,6 @@ func initGPUHandles() *handles {
|
||||
localAppData := os.Getenv("LOCALAPPDATA")
|
||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
||||
// Aligned with driver, we can't carry as payloads
|
||||
nvcudaMgmtName = "nvcuda.dll"
|
||||
nvcudaMgmtPatterns = NvcudaWindowsGlobs
|
||||
case "linux":
|
||||
cudartMgmtName = "libcudart.so*"
|
||||
if tmpDir != "" {
|
||||
@@ -112,25 +89,11 @@ func initGPUHandles() *handles {
|
||||
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
|
||||
}
|
||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
|
||||
// Aligned with driver, we can't carry as payloads
|
||||
nvcudaMgmtName = "libcuda.so*"
|
||||
nvcudaMgmtPatterns = NvcudaLinuxGlobs
|
||||
default:
|
||||
return gpuHandles
|
||||
}
|
||||
|
||||
slog.Info("Detecting GPUs")
|
||||
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
|
||||
if len(nvcudaLibPaths) > 0 {
|
||||
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
||||
if nvcuda != nil {
|
||||
slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
|
||||
gpuHandles.nvcuda = nvcuda
|
||||
gpuHandles.deviceCount = deviceCount
|
||||
return gpuHandles
|
||||
}
|
||||
}
|
||||
|
||||
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
||||
if len(cudartLibPaths) > 0 {
|
||||
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
||||
@@ -155,9 +118,6 @@ func GetGPUInfo() GpuInfoList {
|
||||
if gpuHandles.cudart != nil {
|
||||
C.cudart_release(*gpuHandles.cudart)
|
||||
}
|
||||
if gpuHandles.nvcuda != nil {
|
||||
C.nvcuda_release(*gpuHandles.nvcuda)
|
||||
}
|
||||
}()
|
||||
|
||||
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
|
||||
@@ -166,12 +126,6 @@ func GetGPUInfo() GpuInfoList {
|
||||
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
|
||||
}
|
||||
|
||||
// On windows we bundle the nvidia library one level above the runner dir
|
||||
depPath := ""
|
||||
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
|
||||
depPath = filepath.Dir(envconfig.RunnersDir)
|
||||
}
|
||||
|
||||
var memInfo C.mem_info_t
|
||||
resp := []GpuInfo{}
|
||||
|
||||
@@ -184,11 +138,7 @@ func GetGPUInfo() GpuInfoList {
|
||||
gpuInfo := GpuInfo{
|
||||
Library: "cuda",
|
||||
}
|
||||
if gpuHandles.cudart != nil {
|
||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
||||
} else {
|
||||
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
|
||||
}
|
||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
@@ -204,7 +154,6 @@ func GetGPUInfo() GpuInfoList {
|
||||
gpuInfo.Major = int(memInfo.major)
|
||||
gpuInfo.Minor = int(memInfo.minor)
|
||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||
gpuInfo.DependencyPath = depPath
|
||||
|
||||
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
||||
resp = append(resp, gpuInfo)
|
||||
@@ -247,10 +196,9 @@ func GetCPUMem() (memInfo, error) {
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||
func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||
var ldPaths []string
|
||||
var patterns []string
|
||||
gpuLibPaths := []string{}
|
||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
||||
|
||||
@@ -270,14 +218,8 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||
}
|
||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
||||
}
|
||||
patterns = append(patterns, defaultPatterns...)
|
||||
slog.Debug("gpu library search", "globs", patterns)
|
||||
for _, pattern := range patterns {
|
||||
|
||||
// Nvidia PhysX known to return bogus results
|
||||
if strings.Contains(pattern, "PhysX") {
|
||||
slog.Debug("skipping PhysX cuda library path", "path", pattern)
|
||||
}
|
||||
// Ignore glob discovery errors
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
for _, match := range matches {
|
||||
@@ -325,25 +267,8 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
|
||||
return 0, nil, ""
|
||||
}
|
||||
|
||||
func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
|
||||
var resp C.nvcuda_init_resp_t
|
||||
resp.ch.verbose = getVerboseState()
|
||||
for _, libPath := range nvcudaLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.nvcuda_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return int(resp.num_devices), &resp.ch, libPath
|
||||
}
|
||||
}
|
||||
return 0, nil, ""
|
||||
}
|
||||
|
||||
func getVerboseState() C.uint16_t {
|
||||
if envconfig.Debug {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
return C.uint16_t(1)
|
||||
}
|
||||
return C.uint16_t(0)
|
||||
|
||||
@@ -10,12 +10,6 @@ package gpu
|
||||
import "C"
|
||||
import (
|
||||
"runtime"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
const (
|
||||
metalMinimumMemory = 384 * format.MebiByte
|
||||
)
|
||||
|
||||
func GetGPUInfo() GpuInfoList {
|
||||
@@ -38,7 +32,7 @@ func GetGPUInfo() GpuInfoList {
|
||||
// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
|
||||
info.FreeMemory = info.TotalMemory
|
||||
|
||||
info.MinimumMemory = metalMinimumMemory
|
||||
info.MinimumMemory = 0
|
||||
return []GpuInfo{info}
|
||||
}
|
||||
|
||||
|
||||
@@ -58,7 +58,6 @@ void cpu_check_ram(mem_info_t *resp);
|
||||
#endif
|
||||
|
||||
#include "gpu_info_cudart.h"
|
||||
#include "gpu_info_nvcuda.h"
|
||||
|
||||
#endif // __GPU_INFO_H__
|
||||
#endif // __APPLE__
|
||||
@@ -6,9 +6,9 @@
|
||||
// Just enough typedef's to dlopen/dlsym for memory information
|
||||
typedef enum cudartReturn_enum {
|
||||
CUDART_SUCCESS = 0,
|
||||
CUDART_ERROR_INVALID_VALUE = 1,
|
||||
CUDART_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
// Other values omitted for now...
|
||||
} cudartReturn_t;
|
||||
|
||||
|
||||
@@ -1,203 +0,0 @@
|
||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||
|
||||
#include <string.h>
|
||||
#include "gpu_info_nvcuda.h"
|
||||
|
||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
CUresult ret;
|
||||
resp->err = NULL;
|
||||
resp->num_devices = 0;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
struct lookup {
|
||||
char *s;
|
||||
void **p;
|
||||
} l[] = {
|
||||
|
||||
{"cuInit", (void *)&resp->ch.cuInit},
|
||||
{"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
|
||||
{"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
|
||||
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
|
||||
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
|
||||
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
|
||||
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
|
||||
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
|
||||
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
|
||||
if (!resp->ch.handle) {
|
||||
char *msg = LOAD_ERR();
|
||||
LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||
nvcuda_lib_path, msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; l[i].s != NULL; i++) {
|
||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||
if (!*l[i].p) {
|
||||
char *msg = LOAD_ERR();
|
||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||
msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
ret = (*resp->ch.cuInit)(0);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
||||
return;
|
||||
}
|
||||
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
int version = 0;
|
||||
nvcudaDriverVersion_t driverVersion;
|
||||
driverVersion.major = 0;
|
||||
driverVersion.minor = 0;
|
||||
|
||||
// Report driver version if we're in verbose mode, ignore errors
|
||||
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
||||
} else {
|
||||
driverVersion.major = version / 1000;
|
||||
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
|
||||
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
|
||||
}
|
||||
|
||||
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const int buflen = 256;
|
||||
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
||||
resp->err = NULL;
|
||||
nvcudaMemory_t memInfo = {0,0};
|
||||
CUresult ret;
|
||||
CUdevice device = -1;
|
||||
CUcontext ctx = NULL;
|
||||
char buf[buflen + 1];
|
||||
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("nvcuda handle isn't initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.cuDeviceGet)(&device, i);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda device failed to initialize");
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
|
||||
} else {
|
||||
ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
|
||||
} else {
|
||||
resp->minor = minor;
|
||||
resp->major = major;
|
||||
}
|
||||
}
|
||||
|
||||
ret = (*h.cuDeviceGetUuid)(&uuid, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
||||
} else {
|
||||
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
uuid.bytes[0],
|
||||
uuid.bytes[1],
|
||||
uuid.bytes[2],
|
||||
uuid.bytes[3],
|
||||
uuid.bytes[4],
|
||||
uuid.bytes[5],
|
||||
uuid.bytes[6],
|
||||
uuid.bytes[7],
|
||||
uuid.bytes[8],
|
||||
uuid.bytes[9],
|
||||
uuid.bytes[10],
|
||||
uuid.bytes[11],
|
||||
uuid.bytes[12],
|
||||
uuid.bytes[13],
|
||||
uuid.bytes[14],
|
||||
uuid.bytes[15]
|
||||
);
|
||||
}
|
||||
|
||||
// To get memory we have to set (and release) a context
|
||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
// Best effort on failure...
|
||||
(*h.cuCtxDestroy)(ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
resp->total = memInfo.total;
|
||||
resp->free = memInfo.free;
|
||||
|
||||
LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
|
||||
LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
|
||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
||||
|
||||
|
||||
|
||||
ret = (*h.cuCtxDestroy)(ctx);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(1, "nvcuda failed to release primary device context %d", ret);
|
||||
}
|
||||
}
|
||||
|
||||
void nvcuda_release(nvcuda_handle_t h) {
|
||||
LOG(h.verbose, "releasing nvcuda library\n");
|
||||
UNLOAD_LIBRARY(h.handle);
|
||||
// TODO and other context release logic?
|
||||
h.handle = NULL;
|
||||
}
|
||||
|
||||
#endif // __APPLE__
|
||||
@@ -1,71 +0,0 @@
|
||||
#ifndef __APPLE__
|
||||
#ifndef __GPU_INFO_NVCUDA_H__
|
||||
#define __GPU_INFO_NVCUDA_H__
|
||||
#include "gpu_info.h"
|
||||
|
||||
// Just enough typedef's to dlopen/dlsym for memory information
|
||||
typedef enum cudaError_enum {
|
||||
CUDA_SUCCESS = 0,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDA_ERROR_NOT_INITIALIZED = 3,
|
||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
// Other values omitted for now...
|
||||
} CUresult;
|
||||
|
||||
typedef enum CUdevice_attribute_enum {
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
|
||||
|
||||
// TODO - not yet wired up but may be useful for Jetson or other
|
||||
// integrated GPU scenarios with shared memory
|
||||
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
|
||||
|
||||
} CUdevice_attribute;
|
||||
|
||||
typedef void *nvcudaDevice_t; // Opaque is sufficient
|
||||
typedef struct nvcudaMemory_st {
|
||||
uint64_t total;
|
||||
uint64_t free;
|
||||
} nvcudaMemory_t;
|
||||
|
||||
typedef struct nvcudaDriverVersion {
|
||||
int major;
|
||||
int minor;
|
||||
} nvcudaDriverVersion_t;
|
||||
|
||||
typedef struct CUuuid_st {
|
||||
unsigned char bytes[16];
|
||||
} CUuuid;
|
||||
|
||||
typedef int CUdevice;
|
||||
typedef void* CUcontext;
|
||||
|
||||
typedef struct nvcuda_handle {
|
||||
void *handle;
|
||||
uint16_t verbose;
|
||||
CUresult (*cuInit)(unsigned int Flags);
|
||||
CUresult (*cuDriverGetVersion)(int *driverVersion);
|
||||
CUresult (*cuDeviceGetCount)(int *);
|
||||
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
|
||||
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
|
||||
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
|
||||
|
||||
// Context specific aspects
|
||||
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
|
||||
CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
|
||||
CUresult (*cuCtxDestroy)(CUcontext ctx);
|
||||
} nvcuda_handle_t;
|
||||
|
||||
typedef struct nvcuda_init_resp {
|
||||
char *err; // If err is non-null handle is invalid
|
||||
nvcuda_handle_t ch;
|
||||
int num_devices;
|
||||
} nvcuda_init_resp_t;
|
||||
|
||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
||||
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
|
||||
void nvcuda_release(nvcuda_handle_t ch);
|
||||
|
||||
#endif // __GPU_INFO_NVCUDA_H__
|
||||
#endif // __APPLE__
|
||||
@@ -1,117 +0,0 @@
|
||||
//go:build integration
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMaxQueue(t *testing.T) {
|
||||
// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
|
||||
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
|
||||
threadCount := 32
|
||||
mq := os.Getenv("OLLAMA_MAX_QUEUE")
|
||||
if mq != "" {
|
||||
var err error
|
||||
threadCount, err = strconv.Atoi(mq)
|
||||
require.NoError(t, err)
|
||||
} else {
|
||||
os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
|
||||
}
|
||||
|
||||
req := api.GenerateRequest{
|
||||
Model: "orca-mini",
|
||||
Prompt: "write a long historical fiction story about christopher columbus. use at least 10 facts from his actual journey",
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}
|
||||
resp := []string{"explore", "discover", "ocean"}
|
||||
|
||||
// CPU mode takes much longer at the limit with a large queue setting
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
require.NoError(t, PullIfMissing(ctx, client, req.Model))
|
||||
|
||||
// Context for the worker threads so we can shut them down
|
||||
// embedCtx, embedCancel := context.WithCancel(ctx)
|
||||
embedCtx := ctx
|
||||
|
||||
var genwg sync.WaitGroup
|
||||
go func() {
|
||||
genwg.Add(1)
|
||||
defer genwg.Done()
|
||||
slog.Info("Starting generate request")
|
||||
DoGenerate(ctx, t, client, req, resp, 45*time.Second, 5*time.Second)
|
||||
slog.Info("generate completed")
|
||||
}()
|
||||
|
||||
// Give the generate a chance to get started before we start hammering on embed requests
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
|
||||
threadCount += 10 // Add a few extra to ensure we push the queue past its limit
|
||||
busyCount := 0
|
||||
resetByPeerCount := 0
|
||||
canceledCount := 0
|
||||
succesCount := 0
|
||||
counterMu := sync.Mutex{}
|
||||
var embedwg sync.WaitGroup
|
||||
for i := 0; i < threadCount; i++ {
|
||||
go func(i int) {
|
||||
embedwg.Add(1)
|
||||
defer embedwg.Done()
|
||||
slog.Info("embed started", "id", i)
|
||||
embedReq := api.EmbeddingRequest{
|
||||
Model: req.Model,
|
||||
Prompt: req.Prompt,
|
||||
Options: req.Options,
|
||||
}
|
||||
// Fresh client for every request
|
||||
client, _ = GetTestEndpoint()
|
||||
|
||||
resp, genErr := client.Embeddings(embedCtx, &embedReq)
|
||||
counterMu.Lock()
|
||||
defer counterMu.Unlock()
|
||||
switch {
|
||||
case genErr == nil:
|
||||
succesCount++
|
||||
require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
|
||||
case errors.Is(genErr, context.Canceled):
|
||||
canceledCount++
|
||||
case strings.Contains(genErr.Error(), "busy"):
|
||||
busyCount++
|
||||
case strings.Contains(genErr.Error(), "connection reset by peer"):
|
||||
resetByPeerCount++
|
||||
default:
|
||||
require.NoError(t, genErr, "%d request failed", i)
|
||||
}
|
||||
|
||||
slog.Info("embed finished", "id", i)
|
||||
}(i)
|
||||
}
|
||||
genwg.Wait()
|
||||
slog.Info("generate done, waiting for embeds")
|
||||
embedwg.Wait()
|
||||
|
||||
require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
|
||||
require.True(t, busyCount > 0, "no requests hit busy error but some should have")
|
||||
require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
|
||||
|
||||
slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
|
||||
}
|
||||
17
llm/ext_server/server.cpp
vendored
17
llm/ext_server/server.cpp
vendored
@@ -1032,7 +1032,7 @@ struct llama_server_context
|
||||
slot.has_next_token = false;
|
||||
}
|
||||
|
||||
if (!slot.cache_tokens.empty() && llama_token_is_eog(model, result.tok))
|
||||
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
|
||||
{
|
||||
slot.stopped_eos = true;
|
||||
slot.has_next_token = false;
|
||||
@@ -1144,15 +1144,12 @@ struct llama_server_context
|
||||
|
||||
res.result_json = json
|
||||
{
|
||||
{"content", tkn.text_to_send},
|
||||
{"stop", false},
|
||||
{"slot_id", slot.id},
|
||||
{"multimodal", multimodal}
|
||||
};
|
||||
|
||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
||||
res.result_json["content"] = tkn.text_to_send;
|
||||
}
|
||||
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
@@ -1186,6 +1183,8 @@ struct llama_server_context
|
||||
{"model", params.model_alias},
|
||||
{"tokens_predicted", slot.n_decoded},
|
||||
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||
{"generation_settings", get_formated_generation(slot)},
|
||||
{"prompt", slot.prompt},
|
||||
{"truncated", slot.truncated},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
{"stopped_word", slot.stopped_word},
|
||||
@@ -2645,18 +2644,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = std::atol(sep);
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.val_f64 = std::atof(sep);
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.val_bool = true;
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.val_bool = false;
|
||||
kvo.bool_value = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
|
||||
@@ -21,7 +21,7 @@ init_vars() {
|
||||
# TODO - add additional optimization flags...
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
||||
fi
|
||||
case $(uname -s) in
|
||||
case $(uname -s) in
|
||||
"Darwin")
|
||||
LIB_EXT="dylib"
|
||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||
|
||||
@@ -165,11 +165,11 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||
fi
|
||||
if [ "${ARCH}" == "arm64" ]; then
|
||||
echo "ARM CPU detected - disabling unsupported AVX instructions"
|
||||
|
||||
|
||||
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
|
||||
#
|
||||
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||
# Disabling has minimal performance effect while maintaining compatibility.
|
||||
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||
# Disabling has minimal performance effect while maintaining compatibility.
|
||||
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
|
||||
fi
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||
|
||||
@@ -26,25 +26,16 @@ function amdGPUs {
|
||||
$GPU_LIST -join ';'
|
||||
}
|
||||
|
||||
|
||||
function init_vars {
|
||||
if (!$script:SRC_DIR) {
|
||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||
}
|
||||
if (!$script:llamacppDir) {
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
}
|
||||
if (!$script:cmakeTargets) {
|
||||
$script:cmakeTargets = @("ollama_llama_server")
|
||||
}
|
||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
$script:cmakeDefs = @(
|
||||
"-DBUILD_SHARED_LIBS=on",
|
||||
"-DLLAMA_NATIVE=off"
|
||||
)
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
$script:cmakeTargets = @("ollama_llama_server")
|
||||
$script:ARCH = "amd64" # arm not yet supported.
|
||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
|
||||
md "$script:DIST_BASE" -ea 0 > $null
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
||||
$script:config = "RelWithDebInfo"
|
||||
@@ -175,195 +166,137 @@ function cleanup {
|
||||
}
|
||||
}
|
||||
|
||||
init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
|
||||
function build_static() {
|
||||
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
|
||||
# GCC build for direct linking into the Go binary
|
||||
init_vars
|
||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||
write-host "Checking for MinGW..."
|
||||
# error action ensures we exit on failure
|
||||
get-command gcc
|
||||
get-command mingw32-make
|
||||
$oldTargets = $script:cmakeTargets
|
||||
$script:cmakeTargets = @("llama", "ggml")
|
||||
$script:cmakeDefs = @(
|
||||
"-G", "MinGW Makefiles"
|
||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||
"-DBUILD_SHARED_LIBS=off",
|
||||
"-DLLAMA_NATIVE=off",
|
||||
"-DLLAMA_AVX=off",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DLLAMA_AVX512=off",
|
||||
"-DLLAMA_F16C=off",
|
||||
"-DLLAMA_FMA=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
$script:cmakeTargets = $oldTargets
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu($gen_arch) {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
|
||||
# remaining llama.cpp builds use MSVC
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
||||
$script:distDir="$script:DIST_BASE\cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu_avx() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU AVX generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu_avx2() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU AVX2 generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cuda() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
||||
write-host "building custom CUDA GPU"
|
||||
}
|
||||
build
|
||||
sign
|
||||
install
|
||||
|
||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
} else {
|
||||
write-host "Skipping CUDA generation step"
|
||||
}
|
||||
}
|
||||
|
||||
function build_rocm() {
|
||||
if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
|
||||
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
||||
if ($null -ne $script:ROCM_VERSION) {
|
||||
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
||||
}
|
||||
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
|
||||
$script:cmakeDefs += @(
|
||||
"-G", "Ninja",
|
||||
"-DCMAKE_C_COMPILER=clang.exe",
|
||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||
"-DLLAMA_HIPBLAS=on",
|
||||
"-DHIP_PLATFORM=amd",
|
||||
"-DLLAMA_AVX=on",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
||||
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
||||
"-DGPU_TARGETS=$(amdGPUs)"
|
||||
)
|
||||
|
||||
# Make sure the ROCm binary dir is first in the path
|
||||
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
||||
|
||||
# We have to clobber the LIB var from the developer shell for clang to work properly
|
||||
$env:LIB=""
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
|
||||
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
|
||||
write-host "building custom ROCM GPU"
|
||||
}
|
||||
write-host "Building ROCm"
|
||||
build
|
||||
# Ninja doesn't prefix with config name
|
||||
${script:config}=""
|
||||
if ($null -ne $script:DUMPBIN) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
||||
}
|
||||
sign
|
||||
install
|
||||
|
||||
# Assumes v5.7, may need adjustments for v6
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
|
||||
} else {
|
||||
write-host "Skipping ROCm generation step"
|
||||
}
|
||||
}
|
||||
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
|
||||
|
||||
# GCC build for direct linking into the Go binary
|
||||
init_vars
|
||||
if ($($args.count) -eq 0) {
|
||||
git_module_setup
|
||||
apply_patches
|
||||
build_static
|
||||
if ($script:ARCH -eq "arm64") {
|
||||
build_cpu("ARM64")
|
||||
} else { # amd64
|
||||
build_cpu("x64")
|
||||
build_cpu_avx
|
||||
build_cpu_avx2
|
||||
build_cuda
|
||||
build_rocm
|
||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||
write-host "Checking for MinGW..."
|
||||
# error action ensures we exit on failure
|
||||
get-command gcc
|
||||
get-command mingw32-make
|
||||
$script:cmakeTargets = @("llama", "ggml")
|
||||
$script:cmakeDefs = @(
|
||||
"-G", "MinGW Makefiles"
|
||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||
"-DBUILD_SHARED_LIBS=off",
|
||||
"-DLLAMA_NATIVE=off",
|
||||
"-DLLAMA_AVX=off",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DLLAMA_AVX512=off",
|
||||
"-DLLAMA_F16C=off",
|
||||
"-DLLAMA_FMA=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
|
||||
# remaining llama.cpp builds use MSVC
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
||||
$script:distDir="$script:DIST_BASE\cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
|
||||
if ($null -ne $script:CUDA_LIB_DIR) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
||||
write-host "building custom CUDA GPU"
|
||||
}
|
||||
build
|
||||
sign
|
||||
install
|
||||
}
|
||||
|
||||
if ($null -ne $env:HIP_PATH) {
|
||||
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
||||
if ($null -ne $script:ROCM_VERSION) {
|
||||
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
||||
}
|
||||
|
||||
cleanup
|
||||
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"
|
||||
} else {
|
||||
for ( $i = 0; $i -lt $args.count; $i++ ) {
|
||||
write-host "performing $($args[$i])"
|
||||
& $($args[$i])
|
||||
}
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
|
||||
$script:cmakeDefs += @(
|
||||
"-G", "Ninja",
|
||||
"-DCMAKE_C_COMPILER=clang.exe",
|
||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||
"-DLLAMA_HIPBLAS=on",
|
||||
"-DHIP_PLATFORM=amd",
|
||||
"-DLLAMA_AVX=on",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
||||
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
||||
"-DGPU_TARGETS=$(amdGPUs)"
|
||||
)
|
||||
|
||||
# Make sure the ROCm binary dir is first in the path
|
||||
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
||||
|
||||
# We have to clobber the LIB var from the developer shell for clang to work properly
|
||||
$env:LIB=""
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
|
||||
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
|
||||
write-host "building custom ROCM GPU"
|
||||
}
|
||||
write-host "Building ROCm"
|
||||
build
|
||||
# Ninja doesn't prefix with config name
|
||||
${script:config}=""
|
||||
if ($null -ne $script:DUMPBIN) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
||||
}
|
||||
sign
|
||||
install
|
||||
}
|
||||
|
||||
|
||||
cleanup
|
||||
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"
|
||||
|
||||
Submodule llm/llama.cpp updated: 952d03dbea...7593639ce3
@@ -4,7 +4,6 @@ package llm
|
||||
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
|
||||
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
|
||||
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
|
||||
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
|
||||
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
|
||||
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
|
||||
// #include <stdlib.h>
|
||||
|
||||
@@ -3,11 +3,13 @@ package llm
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
)
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
@@ -49,8 +51,15 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
for _, info := range gpus {
|
||||
memoryAvailable += info.FreeMemory
|
||||
}
|
||||
if envconfig.MaxVRAM > 0 {
|
||||
memoryAvailable = envconfig.MaxVRAM
|
||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
if userLimit != "" {
|
||||
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
||||
} else {
|
||||
slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
|
||||
memoryAvailable = avail
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
||||
@@ -80,42 +89,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
graphFullOffload *= uint64(len(gpus))
|
||||
graphPartialOffload *= uint64(len(gpus))
|
||||
|
||||
// on metal there's no partial offload overhead
|
||||
if gpus[0].Library == "metal" {
|
||||
graphPartialOffload = graphFullOffload
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if layer, ok := layers["output"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
} else if layer, ok := layers["token_embd"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if gpus[0].Library == "metal" && opts.UseMMap {
|
||||
// memory is preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
memoryRequiredPartial += memoryLayerOutput
|
||||
}
|
||||
|
||||
var layerCount int
|
||||
layers := ggml.Tensors().Layers()
|
||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
|
||||
|
||||
@@ -129,11 +115,15 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
}
|
||||
}
|
||||
|
||||
if gpus[0].Library != "metal" || !opts.UseMMap {
|
||||
// memory was not preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
var memoryLayerOutput uint64
|
||||
for k, v := range layers {
|
||||
if !strings.HasPrefix(k, "blk.") {
|
||||
memoryLayerOutput += v.size()
|
||||
}
|
||||
}
|
||||
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
|
||||
if memoryAvailable > memoryRequiredTotal {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index e431c7f7..f077e688 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -3,6 +3,7 @@
|
||||
// I'll gradually clean and extend it
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
+#include "common.h"
|
||||
#include "log.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
@@ -1,45 +0,0 @@
|
||||
diff --git a/ggml-metal.m b/ggml-metal.m
|
||||
index 0207b787..b5e9884b 100644
|
||||
--- a/ggml-metal.m
|
||||
+++ b/ggml-metal.m
|
||||
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
-#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
// these numbers do not translate to other devices or model sizes
|
||||
// TODO: need to find a better approach
|
||||
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||
- switch (src0t) {
|
||||
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q4_0:
|
||||
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
- case GGML_TYPE_Q5_0: // not tested yet
|
||||
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
- default: ne11_mm_min = 1; break;
|
||||
- }
|
||||
+ switch (src0t) {
|
||||
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q4_0:
|
||||
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
+ case GGML_TYPE_Q5_0: // not tested yet
|
||||
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
+ default: ne11_mm_min = 1; break;
|
||||
}
|
||||
-#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
@@ -1,24 +0,0 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index e3c9bcd4..b43f892d 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
struct ggml_tensor * embeddings = inp;
|
||||
if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
+ }
|
||||
+ ggml_set_name(embeddings, "embeddings");
|
||||
+ ggml_set_input(embeddings);
|
||||
+
|
||||
+ if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
- ggml_set_name(embeddings, "embeddings");
|
||||
- ggml_set_input(embeddings);
|
||||
-
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_set_name(positions, "positions");
|
||||
326
llm/server.go
326
llm/server.go
@@ -26,7 +26,6 @@ import (
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
)
|
||||
|
||||
type LlamaServer interface {
|
||||
@@ -74,7 +73,8 @@ func LoadModel(model string) (*GGML, error) {
|
||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
||||
var err error
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
|
||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
@@ -125,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
} else {
|
||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
}
|
||||
demandLib := envconfig.LLMLibrary
|
||||
demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
|
||||
if demandLib != "" {
|
||||
serverPath := availableServers[demandLib]
|
||||
if serverPath == "" {
|
||||
@@ -146,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||
"--embedding",
|
||||
}
|
||||
if envconfig.Debug {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
params = append(params, "--log-format", "json")
|
||||
} else {
|
||||
params = append(params, "--log-disable")
|
||||
@@ -156,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||
}
|
||||
|
||||
if envconfig.Debug {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
params = append(params, "--verbose")
|
||||
}
|
||||
|
||||
@@ -194,15 +194,16 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--numa")
|
||||
}
|
||||
|
||||
numParallel := envconfig.NumParallel
|
||||
|
||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||
// see https://github.com/ollama/ollama/issues/4165
|
||||
if len(projectors) > 0 {
|
||||
numParallel = 1
|
||||
slog.Warn("multimodal models don't support parallel requests yet")
|
||||
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
|
||||
numParallel := 1
|
||||
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||
numParallel, err = strconv.Atoi(onp)
|
||||
if err != nil || numParallel <= 0 {
|
||||
err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
|
||||
slog.Error("misconfiguration", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||
|
||||
for i := 0; i < len(servers); i++ {
|
||||
@@ -233,13 +234,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
if runtime.GOOS == "windows" {
|
||||
pathEnv = "PATH"
|
||||
}
|
||||
// prepend the server directory to LD_LIBRARY_PATH/PATH
|
||||
// append the server directory to LD_LIBRARY_PATH/PATH
|
||||
libraryPaths := []string{dir}
|
||||
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
// Append our runner directory to the path
|
||||
// This will favor system libraries over our bundled library dependencies
|
||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||
libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
|
||||
}
|
||||
|
||||
// Note: we always put the dependency path first
|
||||
@@ -275,31 +276,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
}
|
||||
|
||||
s.cmd.Env = os.Environ()
|
||||
libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
|
||||
s.cmd.Env = append(os.Environ(), libEnv)
|
||||
s.cmd.Stdout = os.Stdout
|
||||
s.cmd.Stderr = s.status
|
||||
|
||||
visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
|
||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||
|
||||
// Update or add the path and visible devices variable with our adjusted version
|
||||
pathNeeded := true
|
||||
devicesNeeded := visibleDevicesEnv != ""
|
||||
for i := range s.cmd.Env {
|
||||
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
|
||||
if strings.EqualFold(cmp[0], pathEnv) {
|
||||
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
||||
pathNeeded = false
|
||||
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
|
||||
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
|
||||
devicesNeeded = false
|
||||
}
|
||||
}
|
||||
if pathNeeded {
|
||||
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
|
||||
}
|
||||
if devicesNeeded {
|
||||
s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
|
||||
// TODO - multiple GPU selection logic...
|
||||
key, val := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
|
||||
if key != "" {
|
||||
s.cmd.Env = append(s.cmd.Env, key+"="+val)
|
||||
}
|
||||
|
||||
slog.Info("starting llama server", "cmd", s.cmd.String())
|
||||
@@ -316,6 +301,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
continue
|
||||
}
|
||||
|
||||
// reap subprocess when it exits
|
||||
go func() {
|
||||
// Exit status managed via getServerStatus
|
||||
_ = s.cmd.Wait()
|
||||
}()
|
||||
|
||||
// TODO - make sure this is all wired up correctly
|
||||
// if err = s.WaitUntilRunning(); err != nil {
|
||||
// slog.Error("error starting llama server", "server", servers[i], "error", err)
|
||||
// s.Close()
|
||||
// finalErr = err
|
||||
// continue
|
||||
// }
|
||||
return s, nil
|
||||
}
|
||||
|
||||
@@ -347,7 +345,7 @@ type ServerStatus int
|
||||
|
||||
const ( // iota is reset to 0
|
||||
ServerStatusReady ServerStatus = iota
|
||||
ServerStatusNoSlotsAvailable
|
||||
ServerStatusNoSlotsAvaialble
|
||||
ServerStatusLoadingModel
|
||||
ServerStatusNotResponding
|
||||
ServerStatusError
|
||||
@@ -357,7 +355,7 @@ func (s ServerStatus) ToString() string {
|
||||
switch s {
|
||||
case ServerStatusReady:
|
||||
return "llm server ready"
|
||||
case ServerStatusNoSlotsAvailable:
|
||||
case ServerStatusNoSlotsAvaialble:
|
||||
return "llm busy - no slots available"
|
||||
case ServerStatusLoadingModel:
|
||||
return "llm server loading model"
|
||||
@@ -414,7 +412,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
case "ok":
|
||||
return ServerStatusReady, nil
|
||||
case "no slot available":
|
||||
return ServerStatusNoSlotsAvailable, nil
|
||||
return ServerStatusNoSlotsAvaialble, nil
|
||||
case "loading model":
|
||||
return ServerStatusLoadingModel, nil
|
||||
default:
|
||||
@@ -422,29 +420,6 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// getServerStatusRetry will retry if ServerStatusNoSlotsAvailable is received
|
||||
func (s *llmServer) getServerStatusRetry(ctx context.Context) (ServerStatus, error) {
|
||||
var retries int
|
||||
for {
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return status, err
|
||||
}
|
||||
|
||||
if status == ServerStatusNoSlotsAvailable {
|
||||
if retries >= 10 {
|
||||
return status, fmt.Errorf("no slots available after %d retries", retries)
|
||||
}
|
||||
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
retries++
|
||||
continue
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *llmServer) Ping(ctx context.Context) error {
|
||||
_, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
@@ -467,7 +442,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Info("context expired before server started")
|
||||
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
|
||||
return fmt.Errorf("timed out waiting for llama runner to start")
|
||||
case err := <-s.done:
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
@@ -542,6 +517,7 @@ ws ::= ([ \t\n] ws)?
|
||||
`
|
||||
|
||||
const maxBufferSize = 512 * format.KiloByte
|
||||
const maxRetries = 3
|
||||
|
||||
type ImageData struct {
|
||||
Data []byte `json:"data"`
|
||||
@@ -584,13 +560,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
return err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// only allow maximum 10 "context shifts" to avoid infinite generation
|
||||
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
||||
req.Options.NumPredict = 10 * s.options.NumCtx
|
||||
slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
|
||||
}
|
||||
|
||||
request := map[string]any{
|
||||
"prompt": req.Prompt,
|
||||
"stream": true,
|
||||
@@ -617,7 +586,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
}
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if status != ServerStatusReady {
|
||||
@@ -631,113 +600,133 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
}
|
||||
}
|
||||
|
||||
// Handling JSON marshaling with special characters unescaped.
|
||||
buffer := &bytes.Buffer{}
|
||||
enc := json.NewEncoder(buffer)
|
||||
enc.SetEscapeHTML(false)
|
||||
retryDelay := 100 * time.Microsecond
|
||||
for retries := 0; retries < maxRetries; retries++ {
|
||||
if retries > 0 {
|
||||
time.Sleep(retryDelay) // wait before retrying
|
||||
retryDelay *= 2 // exponential backoff
|
||||
}
|
||||
|
||||
if err := enc.Encode(request); err != nil {
|
||||
return fmt.Errorf("failed to marshal data: %v", err)
|
||||
}
|
||||
// Handling JSON marshaling with special characters unescaped.
|
||||
buffer := &bytes.Buffer{}
|
||||
enc := json.NewEncoder(buffer)
|
||||
enc.SetEscapeHTML(false)
|
||||
|
||||
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
|
||||
serverReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating POST request: %v", err)
|
||||
}
|
||||
serverReq.Header.Set("Content-Type", "application/json")
|
||||
if err := enc.Encode(request); err != nil {
|
||||
return fmt.Errorf("failed to marshal data: %v", err)
|
||||
}
|
||||
|
||||
res, err := http.DefaultClient.Do(serverReq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("POST predict: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode >= 400 {
|
||||
bodyBytes, err := io.ReadAll(res.Body)
|
||||
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed reading llm error response: %w", err)
|
||||
return fmt.Errorf("error creating POST request: %v", err)
|
||||
}
|
||||
log.Printf("llm predict error: %s", bodyBytes)
|
||||
return fmt.Errorf("%s", bodyBytes)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
buf := make([]byte, 0, maxBufferSize)
|
||||
scanner.Buffer(buf, maxBufferSize)
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("POST predict: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// keep track of the last token generated, this is used to abort if the model starts looping
|
||||
var lastToken string
|
||||
var tokenRepeat int
|
||||
|
||||
for scanner.Scan() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// This handles the request cancellation
|
||||
return ctx.Err()
|
||||
default:
|
||||
line := scanner.Bytes()
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
if resp.StatusCode >= 400 {
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed reading llm error response: %w", err)
|
||||
}
|
||||
log.Printf("llm predict error: %s", bodyBytes)
|
||||
return fmt.Errorf("%s", bodyBytes)
|
||||
}
|
||||
|
||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||
if !ok {
|
||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||
}
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
buf := make([]byte, 0, maxBufferSize)
|
||||
scanner.Buffer(buf, maxBufferSize)
|
||||
|
||||
var c completion
|
||||
if err := json.Unmarshal(evt, &c); err != nil {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||
}
|
||||
retryNeeded := false
|
||||
// keep track of the last token generated, this is used to abort if the model starts looping
|
||||
var lastToken string
|
||||
var tokenRepeat int
|
||||
|
||||
switch {
|
||||
case strings.TrimSpace(c.Content) == lastToken:
|
||||
tokenRepeat++
|
||||
default:
|
||||
lastToken = strings.TrimSpace(c.Content)
|
||||
tokenRepeat = 0
|
||||
}
|
||||
|
||||
// 30 picked as an arbitrary max token repeat limit, modify as needed
|
||||
if tokenRepeat > 30 {
|
||||
slog.Debug("prediction aborted, token repeat limit reached")
|
||||
for scanner.Scan() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// This handles the request cancellation
|
||||
return ctx.Err()
|
||||
}
|
||||
default:
|
||||
line := scanner.Bytes()
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if c.Content != "" {
|
||||
fn(CompletionResponse{
|
||||
Content: c.Content,
|
||||
})
|
||||
}
|
||||
// try again on slot unavailable
|
||||
if bytes.Contains(line, []byte("slot unavailable")) {
|
||||
retryNeeded = true
|
||||
break
|
||||
}
|
||||
|
||||
if c.Stop {
|
||||
fn(CompletionResponse{
|
||||
Done: true,
|
||||
PromptEvalCount: c.Timings.PromptN,
|
||||
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
|
||||
EvalCount: c.Timings.PredictedN,
|
||||
EvalDuration: parseDurationMs(c.Timings.PredictedMS),
|
||||
})
|
||||
return nil
|
||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||
if !ok {
|
||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||
}
|
||||
|
||||
var c completion
|
||||
if err := json.Unmarshal(evt, &c); err != nil {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||
}
|
||||
|
||||
switch {
|
||||
case strings.TrimSpace(c.Content) == lastToken:
|
||||
tokenRepeat++
|
||||
default:
|
||||
lastToken = strings.TrimSpace(c.Content)
|
||||
tokenRepeat = 0
|
||||
}
|
||||
|
||||
// 30 picked as an arbitrary max token repeat limit, modify as needed
|
||||
if tokenRepeat > 30 {
|
||||
slog.Debug("prediction aborted, token repeat limit reached")
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
if c.Content != "" {
|
||||
fn(CompletionResponse{
|
||||
Content: c.Content,
|
||||
})
|
||||
}
|
||||
|
||||
if c.Stop {
|
||||
fn(CompletionResponse{
|
||||
Done: true,
|
||||
PromptEvalCount: c.Timings.PromptN,
|
||||
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
|
||||
EvalCount: c.Timings.PredictedN,
|
||||
EvalDuration: parseDurationMs(c.Timings.PredictedMS),
|
||||
})
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
if strings.Contains(err.Error(), "unexpected EOF") {
|
||||
s.Close()
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
|
||||
return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
|
||||
}
|
||||
return fmt.Errorf("error reading llm response: %v", err)
|
||||
}
|
||||
|
||||
if !retryNeeded {
|
||||
return nil // success
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
if strings.Contains(err.Error(), "unexpected EOF") {
|
||||
s.Close()
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
|
||||
}
|
||||
|
||||
return fmt.Errorf("error reading llm response: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
// should never reach here ideally
|
||||
return fmt.Errorf("max retries exceeded")
|
||||
}
|
||||
|
||||
type EmbeddingRequest struct {
|
||||
@@ -754,9 +743,8 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
|
||||
return nil, err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady {
|
||||
@@ -811,7 +799,7 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
@@ -863,7 +851,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
|
||||
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
@@ -905,13 +893,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
|
||||
func (s *llmServer) Close() error {
|
||||
if s.cmd != nil {
|
||||
slog.Debug("stopping llama server")
|
||||
if err := s.cmd.Process.Kill(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_ = s.cmd.Wait()
|
||||
|
||||
slog.Debug("llama server stopped")
|
||||
return s.cmd.Process.Kill()
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -19,7 +19,7 @@ export default function () {
|
||||
const [step, setStep] = useState<Step>(Step.WELCOME)
|
||||
const [commandCopied, setCommandCopied] = useState<boolean>(false)
|
||||
|
||||
const command = 'ollama run llama3'
|
||||
const command = 'ollama run llama2'
|
||||
|
||||
return (
|
||||
<div className='drag'>
|
||||
|
||||
132
parser/parser.go
Normal file
132
parser/parser.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"slices"
|
||||
)
|
||||
|
||||
type Command struct {
|
||||
Name string
|
||||
Args string
|
||||
}
|
||||
|
||||
func (c *Command) Reset() {
|
||||
c.Name = ""
|
||||
c.Args = ""
|
||||
}
|
||||
|
||||
func Parse(reader io.Reader) ([]Command, error) {
|
||||
var commands []Command
|
||||
var command, modelCommand Command
|
||||
|
||||
scanner := bufio.NewScanner(reader)
|
||||
scanner.Buffer(make([]byte, 0, bufio.MaxScanTokenSize), bufio.MaxScanTokenSize)
|
||||
scanner.Split(scanModelfile)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Bytes()
|
||||
|
||||
fields := bytes.SplitN(line, []byte(" "), 2)
|
||||
if len(fields) == 0 || len(fields[0]) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
switch string(bytes.ToUpper(fields[0])) {
|
||||
case "FROM":
|
||||
command.Name = "model"
|
||||
command.Args = string(bytes.TrimSpace(fields[1]))
|
||||
// copy command for validation
|
||||
modelCommand = command
|
||||
case "ADAPTER":
|
||||
command.Name = string(bytes.ToLower(fields[0]))
|
||||
command.Args = string(bytes.TrimSpace(fields[1]))
|
||||
case "LICENSE", "TEMPLATE", "SYSTEM", "PROMPT":
|
||||
command.Name = string(bytes.ToLower(fields[0]))
|
||||
command.Args = string(fields[1])
|
||||
case "PARAMETER":
|
||||
fields = bytes.SplitN(fields[1], []byte(" "), 2)
|
||||
if len(fields) < 2 {
|
||||
return nil, fmt.Errorf("missing value for %s", fields)
|
||||
}
|
||||
|
||||
command.Name = string(fields[0])
|
||||
command.Args = string(bytes.TrimSpace(fields[1]))
|
||||
case "EMBED":
|
||||
return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
|
||||
case "MESSAGE":
|
||||
command.Name = string(bytes.ToLower(fields[0]))
|
||||
fields = bytes.SplitN(fields[1], []byte(" "), 2)
|
||||
if len(fields) < 2 {
|
||||
return nil, fmt.Errorf("should be in the format <role> <message>")
|
||||
}
|
||||
if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) {
|
||||
return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"")
|
||||
}
|
||||
command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1]))
|
||||
default:
|
||||
if !bytes.HasPrefix(fields[0], []byte("#")) {
|
||||
// log a warning for unknown commands
|
||||
slog.Warn(fmt.Sprintf("Unknown command: %s", fields[0]))
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
commands = append(commands, command)
|
||||
command.Reset()
|
||||
}
|
||||
|
||||
if modelCommand.Args == "" {
|
||||
return nil, errors.New("no FROM line for the model was specified")
|
||||
}
|
||||
|
||||
return commands, scanner.Err()
|
||||
}
|
||||
|
||||
func scanModelfile(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
||||
advance, token, err = scan([]byte(`"""`), []byte(`"""`), data, atEOF)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if advance > 0 && token != nil {
|
||||
return advance, token, nil
|
||||
}
|
||||
|
||||
advance, token, err = scan([]byte(`"`), []byte(`"`), data, atEOF)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if advance > 0 && token != nil {
|
||||
return advance, token, nil
|
||||
}
|
||||
|
||||
return bufio.ScanLines(data, atEOF)
|
||||
}
|
||||
|
||||
func scan(openBytes, closeBytes, data []byte, atEOF bool) (advance int, token []byte, err error) {
|
||||
newline := bytes.IndexByte(data, '\n')
|
||||
|
||||
if start := bytes.Index(data, openBytes); start >= 0 && start < newline {
|
||||
end := bytes.Index(data[start+len(openBytes):], closeBytes)
|
||||
if end < 0 {
|
||||
if atEOF {
|
||||
return 0, nil, fmt.Errorf("unterminated %s: expecting %s", openBytes, closeBytes)
|
||||
} else {
|
||||
return 0, nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
n := start + len(openBytes) + end + len(closeBytes)
|
||||
|
||||
newData := data[:start]
|
||||
newData = append(newData, data[start+len(openBytes):n-len(closeBytes)]...)
|
||||
return n, newData, nil
|
||||
}
|
||||
|
||||
return 0, nil, nil
|
||||
}
|
||||
98
parser/parser_test.go
Normal file
98
parser/parser_test.go
Normal file
@@ -0,0 +1,98 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func Test_Parser(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM model1
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
commands, err := Parse(reader)
|
||||
assert.Nil(t, err)
|
||||
|
||||
expectedCommands := []Command{
|
||||
{Name: "model", Args: "model1"},
|
||||
{Name: "adapter", Args: "adapter1"},
|
||||
{Name: "license", Args: "MIT"},
|
||||
{Name: "param1", Args: "value1"},
|
||||
{Name: "param2", Args: "value2"},
|
||||
{Name: "template", Args: "template1"},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedCommands, commands)
|
||||
}
|
||||
|
||||
func Test_Parser_NoFromLine(t *testing.T) {
|
||||
|
||||
input := `
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "no FROM line")
|
||||
}
|
||||
|
||||
func Test_Parser_MissingValue(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM foo
|
||||
PARAMETER param1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "missing value for [param1]")
|
||||
|
||||
}
|
||||
|
||||
func Test_Parser_Messages(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM foo
|
||||
MESSAGE system You are a Parser. Always Parse things.
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
commands, err := Parse(reader)
|
||||
assert.Nil(t, err)
|
||||
|
||||
expectedCommands := []Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a Parser. Always Parse things."},
|
||||
{Name: "message", Args: "user: Hey there!"},
|
||||
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedCommands, commands)
|
||||
}
|
||||
|
||||
func Test_Parser_Messages_BadRole(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM foo
|
||||
MESSAGE badguy I'm a bad guy!
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "role must be one of \"system\", \"user\", or \"assistant\"")
|
||||
}
|
||||
@@ -7,8 +7,6 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function checkEnv() {
|
||||
$script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
Write-host "Building for ${script:TARGET_ARCH}"
|
||||
write-host "Locating required tools and paths"
|
||||
$script:SRC_DIR=$PWD
|
||||
if (!$env:VCToolsRedistDir) {
|
||||
@@ -32,7 +30,7 @@ function checkEnv() {
|
||||
|
||||
$script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
|
||||
|
||||
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
|
||||
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-amd64"
|
||||
$env:CGO_ENABLED="1"
|
||||
echo "Checking version"
|
||||
if (!$env:VERSION) {
|
||||
@@ -83,8 +81,8 @@ function buildOllama() {
|
||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
|
||||
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
|
||||
New-Item -ItemType Directory -Path .\dist\windows-amd64\ -Force
|
||||
cp .\ollama.exe .\dist\windows-amd64\ollama-windows-amd64.exe
|
||||
}
|
||||
|
||||
function buildApp() {
|
||||
@@ -111,6 +109,9 @@ function gatherDependencies() {
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
|
||||
|
||||
cp "${script:NVIDIA_DIR}\cudart64_*.dll" "${script:DEPS_DIR}\"
|
||||
cp "${script:NVIDIA_DIR}\cublas64_*.dll" "${script:DEPS_DIR}\"
|
||||
cp "${script:NVIDIA_DIR}\cublasLt64_*.dll" "${script:DEPS_DIR}\"
|
||||
|
||||
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
@@ -122,6 +123,15 @@ function gatherDependencies() {
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
}
|
||||
if ($null -ne $env:HIP_PATH) {
|
||||
# Assumes v5.7, may need adjustments for v6
|
||||
rm -ea 0 -recurse -force -path "${script:DEPS_DIR}\rocm\"
|
||||
md "${script:DEPS_DIR}\rocm\rocblas\library\" -ea 0 > $null
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:DEPS_DIR}\rocm\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:DEPS_DIR}\rocm\"
|
||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:DEPS_DIR}\rocm\rocblas\library\"
|
||||
}
|
||||
}
|
||||
|
||||
function buildInstaller() {
|
||||
@@ -129,16 +139,16 @@ function buildInstaller() {
|
||||
cd "${script:SRC_DIR}\app"
|
||||
$env:PKG_VERSION=$script:PKG_VERSION
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
|
||||
} else {
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH .\ollama.iss
|
||||
& "${script:INNO_SETUP_DIR}\ISCC.exe" .\ollama.iss
|
||||
}
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
|
||||
function distZip() {
|
||||
write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip"
|
||||
Compress-Archive -Path "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip" -Force
|
||||
write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-amd64.zip"
|
||||
Compress-Archive -Path "${script:SRC_DIR}\dist\windows-amd64\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-amd64.zip" -Force
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
@@ -166,8 +166,8 @@ fi
|
||||
|
||||
if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
|
||||
# Look for pre-existing ROCm v6 before downloading the dependencies
|
||||
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
|
||||
if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
|
||||
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm"; do
|
||||
if [ -n "${search}" ] && [ -e "${search}/lib/libhipblas.so.2" ]; then
|
||||
status "Compatible AMD GPU ROCm library detected at ${search}"
|
||||
install_success
|
||||
exit 0
|
||||
|
||||
@@ -1,174 +0,0 @@
|
||||
package envconfig
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
// Set via OLLAMA_ORIGINS in the environment
|
||||
AllowOrigins []string
|
||||
// Set via OLLAMA_DEBUG in the environment
|
||||
Debug bool
|
||||
// Set via OLLAMA_LLM_LIBRARY in the environment
|
||||
LLMLibrary string
|
||||
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
|
||||
MaxRunners int
|
||||
// Set via OLLAMA_MAX_QUEUE in the environment
|
||||
MaxQueuedRequests int
|
||||
// Set via OLLAMA_MAX_VRAM in the environment
|
||||
MaxVRAM uint64
|
||||
// Set via OLLAMA_NOPRUNE in the environment
|
||||
NoPrune bool
|
||||
// Set via OLLAMA_NUM_PARALLEL in the environment
|
||||
NumParallel int
|
||||
// Set via OLLAMA_RUNNERS_DIR in the environment
|
||||
RunnersDir string
|
||||
// Set via OLLAMA_TMPDIR in the environment
|
||||
TmpDir string
|
||||
)
|
||||
|
||||
func AsMap() map[string]string {
|
||||
return map[string]string{
|
||||
"OLLAMA_ORIGINS": fmt.Sprintf("%v", AllowOrigins),
|
||||
"OLLAMA_DEBUG": fmt.Sprintf("%v", Debug),
|
||||
"OLLAMA_LLM_LIBRARY": fmt.Sprintf("%v", LLMLibrary),
|
||||
"OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners),
|
||||
"OLLAMA_MAX_QUEUE": fmt.Sprintf("%v", MaxQueuedRequests),
|
||||
"OLLAMA_MAX_VRAM": fmt.Sprintf("%v", MaxVRAM),
|
||||
"OLLAMA_NOPRUNE": fmt.Sprintf("%v", NoPrune),
|
||||
"OLLAMA_NUM_PARALLEL": fmt.Sprintf("%v", NumParallel),
|
||||
"OLLAMA_RUNNERS_DIR": fmt.Sprintf("%v", RunnersDir),
|
||||
"OLLAMA_TMPDIR": fmt.Sprintf("%v", TmpDir),
|
||||
}
|
||||
}
|
||||
|
||||
var defaultAllowOrigins = []string{
|
||||
"localhost",
|
||||
"127.0.0.1",
|
||||
"0.0.0.0",
|
||||
}
|
||||
|
||||
// Clean quotes and spaces from the value
|
||||
func clean(key string) string {
|
||||
return strings.Trim(os.Getenv(key), "\"' ")
|
||||
}
|
||||
|
||||
func init() {
|
||||
// default values
|
||||
NumParallel = 1
|
||||
MaxRunners = 1
|
||||
MaxQueuedRequests = 512
|
||||
|
||||
LoadConfig()
|
||||
}
|
||||
|
||||
func LoadConfig() {
|
||||
if debug := clean("OLLAMA_DEBUG"); debug != "" {
|
||||
d, err := strconv.ParseBool(debug)
|
||||
if err == nil {
|
||||
Debug = d
|
||||
} else {
|
||||
Debug = true
|
||||
}
|
||||
}
|
||||
|
||||
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
|
||||
if runtime.GOOS == "windows" && RunnersDir == "" {
|
||||
// On Windows we do not carry the payloads inside the main executable
|
||||
appExe, err := os.Executable()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup executable path", "error", err)
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup working directory", "error", err)
|
||||
}
|
||||
|
||||
var paths []string
|
||||
for _, root := range []string{filepath.Dir(appExe), cwd} {
|
||||
paths = append(paths,
|
||||
filepath.Join(root),
|
||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, p := range paths {
|
||||
candidate := filepath.Join(p, "ollama_runners")
|
||||
_, err := os.Stat(candidate)
|
||||
if err == nil {
|
||||
RunnersDir = candidate
|
||||
break
|
||||
}
|
||||
}
|
||||
if RunnersDir == "" {
|
||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
||||
}
|
||||
}
|
||||
|
||||
TmpDir = clean("OLLAMA_TMPDIR")
|
||||
|
||||
userLimit := clean("OLLAMA_MAX_VRAM")
|
||||
if userLimit != "" {
|
||||
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
||||
} else {
|
||||
MaxVRAM = avail
|
||||
}
|
||||
}
|
||||
|
||||
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
|
||||
|
||||
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||
val, err := strconv.Atoi(onp)
|
||||
if err != nil || val <= 0 {
|
||||
slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
||||
} else {
|
||||
NumParallel = val
|
||||
}
|
||||
}
|
||||
|
||||
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
||||
NoPrune = true
|
||||
}
|
||||
|
||||
if origins := clean("OLLAMA_ORIGINS"); origins != "" {
|
||||
AllowOrigins = strings.Split(origins, ",")
|
||||
}
|
||||
for _, allowOrigin := range defaultAllowOrigins {
|
||||
AllowOrigins = append(AllowOrigins,
|
||||
fmt.Sprintf("http://%s", allowOrigin),
|
||||
fmt.Sprintf("https://%s", allowOrigin),
|
||||
fmt.Sprintf("http://%s:*", allowOrigin),
|
||||
fmt.Sprintf("https://%s:*", allowOrigin),
|
||||
)
|
||||
}
|
||||
|
||||
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
|
||||
if maxRunners != "" {
|
||||
m, err := strconv.Atoi(maxRunners)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
|
||||
} else {
|
||||
MaxRunners = m
|
||||
}
|
||||
}
|
||||
|
||||
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
|
||||
p, err := strconv.Atoi(onp)
|
||||
if err != nil || p <= 0 {
|
||||
slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
|
||||
} else {
|
||||
MaxQueuedRequests = p
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
package envconfig
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestConfig(t *testing.T) {
|
||||
os.Setenv("OLLAMA_DEBUG", "")
|
||||
LoadConfig()
|
||||
require.False(t, Debug)
|
||||
os.Setenv("OLLAMA_DEBUG", "false")
|
||||
LoadConfig()
|
||||
require.False(t, Debug)
|
||||
os.Setenv("OLLAMA_DEBUG", "1")
|
||||
LoadConfig()
|
||||
require.True(t, Debug)
|
||||
}
|
||||
240
server/images.go
240
server/images.go
@@ -5,7 +5,6 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
@@ -21,17 +20,15 @@ import (
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"text/template"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/auth"
|
||||
"github.com/ollama/ollama/convert"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
@@ -63,76 +60,6 @@ func (m *Model) IsEmbedding() bool {
|
||||
return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
|
||||
}
|
||||
|
||||
func (m *Model) String() string {
|
||||
var modelfile model.File
|
||||
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "model",
|
||||
Args: m.ModelPath,
|
||||
})
|
||||
|
||||
if m.Template != "" {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "template",
|
||||
Args: m.Template,
|
||||
})
|
||||
}
|
||||
|
||||
if m.System != "" {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "system",
|
||||
Args: m.System,
|
||||
})
|
||||
}
|
||||
|
||||
for _, adapter := range m.AdapterPaths {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "adapter",
|
||||
Args: adapter,
|
||||
})
|
||||
}
|
||||
|
||||
for _, projector := range m.ProjectorPaths {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "projector",
|
||||
Args: projector,
|
||||
})
|
||||
}
|
||||
|
||||
for k, v := range m.Options {
|
||||
switch v := v.(type) {
|
||||
case []any:
|
||||
for _, s := range v {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: k,
|
||||
Args: fmt.Sprintf("%v", s),
|
||||
})
|
||||
}
|
||||
default:
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: k,
|
||||
Args: fmt.Sprintf("%v", v),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for _, license := range m.License {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "license",
|
||||
Args: license,
|
||||
})
|
||||
}
|
||||
|
||||
for _, msg := range m.Messages {
|
||||
modelfile.Commands = append(modelfile.Commands, model.Command{
|
||||
Name: "message",
|
||||
Args: fmt.Sprintf("%s %s", msg.Role, msg.Content),
|
||||
})
|
||||
}
|
||||
|
||||
return modelfile.String()
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
@@ -357,7 +284,7 @@ func realpath(mfDir, from string) string {
|
||||
return abspath
|
||||
}
|
||||
|
||||
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) error {
|
||||
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
|
||||
deleteMap := make(map[string]struct{})
|
||||
if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
|
||||
for _, layer := range append(manifest.Layers, manifest.Config) {
|
||||
@@ -379,7 +306,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
|
||||
params := make(map[string][]string)
|
||||
fromParams := make(map[string]any)
|
||||
|
||||
for _, c := range modelfile.Commands {
|
||||
for _, c := range commands {
|
||||
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
|
||||
|
||||
switch c.Name {
|
||||
@@ -696,7 +623,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
|
||||
return err
|
||||
}
|
||||
|
||||
if !envconfig.NoPrune {
|
||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -774,43 +701,36 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func CopyModel(src, dst model.Name) error {
|
||||
if !dst.IsFullyQualified() {
|
||||
return model.Unqualified(dst)
|
||||
}
|
||||
if !src.IsFullyQualified() {
|
||||
return model.Unqualified(src)
|
||||
}
|
||||
|
||||
if src.Filepath() == dst.Filepath() {
|
||||
return nil
|
||||
}
|
||||
|
||||
manifests, err := GetManifestPath()
|
||||
func CopyModel(src, dest string) error {
|
||||
srcModelPath := ParseModelPath(src)
|
||||
srcPath, err := srcModelPath.GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dstpath := filepath.Join(manifests, dst.Filepath())
|
||||
if err := os.MkdirAll(filepath.Dir(dstpath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
srcpath := filepath.Join(manifests, src.Filepath())
|
||||
srcfile, err := os.Open(srcpath)
|
||||
destModelPath := ParseModelPath(dest)
|
||||
destPath, err := destModelPath.GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer srcfile.Close()
|
||||
|
||||
dstfile, err := os.Create(dstpath)
|
||||
if err != nil {
|
||||
if err := os.MkdirAll(filepath.Dir(destPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
defer dstfile.Close()
|
||||
|
||||
_, err = io.Copy(dstfile, srcfile)
|
||||
return err
|
||||
// copy the file
|
||||
input, err := os.ReadFile(srcPath)
|
||||
if err != nil {
|
||||
fmt.Println("Error reading file:", err)
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.WriteFile(destPath, input, 0o644)
|
||||
if err != nil {
|
||||
fmt.Println("Error reading file:", err)
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
|
||||
@@ -970,6 +890,67 @@ func DeleteModel(name string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func ShowModelfile(model *Model) (string, error) {
|
||||
var mt struct {
|
||||
*Model
|
||||
From string
|
||||
Parameters map[string][]any
|
||||
}
|
||||
|
||||
mt.Parameters = make(map[string][]any)
|
||||
for k, v := range model.Options {
|
||||
if s, ok := v.([]any); ok {
|
||||
mt.Parameters[k] = s
|
||||
continue
|
||||
}
|
||||
|
||||
mt.Parameters[k] = []any{v}
|
||||
}
|
||||
|
||||
mt.Model = model
|
||||
mt.From = model.ModelPath
|
||||
|
||||
if model.ParentModel != "" {
|
||||
mt.From = model.ParentModel
|
||||
}
|
||||
|
||||
modelFile := `# Modelfile generated by "ollama show"
|
||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||
# FROM {{ .ShortName }}
|
||||
|
||||
FROM {{ .From }}
|
||||
TEMPLATE """{{ .Template }}"""
|
||||
|
||||
{{- if .System }}
|
||||
SYSTEM """{{ .System }}"""
|
||||
{{- end }}
|
||||
|
||||
{{- range $adapter := .AdapterPaths }}
|
||||
ADAPTER {{ $adapter }}
|
||||
{{- end }}
|
||||
|
||||
{{- range $k, $v := .Parameters }}
|
||||
{{- range $parameter := $v }}
|
||||
PARAMETER {{ $k }} {{ printf "%#v" $parameter }}
|
||||
{{- end }}
|
||||
{{- end }}`
|
||||
|
||||
tmpl, err := template.New("").Parse(modelFile)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("error parsing template: %q", err))
|
||||
return "", err
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
|
||||
if err = tmpl.Execute(&buf, mt); err != nil {
|
||||
slog.Info(fmt.Sprintf("error executing template: %q", err))
|
||||
return "", err
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
fn(api.ProgressResponse{Status: "retrieving manifest"})
|
||||
@@ -991,6 +972,9 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||
for _, layer := range layers {
|
||||
if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
|
||||
slog.Info(fmt.Sprintf("error uploading blob: %v", err))
|
||||
if errors.Is(err, errUnauthorized) {
|
||||
return fmt.Errorf("unable to push %s, make sure this namespace exists and you are authorized to push to it", ParseModelPath(name).GetNamespaceRepository())
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
@@ -1027,7 +1011,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||
// build deleteMap to prune unused layers
|
||||
deleteMap := make(map[string]struct{})
|
||||
|
||||
if !envconfig.NoPrune {
|
||||
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
manifest, _, err = GetManifest(mp)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
@@ -1153,40 +1137,9 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
|
||||
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
|
||||
}
|
||||
|
||||
var errUnauthorized = fmt.Errorf("unauthorized: access denied")
|
||||
|
||||
// getTokenSubject returns the subject of a JWT token, it does not validate the token
|
||||
func getTokenSubject(token string) string {
|
||||
parts := strings.Split(token, ".")
|
||||
if len(parts) != 3 {
|
||||
slog.Error("jwt token does not contain 3 parts")
|
||||
return ""
|
||||
}
|
||||
|
||||
payload := parts[1]
|
||||
payloadBytes, err := base64.RawURLEncoding.DecodeString(payload)
|
||||
if err != nil {
|
||||
slog.Error(fmt.Sprintf("failed to decode jwt payload: %v", err))
|
||||
return ""
|
||||
}
|
||||
|
||||
var payloadMap map[string]interface{}
|
||||
if err := json.Unmarshal(payloadBytes, &payloadMap); err != nil {
|
||||
slog.Error(fmt.Sprintf("failed to unmarshal payload JSON: %v", err))
|
||||
return ""
|
||||
}
|
||||
|
||||
sub, ok := payloadMap["sub"]
|
||||
if !ok {
|
||||
slog.Error("jwt does not contain 'sub' field")
|
||||
return ""
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s", sub)
|
||||
}
|
||||
var errUnauthorized = errors.New("unauthorized")
|
||||
|
||||
func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
|
||||
anonymous := true // access will default to anonymous if no user is found associated with the public key
|
||||
for i := 0; i < 2; i++ {
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
|
||||
if err != nil {
|
||||
@@ -1205,7 +1158,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
anonymous = getTokenSubject(token) == "anonymous"
|
||||
regOpts.Token = token
|
||||
if body != nil {
|
||||
_, err = body.Seek(0, io.SeekStart)
|
||||
@@ -1226,16 +1178,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
}
|
||||
}
|
||||
|
||||
if anonymous {
|
||||
// no user is associated with the public key, and the request requires non-anonymous access
|
||||
pubKey, nestedErr := auth.GetPublicKey()
|
||||
if nestedErr != nil {
|
||||
slog.Error(fmt.Sprintf("couldn't get public key: %v", nestedErr))
|
||||
return nil, errUnauthorized
|
||||
}
|
||||
return nil, &errtypes.UnknownOllamaKey{Key: pubKey}
|
||||
}
|
||||
// user is associated with the public key, but is not authorized to make the request
|
||||
return nil, errUnauthorized
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -26,10 +25,9 @@ const (
|
||||
)
|
||||
|
||||
var (
|
||||
ErrInvalidImageFormat = errors.New("invalid image format")
|
||||
ErrInvalidProtocol = errors.New("invalid protocol scheme")
|
||||
ErrInsecureProtocol = errors.New("insecure protocol http")
|
||||
ErrInvalidDigestFormat = errors.New("invalid digest format")
|
||||
ErrInvalidImageFormat = errors.New("invalid image format")
|
||||
ErrInvalidProtocol = errors.New("invalid protocol scheme")
|
||||
ErrInsecureProtocol = errors.New("insecure protocol http")
|
||||
)
|
||||
|
||||
func ParseModelPath(name string) ModelPath {
|
||||
@@ -151,17 +149,6 @@ func GetBlobsPath(digest string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// only accept actual sha256 digests
|
||||
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
|
||||
re := regexp.MustCompile(pattern)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if digest != "" && !re.MatchString(digest) {
|
||||
return "", ErrInvalidDigestFormat
|
||||
}
|
||||
|
||||
digest = strings.ReplaceAll(digest, ":", "-")
|
||||
path := filepath.Join(dir, "blobs", digest)
|
||||
dirPath := filepath.Dir(path)
|
||||
|
||||
@@ -1,73 +1,6 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestGetBlobsPath(t *testing.T) {
|
||||
// GetBlobsPath expects an actual directory to exist
|
||||
dir, err := os.MkdirTemp("", "ollama-test")
|
||||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(dir)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
digest string
|
||||
expected string
|
||||
err error
|
||||
}{
|
||||
{
|
||||
"empty digest",
|
||||
"",
|
||||
filepath.Join(dir, "blobs"),
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"valid with colon",
|
||||
"sha256:456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
|
||||
filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"valid with dash",
|
||||
"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
|
||||
filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"digest too short",
|
||||
"sha256-45640291",
|
||||
"",
|
||||
ErrInvalidDigestFormat,
|
||||
},
|
||||
{
|
||||
"digest too long",
|
||||
"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9aaaaaaaaaa",
|
||||
"",
|
||||
ErrInvalidDigestFormat,
|
||||
},
|
||||
{
|
||||
"digest invalid chars",
|
||||
"../sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7a",
|
||||
"",
|
||||
ErrInvalidDigestFormat,
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_MODELS", dir)
|
||||
|
||||
got, err := GetBlobsPath(tc.digest)
|
||||
|
||||
assert.ErrorIs(t, tc.err, err, tc.name)
|
||||
assert.Equal(t, tc.expected, got, tc.name)
|
||||
})
|
||||
}
|
||||
}
|
||||
import "testing"
|
||||
|
||||
func TestParseModelPath(t *testing.T) {
|
||||
tests := []struct {
|
||||
|
||||
122
server/routes.go
122
server/routes.go
@@ -1,7 +1,6 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
@@ -29,8 +28,7 @@ import (
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
@@ -147,7 +145,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
handleErrorResponse(c, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -390,7 +388,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
handleErrorResponse(c, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -513,17 +511,28 @@ func (s *Server) PushModelHandler(c *gin.Context) {
|
||||
|
||||
func (s *Server) CreateModelHandler(c *gin.Context) {
|
||||
var req api.CreateRequest
|
||||
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
|
||||
return
|
||||
} else if err != nil {
|
||||
case err != nil:
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
name := model.ParseName(cmp.Or(req.Model, req.Name))
|
||||
if !name.IsValid() {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
|
||||
var model string
|
||||
if req.Model != "" {
|
||||
model = req.Model
|
||||
} else if req.Name != "" {
|
||||
model = req.Name
|
||||
} else {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := ParseModelPath(model).Validate(); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -532,19 +541,19 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
var r io.Reader = strings.NewReader(req.Modelfile)
|
||||
var modelfile io.Reader = strings.NewReader(req.Modelfile)
|
||||
if req.Path != "" && req.Modelfile == "" {
|
||||
f, err := os.Open(req.Path)
|
||||
mf, err := os.Open(req.Path)
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
defer mf.Close()
|
||||
|
||||
r = f
|
||||
modelfile = mf
|
||||
}
|
||||
|
||||
modelfile, err := model.ParseFile(r)
|
||||
commands, err := parser.Parse(modelfile)
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -560,7 +569,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
defer cancel()
|
||||
|
||||
if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), req.Quantization, modelfile, fn); err != nil {
|
||||
if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -708,12 +717,12 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
}
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"")
|
||||
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
|
||||
fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
|
||||
fmt.Fprint(&sb, model.String())
|
||||
resp.Modelfile = sb.String()
|
||||
mf, err := ShowModelfile(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Modelfile = mf
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
@@ -779,31 +788,34 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
func (s *Server) CopyModelHandler(c *gin.Context) {
|
||||
var r api.CopyRequest
|
||||
if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
|
||||
var req api.CopyRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
|
||||
return
|
||||
} else if err != nil {
|
||||
case err != nil:
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
src := model.ParseName(r.Source)
|
||||
if !src.IsValid() {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
|
||||
if req.Source == "" || req.Destination == "" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "source add destination are required"})
|
||||
return
|
||||
}
|
||||
|
||||
dst := model.ParseName(r.Destination)
|
||||
if !dst.IsValid() {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Source)})
|
||||
if err := ParseModelPath(req.Destination).Validate(); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
if err := CopyModel(src, dst); errors.Is(err, os.ErrNotExist) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found", r.Source)})
|
||||
} else if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
if err := CopyModel(req.Source, req.Destination); err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Source)})
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -860,6 +872,12 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
|
||||
c.Status(http.StatusCreated)
|
||||
}
|
||||
|
||||
var defaultAllowOrigins = []string{
|
||||
"localhost",
|
||||
"127.0.0.1",
|
||||
"0.0.0.0",
|
||||
}
|
||||
|
||||
func isLocalIP(ip netip.Addr) bool {
|
||||
if interfaces, err := net.Interfaces(); err == nil {
|
||||
for _, iface := range interfaces {
|
||||
@@ -943,7 +961,19 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
config := cors.DefaultConfig()
|
||||
config.AllowWildcard = true
|
||||
config.AllowBrowserExtensions = true
|
||||
config.AllowOrigins = envconfig.AllowOrigins
|
||||
|
||||
if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
|
||||
config.AllowOrigins = strings.Split(allowedOrigins, ",")
|
||||
}
|
||||
|
||||
for _, allowOrigin := range defaultAllowOrigins {
|
||||
config.AllowOrigins = append(config.AllowOrigins,
|
||||
fmt.Sprintf("http://%s", allowOrigin),
|
||||
fmt.Sprintf("https://%s", allowOrigin),
|
||||
fmt.Sprintf("http://%s:*", allowOrigin),
|
||||
fmt.Sprintf("https://%s:*", allowOrigin),
|
||||
)
|
||||
}
|
||||
|
||||
r := gin.Default()
|
||||
r.Use(
|
||||
@@ -982,11 +1012,10 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
|
||||
func Serve(ln net.Listener) error {
|
||||
level := slog.LevelInfo
|
||||
if envconfig.Debug {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
|
||||
slog.Info("server config", "env", envconfig.AsMap())
|
||||
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: level,
|
||||
AddSource: true,
|
||||
@@ -1010,7 +1039,7 @@ func Serve(ln net.Listener) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if !envconfig.NoPrune {
|
||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
// clean up unused layers and manifests
|
||||
if err := PruneLayers(); err != nil {
|
||||
return err
|
||||
@@ -1041,7 +1070,6 @@ func Serve(ln net.Listener) error {
|
||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
<-signals
|
||||
srvr.Close()
|
||||
done()
|
||||
sched.unloadAllRunners()
|
||||
gpu.Cleanup()
|
||||
@@ -1187,7 +1215,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
handleErrorResponse(c, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1308,15 +1336,3 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func handleErrorResponse(c *gin.Context, err error) {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
c.JSON(499, gin.H{"error": "request canceled"})
|
||||
return
|
||||
}
|
||||
if errors.Is(err, ErrMaxQueue) {
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
@@ -55,13 +55,13 @@ func Test_Routes(t *testing.T) {
|
||||
createTestModel := func(t *testing.T, name string) {
|
||||
fname := createTestFile(t, "ollama-model")
|
||||
|
||||
r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
||||
modelfile, err := model.ParseFile(r)
|
||||
modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
||||
commands, err := parser.Parse(modelfile)
|
||||
assert.Nil(t, err)
|
||||
fn := func(resp api.ProgressResponse) {
|
||||
t.Logf("Status: %s", resp.Status)
|
||||
}
|
||||
err = CreateModel(context.TODO(), name, "", "", modelfile, fn)
|
||||
err = CreateModel(context.TODO(), name, "", "", commands, fn)
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
|
||||
@@ -238,5 +238,6 @@ func Test_Routes(t *testing.T) {
|
||||
if tc.Expected != nil {
|
||||
tc.Expected(t, resp)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
140
server/sched.go
140
server/sched.go
@@ -5,8 +5,10 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -15,13 +17,13 @@ import (
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
type LlmRequest struct {
|
||||
ctx context.Context //nolint:containedctx
|
||||
model *Model
|
||||
ggml *llm.GGML // TODO - how large is this, and do we need to free it after we've finished loading?
|
||||
opts api.Options
|
||||
sessionDuration time.Duration
|
||||
successCh chan *runnerRef
|
||||
@@ -37,19 +39,31 @@ type Scheduler struct {
|
||||
loaded map[string]*runnerRef
|
||||
loadedMu sync.Mutex
|
||||
|
||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
||||
loadFn func(req *LlmRequest, gpus gpu.GpuInfoList)
|
||||
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
|
||||
getGpuFn func() gpu.GpuInfoList
|
||||
}
|
||||
|
||||
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||
// TODO set this to zero after a release or two, to enable multiple models by default
|
||||
var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
|
||||
var maxQueuedRequests = 10 // TODO configurable
|
||||
|
||||
func InitScheduler(ctx context.Context) *Scheduler {
|
||||
maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
|
||||
if maxRunners != "" {
|
||||
m, err := strconv.Atoi(maxRunners)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
|
||||
} else {
|
||||
loadedMax = m
|
||||
}
|
||||
}
|
||||
|
||||
sched := &Scheduler{
|
||||
pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
|
||||
finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
|
||||
expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests),
|
||||
unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests),
|
||||
pendingReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
||||
finishedReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
||||
expiredCh: make(chan *runnerRef, maxQueuedRequests),
|
||||
unloadedCh: make(chan interface{}, maxQueuedRequests),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: llm.NewLlamaServer,
|
||||
getGpuFn: gpu.GetGPUInfo,
|
||||
@@ -60,22 +74,24 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||
|
||||
// context must be canceled to decrement ref count and release the runner
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
||||
// allocate a large enough kv cache for all parallel requests
|
||||
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
|
||||
|
||||
ggml, err := llm.LoadModel(model.ModelPath)
|
||||
req := &LlmRequest{
|
||||
ctx: c,
|
||||
model: model,
|
||||
ggml: ggml,
|
||||
opts: opts,
|
||||
sessionDuration: sessionDuration,
|
||||
successCh: make(chan *runnerRef),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
req.errCh <- err
|
||||
return req.successCh, req.errCh
|
||||
}
|
||||
select {
|
||||
case s.pendingReqCh <- req:
|
||||
default:
|
||||
req.errCh <- ErrMaxQueue
|
||||
req.errCh <- fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||
}
|
||||
return req.successCh, req.errCh
|
||||
}
|
||||
@@ -100,12 +116,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
return
|
||||
case pending := <-s.pendingReqCh:
|
||||
// Block other requests until we get this pending request running
|
||||
|
||||
if pending.ctx.Err() != nil {
|
||||
slog.Debug("pending request cancelled or timed out, skipping scheduling")
|
||||
continue
|
||||
}
|
||||
|
||||
for {
|
||||
var runnerToExpire *runnerRef
|
||||
s.loadedMu.Lock()
|
||||
@@ -120,50 +130,31 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||
break
|
||||
}
|
||||
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
|
||||
} else if loadedCount == 0 {
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
gpus := s.getGpuFn()
|
||||
g := pickBestFitGPUs(pending, gpus)
|
||||
if g != nil {
|
||||
gpus = g
|
||||
}
|
||||
s.loadFn(pending, gpus)
|
||||
break
|
||||
} else if loadedMax > 0 && loadedCount >= loadedMax {
|
||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
runnerToExpire = s.findRunnerToUnload(pending)
|
||||
} else {
|
||||
// Either no models are loaded or below envconfig.MaxRunners
|
||||
// More than one loaded model, so we have to see if the new one fits
|
||||
// Get a refreshed GPU list
|
||||
gpus := s.getGpuFn()
|
||||
|
||||
// Load model for fitting
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
||||
if err != nil {
|
||||
pending.errCh <- err
|
||||
break
|
||||
}
|
||||
|
||||
// If we're CPU only mode, just limit by envconfig.MaxRunners above
|
||||
// TODO handle system memory exhaustion
|
||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
|
||||
slog.Debug("cpu mode with existing models, loading")
|
||||
s.loadFn(pending, ggml, gpus)
|
||||
break
|
||||
}
|
||||
|
||||
// No models loaded. Load the model but prefer the best fit.
|
||||
if loadedCount == 0 {
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
g := pickBestFitGPUs(pending, ggml, gpus)
|
||||
if g != nil {
|
||||
gpus = g
|
||||
}
|
||||
s.loadFn(pending, ggml, gpus)
|
||||
break
|
||||
}
|
||||
|
||||
// More than one loaded model, so we have to see if the new one fits
|
||||
// Update free memory from currently loaded models
|
||||
s.updateFreeSpace(gpus)
|
||||
gpus = pickBestFitGPUs(pending, ggml, gpus)
|
||||
gpus = pickBestFitGPUs(pending, gpus)
|
||||
if gpus != nil {
|
||||
slog.Debug("new model fits with existing models, loading")
|
||||
s.loadFn(pending, ggml, gpus)
|
||||
s.loadFn(pending, gpus)
|
||||
break
|
||||
}
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
runnerToExpire = s.findRunnerToUnload(pending)
|
||||
}
|
||||
|
||||
if runnerToExpire == nil {
|
||||
@@ -236,7 +227,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
||||
defer runner.refMu.Unlock()
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
s.expiredCh <- runner
|
||||
})
|
||||
@@ -263,9 +253,9 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
||||
continue
|
||||
}
|
||||
|
||||
s.loadedMu.Lock()
|
||||
slog.Debug("got lock to unload", "model", runner.model)
|
||||
runner.unload()
|
||||
s.loadedMu.Lock()
|
||||
delete(s.loaded, runner.model)
|
||||
s.loadedMu.Unlock()
|
||||
slog.Debug("runner released", "model", runner.model)
|
||||
@@ -283,10 +273,6 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||
runner.refMu.Lock()
|
||||
defer runner.refMu.Unlock()
|
||||
runner.refCount++
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
runner.sessionDuration = pending.sessionDuration
|
||||
pending.successCh <- runner
|
||||
go func() {
|
||||
@@ -296,8 +282,8 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||
}()
|
||||
}
|
||||
|
||||
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
|
||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||
func (s *Scheduler) load(req *LlmRequest, gpus gpu.GpuInfoList) {
|
||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
@@ -417,10 +403,6 @@ type runnerRef struct {
|
||||
|
||||
// The refMu must already be held when calling unload
|
||||
func (runner *runnerRef) unload() {
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
if runner.llama != nil {
|
||||
runner.llama.Close()
|
||||
}
|
||||
@@ -435,25 +417,16 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
||||
slog.Debug("evaluating already loaded", "model", req.model.ModelPath)
|
||||
runner.refMu.Lock()
|
||||
defer runner.refMu.Unlock()
|
||||
|
||||
// Ignore the NumGPU settings for comparison
|
||||
optsExisting := runner.Options.Runner
|
||||
optsExisting.NumGPU = -1
|
||||
optsNew := req.opts.Runner
|
||||
optsNew.NumGPU = -1
|
||||
timeout := 10 * time.Second
|
||||
if runner.loading {
|
||||
timeout = 2 * time.Minute // Initial load can take a long time for big models on slow systems...
|
||||
}
|
||||
|
||||
if runner.Options == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
// Don't reload runner if num_gpu=-1 was provided
|
||||
optsExisting := runner.Options.Runner
|
||||
optsNew := req.opts.Runner
|
||||
if optsNew.NumGPU < 0 {
|
||||
optsExisting.NumGPU = -1
|
||||
optsNew.NumGPU = -1
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout) // BUG -
|
||||
defer cancel()
|
||||
if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
|
||||
!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
|
||||
@@ -461,7 +434,6 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
||||
runner.llama.Ping(ctx) != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -482,7 +454,7 @@ func (a ByDuration) Less(i, j int) bool {
|
||||
|
||||
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||
func pickBestFitGPUs(req *LlmRequest, gpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
for _, gl := range gpus.ByLibrary() {
|
||||
var ok bool
|
||||
@@ -494,7 +466,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
||||
|
||||
// First attempt to fit the model into a single GPU
|
||||
for _, g := range sgl {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||
return []gpu.GpuInfo{g}
|
||||
}
|
||||
@@ -505,7 +477,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// Now try all the GPUs
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(gl, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
|
||||
return gl
|
||||
}
|
||||
@@ -514,7 +486,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
||||
}
|
||||
|
||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||
func (s *Scheduler) findRunnerToUnload(req *LlmRequest) *runnerRef {
|
||||
s.loadedMu.Lock()
|
||||
runnerList := make([]*runnerRef, 0, len(s.loaded))
|
||||
for _, r := range s.loaded {
|
||||
|
||||
@@ -15,7 +15,6 @@ import (
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
@@ -28,21 +27,29 @@ func init() {
|
||||
func TestInitScheduler(t *testing.T) {
|
||||
ctx, done := context.WithCancel(context.Background())
|
||||
defer done()
|
||||
initialMax := loadedMax
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
require.Equal(t, initialMax, loadedMax)
|
||||
require.NotNil(t, s.loaded)
|
||||
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
|
||||
s = InitScheduler(ctx)
|
||||
require.Equal(t, initialMax, loadedMax)
|
||||
require.NotNil(t, s.loaded)
|
||||
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
||||
s = InitScheduler(ctx)
|
||||
require.Equal(t, 0, loadedMax)
|
||||
require.NotNil(t, s.loaded)
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
func TestLoad(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
var ggml *llm.GGML // value not used in tests
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
model: &Model{ModelPath: "foo"},
|
||||
opts: api.DefaultOptions(),
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
sessionDuration: 2,
|
||||
@@ -52,12 +59,10 @@ func TestLoad(t *testing.T) {
|
||||
return nil, fmt.Errorf("something failed to load model blah")
|
||||
}
|
||||
gpus := gpu.GpuInfoList{}
|
||||
s.load(req, ggml, gpus)
|
||||
s.load(req, gpus)
|
||||
require.Len(t, req.successCh, 0)
|
||||
require.Len(t, req.errCh, 1)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 0)
|
||||
s.loadedMu.Unlock()
|
||||
err := <-req.errCh
|
||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
@@ -65,30 +70,26 @@ func TestLoad(t *testing.T) {
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus)
|
||||
s.load(req, gpus)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.NoError(t, err)
|
||||
case resp := <-req.successCh:
|
||||
require.Equal(t, uint64(10), resp.estimatedVRAM)
|
||||
require.Equal(t, uint(1), resp.refCount)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
req.model.ModelPath = "dummy_model_path"
|
||||
server.waitResp = fmt.Errorf("wait failure")
|
||||
s.load(req, ggml, gpus)
|
||||
s.load(req, gpus)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.Contains(t, err.Error(), "wait failure")
|
||||
case resp := <-req.successCh:
|
||||
t.Errorf("unexpected success %v", resp)
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
runner := s.loaded["dummy_model_path"]
|
||||
s.loadedMu.Unlock()
|
||||
require.NotNil(t, runner)
|
||||
require.Equal(t, uint(0), runner.refCount)
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
@@ -100,7 +101,6 @@ type bundle struct {
|
||||
ctxDone func()
|
||||
srv *mockLlm
|
||||
req *LlmRequest
|
||||
ggml *llm.GGML
|
||||
}
|
||||
|
||||
func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
||||
@@ -132,16 +132,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
})
|
||||
assert.Nil(t, err)
|
||||
|
||||
fname := f.Name()
|
||||
model := &Model{Name: modelName, ModelPath: fname}
|
||||
scenario.ggml, err = llm.LoadModel(model.ModelPath)
|
||||
ggml, err := llm.LoadModel(model.ModelPath)
|
||||
require.NoError(t, err)
|
||||
|
||||
scenario.req = &LlmRequest{
|
||||
ctx: scenario.ctx,
|
||||
model: model,
|
||||
opts: api.DefaultOptions(),
|
||||
ggml: ggml,
|
||||
sessionDuration: 5 * time.Millisecond,
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
@@ -151,7 +149,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
||||
}
|
||||
|
||||
func TestRequests(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
@@ -159,20 +157,18 @@ func TestRequests(t *testing.T) {
|
||||
scenario1a.req.sessionDuration = 0
|
||||
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
||||
scenario1b.req.model = scenario1a.req.model
|
||||
scenario1b.ggml = scenario1a.ggml
|
||||
scenario1b.req.ggml = scenario1a.req.ggml
|
||||
scenario1b.req.sessionDuration = 0
|
||||
|
||||
// simple reload of same model
|
||||
scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
|
||||
scenario2a.req.model = scenario1a.req.model
|
||||
scenario2a.ggml = scenario1a.ggml
|
||||
scenario2a.req.ggml = scenario1a.req.ggml
|
||||
|
||||
// Multiple loaded models
|
||||
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
||||
scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
|
||||
scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
|
||||
scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||
scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
|
||||
scenario3c := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
@@ -226,7 +222,7 @@ func TestRequests(t *testing.T) {
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
|
||||
envconfig.MaxRunners = 1
|
||||
loadedMax = 1
|
||||
s.newServerFn = scenario3a.newServer
|
||||
slog.Info("scenario3a")
|
||||
s.pendingReqCh <- scenario3a.req
|
||||
@@ -241,11 +237,9 @@ func TestRequests(t *testing.T) {
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
envconfig.MaxRunners = 0
|
||||
loadedMax = 0
|
||||
s.newServerFn = scenario3b.newServer
|
||||
slog.Info("scenario3b")
|
||||
s.pendingReqCh <- scenario3b.req
|
||||
@@ -257,14 +251,19 @@ func TestRequests(t *testing.T) {
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 2)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// This is a CPU load with NumGPU = 0 so it should load
|
||||
// Try to load a model that wont fit
|
||||
s.newServerFn = scenario3c.newServer
|
||||
slog.Info("scenario3c")
|
||||
require.Len(t, s.loaded, 2)
|
||||
scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
|
||||
time.Sleep(2 * time.Millisecond)
|
||||
s.pendingReqCh <- scenario3c.req
|
||||
// finish prior request, so new model can load
|
||||
time.Sleep(6 * time.Millisecond)
|
||||
require.Len(t, s.loaded, 1)
|
||||
scenario3b.ctxDone()
|
||||
select {
|
||||
case resp := <-scenario3c.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario3c.srv)
|
||||
@@ -273,40 +272,11 @@ func TestRequests(t *testing.T) {
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 3)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// Try to load a model that wont fit
|
||||
s.newServerFn = scenario3d.newServer
|
||||
slog.Info("scenario3d")
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 3)
|
||||
s.loadedMu.Unlock()
|
||||
scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
|
||||
time.Sleep(2 * time.Millisecond)
|
||||
s.pendingReqCh <- scenario3d.req
|
||||
// finish prior request, so new model can load
|
||||
time.Sleep(6 * time.Millisecond)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 2)
|
||||
s.loadedMu.Unlock()
|
||||
scenario3b.ctxDone()
|
||||
select {
|
||||
case resp := <-scenario3d.req.successCh:
|
||||
require.Equal(t, resp.llama, scenario3d.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, scenario3d.req.errCh, 0)
|
||||
case <-ctx.Done():
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 2)
|
||||
s.loadedMu.Unlock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
}
|
||||
|
||||
func TestGetRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
@@ -316,7 +286,7 @@ func TestGetRunner(t *testing.T) {
|
||||
scenario1b.req.sessionDuration = 0
|
||||
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
|
||||
scenario1c.req.sessionDuration = 0
|
||||
envconfig.MaxQueuedRequests = 1
|
||||
maxQueuedRequests = 1
|
||||
s := InitScheduler(ctx)
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
g := gpu.GpuInfo{Library: "metal"}
|
||||
@@ -345,28 +315,25 @@ func TestGetRunner(t *testing.T) {
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
scenario1a.ctxDone()
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
scenario1c.req.model.ModelPath = "bad path"
|
||||
slog.Info("scenario1c")
|
||||
successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
|
||||
// Starts in pending channel, then should be quickly processsed to return an error
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, successCh1c, 0)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 0)
|
||||
s.loadedMu.Unlock()
|
||||
require.Len(t, errCh1c, 1)
|
||||
err = <-errCh1c
|
||||
require.Contains(t, err.Error(), "bad path")
|
||||
scenario1b.ctxDone()
|
||||
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
require.Len(t, s.loaded, 0)
|
||||
}
|
||||
|
||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||
func TestPrematureExpired(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
@@ -387,9 +354,7 @@ func TestPrematureExpired(t *testing.T) {
|
||||
require.Equal(t, resp.llama, scenario1a.srv)
|
||||
require.Len(t, s.pendingReqCh, 0)
|
||||
require.Len(t, errCh1a, 0)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
slog.Info("sending premature expired event now")
|
||||
s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
|
||||
case <-ctx.Done():
|
||||
@@ -401,9 +366,7 @@ func TestPrematureExpired(t *testing.T) {
|
||||
require.LessOrEqual(t, len(s.finishedReqCh), 1)
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
require.Len(t, s.finishedReqCh, 0)
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 0)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// also shouldn't happen in real life
|
||||
s.finishedReqCh <- scenario1a.req
|
||||
@@ -411,10 +374,9 @@ func TestPrematureExpired(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestUseLoadedRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
opts: api.DefaultOptions(),
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
sessionDuration: 2,
|
||||
}
|
||||
@@ -436,7 +398,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestUpdateFreeSpace(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
||||
defer done()
|
||||
gpus := gpu.GpuInfoList{
|
||||
{
|
||||
@@ -458,47 +420,43 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||
r2 := &runnerRef{llama: llm2, gpus: gpus}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
s.updateFreeSpace(gpus)
|
||||
require.Equal(t, uint64(850), gpus[0].FreeMemory)
|
||||
require.Equal(t, uint64(1850), gpus[1].FreeMemory)
|
||||
|
||||
}
|
||||
|
||||
func TestFindRunnerToUnload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
req := &LlmRequest{ctx: ctx}
|
||||
r1 := &runnerRef{refCount: 1, sessionDuration: 1}
|
||||
r2 := &runnerRef{sessionDuration: 2}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
resp := s.findRunnerToUnload()
|
||||
resp := s.findRunnerToUnload(req)
|
||||
require.Equal(t, r2, resp)
|
||||
r2.refCount = 1
|
||||
resp = s.findRunnerToUnload()
|
||||
resp = s.findRunnerToUnload(req)
|
||||
require.Equal(t, r1, resp)
|
||||
|
||||
}
|
||||
|
||||
func TestNeedsReload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
llm := &mockLlm{}
|
||||
do := api.DefaultOptions()
|
||||
runner := &runnerRef{
|
||||
adapters: []string{"adapter1"},
|
||||
projectors: []string{"projector1"},
|
||||
Options: &do,
|
||||
Options: &api.Options{},
|
||||
llama: llm,
|
||||
}
|
||||
req := &LlmRequest{
|
||||
@@ -506,7 +464,7 @@ func TestNeedsReload(t *testing.T) {
|
||||
AdapterPaths: []string{"adapter2"},
|
||||
ProjectorPaths: []string{"projector2"},
|
||||
},
|
||||
opts: api.DefaultOptions(),
|
||||
opts: api.Options{},
|
||||
}
|
||||
resp := runner.needsReload(ctx, req)
|
||||
require.True(t, resp)
|
||||
@@ -527,14 +485,11 @@ func TestNeedsReload(t *testing.T) {
|
||||
require.False(t, resp)
|
||||
req.opts.NumGPU = 99
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.True(t, resp)
|
||||
req.opts.NumGPU = -1
|
||||
resp = runner.needsReload(ctx, req)
|
||||
require.False(t, resp)
|
||||
}
|
||||
|
||||
func TestUnloadAllRunners(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
llm1 := &mockLlm{}
|
||||
@@ -545,10 +500,8 @@ func TestUnloadAllRunners(t *testing.T) {
|
||||
r1 := &runnerRef{llama: llm1}
|
||||
r2 := &runnerRef{llama: llm2}
|
||||
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loaded["b"] = r2
|
||||
s.loadedMu.Unlock()
|
||||
s.unloadAllRunners()
|
||||
|
||||
require.True(t, llm1.closeCalled)
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
// Package errtypes contains custom error types
|
||||
package errtypes
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const UnknownOllamaKeyErrMsg = "unknown ollama key"
|
||||
|
||||
// TODO: This should have a structured response from the API
|
||||
type UnknownOllamaKey struct {
|
||||
Key string
|
||||
}
|
||||
|
||||
func (e *UnknownOllamaKey) Error() string {
|
||||
return fmt.Sprintf("unauthorized: %s %q", UnknownOllamaKeyErrMsg, strings.TrimSpace(e.Key))
|
||||
}
|
||||
87
types/model/digest.go
Normal file
87
types/model/digest.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Digest represents a digest of a model Manifest. It is a comparable value
|
||||
// type and is immutable.
|
||||
//
|
||||
// The zero Digest is not a valid digest.
|
||||
type Digest struct {
|
||||
s string
|
||||
}
|
||||
|
||||
// Split returns the digest type and the digest value.
|
||||
func (d Digest) Split() (typ, digest string) {
|
||||
typ, digest, _ = strings.Cut(d.s, "-")
|
||||
return
|
||||
}
|
||||
|
||||
// String returns the digest in the form of "<digest-type>-<digest>", or the
|
||||
// empty string if the digest is invalid.
|
||||
func (d Digest) String() string { return d.s }
|
||||
|
||||
// IsValid returns true if the digest is valid (not zero).
|
||||
//
|
||||
// A valid digest may be created only by ParseDigest, or
|
||||
// ParseName(name).Digest().
|
||||
func (d Digest) IsValid() bool { return d.s != "" }
|
||||
|
||||
// LogValue implements slog.Value.
|
||||
func (d Digest) LogValue() slog.Value {
|
||||
return slog.StringValue(d.String())
|
||||
}
|
||||
|
||||
var (
|
||||
_ slog.LogValuer = Digest{}
|
||||
)
|
||||
|
||||
// ParseDigest parses a string in the form of "<digest-type>-<digest>" into a
|
||||
// Digest.
|
||||
func ParseDigest(s string) Digest {
|
||||
typ, digest, ok := strings.Cut(s, "-")
|
||||
if !ok {
|
||||
typ, digest, ok = strings.Cut(s, ":")
|
||||
}
|
||||
if ok && isValidDigestType(typ) && isValidHex(digest) && len(digest) >= 2 {
|
||||
return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
|
||||
}
|
||||
return Digest{}
|
||||
}
|
||||
|
||||
func MustParseDigest(s string) Digest {
|
||||
d := ParseDigest(s)
|
||||
if !d.IsValid() {
|
||||
panic(fmt.Sprintf("invalid digest: %q", s))
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func isValidDigestType(s string) bool {
|
||||
if len(s) == 0 {
|
||||
return false
|
||||
}
|
||||
for _, r := range s {
|
||||
if !unicode.IsLower(r) && !unicode.IsDigit(r) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func isValidHex(s string) bool {
|
||||
if len(s) == 0 {
|
||||
return false
|
||||
}
|
||||
for i := range s {
|
||||
c := s[i]
|
||||
if c < '0' || c > '9' && c < 'a' || c > 'f' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
46
types/model/digest_test.go
Normal file
46
types/model/digest_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package model
|
||||
|
||||
import "testing"
|
||||
|
||||
var testDigests = map[string]Digest{
|
||||
"": {},
|
||||
"sha256-1234": {s: "sha256-1234"},
|
||||
"sha256-5678": {s: "sha256-5678"},
|
||||
"blake2-9abc": {s: "blake2-9abc"},
|
||||
"-1234": {},
|
||||
"sha256-": {},
|
||||
"sha256-1234-5678": {},
|
||||
"sha256-P": {}, // invalid hex
|
||||
"sha256-1234P": {},
|
||||
"---": {},
|
||||
}
|
||||
|
||||
func TestDigestParse(t *testing.T) {
|
||||
// Test cases.
|
||||
for s, want := range testDigests {
|
||||
got := ParseDigest(s)
|
||||
t.Logf("ParseDigest(%q) = %#v", s, got)
|
||||
if got != want {
|
||||
t.Errorf("ParseDigest(%q) = %q; want %q", s, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDigestString(t *testing.T) {
|
||||
// Test cases.
|
||||
for s, d := range testDigests {
|
||||
want := s
|
||||
if !d.IsValid() {
|
||||
want = ""
|
||||
}
|
||||
got := d.String()
|
||||
if got != want {
|
||||
t.Errorf("ParseDigest(%q).String() = %q; want %q", s, got, want)
|
||||
}
|
||||
|
||||
got = ParseDigest(s).String()
|
||||
if got != want {
|
||||
t.Errorf("roundtrip ParseDigest(%q).String() = %q; want %q", s, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,299 +0,0 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type File struct {
|
||||
Commands []Command
|
||||
}
|
||||
|
||||
func (f File) String() string {
|
||||
var sb strings.Builder
|
||||
for _, cmd := range f.Commands {
|
||||
fmt.Fprintln(&sb, cmd.String())
|
||||
}
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
type Command struct {
|
||||
Name string
|
||||
Args string
|
||||
}
|
||||
|
||||
func (c Command) String() string {
|
||||
var sb strings.Builder
|
||||
switch c.Name {
|
||||
case "model":
|
||||
fmt.Fprintf(&sb, "FROM %s", c.Args)
|
||||
case "license", "template", "system", "adapter":
|
||||
fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
|
||||
case "message":
|
||||
role, message, _ := strings.Cut(c.Args, ": ")
|
||||
fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
|
||||
default:
|
||||
fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
|
||||
}
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
type state int
|
||||
|
||||
const (
|
||||
stateNil state = iota
|
||||
stateName
|
||||
stateValue
|
||||
stateParameter
|
||||
stateMessage
|
||||
stateComment
|
||||
)
|
||||
|
||||
var (
|
||||
errMissingFrom = errors.New("no FROM line")
|
||||
errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
|
||||
errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
|
||||
)
|
||||
|
||||
func ParseFile(r io.Reader) (*File, error) {
|
||||
var cmd Command
|
||||
var curr state
|
||||
var b bytes.Buffer
|
||||
var role string
|
||||
|
||||
var f File
|
||||
|
||||
br := bufio.NewReader(r)
|
||||
for {
|
||||
r, _, err := br.ReadRune()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
next, r, err := parseRuneForState(r, curr)
|
||||
if errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
return nil, fmt.Errorf("%w: %s", err, b.String())
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// process the state transition, some transitions need to be intercepted and redirected
|
||||
if next != curr {
|
||||
switch curr {
|
||||
case stateName:
|
||||
if !isValidCommand(b.String()) {
|
||||
return nil, errInvalidCommand
|
||||
}
|
||||
|
||||
// next state sometimes depends on the current buffer value
|
||||
switch s := strings.ToLower(b.String()); s {
|
||||
case "from":
|
||||
cmd.Name = "model"
|
||||
case "parameter":
|
||||
// transition to stateParameter which sets command name
|
||||
next = stateParameter
|
||||
case "message":
|
||||
// transition to stateMessage which validates the message role
|
||||
next = stateMessage
|
||||
fallthrough
|
||||
default:
|
||||
cmd.Name = s
|
||||
}
|
||||
case stateParameter:
|
||||
cmd.Name = b.String()
|
||||
case stateMessage:
|
||||
if !isValidMessageRole(b.String()) {
|
||||
return nil, errInvalidMessageRole
|
||||
}
|
||||
|
||||
role = b.String()
|
||||
case stateComment, stateNil:
|
||||
// pass
|
||||
case stateValue:
|
||||
s, ok := unquote(b.String())
|
||||
if !ok || isSpace(r) {
|
||||
if _, err := b.WriteRune(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if role != "" {
|
||||
s = role + ": " + s
|
||||
role = ""
|
||||
}
|
||||
|
||||
cmd.Args = s
|
||||
f.Commands = append(f.Commands, cmd)
|
||||
}
|
||||
|
||||
b.Reset()
|
||||
curr = next
|
||||
}
|
||||
|
||||
if strconv.IsPrint(r) {
|
||||
if _, err := b.WriteRune(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// flush the buffer
|
||||
switch curr {
|
||||
case stateComment, stateNil:
|
||||
// pass; nothing to flush
|
||||
case stateValue:
|
||||
s, ok := unquote(b.String())
|
||||
if !ok {
|
||||
return nil, io.ErrUnexpectedEOF
|
||||
}
|
||||
|
||||
if role != "" {
|
||||
s = role + ": " + s
|
||||
}
|
||||
|
||||
cmd.Args = s
|
||||
f.Commands = append(f.Commands, cmd)
|
||||
default:
|
||||
return nil, io.ErrUnexpectedEOF
|
||||
}
|
||||
|
||||
for _, cmd := range f.Commands {
|
||||
if cmd.Name == "model" {
|
||||
return &f, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errMissingFrom
|
||||
}
|
||||
|
||||
func parseRuneForState(r rune, cs state) (state, rune, error) {
|
||||
switch cs {
|
||||
case stateNil:
|
||||
switch {
|
||||
case r == '#':
|
||||
return stateComment, 0, nil
|
||||
case isSpace(r), isNewline(r):
|
||||
return stateNil, 0, nil
|
||||
default:
|
||||
return stateName, r, nil
|
||||
}
|
||||
case stateName:
|
||||
switch {
|
||||
case isAlpha(r):
|
||||
return stateName, r, nil
|
||||
case isSpace(r):
|
||||
return stateValue, 0, nil
|
||||
default:
|
||||
return stateNil, 0, errInvalidCommand
|
||||
}
|
||||
case stateValue:
|
||||
switch {
|
||||
case isNewline(r):
|
||||
return stateNil, r, nil
|
||||
case isSpace(r):
|
||||
return stateNil, r, nil
|
||||
default:
|
||||
return stateValue, r, nil
|
||||
}
|
||||
case stateParameter:
|
||||
switch {
|
||||
case isAlpha(r), isNumber(r), r == '_':
|
||||
return stateParameter, r, nil
|
||||
case isSpace(r):
|
||||
return stateValue, 0, nil
|
||||
default:
|
||||
return stateNil, 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
case stateMessage:
|
||||
switch {
|
||||
case isAlpha(r):
|
||||
return stateMessage, r, nil
|
||||
case isSpace(r):
|
||||
return stateValue, 0, nil
|
||||
default:
|
||||
return stateNil, 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
case stateComment:
|
||||
switch {
|
||||
case isNewline(r):
|
||||
return stateNil, 0, nil
|
||||
default:
|
||||
return stateComment, 0, nil
|
||||
}
|
||||
default:
|
||||
return stateNil, 0, errors.New("")
|
||||
}
|
||||
}
|
||||
|
||||
func quote(s string) string {
|
||||
if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
|
||||
if strings.Contains(s, "\"") {
|
||||
return `"""` + s + `"""`
|
||||
}
|
||||
|
||||
return `"` + s + `"`
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func unquote(s string) (string, bool) {
|
||||
// TODO: single quotes
|
||||
if len(s) >= 3 && s[:3] == `"""` {
|
||||
if len(s) >= 6 && s[len(s)-3:] == `"""` {
|
||||
return s[3 : len(s)-3], true
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
if len(s) >= 1 && s[0] == '"' {
|
||||
if len(s) >= 2 && s[len(s)-1] == '"' {
|
||||
return s[1 : len(s)-1], true
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
return s, true
|
||||
}
|
||||
|
||||
func isAlpha(r rune) bool {
|
||||
return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
|
||||
}
|
||||
|
||||
func isNumber(r rune) bool {
|
||||
return r >= '0' && r <= '9'
|
||||
}
|
||||
|
||||
func isSpace(r rune) bool {
|
||||
return r == ' ' || r == '\t'
|
||||
}
|
||||
|
||||
func isNewline(r rune) bool {
|
||||
return r == '\r' || r == '\n'
|
||||
}
|
||||
|
||||
func isValidMessageRole(role string) bool {
|
||||
return role == "system" || role == "user" || role == "assistant"
|
||||
}
|
||||
|
||||
func isValidCommand(cmd string) bool {
|
||||
switch strings.ToLower(cmd) {
|
||||
case "from", "license", "template", "system", "adapter", "parameter", "message":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
@@ -1,511 +0,0 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestParseFileFile(t *testing.T) {
|
||||
input := `
|
||||
FROM model1
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
modelfile, err := ParseFile(reader)
|
||||
assert.NoError(t, err)
|
||||
|
||||
expectedCommands := []Command{
|
||||
{Name: "model", Args: "model1"},
|
||||
{Name: "adapter", Args: "adapter1"},
|
||||
{Name: "license", Args: "MIT"},
|
||||
{Name: "param1", Args: "value1"},
|
||||
{Name: "param2", Args: "value2"},
|
||||
{Name: "template", Args: "template1"},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedCommands, modelfile.Commands)
|
||||
}
|
||||
|
||||
func TestParseFileFrom(t *testing.T) {
|
||||
var cases = []struct {
|
||||
input string
|
||||
expected []Command
|
||||
err error
|
||||
}{
|
||||
{
|
||||
"FROM foo",
|
||||
[]Command{{Name: "model", Args: "foo"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM /path/to/model",
|
||||
[]Command{{Name: "model", Args: "/path/to/model"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM /path/to/model/fp16.bin",
|
||||
[]Command{{Name: "model", Args: "/path/to/model/fp16.bin"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM llama3:latest",
|
||||
[]Command{{Name: "model", Args: "llama3:latest"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"FROM llama3:7b-instruct-q4_K_M",
|
||||
[]Command{{Name: "model", Args: "llama3:7b-instruct-q4_K_M"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"", nil, errMissingFrom,
|
||||
},
|
||||
{
|
||||
"PARAMETER param1 value1",
|
||||
nil,
|
||||
errMissingFrom,
|
||||
},
|
||||
{
|
||||
"PARAMETER param1 value1\nFROM foo",
|
||||
[]Command{{Name: "param1", Args: "value1"}, {Name: "model", Args: "foo"}},
|
||||
nil,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.input))
|
||||
assert.ErrorIs(t, err, c.err)
|
||||
if modelfile != nil {
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileParametersMissingValue(t *testing.T) {
|
||||
input := `
|
||||
FROM foo
|
||||
PARAMETER param1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := ParseFile(reader)
|
||||
assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
|
||||
}
|
||||
|
||||
func TestParseFileBadCommand(t *testing.T) {
|
||||
input := `
|
||||
FROM foo
|
||||
BADCOMMAND param1 value1
|
||||
`
|
||||
_, err := ParseFile(strings.NewReader(input))
|
||||
assert.ErrorIs(t, err, errInvalidCommand)
|
||||
|
||||
}
|
||||
|
||||
func TestParseFileMessages(t *testing.T) {
|
||||
var cases = []struct {
|
||||
input string
|
||||
expected []Command
|
||||
err error
|
||||
}{
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system You are a file parser. Always parse things.
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a file parser. Always parse things."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system You are a file parser. Always parse things.`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a file parser. Always parse things."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system You are a file parser. Always parse things.
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: You are a file parser. Always parse things."},
|
||||
{Name: "message", Args: "user: Hey there!"},
|
||||
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system """
|
||||
You are a multiline file parser. Always parse things.
|
||||
"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "message", Args: "system: \nYou are a multiline file parser. Always parse things.\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE badguy I'm a bad guy!
|
||||
`,
|
||||
nil,
|
||||
errInvalidMessageRole,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system
|
||||
`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
MESSAGE system`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.input))
|
||||
assert.ErrorIs(t, err, c.err)
|
||||
if modelfile != nil {
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileQuoted(t *testing.T) {
|
||||
var cases = []struct {
|
||||
multiline string
|
||||
expected []Command
|
||||
err error
|
||||
}{
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """
|
||||
This is a
|
||||
multiline system.
|
||||
"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "\nThis is a\nmultiline system.\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """
|
||||
This is a
|
||||
multiline system."""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "\nThis is a\nmultiline system."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """This is a
|
||||
multiline system."""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "This is a\nmultiline system."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """This is a multiline system."""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "This is a multiline system."},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """This is a multiline system.""
|
||||
`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM "
|
||||
`,
|
||||
nil,
|
||||
io.ErrUnexpectedEOF,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """
|
||||
This is a multiline system with "quotes".
|
||||
"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "\nThis is a multiline system with \"quotes\".\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: ""},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM ""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: ""},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM "'"
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: "'"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM """''"'""'""'"'''''""'""'"""
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "system", Args: `''"'""'""'"'''''""'""'`},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
`
|
||||
FROM foo
|
||||
TEMPLATE """
|
||||
{{ .Prompt }}
|
||||
"""`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: "template", Args: "\n{{ .Prompt }}\n"},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.multiline))
|
||||
assert.ErrorIs(t, err, c.err)
|
||||
if modelfile != nil {
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileParameters(t *testing.T) {
|
||||
var cases = map[string]struct {
|
||||
name, value string
|
||||
}{
|
||||
"numa true": {"numa", "true"},
|
||||
"num_ctx 1": {"num_ctx", "1"},
|
||||
"num_batch 1": {"num_batch", "1"},
|
||||
"num_gqa 1": {"num_gqa", "1"},
|
||||
"num_gpu 1": {"num_gpu", "1"},
|
||||
"main_gpu 1": {"main_gpu", "1"},
|
||||
"low_vram true": {"low_vram", "true"},
|
||||
"f16_kv true": {"f16_kv", "true"},
|
||||
"logits_all true": {"logits_all", "true"},
|
||||
"vocab_only true": {"vocab_only", "true"},
|
||||
"use_mmap true": {"use_mmap", "true"},
|
||||
"use_mlock true": {"use_mlock", "true"},
|
||||
"num_thread 1": {"num_thread", "1"},
|
||||
"num_keep 1": {"num_keep", "1"},
|
||||
"seed 1": {"seed", "1"},
|
||||
"num_predict 1": {"num_predict", "1"},
|
||||
"top_k 1": {"top_k", "1"},
|
||||
"top_p 1.0": {"top_p", "1.0"},
|
||||
"tfs_z 1.0": {"tfs_z", "1.0"},
|
||||
"typical_p 1.0": {"typical_p", "1.0"},
|
||||
"repeat_last_n 1": {"repeat_last_n", "1"},
|
||||
"temperature 1.0": {"temperature", "1.0"},
|
||||
"repeat_penalty 1.0": {"repeat_penalty", "1.0"},
|
||||
"presence_penalty 1.0": {"presence_penalty", "1.0"},
|
||||
"frequency_penalty 1.0": {"frequency_penalty", "1.0"},
|
||||
"mirostat 1": {"mirostat", "1"},
|
||||
"mirostat_tau 1.0": {"mirostat_tau", "1.0"},
|
||||
"mirostat_eta 1.0": {"mirostat_eta", "1.0"},
|
||||
"penalize_newline true": {"penalize_newline", "true"},
|
||||
"stop ### User:": {"stop", "### User:"},
|
||||
"stop ### User: ": {"stop", "### User: "},
|
||||
"stop \"### User:\"": {"stop", "### User:"},
|
||||
"stop \"### User: \"": {"stop", "### User: "},
|
||||
"stop \"\"\"### User:\"\"\"": {"stop", "### User:"},
|
||||
"stop \"\"\"### User:\n\"\"\"": {"stop", "### User:\n"},
|
||||
"stop <|endoftext|>": {"stop", "<|endoftext|>"},
|
||||
"stop <|eot_id|>": {"stop", "<|eot_id|>"},
|
||||
"stop </s>": {"stop", "</s>"},
|
||||
}
|
||||
|
||||
for k, v := range cases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
fmt.Fprintln(&b, "FROM foo")
|
||||
fmt.Fprintln(&b, "PARAMETER", k)
|
||||
modelfile, err := ParseFile(&b)
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Equal(t, []Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
{Name: v.name, Args: v.value},
|
||||
}, modelfile.Commands)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileComments(t *testing.T) {
|
||||
var cases = []struct {
|
||||
input string
|
||||
expected []Command
|
||||
}{
|
||||
{
|
||||
`
|
||||
# comment
|
||||
FROM foo
|
||||
`,
|
||||
[]Command{
|
||||
{Name: "model", Args: "foo"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c.input))
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, c.expected, modelfile.Commands)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFileFormatParseFile(t *testing.T) {
|
||||
var cases = []string{
|
||||
`
|
||||
FROM foo
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
MESSAGE system You are a file parser. Always parse things.
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
`
|
||||
FROM foo
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
MESSAGE system """
|
||||
You are a store greeter. Always responsed with "Hello!".
|
||||
"""
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
`
|
||||
FROM foo
|
||||
ADAPTER adapter1
|
||||
LICENSE """
|
||||
Very long and boring legal text.
|
||||
Blah blah blah.
|
||||
"Oh look, a quote!"
|
||||
"""
|
||||
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
MESSAGE system """
|
||||
You are a store greeter. Always responsed with "Hello!".
|
||||
"""
|
||||
MESSAGE user Hey there!
|
||||
MESSAGE assistant Hello, I want to parse all the things!
|
||||
`,
|
||||
`
|
||||
FROM foo
|
||||
SYSTEM ""
|
||||
`,
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run("", func(t *testing.T) {
|
||||
modelfile, err := ParseFile(strings.NewReader(c))
|
||||
assert.NoError(t, err)
|
||||
|
||||
modelfile2, err := ParseFile(strings.NewReader(modelfile.String()))
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Equal(t, modelfile, modelfile2)
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,2 +1,2 @@
|
||||
go test fuzz v1
|
||||
string("00@")
|
||||
string("/0")
|
||||
2
types/model/testdata/fuzz/FuzzParseRef/27fd759314f0e6d6
vendored
Normal file
2
types/model/testdata/fuzz/FuzzParseRef/27fd759314f0e6d6
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
go test fuzz v1
|
||||
string("0//0")
|
||||
2
types/model/testdata/fuzz/FuzzParseRef/3e3b70dba384074d
vendored
Normal file
2
types/model/testdata/fuzz/FuzzParseRef/3e3b70dba384074d
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
go test fuzz v1
|
||||
string("0 /0")
|
||||
2
types/model/testdata/fuzz/FuzzParseRef/71f1fdff711b6dab
vendored
Normal file
2
types/model/testdata/fuzz/FuzzParseRef/71f1fdff711b6dab
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
go test fuzz v1
|
||||
string("+0/00000")
|
||||
2
types/model/testdata/fuzz/FuzzParseRef/82c2975c430ac608
vendored
Normal file
2
types/model/testdata/fuzz/FuzzParseRef/82c2975c430ac608
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
go test fuzz v1
|
||||
string(":")
|
||||
2
types/model/testdata/fuzz/FuzzParseRef/b51b1c875e61a948
vendored
Normal file
2
types/model/testdata/fuzz/FuzzParseRef/b51b1c875e61a948
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
go test fuzz v1
|
||||
string("0+.\xf2\x80\xf6\x9d00000\xe5\x99\xe6\xd900\xd90\xa60\x91\xdc0\xff\xbf\x99\xe800\xb9\xdc\xd6\xc300\x970\xfb\xfd0\xe0\x8a\xe1\xad\xd40\x9700\xa80\x980\xdd0000\xb00\x91000\xfe0\x89\x9b\x90\x93\x9f0\xe60\xf7\x84\xb0\x87\xa5\xff0\xa000\x9a\x85\xf6\x85\xfe\xa9\xf9\xe9\xde00\xf4\xe0\x8f\x81\xad\xde00\xd700\xaa\xe000000\xb1\xee0\x91")
|
||||
15
types/structs/structs.go
Normal file
15
types/structs/structs.go
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright (c) Tailscale Inc & AUTHORS
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
// Package structs contains the Incomparable type.
|
||||
package structs
|
||||
|
||||
// Incomparable is a zero-width incomparable type. If added as the
|
||||
// first field in a struct, it marks that struct as not comparable
|
||||
// (can't do == or be a map key) and usually doesn't add any width to
|
||||
// the struct (unless the struct has only small fields).
|
||||
//
|
||||
// By making a struct incomparable, you can prevent misuse (prevent
|
||||
// people from using ==), but also you can shrink generated binaries,
|
||||
// as the compiler can omit equality funcs from the binary.
|
||||
type Incomparable [0]func()
|
||||
Reference in New Issue
Block a user