Compare commits

..

1 Commits

Author SHA1 Message Date
jmorganca
9b5b69c00f llm: update llama.cpp submodule to 7c26775 2024-06-17 13:46:02 -04:00
58 changed files with 850 additions and 1930 deletions

View File

@@ -437,7 +437,6 @@ jobs:
env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
GH_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v4
- name: Set Version
@@ -461,20 +460,15 @@ jobs:
ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt)
cat dist/sha256sum.txt
- name: Create or update Release
run: |
echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
if [ -n "$OLD_TAG" ]; then
echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
else
echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
gh release create ${GITHUB_REF_NAME} \
--title ${{ env.RELEASE_VERSION }} \
--draft \
--generate-notes \
--prerelease
fi
echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
gh release upload ${GITHUB_REF_NAME} dist/* --clobber
- uses: ncipollo/release-action@v1
with:
name: ${{ env.RELEASE_VERSION }}
allowUpdates: true
artifacts: 'dist/*'
draft: true
prerelease: true
omitBodyDuringUpdate: true
generateReleaseNotes: true
omitDraftDuringUpdate: true
omitPrereleaseDuringUpdate: true
replacesArtifacts: true

View File

@@ -73,12 +73,12 @@ jobs:
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$gccpath;$env:PATH"
echo $env:PATH
$env:GOARCH=""; $env:OLLAMA_BUILD_TARGET_ARCH="${{ matrix.arch }}"; go generate -x ./...
go generate -x ./...
if: ${{ startsWith(matrix.os, 'windows-') }}
name: 'Windows Generate'
- run: GOARCH= OLLAMA_BUILD_TARGET_ARCH=${{ matrix.arch }} go generate -x ./...
name: 'Windows Go Generate'
- run: go generate -x ./...
if: ${{ ! startsWith(matrix.os, 'windows-') }}
name: 'Unix Generate'
name: 'Unix Go Generate'
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
@@ -184,7 +184,7 @@ jobs:
$env:OLLAMA_SKIP_CPU_GENERATE="1"
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
go generate -x ./...
name: go generate -x ./...
name: go generate
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
# TODO - do we need any artifacts?
@@ -217,7 +217,7 @@ jobs:
- name: 'Verify CUDA'
run: nvcc -V
- run: go get ./...
- name: go generate -x ./...
- name: go generate
run: |
$gopath=(get-command go).source | split-path -parent
$cudabin=(get-command nvcc).source | split-path
@@ -312,10 +312,7 @@ jobs:
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'macos-') }}
shell: bash
- run: $env:GOARCH=""; $env:OLLAMA_BUILD_TARGET_ARCH="${{ matrix.arch }}"; go generate -x ./...
if: ${{ startsWith(matrix.os, 'windows-') }}
- run: GOARCH= OLLAMA_BUILD_TARGET_ARCH=${{ matrix.arch }} go generate -x ./...
if: ${{ ! startsWith(matrix.os, 'windows-') }}
- run: go generate ./...
- run: go build
- run: go test -v ./...
- uses: actions/upload-artifact@v4

View File

@@ -1,12 +1,12 @@
<div align="center">
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
</div>
# Ollama
[![Discord](https://dcbadge.vercel.app/api/server/ollama?style=flat&compact=true)](https://discord.gg/ollama)
Get up and running with large language models locally.
Get up and running with large language models.
### macOS
@@ -51,15 +51,17 @@ Here are some example models that can be downloaded:
| ------------------ | ---------- | ----- | ------------------------------ |
| Llama 3 | 8B | 4.7GB | `ollama run llama3` |
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
| Phi-3 | 3,8B | 2.3GB | `ollama run phi3` |
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
| LLaVA | 7B | 4.5GB | `ollama run llava` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
| Solar | 10.7B | 6.1GB | `ollama run solar` |
> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -173,7 +175,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
The image features a yellow smiley face, which is likely the central focus of the picture.
```
### Pass in prompt as arguments
### Pass the prompt as an argument
```
$ ollama run llama3 "Summarize this file: $(cat README.md)"
@@ -192,19 +194,7 @@ ollama list
## Building
Install `cmake` and `go`:
```
brew install cmake go
```
Then build the binary:
```
go run build.go
```
More detailed instructions can be found in the [developer guide](https://github.com/ollama/ollama/blob/main/docs/development.md)
See the [developer guide](https://github.com/ollama/ollama/blob/main/docs/development.md)
### Running local builds
@@ -252,6 +242,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Open WebUI](https://github.com/open-webui/open-webui)
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
- [Hollama](https://github.com/fmaclen/hollama)
- [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
- [LibreChat](https://github.com/danny-avila/LibreChat)
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
@@ -278,17 +269,23 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
- [OpenAOE](https://github.com/InternLM/OpenAOE)
- [Odin Runes](https://github.com/leonid20000/OdinRunes)
- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
- [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
- [chat: chat web app for teams](https://github.com/swuecho/chat)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
- [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
- [StreamDeploy](https://github.com/StreamDeploy-DevRel/streamdeploy-llm-app-scaffold) (LLM Application Scaffold)
- [chat](https://github.com/swuecho/chat) (chat web app for teams)
- [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
- [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
- [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
- [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
### Terminal
@@ -311,6 +308,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ShellOracle](https://github.com/djcopley/ShellOracle)
- [tlm](https://github.com/yusufcanb/tlm)
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
- [gollama](https://github.com/sammcj/gollama)
### Database
@@ -321,17 +319,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
### Libraries
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
@@ -342,9 +343,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
- [Elixir LangChain](https://github.com/brainlid/langchain)
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
- [Ollama for R - ollama-r](https://github.com/hauselin/ollama-r)
- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
- [Testcontainers](https://testcontainers.com/modules/ollama/)
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
- [LlamaScript](https://github.com/Project-Llama/llamascript)
### Mobile
@@ -364,18 +369,23 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
- [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
- [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

View File

@@ -159,49 +159,18 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type Runner struct {
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap TriState `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
type TriState int
const (
TriStateUndefined TriState = -1
TriStateFalse TriState = 0
TriStateTrue TriState = 1
)
func (b *TriState) UnmarshalJSON(data []byte) error {
var v bool
if err := json.Unmarshal(data, &v); err != nil {
return err
}
if v {
*b = TriStateTrue
}
*b = TriStateFalse
return nil
}
func (b *TriState) MarshalJSON() ([]byte, error) {
if *b == TriStateUndefined {
return nil, nil
}
var v bool
if *b == TriStateTrue {
v = true
}
return json.Marshal(v)
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
// EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -253,7 +222,6 @@ type ShowRequest struct {
Model string `json:"model"`
System string `json:"system"`
Template string `json:"template"`
Verbose bool `json:"verbose"`
Options map[string]interface{} `json:"options"`
@@ -263,16 +231,14 @@ type ShowRequest struct {
// ShowResponse is the response returned from [Client.Show].
type ShowResponse struct {
License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
ModelInfo map[string]any `json:"model_info,omitempty"`
ProjectorInfo map[string]any `json:"projector_info,omitempty"`
ModifiedAt time.Time `json:"modified_at,omitempty"`
License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
ModifiedAt time.Time `json:"modified_at,omitempty"`
}
// CopyRequest is the request passed to [Client.Copy].
@@ -437,19 +403,6 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
continue
}
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
val, ok := val.(bool)
if !ok {
return fmt.Errorf("option %q must be of type boolean", key)
}
if val {
field.SetInt(int64(TriStateTrue))
} else {
field.SetInt(int64(TriStateFalse))
}
continue
}
switch field.Kind() {
case reflect.Int:
switch t := val.(type) {
@@ -538,7 +491,7 @@ func DefaultOptions() Options {
LowVRAM: false,
F16KV: true,
UseMLock: false,
UseMMap: TriStateUndefined,
UseMMap: true,
UseNUMA: false,
},
}
@@ -608,19 +561,6 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
} else {
field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() {
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
boolVal, err := strconv.ParseBool(vals[0])
if err != nil {
return nil, fmt.Errorf("invalid bool value %s", vals)
}
if boolVal {
out[key] = TriStateTrue
} else {
out[key] = TriStateFalse
}
continue
}
switch field.Kind() {
case reflect.Float32:
floatVal, err := strconv.ParseFloat(vals[0], 32)

View File

@@ -2,7 +2,6 @@ package api
import (
"encoding/json"
"fmt"
"math"
"testing"
"time"
@@ -106,101 +105,3 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
})
}
}
func TestUseMmapParsingFromJSON(t *testing.T) {
tests := []struct {
name string
req string
exp TriState
}{
{
name: "Undefined",
req: `{ }`,
exp: TriStateUndefined,
},
{
name: "True",
req: `{ "use_mmap": true }`,
exp: TriStateTrue,
},
{
name: "False",
req: `{ "use_mmap": false }`,
exp: TriStateFalse,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var oMap map[string]interface{}
err := json.Unmarshal([]byte(test.req), &oMap)
require.NoError(t, err)
opts := DefaultOptions()
err = opts.FromMap(oMap)
require.NoError(t, err)
assert.Equal(t, test.exp, opts.UseMMap)
})
}
}
func TestUseMmapFormatParams(t *testing.T) {
tests := []struct {
name string
req map[string][]string
exp TriState
err error
}{
{
name: "True",
req: map[string][]string{
"use_mmap": []string{"true"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "False",
req: map[string][]string{
"use_mmap": []string{"false"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "Numeric True",
req: map[string][]string{
"use_mmap": []string{"1"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "Numeric False",
req: map[string][]string{
"use_mmap": []string{"0"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "invalid string",
req: map[string][]string{
"use_mmap": []string{"foo"},
},
exp: TriStateUndefined,
err: fmt.Errorf("invalid bool value [foo]"),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
resp, err := FormatParams(test.req)
require.Equal(t, err, test.err)
respVal, ok := resp["use_mmap"]
if test.exp != TriStateUndefined {
assert.True(t, ok, "resp: %v", resp)
assert.Equal(t, test.exp, respVal)
}
})
}
}

View File

@@ -5,8 +5,6 @@ import (
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
)
@@ -26,7 +24,6 @@ func InitLogging() {
logFile = os.Stderr
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else {
rotateLogs(AppLogFile)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -49,32 +46,3 @@ func InitLogging() {
slog.Info("ollama app started")
}
func rotateLogs(logFile string) {
if _, err := os.Stat(logFile); os.IsNotExist(err) {
return
}
index := strings.LastIndex(logFile, ".")
pre := logFile[:index]
post := "." + logFile[index+1:]
for i := LogRotationCount; i > 0; i-- {
older := pre + "-" + strconv.Itoa(i) + post
newer := pre + "-" + strconv.Itoa(i-1) + post
if i == 1 {
newer = pre + post
}
if _, err := os.Stat(newer); err == nil {
if _, err := os.Stat(older); err == nil {
err := os.Remove(older)
if err != nil {
slog.Warn("Failed to remove older log", "older", older, "error", err)
continue
}
}
err := os.Rename(newer, older)
if err != nil {
slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
}
}
}
}

View File

@@ -1,44 +0,0 @@
package lifecycle
import (
"os"
"path/filepath"
"strconv"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRotateLogs(t *testing.T) {
logDir := t.TempDir()
logFile := filepath.Join(logDir, "testlog.log")
// No log exists
rotateLogs(logFile)
require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
assert.FileExists(t, logFile)
// First rotation
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
// Should be a no-op without a new log
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
for i := 2; i <= LogRotationCount+1; i++ {
require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
assert.FileExists(t, logFile)
rotateLogs(logFile)
assert.NoFileExists(t, logFile)
for j := 1; j < i; j++ {
assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
}
assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
}
}

View File

@@ -16,12 +16,11 @@ var (
AppDir = "/opt/Ollama"
AppDataDir = "/opt/Ollama"
// TODO - should there be a distinct log dir?
UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe"
LogRotationCount = 5
UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe"
)
func init() {

View File

@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
}
rotateLogs(ServerLogFile)
// TODO - rotation
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err)

View File

@@ -88,15 +88,10 @@ DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
#if DirExists("..\dist\windows-amd64\cuda")
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\oneapi")
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif

199
build.go
View File

@@ -1,199 +0,0 @@
//go:build ignore
package main
import (
"cmp"
"errors"
"flag"
"log"
"os"
"os/exec"
"path/filepath"
"runtime"
)
// Flags
var (
flagRegenerateDestroy = flag.Bool("d", false, "force regenerate the dependencies (destructive)")
flagRegenerateGently = flag.Bool("g", false, "regenerate the dependencies (non-destructive)")
flagSkipBuild = flag.Bool("s", false, "generate dependencies only (e.g. skip 'go build .')")
// Flags to set GOARCH explicitly for cross-platform builds,
// e.g., in CI to target a different platform than the build matrix
// default. These allows us to run generate without a separate build
// step for building the script binary for the host ARCH and then
// runing the generate script for the target ARCH. Instead, we can
// just run `go run build.go -target=$GOARCH` to generate the
// deps.
flagGOARCH = flag.String("target", "", "sets GOARCH to use when generating dependencies and building")
)
func buildEnv() []string {
return append(os.Environ(), "GOARCH="+cmp.Or(
*flagGOARCH,
os.Getenv("OLLAMA_BUILD_TARGET_ARCH"),
runtime.GOARCH,
))
}
func main() {
log.SetFlags(0)
flag.Usage = func() {
log.Printf("Usage: go run build.go [flags]")
log.Println()
log.Println("Flags:")
flag.PrintDefaults()
log.Println()
log.Println("This script builds the Ollama server binary and generates the llama.cpp")
log.Println("bindings for the current platform. It assumes that the current working")
log.Println("directory is the root directory of the Ollama project.")
log.Println()
log.Println("If the -d flag is provided, the script will force regeneration of the")
log.Println("dependencies; removing the 'llm/build' directory before starting.")
log.Println()
log.Println("If the -g flag is provided, the script will regenerate the dependencies")
log.Println("without removing the 'llm/build' directory.")
log.Println()
log.Println("If the -s flag is provided, the script will skip building the Ollama binary")
log.Println()
log.Println("If the -target flag is provided, the script will set GOARCH to the value")
log.Println("of the flag. This is useful for cross-platform builds.")
log.Println()
log.Println("The script will check for the required dependencies (cmake, gcc) and")
log.Println("print their version.")
log.Println()
log.Println("The script will also check if it is being run from the root directory of")
log.Println("the Ollama project.")
log.Println()
os.Exit(1)
}
flag.Parse()
log.Printf("=== Building Ollama ===")
defer func() {
log.Printf("=== Done building Ollama ===")
if !*flagSkipBuild {
log.Println()
log.Println("To run the Ollama server, use:")
log.Println()
log.Println(" ./ollama serve")
log.Println()
}
}()
if flag.NArg() > 0 {
flag.Usage()
}
if !inRootDir() {
log.Fatalf("Please run this script from the root directory of the Ollama project.")
}
if err := checkDependencies(); err != nil {
log.Fatalf("Failed dependency check: %v", err)
}
if err := buildLlammaCPP(); err != nil {
log.Fatalf("Failed to build llama.cpp: %v", err)
}
if err := goBuildOllama(); err != nil {
log.Fatalf("Failed to build ollama Go binary: %v", err)
}
}
// checkDependencies does a quick check to see if the required dependencies are
// installed on the system and functioning enough to print their version.
//
// TODO(bmizerany): Check the actual version of the dependencies? Seems a
// little daunting given diff versions might print diff things. This should
// be good enough for now.
func checkDependencies() error {
var err error
check := func(name string, args ...string) {
log.Printf("=== Checking for %s ===", name)
defer log.Printf("=== Done checking for %s ===\n\n", name)
cmd := exec.Command(name, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
err = errors.Join(err, cmd.Run())
}
check("cmake", "--version")
check("gcc", "--version")
return err
}
func goBuildOllama() error {
log.Println("=== Building Ollama binary ===")
defer log.Printf("=== Done building Ollama binary ===\n\n")
if *flagSkipBuild {
log.Println("Skipping 'go build -o ollama .'")
return nil
}
cmd := exec.Command("go", "build", "-o", "ollama", ".")
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Env = buildEnv()
return cmd.Run()
}
// buildLlammaCPP generates the llama.cpp bindings for the current platform.
//
// It assumes that the current working directory is the root directory of the
// Ollama project.
func buildLlammaCPP() error {
log.Println("=== Generating dependencies ===")
defer log.Printf("=== Done generating dependencies ===\n\n")
if *flagRegenerateDestroy {
if err := os.RemoveAll(filepath.Join("llm", "build")); err != nil {
return err
}
}
if isDirectory(filepath.Join("llm", "build")) && !*flagRegenerateGently {
log.Println("llm/build already exists; skipping. Use -d or -g to re-generate.")
return nil
}
scriptDir, err := filepath.Abs(filepath.Join("llm", "generate"))
if err != nil {
return err
}
var cmd *exec.Cmd
switch runtime.GOOS {
case "windows":
script := filepath.Join(scriptDir, "gen_windows.ps1")
cmd = exec.Command("powershell", "-ExecutionPolicy", "Bypass", "-File", script)
case "linux":
script := filepath.Join(scriptDir, "gen_linux.sh")
cmd = exec.Command("bash", script)
case "darwin":
script := filepath.Join(scriptDir, "gen_darwin.sh")
cmd = exec.Command("bash", script)
default:
log.Fatalf("Unsupported OS: %s", runtime.GOOS)
}
cmd.Dir = filepath.Join("llm", "generate")
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Env = buildEnv()
log.Printf("Running GOOS=%s GOARCH=%s %s", runtime.GOOS, runtime.GOARCH, cmd.Args)
return cmd.Run()
}
func isDirectory(path string) bool {
info, err := os.Stat(path)
if err != nil {
return false
}
return info.IsDir()
}
// inRootDir returns true if the current working directory is the root
// directory of the Ollama project. It looks for a file named "go.mod".
func inRootDir() bool {
_, err := os.Stat("go.mod")
return err == nil
}

View File

@@ -162,6 +162,9 @@ func tempZipFiles(path string) (string, error) {
}
defer tempfile.Close()
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
detectContentType := func(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
@@ -230,9 +233,6 @@ func tempZipFiles(path string) (string, error) {
files = append(files, tks...)
}
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
for _, file := range files {
f, err := os.Open(file)
if err != nil {
@@ -287,12 +287,38 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
}
func RunHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
name := args[0]
// check if the model exists on the server
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, []string{name}); err != nil {
return err
}
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
if err != nil {
return err
}
case err != nil:
return err
}
interactive := true
opts := runOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
MultiModal: slices.Contains(show.Details.Families, "clip"),
ParentModel: show.Details.ParentModel,
}
format, err := cmd.Flags().GetString("format")
@@ -336,38 +362,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
opts.WordWrap = !nowrap
// Fill out the rest of the options based on information about the
// model.
client, err := api.ClientFromEnvironment()
if err != nil {
return err
if !interactive {
return generate(cmd, opts)
}
name := args[0]
info, err := func() (*api.ShowResponse, error) {
showReq := &api.ShowRequest{Name: name}
info, err := client.Show(cmd.Context(), showReq)
var se api.StatusError
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
if err := PullHandler(cmd, []string{name}); err != nil {
return nil, err
}
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
}
return info, err
}()
if err != nil {
return err
}
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
opts.ParentModel = info.Details.ParentModel
opts.Messages = append(opts.Messages, info.Messages...)
if interactive {
return generateInteractive(cmd, opts)
}
return generate(cmd, opts)
return generateInteractive(cmd, opts)
}
func errFromUnknownKey(unknownKeyErr error) error {
@@ -580,6 +579,10 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
if len(args) != 1 {
return errors.New("missing model name")
}
license, errLicense := cmd.Flags().GetBool("license")
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -622,6 +625,8 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
if flagsSet > 1 {
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
} else if flagsSet == 0 {
return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
}
req := api.ShowRequest{Name: args[0]}
@@ -630,141 +635,22 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
if flagsSet == 1 {
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
}
return nil
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
}
showInfo(resp)
return nil
}
func showInfo(resp *api.ShowResponse) {
arch := resp.ModelInfo["general.architecture"].(string)
modelData := [][]string{
{"arch", arch},
{"parameters", resp.Details.ParameterSize},
{"quantization", resp.Details.QuantizationLevel},
{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
}
mainTableData := [][]string{
{"Model"},
{renderSubTable(modelData, false)},
}
if resp.ProjectorInfo != nil {
projectorData := [][]string{
{"arch", "clip"},
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
}
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
}
projectorData = append(projectorData,
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
)
mainTableData = append(mainTableData,
[]string{"Projector"},
[]string{renderSubTable(projectorData, false)},
)
}
if resp.Parameters != "" {
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
}
if resp.System != "" {
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
}
if resp.License != "" {
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
}
table := tablewriter.NewWriter(os.Stdout)
table.SetAutoWrapText(false)
table.SetBorder(false)
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range mainTableData {
table.Append(v)
}
table.Render()
}
func renderSubTable(data [][]string, file bool) string {
var buf bytes.Buffer
table := tablewriter.NewWriter(&buf)
table.SetAutoWrapText(!file)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding("\t")
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range data {
table.Append(v)
}
table.Render()
renderedTable := buf.String()
lines := strings.Split(renderedTable, "\n")
for i, line := range lines {
lines[i] = "\t" + line
}
return strings.Join(lines, "\n")
}
func twoLines(s string) [][]string {
lines := strings.Split(s, "\n")
res := [][]string{}
count := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
count++
res = append(res, []string{line})
if count == 2 {
return res
}
}
}
return res
}
func formatParams(s string) string {
lines := strings.Split(s, "\n")
table := [][]string{}
for _, line := range lines {
table = append(table, strings.Fields(line))
}
return renderSubTable(table, false)
}
func CopyHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {

View File

@@ -31,40 +31,65 @@ const (
)
func loadModel(cmd *cobra.Command, opts *runOptions) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
client, err := api.ClientFromEnvironment()
showReq := api.ShowRequest{Name: opts.Model}
showResp, err := client.Show(cmd.Context(), &showReq)
if err != nil {
return err
}
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
opts.ParentModel = showResp.Details.ParentModel
chatReq := &api.ChatRequest{
Model: opts.Model,
KeepAlive: opts.KeepAlive,
if len(showResp.Messages) > 0 {
opts.Messages = append(opts.Messages, showResp.Messages...)
}
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
chatReq := &api.ChatRequest{
Model: opts.Model,
Messages: []api.Message{},
}
if opts.KeepAlive != nil {
chatReq.KeepAlive = opts.KeepAlive
}
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear()
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
if len(opts.Messages) > 0 {
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
}
}
return nil
})
if err != nil {
return err
}
return nil
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Messages = make([]api.Message, 0)
err := loadModel(cmd, &opts)
if err != nil {
return err
@@ -404,7 +429,15 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
switch args[1] {
case "info":
showInfo(resp)
fmt.Println("Model details:")
if len(resp.Details.Families) > 0 {
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
} else if resp.Details.Family != "" {
fmt.Printf("Family %s\n", resp.Details.Family)
}
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
fmt.Println("")
case "license":
if resp.License == "" {
fmt.Println("No license was specified for this model.")

View File

@@ -26,7 +26,7 @@ All durations are returned in nanoseconds.
### Streaming responses
Certain endpoints stream responses as JSON objects. Streaming can be disabled by providing `{"stream": false}` for these endpoints.
Certain endpoints stream responses as JSON objects and can optional return non-streamed responses.
## Generate a completion
@@ -777,12 +777,11 @@ A single JSON object will be returned.
POST /api/show
```
Show information about a model including details, modelfile, template, parameters, license, system prompt.
Show information about a model including details, modelfile, template, parameters, license, and system prompt.
### Parameters
- `name`: name of the model to show
- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
### Examples
@@ -799,40 +798,14 @@ curl http://localhost:11434/api/show -d '{
```json
{
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
"parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
"template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
"parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:",
"template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
"details": {
"parent_model": "",
"format": "gguf",
"family": "llama",
"families": [
"llama"
],
"parameter_size": "8.0B",
"families": ["llama", "clip"],
"parameter_size": "7B",
"quantization_level": "Q4_0"
},
"model_info": {
"general.architecture": "llama",
"general.file_type": 2,
"general.parameter_count": 8030261248,
"general.quantization_version": 2,
"llama.attention.head_count": 32,
"llama.attention.head_count_kv": 8,
"llama.attention.layer_norm_rms_epsilon": 0.00001,
"llama.block_count": 32,
"llama.context_length": 8192,
"llama.embedding_length": 4096,
"llama.feed_forward_length": 14336,
"llama.rope.dimension_count": 128,
"llama.rope.freq_base": 500000,
"llama.vocab_size": 128256,
"tokenizer.ggml.bos_token_id": 128000,
"tokenizer.ggml.eos_token_id": 128009,
"tokenizer.ggml.merges": [], // populates if `verbose=true`
"tokenizer.ggml.model": "gpt2",
"tokenizer.ggml.pre": "llama-bpe",
"tokenizer.ggml.token_type": [], // populates if `verbose=true`
"tokenizer.ggml.tokens": [] // populates if `verbose=true`
}
}
```

View File

@@ -25,7 +25,13 @@ export OLLAMA_DEBUG=1
Get the required libraries and build the native LLM code:
```bash
go run build.go
go generate ./...
```
Then build ollama:
```bash
go build .
```
Now you can run `ollama`:
@@ -34,16 +40,6 @@ Now you can run `ollama`:
./ollama
```
### Rebuilding the native code
If at any point you need to rebuild the native code, you can run the
build.go script again using the `-f` flag to force a rebuild, and,
optionally, the `-d` flag to skip building the Go binary:
```bash
go run build.go -d -s
```
### Linux
#### Linux CUDA (NVIDIA)
@@ -59,10 +55,16 @@ specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go run build.go
go build .
```
#### Linux ROCm (AMD)
@@ -78,17 +80,21 @@ install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
```
go generate ./...
```
Then build the binary:
```
go run build.go
go build .
```
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
#### Advanced CPU Settings
By default, running `go run build.go` will compile a few different variations
By default, running `go generate ./...` will compile a few different variations
of the LLM library based on common CPU families and vector math capabilities,
including a lowest-common-denominator which should run on almost any 64 bit CPU
somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
@@ -98,7 +104,8 @@ like to use. For example, to compile an optimized binary for an Intel i9-9880H,
you might use:
```
OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go run build.go
OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
go build .
```
#### Containerized Linux Build
@@ -122,7 +129,8 @@ Then, build the `ollama` binary:
```powershell
$env:CGO_ENABLED="1"
go run build.go
go generate ./...
go build .
```
#### Windows CUDA (NVIDIA)

View File

@@ -18,7 +18,7 @@ Check your compute compatibility to see if your card is supported:
| | Quadro | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000` |
| 7.0 | NVIDIA | `TITAN V` `V100` `Quadro GV100` |
| 6.1 | NVIDIA TITAN | `TITAN Xp` `TITAN X` |
| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050 Ti` `GTX 1050` |
| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050` |
| | Quadro | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
| | Tesla | `P40` `P4` |
| 6.0 | NVIDIA | `Tesla P100` `Quadro GP100` |

View File

@@ -47,13 +47,19 @@ success
### Supported Quantizations
<details>
<summary>Legacy Quantization</summary>
- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
#### K-means Quantizations
</details>
<details>
<summary>K-means Quantization</summary>`
- `Q3_K_S`
- `Q3_K_M`
@@ -64,6 +70,11 @@ success
- `Q5_K_M`
- `Q6_K`
</details>
> [!NOTE]
> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
## Template Detection
> [!NOTE]

View File

@@ -104,6 +104,7 @@ curl http://localhost:11434/v1/chat/completions \
#### Notes
- `finish_reason` will always be `stop`
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
## Models

View File

@@ -22,7 +22,7 @@ docker logs <container-name>
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
- `explorer %LOCALAPPDATA%\Ollama` to view logs
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories

View File

@@ -39,8 +39,8 @@ server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains most resent logs from the GUI application
- *server.log* contains the most recent server logs
- *app.log* contains logs from the GUI application
- *server.log* contains the server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration

View File

@@ -77,27 +77,20 @@ func cleanupTmpDirs() {
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
if err == nil {
pid, err := strconv.Atoi(string(raw))
if err == nil {
if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
// Another running ollama, ignore this tmpdir
continue
}
}
} else {
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
}
err = os.RemoveAll(d)
if err != nil {
slog.Warn("failed to read ollama.pid", "path", d, "error", err)
// No pid, ignore this tmpdir
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("failed to parse pid", "path", d, "error", err)
continue
}
proc, err := os.FindProcess(pid)
if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("found running ollama", "pid", pid, "path", d)
// Another running ollama, ignore this tmpdir
continue
}
if err := os.Remove(d); err != nil {
slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
}
}
}

View File

@@ -231,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
depPath = filepath.Dir(envconfig.RunnersDir)
}
// Load ALL libraries
@@ -282,12 +282,6 @@ func GetGPUInfo() GpuInfoList {
// Intel
if envconfig.IntelGpu {
oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir
depPath = ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
}
for d := range oHandles.oneapi.num_drivers {
if oHandles.oneapi == nil {
// shouldn't happen
@@ -312,7 +306,7 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DependencyPath = depPath
// TODO dependency path?
oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
}

View File

@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*(l[i].p)) {
if (!l[i].p) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);

View File

@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*(l[i].p)) {
if (!*l[i].p) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);

View File

@@ -42,7 +42,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*(l[i].p)) {
if (!l[i].p) {
resp->ch.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);

View File

@@ -50,7 +50,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
if (!*(l[i].p)) {
if (!l[i].p) {
resp->oh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -98,7 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
}
for (d = 0; d < resp->oh.num_drivers; d++) {
LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
LOG(resp->oh.verbose, "calling zesDeviceGet %d\n", resp->oh.drivers[d]);
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
&resp->oh.num_devices[d], NULL);
if (ret != ZE_RESULT_SUCCESS) {

View File

@@ -56,6 +56,7 @@ struct server_params {
std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
@@ -426,6 +427,16 @@ struct llama_server_context
return true;
}
void validate_model_chat_template(server_params & sparams) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) {
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
sparams.chat_template = "chatml";
}
}
void initialize() {
// create slots
all_slots_are_idle = true;
@@ -1650,41 +1661,26 @@ struct llama_server_context
}
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
char buf[256];
llama_model_meta_val_str(model, "general.architecture", buf, 256);
bool gemma2 = strcmp(buf, "gemma2") == 0;
int32_t truncate_at = slot.n_ctx;
// truncate at 2/3 of the context length for gemma2 models
// as they do not support context shifts (from the sliding window implementation).
// this way, prompts that almost fit the context length can still generate a full
// response without a sudden stop from hitting the context limit
if (gemma2) {
truncate_at = 2 * slot.n_ctx / 3;
}
// if input prompt is too big, truncate it, if group attention self-extend is disabled
if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
{
const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_shift = n_left / 2;
const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;
const int n_block_size = n_left / 2;
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
std::vector<llama_token> new_tokens(
prompt_tokens.begin(),
prompt_tokens.begin() + slot.params.n_keep);
new_tokens.insert(
new_tokens.end(),
prompt_tokens.begin() + slot.params.n_keep + n_erase,
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
prompt_tokens.end());
LOG_INFO("input truncated", {
{"n_ctx", slot.n_ctx},
{"n_keep", slot.params.n_keep},
{"n_left", n_left},
{"n_shift", n_shift},
{"n_erase", n_erase},
LOG_VERBOSE("input truncated", {
{"n_ctx", slot.n_ctx},
{"n_keep", slot.params.n_keep},
{"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
});
slot.truncated = true;
prompt_tokens = new_tokens;
@@ -1693,19 +1689,6 @@ struct llama_server_context
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
}
// Models with sliding window attention do not work with context shifts, so
// limit their prediction to the context length
if (gemma2) {
int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
slot.n_predict = limit;
slot.params.n_predict = limit;
LOG_INFO("model does not support sliding window, limiting generation", {
{"n_ctx", slot.n_ctx},
{"n_prompt_tokens", slot.n_prompt_tokens},
{"n_predict", slot.n_predict}
});
}
if (!slot.params.cache_prompt)
{
llama_sampling_reset(slot.ctx_sampling);
@@ -2552,6 +2535,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
sparams.chat_template = argv[i];
}
else if (arg == "--override-kv")
{
@@ -3024,6 +3008,11 @@ int main(int argc, char **argv) {
}
const auto model_meta = llama.model_meta();
if (sparams.chat_template.empty()) { // custom chat template is not supplied
// check if the template comes with the model is supported by us
llama.validate_model_chat_template(sparams);
}
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation

View File

@@ -1,6 +1,6 @@
#!/bin/bash
# This script is intended to run inside the `go run build.go` script, which
# sets the working directory to the correct location: ./llm/generate/.
# This script is intended to run inside the go generate
# working directory must be ./llm/generate/
# TODO - add hardening to detect missing tools (cmake, etc.)
@@ -18,7 +18,7 @@ sign() {
fi
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
case "${GOARCH}" in
"amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
@@ -92,10 +92,10 @@ case "${GOARCH}" in
;;
*)
echo "GOARCH must be set"
echo "this script is meant to be run from within 'go run build.go'"
echo "this script is meant to be run from within go generate"
exit 1
;;
esac
cleanup
echo "code generation completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View File

@@ -1,6 +1,6 @@
#!/bin/bash
# This script is intended to run with the `go run build.go` script, which
# sets the working directory to the correct location: ./llm/generate/.
# This script is intended to run inside the go generate
# working directory must be llm/generate/
# First we build one or more CPU based LLM libraries
#
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc)
fi
fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU"
else
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
@@ -281,4 +281,4 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
fi
cleanup
echo "code generation completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View File

@@ -1,7 +1,5 @@
#!powershell
$ErrorActionPreference = "Stop"
function amdGPUs {
if ($env:AMDGPU_TARGETS) {
return $env:AMDGPU_TARGETS
@@ -26,15 +24,25 @@ function amdGPUs {
$GPU_LIST -join ';'
}
function init_vars {
$script:SRC_DIR = $(resolve-path "..\..\")
$script:llamacppDir = "../llama.cpp"
if (!$script:SRC_DIR) {
$script:SRC_DIR = $(resolve-path "..\..\")
}
if (!$script:llamacppDir) {
$script:llamacppDir = "../llama.cpp"
}
if (!$script:cmakeTargets) {
$script:cmakeTargets = @("ollama_llama_server")
}
$script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off"
)
$script:cmakeTargets = @("ollama_llama_server")
$script:ARCH = "amd64" # arm not yet supported.
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
md "$script:DIST_BASE" -ea 0 > $null
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
$script:config = "RelWithDebInfo"
@@ -55,7 +63,6 @@ function init_vars {
} else {
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
}
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
@@ -76,9 +83,9 @@ function init_vars {
function git_module_setup {
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
& git submodule init
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
& git submodule update --force "${script:llamacppDir}"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
}
function apply_patches {
@@ -112,10 +119,15 @@ function build {
write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
if ($cmakeDefs -contains "-G") {
$extra=@("-j8")
} else {
$extra= @("--", "/p:CL_MPcount=8")
}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
if ($LASTEXITCODE -ne 0) { write-host "cmake build exit status $LASTEXITCODE"; throw($LASTEXITCODE)}
# Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
@@ -129,26 +141,23 @@ function sign {
foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
}
}
}
function compress {
if ($script:GZIP -eq $null) {
write-host "gzip not installed, not compressing files"
return
}
write-host "Compressing binaries..."
function install {
write-host "Installing binaries to dist dir ${script:distDir}"
mkdir ${script:distDir} -ErrorAction SilentlyContinue
$binaries = dir "${script:buildDir}/bin/*.exe"
foreach ($file in $binaries) {
& "$script:GZIP" --best -f $file
copy-item -Path $file -Destination ${script:distDir} -Force
}
write-host "Compressing dlls..."
write-host "Installing dlls to dist dir ${script:distDir}"
$dlls = dir "${script:buildDir}/bin/*.dll"
foreach ($file in $dlls) {
& "$script:GZIP" --best -f $file
copy-item -Path $file -Destination ${script:distDir} -Force
}
}
@@ -169,132 +178,266 @@ function cleanup {
}
}
init_vars
git_module_setup
apply_patches
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
# GCC build for direct linking into the Go binary
init_vars
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with itx
write-host "Checking for MinGW..."
# error action ensures we exit on failure
get-command gcc
get-command mingw32-make
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
"-G", "MinGW Makefiles"
"-DCMAKE_C_COMPILER=gcc.exe",
"-DCMAKE_CXX_COMPILER=g++.exe",
"-DBUILD_SHARED_LIBS=off",
"-DLLAMA_NATIVE=off",
"-DLLAMA_AVX=off",
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
# remaining llama.cpp builds use MSVC
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU"
build
sign
compress
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU"
build
sign
compress
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU"
build
sign
compress
} else {
write-host "Skipping CPU generation step as requested"
function build_static() {
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
# GCC build for direct linking into the Go binary
init_vars
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with itx
write-host "Checking for MinGW..."
# error action ensures we exit on failure
get-command gcc
get-command mingw32-make
$oldTargets = $script:cmakeTargets
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
"-G", "MinGW Makefiles"
"-DCMAKE_C_COMPILER=gcc.exe",
"-DCMAKE_CXX_COMPILER=g++.exe",
"-DBUILD_SHARED_LIBS=off",
"-DLLAMA_NATIVE=off",
"-DLLAMA_AVX=off",
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
$script:cmakeTargets = $oldTargets
} else {
write-host "Skipping CPU generation step as requested"
}
}
if ($null -ne $script:CUDA_LIB_DIR) {
# Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
function build_cpu() {
if ($script:ARCH -eq "arm64") {
$gen_arch = "ARM64"
} else { # amd64
$gen_arch = "x64"
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
write-host "building custom CUDA GPU"
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
# remaining llama.cpp builds use MSVC
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
$script:distDir="$script:DIST_BASE\cpu"
write-host "Building LCD CPU"
build
sign
install
} else {
write-host "Skipping CPU generation step as requested"
}
build
sign
compress
}
if ($null -ne $env:HIP_PATH) {
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
if ($null -ne $script:ROCM_VERSION) {
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
function build_cpu_avx() {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
$script:distDir="$script:DIST_BASE\cpu_avx"
write-host "Building AVX CPU"
build
sign
install
} else {
write-host "Skipping CPU AVX generation step as requested"
}
}
function build_cpu_avx2() {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
$script:distDir="$script:DIST_BASE\cpu_avx2"
write-host "Building AVX2 CPU"
build
sign
install
} else {
write-host "Skipping CPU AVX2 generation step as requested"
}
}
function build_cuda() {
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
# Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @(
"-A", "x64",
"-DLLAMA_CUDA=ON",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
"-DCMAKE_CUDA_FLAGS=-t8"
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
)
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
write-host "building custom CUDA GPU"
}
build
sign
install
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
} else {
write-host "Skipping CUDA generation step"
}
}
function build_oneapi() {
if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}")) {
# Get oneAPI version
$script:ONEAPI_VERSION = icpx --version
$script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
if ($null -ne $script:ONEAPI_VERSION) {
$script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
$script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
$script:cmakeDefs += @(
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DLLAMA_HIPBLAS=on",
"-DHIP_PLATFORM=amd",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
"-DAMDGPU_TARGETS=$(amdGPUs)",
"-DGPU_TARGETS=$(amdGPUs)"
)
"-G", "MinGW Makefiles",
"-DLLAMA_SYCL=ON",
"-DCMAKE_C_COMPILER=icx",
"-DCMAKE_CXX_COMPILER=icx",
"-DCMAKE_BUILD_TYPE=Release"
)
# Make sure the ROCm binary dir is first in the path
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
# We have to clobber the LIB var from the developer shell for clang to work properly
$env:LIB=""
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
write-host "building custom ROCM GPU"
}
write-host "Building ROCm"
Write-Host "Building oneAPI"
build
# Ninja doesn't prefix with config name
${script:config}=""
if ($null -ne $script:DUMPBIN) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
}
sign
compress
install
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
} else {
Write-Host "Skipping oneAPI generation step"
}
}
function build_rocm() {
if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
if ($null -ne $script:ROCM_VERSION) {
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
}
cleanup
write-host "`code generation completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
init_vars
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
$script:cmakeDefs += @(
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DLLAMA_HIPBLAS=on",
"-DHIP_PLATFORM=amd",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
"-DAMDGPU_TARGETS=$(amdGPUs)",
"-DGPU_TARGETS=$(amdGPUs)"
)
# Make sure the ROCm binary dir is first in the path
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
# We have to clobber the LIB var from the developer shell for clang to work properly
$env:LIB=""
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
write-host "building custom ROCM GPU"
}
write-host "Building ROCm"
build
# Ninja doesn't prefix with config name
${script:config}=""
if ($null -ne $script:DUMPBIN) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
}
sign
install
# Assumes v5.7, may need adjustments for v6
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
} else {
write-host "Skipping ROCm generation step"
}
}
init_vars
if ($($args.count) -eq 0) {
git_module_setup
apply_patches
$tasks = @("build_static", "build_cpu")
$jobs = @()
if ($script:ARCH -ne "arm64") {
$tasks += $("build_cpu_avx", "build_cpu_avx2", "build_cuda", "build_oneapi", "build_rocm")
}
foreach ($t in $tasks) {
$jobs += @(Start-ThreadJob -ThrottleLimit 12 -FilePath .\gen_windows.ps1 -ArgumentList $t -Name $t)
}
get-job
foreach ($job in $jobs) {
write-host "----" $job.Name output follows
receive-job -wait -job $job
write-host "----" $job.Name $job.State
write-host ""
if ($job.State -contains 'Failed') {
cleanup
write-host "Terminating remaining jobs (this takes a while, you can ^C)"
# TODO find some way to kill the spawned cmake processes faster
remove-job -force -job $jobs
exit(-1)
}
get-job
}
cleanup
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"
} else {
for ( $i = 0; $i -lt $args.count; $i++ ) {
write-host "performing $($args[$i])"
& $($args[$i])
}
}

View File

@@ -0,0 +1,3 @@
package generate
//go:generate bash ./gen_darwin.sh

View File

@@ -0,0 +1,3 @@
package generate
//go:generate bash ./gen_linux.sh

View File

@@ -0,0 +1,3 @@
package generate
//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1

View File

@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
return llm.tensors
}
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
func (llm *ggla) decode(rs io.ReadSeeker) error {
var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err
@@ -69,18 +69,9 @@ func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
for {
var dims uint32
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
if errors.Is(err, io.EOF) {
return nil
}
return err
}
defer func() {
if errors.Is(retErr, io.EOF) {
retErr = io.ErrUnexpectedEOF
}
}()
var namesize uint32
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
return err
@@ -117,7 +108,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
return err
}
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
return err
}

View File

@@ -6,8 +6,6 @@ import (
"fmt"
"io"
"strings"
"github.com/ollama/ollama/util/bufioutil"
)
type GGML struct {
@@ -71,30 +69,6 @@ func (kv KV) HeadCountKV() uint64 {
return 1
}
func (kv KV) EmbeddingHeadCount() uint64 {
if heads := kv.HeadCount(); heads > 0 {
return kv.EmbeddingLength() / kv.HeadCount()
}
return 0
}
func (kv KV) EmbeddingHeadCountK() uint64 {
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
return k
}
return kv.EmbeddingHeadCount()
}
func (kv KV) EmbeddingHeadCountV() uint64 {
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
return v
}
return kv.EmbeddingHeadCount()
}
func (kv KV) GQA() uint64 {
return kv.HeadCount() / kv.HeadCountKV()
}
@@ -280,18 +254,7 @@ func DetectGGMLType(b []byte) string {
}
}
// DecodeGGML decodes a GGML model from the given reader.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
if maxArraySize == 0 {
maxArraySize = 1024
}
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err
@@ -304,15 +267,17 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
case FILE_MAGIC_GGLA:
c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE:
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
c = &containerGGUF{ByteOrder: binary.LittleEndian}
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
c = &containerGGUF{ByteOrder: binary.BigEndian}
default:
return nil, 0, errors.New("invalid file magic")
}
model, err := c.Decode(rs)
if err != nil {
if errors.Is(err, io.EOF) {
// noop
} else if err != nil {
return nil, 0, err
}
@@ -332,10 +297,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV()
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
embeddingHeads := llm.KV().EmbeddingHeadCount()
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
layers := llm.Tensors().Layers()
@@ -346,7 +308,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload = 4 * batch * embedding
partialOffload += max(
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
@@ -354,30 +316,21 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
// mixtral 8x22b
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
partialOffload = max(
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
)
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
// mixtral 8x7b
ffnGateWeight1 := ffnGateWeight.Shape[1]
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
partialOffload = max(
4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
)
}
case "gemma", "gemma2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
)
partialOffload = max(
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
4*embeddingHeadsK*context*8+
embedding*embeddingHeadsK*heads*9/16,
)
case "gemma":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
@@ -414,16 +367,6 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4*batch*(vocab+2*embedding),
fullOffload,
)
case "deepseek2":
fullOffload = max(
4*batch*(3*embedding+vocab),
4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
)
partialOffload = max(
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
)
}
return

View File

@@ -1 +0,0 @@
package llm

View File

@@ -3,10 +3,11 @@ package llm
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"strings"
"log/slog"
)
type containerGGUF struct {
@@ -28,12 +29,6 @@ type containerGGUF struct {
NumTensor uint64
NumKV uint64
}
maxArraySize int
}
func (c *containerGGUF) canCollectArray(size int) bool {
return c.maxArraySize < 0 || size <= c.maxArraySize
}
func (c *containerGGUF) Name() string {
@@ -59,6 +54,7 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
}
model := newGGUF(c)
slog.Debug(fmt.Sprintf("model = %#v", model))
if err := model.Decode(rs); err != nil {
return nil, err
}
@@ -89,8 +85,6 @@ type gguf struct {
tensors []*Tensor
parameters uint64
scratch [16 << 10]byte
}
func newGGUF(container *containerGGUF) *gguf {
@@ -187,34 +181,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
}
// decode tensors
for range llm.numTensor() {
for i := 0; uint64(i) < llm.numTensor(); i++ {
name, err := readGGUFString(llm, rs)
if err != nil {
return fmt.Errorf("failed to read tensor name: %w", err)
return err
}
// dims is the number of dimensions in the tensor
dims, err := readGGUF[uint32](llm, rs)
if err != nil {
return fmt.Errorf("failed to read tensor dimensions: %w", err)
return err
}
shape := [4]uint64{1, 1, 1, 1}
for i := 0; uint32(i) < dims; i++ {
shape[i], err = readGGUF[uint64](llm, rs)
if err != nil {
return fmt.Errorf("failed to read tensor shape: %w", err)
return err
}
}
kind, err := readGGUF[uint32](llm, rs)
if err != nil {
return fmt.Errorf("failed to read tensor kind: %w", err)
return err
}
offset, err := readGGUF[uint64](llm, rs)
if err != nil {
return fmt.Errorf("failed to read tensor offset: %w", err)
return err
}
tensor := Tensor{
@@ -236,19 +230,24 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
alignment = 32
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
for _, tensor := range llm.tensors {
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return fmt.Errorf("failed to get current offset: %w", err)
}
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return fmt.Errorf("failed to seek to init padding: %w", err)
}
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
return fmt.Errorf("failed to seek to tensor: %w", err)
return err
}
padding := llm.padding(int64(tensor.Size()), int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
}
@@ -286,48 +285,22 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
return b.String(), nil
}
func discardGGUFString(llm *gguf, r io.Reader) error {
buf := llm.scratch[:8]
_, err := io.ReadFull(r, buf)
if err != nil {
return err
}
size := int(llm.ByteOrder.Uint64(buf))
for size > 0 {
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
if err != nil {
return err
}
size -= n
}
return nil
}
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
if llm.Version == 1 {
return readGGUFV1String(llm, r)
}
buf := llm.scratch[:8]
_, err := io.ReadFull(r, buf)
if err != nil {
var length uint64
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
return "", err
}
length := int(llm.ByteOrder.Uint64(buf))
if length > len(llm.scratch) {
buf = make([]byte, length)
} else {
buf = llm.scratch[:length]
}
clear(buf)
_, err = io.ReadFull(r, buf)
if err != nil {
var b bytes.Buffer
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
return "", err
}
return string(buf), nil
return b.String(), nil
}
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -343,16 +316,7 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
return err
}
type array struct {
size int
values []any
}
func (a *array) MarshalJSON() ([]byte, error) {
return json.Marshal(a.values)
}
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
t, err := readGGUF[uint32](llm, r)
if err != nil {
return nil, err
@@ -363,12 +327,7 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
return nil, err
}
a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, 0, int(n))
}
for i := range n {
for i := 0; uint32(i) < n; i++ {
var e any
switch t {
case ggufTypeUint8:
@@ -402,15 +361,13 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
return nil, err
}
if a.values != nil {
a.values[i] = e
}
a = append(a, e)
}
return a, nil
return
}
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
if llm.Version == 1 {
return readGGUFV1Array(llm, r)
}
@@ -425,12 +382,7 @@ func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
return nil, err
}
a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, int(n))
}
for i := range n {
for i := 0; uint64(i) < n; i++ {
var e any
switch t {
case ggufTypeUint8:
@@ -456,11 +408,7 @@ func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
case ggufTypeBool:
e, err = readGGUF[bool](llm, r)
case ggufTypeString:
if a.values != nil {
e, err = readGGUFString(llm, r)
} else {
err = discardGGUFString(llm, r)
}
e, err = readGGUFString(llm, r)
default:
return nil, fmt.Errorf("invalid array type: %d", t)
}
@@ -468,12 +416,10 @@ func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
return nil, err
}
if a.values != nil {
a.values[i] = e
}
a = append(a, e)
}
return a, nil
return
}
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {

View File

@@ -1,7 +1,6 @@
package llm
import (
"fmt"
"log/slog"
"strconv"
"strings"
@@ -50,18 +49,6 @@ type MemoryEstimate struct {
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
// internal fields for logging purposes
inferenceLibrary string
layersRequested int
layersModel int
availableList []string
kv uint64
allocationsList []string
memoryWeights uint64
memoryLayerOutput uint64
graphFullOffload uint64
graphPartialOffload uint64
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@@ -115,8 +102,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
slog.Warn("model missing blk.0 layer size")
}
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
@@ -180,11 +167,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// For all the layers, find where they can fit on the GPU(s)
for i := range int(ggml.KV().BlockCount()) {
// Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
layerSize = blk.size()
layerSize += kv / ggml.KV().BlockCount()
}
memoryWeights += layerSize
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
@@ -270,86 +252,78 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
allocationsList = append(allocationsList, format.HumanBytes2(a))
}
estimate := MemoryEstimate{
TotalSize: memoryRequiredTotal,
Layers: 0,
Graph: 0,
VRAMSize: 0,
GPUSizes: []uint64{},
inferenceLibrary: gpus[0].Library,
layersRequested: opts.NumGPU,
layersModel: int(ggml.KV().BlockCount()) + 1,
availableList: availableList,
kv: kv,
allocationsList: allocationsList,
memoryWeights: memoryWeights,
memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload,
}
if gpus[0].Library == "cpu" {
return estimate
}
if layerCount == 0 {
slog.Debug("insufficient VRAM to load any model layers")
return estimate
}
estimate.Layers = layerCount
estimate.Graph = graphOffload
estimate.VRAMSize = memoryRequiredPartial
estimate.TotalSize = memoryRequiredTotal
estimate.TensorSplit = tensorSplit
estimate.GPUSizes = gpuAllocations
return estimate
}
func (m MemoryEstimate) log() {
slog.Info(
"offload to "+m.inferenceLibrary,
"offload to gpu",
slog.Group(
"layers",
// requested number of layers to offload
"requested", m.layersRequested,
"requested", opts.NumGPU,
// The number of layers the model has (including output)
"model", m.layersModel,
"model", int(ggml.KV().BlockCount())+1,
// estimated number of layers that can be offloaded
"offload", m.Layers,
// multi-gpu split for tensors
"split", m.TensorSplit,
"offload", layerCount,
// multi-gpu split for tesnors
"split", tensorSplit,
),
slog.Group(
"memory",
// memory available by GPU for offloading
"available", m.availableList,
"available", availableList,
slog.Group(
"required",
// memory required for full offloading
"full", format.HumanBytes2(m.TotalSize),
"full", format.HumanBytes2(memoryRequiredTotal),
// memory required to offload layers.estimate layers
"partial", format.HumanBytes2(m.VRAMSize),
"partial", format.HumanBytes2(memoryRequiredPartial),
// memory of KV cache
"kv", format.HumanBytes2(m.kv),
"kv", format.HumanBytes2(kv),
// Allocations across the GPUs
"allocations", m.allocationsList,
"allocations", allocationsList,
),
slog.Group(
"weights",
// memory of the weights
"total", format.HumanBytes2(m.memoryWeights),
"total", format.HumanBytes2(memoryWeights),
// memory of repeating layers
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
// memory of non-repeating layers
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
),
slog.Group(
"graph",
// memory of graph when fully offloaded
"full", format.HumanBytes2(m.graphFullOffload),
"full", format.HumanBytes2(graphFullOffload),
// memory of graph when not fully offloaded
"partial", format.HumanBytes2(m.graphPartialOffload),
"partial", format.HumanBytes2(graphPartialOffload),
),
),
)
if gpus[0].Library == "cpu" {
return MemoryEstimate{
Layers: 0,
Graph: 0,
VRAMSize: 0,
TotalSize: memoryRequiredTotal,
GPUSizes: []uint64{},
}
}
if layerCount == 0 {
slog.Debug("insufficient VRAM to load any model layers")
return MemoryEstimate{
Layers: 0,
Graph: 0,
VRAMSize: 0,
TotalSize: memoryRequiredTotal,
GPUSizes: []uint64{},
}
}
return MemoryEstimate{
Layers: layerCount,
Graph: graphOffload,
VRAMSize: memoryRequiredPartial,
TotalSize: memoryRequiredTotal,
TensorSplit: tensorSplit,
GPUSizes: gpuAllocations,
}
}

View File

@@ -22,14 +22,13 @@ func TestEstimateGPULayers(t *testing.T) {
defer f.Close()
gguf := NewGGUFV3(binary.LittleEndian)
inputLayerCount := 5
tensors := []Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
}
assert.Len(t, tensors, inputLayerCount+1)
err = gguf.Encode(f, KV{
@@ -46,10 +45,8 @@ func TestEstimateGPULayers(t *testing.T) {
}, tensors)
require.NoError(t, err)
ggml, err := LoadModel(f.Name(), 0)
if err != nil {
t.Fatal(err)
}
ggml, err := LoadModel(f.Name())
require.NoError(t, err)
// Simple CPU scenario
gpus := []gpu.GpuInfo{

View File

@@ -1,8 +1,8 @@
diff --git a/common/common.cpp b/common/common.cpp
index 73ff0e85..6adb1a92 100644
index ba1ecf0e..cead57cc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
@@ -12,20 +12,20 @@ index 73ff0e85..6adb1a92 100644
mparams.kv_overrides = NULL;
} else {
diff --git a/common/common.h b/common/common.h
index 58ed72f4..0bb2605e 100644
index d80344f2..71e84834 100644
--- a/common/common.h
+++ b/common/common.h
@@ -180,6 +180,13 @@ struct gpt_params {
@@ -174,6 +174,13 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
+
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+ // If the provided progress_callback returns true, model loading continues.
+ // If it returns false, model loading is immediately aborted.
+ llama_progress_callback progress_callback = NULL;
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
+
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
};
void gpt_params_handle_model_default(gpt_params & params);

View File

@@ -1,8 +1,8 @@
diff --git a/llama.cpp b/llama.cpp
index 61948751..4b72a293 100644
index 40d2ec2c..74f3ee9c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 61948751..4b72a293 100644
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (tokenizer_pre == "default") {
+ if (tokenizer_pre == "default") {
- } else if (
+ if (
tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
tokenizer_pre == "smaug-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);

View File

@@ -1,305 +0,0 @@
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
From: Ollama maintainers <hello@ollama.com>
Date: Wed, 26 Jun 2024 16:18:09 -0700
Subject: [PATCH] Architecture support
---
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 1 deletion(-)
diff --git a/llama.cpp b/llama.cpp
index 61948751..3b4196f5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM,
LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
+ { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
@@ -464,10 +466,12 @@ enum llm_tensor {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GEMMA2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ },
+ },
{
LLM_ARCH_STARCODER2,
{
@@ -1941,6 +1963,8 @@ enum e_model {
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
+ MODEL_9B,
+ MODEL_27B,
};
static const size_t kiB = 1024;
@@ -2114,6 +2138,7 @@ struct llama_layer {
struct ggml_tensor * attn_out_norm_b;
struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm;
+ struct ggml_tensor * attn_post_norm;
// attention
struct ggml_tensor * wq;
@@ -2136,6 +2161,7 @@ struct llama_layer {
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
+ struct ggml_tensor * ffn_post_norm;
struct ggml_tensor * layer_out_norm;
struct ggml_tensor * layer_out_norm_b;
struct ggml_tensor * ffn_norm_exps;
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
}
} break;
case LLM_ARCH_GEMMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 18: model.type = e_model::MODEL_9B; break;
+ case 28: model.type = e_model::MODEL_27B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
}
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ const int64_t n_ff = hparams.n_ff;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+ }
+ } break;
case LLM_ARCH_STARCODER2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10614,6 +10684,123 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_gemma2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, NULL,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_post_norm", il);
+
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = llm_build_norm(ctx0, sa_out, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_gemma();
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ result = llm.build_gemma2();
+ } break;
case LLM_ARCH_STARCODER2:
{
result = llm.build_starcoder2();
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA:
+ case LLM_ARCH_GEMMA2:
case LLM_ARCH_STARCODER2:
case LLM_ARCH_GPTNEOX:
return LLAMA_ROPE_TYPE_NEOX;
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<s>assistant\n";
}
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
--
2.45.2

View File

@@ -58,7 +58,7 @@ func availableServers() map[string]string {
}
// glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
pattern := filepath.Join(payloadsDir, "*")
files, err := filepath.Glob(pattern)
if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
servers[filepath.Base(file)] = file
}
return servers

View File

@@ -60,12 +60,7 @@ type llmServer struct {
sem *semaphore.Weighted
}
// LoadModel will load a model from disk. The model must be in the GGML format.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*GGML, error) {
func LoadModel(model string) (*GGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@@ -76,7 +71,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
}
defer f.Close()
ggml, _, err := DecodeGGML(f, maxArraySize)
ggml, _, err := DecodeGGML(f)
return ggml, err
}
@@ -86,17 +81,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var err error
var cpuRunner string
var estimate MemoryEstimate
var systemTotalMemory uint64
var systemFreeMemory uint64
systemMemInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemTotalMemory = systemMemInfo.TotalMemory
systemFreeMemory = systemMemInfo.FreeMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
}
var systemMemory uint64
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 {
@@ -106,10 +91,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner = serverForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemMemory = memInfo.TotalMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
}
}
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
opts.NumGPU = 0
@@ -122,8 +116,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
}
estimate.log()
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
@@ -208,7 +200,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = api.TriStateFalse
opts.UseMMap = false
}
}
@@ -216,11 +208,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--flash-attn")
}
// Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
opts.UseMMap == api.TriStateFalse {
if !opts.UseMMap {
params = append(params, "--no-mmap")
}
@@ -283,8 +271,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
libraryPaths := []string{dir, filepath.Dir(dir)}
// prepend the server directory to LD_LIBRARY_PATH/PATH
libraryPaths := []string{dir}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
@@ -417,7 +405,7 @@ func projectorMemoryRequirements(filename string) uint64 {
}
defer file.Close()
ggml, _, err := DecodeGGML(file, 0)
ggml, _, err := DecodeGGML(file)
if err != nil {
return 0
}

View File

@@ -1,7 +1,5 @@
package main
//go:generate go run build.go -g -s
import (
"context"

View File

@@ -103,19 +103,19 @@ function buildApp() {
function gatherDependencies() {
write-host "Gathering runtime dependencies"
cd "${script:SRC_DIR}"
md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
md "${script:DEPS_DIR}" -ea 0 > $null
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
# currently works for Win11 + MSVC 2019 + Cuda V11
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
if ("${env:KEY_CONTAINER}") {
write-host "about to sign"
foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
write-host "signing $file"
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file

View File

@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
case $OS_NAME in
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
amzn) install_cuda_driver_yum 'fedora' '37' ;;
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;

View File

@@ -414,22 +414,17 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
layer, err := NewLayer(temp, baseLayer.MediaType)
layers, err := parseFromFile(ctx, temp, "", fn)
if err != nil {
return err
}
if _, err := temp.Seek(0, io.SeekStart); err != nil {
return err
if len(layers) != 1 {
return errors.New("quantization failed")
}
ggml, _, err := llm.DecodeGGML(temp, 0)
if err != nil {
return err
}
baseLayer.Layer = layer
baseLayer.GGML = ggml
baseLayer.Layer = layers[0].Layer
baseLayer.GGML = layers[0].GGML
}
}

View File

@@ -11,7 +11,6 @@ import (
"net/http"
"os"
"path/filepath"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/convert"
@@ -64,7 +63,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
}
defer blob.Close()
ggml, _, err := llm.DecodeGGML(blob, 0)
ggml, _, err := llm.DecodeGGML(blob)
if err != nil {
return nil, err
}
@@ -78,80 +77,62 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
return layers, nil
}
func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
stat, err := file.Stat()
if err != nil {
return err
return nil, err
}
r, err := zip.NewReader(file, stat.Size())
if err != nil {
return err
return nil, err
}
tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempdir)
fn(api.ProgressResponse{Status: "unpacking model metadata"})
for _, f := range r.File {
n := filepath.Join(p, f.Name)
if !strings.HasPrefix(n, p) {
slog.Warn("skipped extracting file outside of context", "name", f.Name)
continue
}
if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
return err
}
// TODO(mxyng): this should not write out all files to disk
outfile, err := os.Create(n)
outfile, err := os.Create(filepath.Join(tempdir, f.Name))
if err != nil {
return err
return nil, err
}
defer outfile.Close()
infile, err := f.Open()
if err != nil {
return err
return nil, err
}
defer infile.Close()
if _, err = io.Copy(outfile, infile); err != nil {
return err
return nil, err
}
if err := outfile.Close(); err != nil {
return err
return nil, err
}
if err := infile.Close(); err != nil {
return err
return nil, err
}
}
return nil
}
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempDir)
if err := extractFromZipFile(tempDir, file, fn); err != nil {
return nil, err
}
mf, err := convert.GetModelFormat(tempDir)
mf, err := convert.GetModelFormat(tempdir)
if err != nil {
return nil, err
}
params, err := mf.GetParams(tempDir)
params, err := mf.GetParams(tempdir)
if err != nil {
return nil, err
}
mArch, err := mf.GetModelArch("", tempDir, params)
mArch, err := mf.GetModelArch("", tempdir, params)
if err != nil {
return nil, err
}
@@ -169,7 +150,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
// TODO(mxyng): this should write directly into a layer
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
temp, err := os.CreateTemp(tempDir, "fp16")
temp, err := os.CreateTemp(tempdir, "fp16")
if err != nil {
return nil, err
}
@@ -195,7 +176,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
}
defer bin.Close()
ggml, _, err := llm.DecodeGGML(bin, 0)
ggml, _, err := llm.DecodeGGML(bin)
if err != nil {
return nil, err
}
@@ -229,7 +210,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
var offset int64
for offset < stat.Size() {
ggml, n, err := llm.DecodeGGML(file, 0)
ggml, n, err := llm.DecodeGGML(file)
if errors.Is(err, io.EOF) {
break
} else if err != nil {

View File

@@ -1,92 +0,0 @@
package server
import (
"archive/zip"
"bytes"
"io"
"os"
"path/filepath"
"slices"
"testing"
"github.com/ollama/ollama/api"
)
func createZipFile(t *testing.T, name string) *os.File {
t.Helper()
f, err := os.CreateTemp(t.TempDir(), "")
if err != nil {
t.Fatal(err)
}
zf := zip.NewWriter(f)
defer zf.Close()
zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
if err != nil {
t.Fatal(err)
}
if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
t.Fatal(err)
}
return f
}
func TestExtractFromZipFile(t *testing.T) {
cases := []struct {
name string
expect []string
}{
{
name: "good",
expect: []string{"good"},
},
{
name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
f := createZipFile(t, tt.name)
defer f.Close()
tempDir := t.TempDir()
if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
t.Fatal(err)
}
var matches []string
if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
if err != nil {
return err
}
if !fi.IsDir() {
matches = append(matches, p)
}
return nil
}); err != nil {
t.Fatal(err)
}
var actual []string
for _, match := range matches {
rel, err := filepath.Rel(tempDir, match)
if err != nil {
t.Error(err)
}
actual = append(actual, rel)
}
if !slices.Equal(actual, tt.expect) {
t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
}
})
}
}

View File

@@ -734,48 +734,9 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
fmt.Fprint(&sb, m.String())
resp.Modelfile = sb.String()
kvData, err := getKVData(m.ModelPath, req.Verbose)
if err != nil {
return nil, err
}
delete(kvData, "general.name")
delete(kvData, "tokenizer.chat_template")
resp.ModelInfo = kvData
if len(m.ProjectorPaths) > 0 {
projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
if err != nil {
return nil, err
}
resp.ProjectorInfo = projectorData
}
return resp, nil
}
func getKVData(digest string, verbose bool) (llm.KV, error) {
maxArraySize := 0
if verbose {
maxArraySize = -1
}
kvData, err := llm.LoadModel(digest, maxArraySize)
if err != nil {
return nil, err
}
kv := kvData.KV()
if !verbose {
for k := range kv {
if t, ok := kv[k].([]any); len(t) > 5 && ok {
kv[k] = []any{}
}
}
}
return kv, nil
}
func (s *Server) ListModelsHandler(c *gin.Context) {
ms, err := Manifests()
if err != nil {
@@ -1105,20 +1066,11 @@ func Serve(ln net.Listener) error {
schedCtx, schedDone := context.WithCancel(ctx)
sched := InitScheduler(schedCtx)
s := &Server{addr: ln.Addr(), sched: sched}
http.Handle("/", s.GenerateRoutes())
r := s.GenerateRoutes()
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
srvr := &http.Server{
// Use http.DefaultServeMux so we get net/http/pprof for
// free.
//
// TODO(bmizerany): Decide if we want to make this
// configurable so it is not exposed by default, or allow
// users to bind it to a different port. This was a quick
// and easy way to get pprof, but it may not be the best
// way.
Handler: nil,
Handler: r,
}
// listen for a ctrl+c and stop any loaded llm

View File

@@ -19,7 +19,6 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
@@ -213,7 +212,6 @@ func Test_Routes(t *testing.T) {
"top_p 0.9",
}
assert.Equal(t, expectedParams, params)
assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
},
},
}
@@ -327,40 +325,3 @@ func TestCase(t *testing.T) {
})
}
}
func TestShow(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
var s Server
createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "show-model",
Modelfile: fmt.Sprintf(
"FROM %s\nFROM %s",
createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
),
})
w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
Name: "show-model",
})
if w.Code != http.StatusOK {
t.Fatalf("expected status code 200, actual %d", w.Code)
}
var resp api.ShowResponse
if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
t.Fatal(err)
}
if resp.ModelInfo["general.architecture"] != "test" {
t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
}
if resp.ProjectorInfo["general.architecture"] != "clip" {
t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
}
}

View File

@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
// Load model for fitting
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
ggml, err := llm.LoadModel(pending.model.ModelPath)
if err != nil {
pending.errCh <- err
break

View File

@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
})
require.NoError(t, err)
fname := f.Name()
model := &Model{Name: modelName, ModelPath: fname}
scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
scenario.ggml, err = llm.LoadModel(model.ModelPath)
require.NoError(t, err)
scenario.req = &LlmRequest{

View File

@@ -4,6 +4,7 @@ package model
import (
"cmp"
"encoding/hex"
"errors"
"fmt"
"log/slog"
@@ -370,3 +371,57 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
}
return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
}
type DigestType byte
const (
DigestTypeInvalid DigestType = iota
DigestTypeSHA256
)
func (t DigestType) String() string {
switch t {
case DigestTypeSHA256:
return "sha256"
default:
return "invalid"
}
}
type Digest struct {
Type DigestType
Sum [32]byte
}
func ParseDigest(s string) (Digest, error) {
i := strings.IndexAny(s, "-:")
if i < 0 {
return Digest{}, fmt.Errorf("invalid digest %q", s)
}
typ, encSum := s[:i], s[i+1:]
if typ != "sha256" {
return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
}
d := Digest{
Type: DigestTypeSHA256,
}
n, err := hex.Decode(d.Sum[:], []byte(encSum))
if err != nil {
return Digest{}, err
}
if n != 32 {
return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
}
return d, nil
}
func (d Digest) String() string {
if d.Type == DigestTypeInvalid {
return ""
}
return fmt.Sprintf("sha256-%x", d.Sum)
}
func (d Digest) IsValid() bool {
return d.Type != DigestTypeInvalid
}

View File

@@ -284,6 +284,40 @@ func TestFilepathAllocs(t *testing.T) {
}
}
const (
validSha256 = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
)
func TestParseDigest(t *testing.T) {
cases := []struct {
in string
want string
}{
{"", ""}, // empty
{"sha123-12", ""}, // invalid type
{"sha256-", ""}, // invalid sum
{"sha256-123", ""}, // invalid odd length sum
{validSha256, validSha256},
{validSha256Old, validSha256},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
got, err := ParseDigest(tt.in)
if err != nil {
if tt.want != "" {
t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
}
return
}
if got.String() != tt.want {
t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
}
})
}
}
func TestParseNameFromFilepath(t *testing.T) {
cases := map[string]Name{
filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},

View File

@@ -1,34 +0,0 @@
package bufioutil
import (
"bufio"
"io"
)
type BufferedSeeker struct {
rs io.ReadSeeker
br *bufio.Reader
}
func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
return &BufferedSeeker{
rs: rs,
br: bufio.NewReaderSize(rs, size),
}
}
func (b *BufferedSeeker) Read(p []byte) (int, error) {
return b.br.Read(p)
}
func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
if whence == io.SeekCurrent {
offset -= int64(b.br.Buffered())
}
n, err := b.rs.Seek(offset, whence)
if err != nil {
return 0, err
}
b.br.Reset(b.rs)
return n, nil
}

View File

@@ -1,64 +0,0 @@
package bufioutil
import (
"bytes"
"io"
"strings"
"testing"
)
func TestBufferedSeeker(t *testing.T) {
const alphabet = "abcdefghijklmnopqrstuvwxyz"
bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
checkRead := func(buf []byte, expected string) {
t.Helper()
_, err := bs.Read(buf)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(buf, []byte(expected)) {
t.Fatalf("expected %s, got %s", expected, buf)
}
}
// Read the first 5 bytes
buf := make([]byte, 5)
checkRead(buf, "abcde")
// Seek back to the beginning
_, err := bs.Seek(0, io.SeekStart)
if err != nil {
t.Fatal(err)
}
// read 'a'
checkRead(buf[:1], "a")
if bs.br.Buffered() == 0 {
t.Fatalf("totally unexpected sanity check failed")
}
// Seek past 'b'
_, err = bs.Seek(1, io.SeekCurrent)
if err != nil {
t.Fatal(err)
}
checkRead(buf, "cdefg")
// Seek back to the beginning
_, err = bs.Seek(0, io.SeekStart)
if err != nil {
t.Fatal(err)
}
checkRead(buf, "abcde")
// Seek to the end
_, err = bs.Seek(-5, io.SeekEnd)
if err != nil {
t.Fatal(err)
}
checkRead(buf, "vwxyz")
}