Compare commits

..

32 Commits

Author SHA1 Message Date
Roy Han
c494aea5c8 Strip stop strings 2024-06-20 09:06:08 -07:00
royjhan
fedf71635e Extend api/show and ollama show to return more model info (#4881)
* API Show Extended

* Initial Draft of Information

Co-Authored-By: Patrick Devine <pdevine@sonic.net>

* Clean Up

* Descriptive arg error messages and other fixes

* Second Draft of Show with Projectors Included

* Remove Chat Template

* Touches

* Prevent wrapping from files

* Verbose functionality

* Docs

* Address Feedback

* Lint

* Resolve Conflicts

* Function Name

* Tests for api/show model info

* Show Test File

* Add Projector Test

* Clean routes

* Projector Check

* Move Show Test

* Touches

* Doc update

---------

Co-authored-by: Patrick Devine <pdevine@sonic.net>
2024-06-19 14:19:02 -07:00
Daniel Hiltgen
97c59be653 Merge pull request #5074 from dhiltgen/app_log_rotation
Implement log rotation for tray app
2024-06-19 13:02:24 -07:00
Daniel Hiltgen
9d8a4988e8 Implement log rotation for tray app 2024-06-19 12:53:34 -07:00
Michael Yang
1ae0750a21 Merge pull request #5147 from ollama/mxyng/cleanup
remove confusing log message
2024-06-19 12:50:31 -07:00
Michael Yang
9d91e5e587 remove confusing log message 2024-06-19 11:14:11 -07:00
Daniel Hiltgen
96624aa412 Merge pull request #5072 from dhiltgen/windows_path
Move libraries out of users path
2024-06-19 09:13:39 -07:00
Daniel Hiltgen
10f33b8537 Merge pull request #5146 from dhiltgen/backout
Put back temporary intel GPU env var
2024-06-19 09:12:45 -07:00
Daniel Hiltgen
4a633cc295 Merge pull request #5145 from dhiltgen/bad_loads
Fix bad symbol load detection
2024-06-19 09:12:33 -07:00
Daniel Hiltgen
d34d88e417 Revert "Revert "gpu: add env var for detecting Intel oneapi gpus (#5076)""
This reverts commit 755b4e4fc2.
2024-06-19 08:57:41 -07:00
Daniel Hiltgen
52ce350b7a Fix bad symbol load detection
pointer deref's weren't correct on a few libraries, which explains
some crashes on older systems or miswired symlinks for discovery libraries.
2024-06-19 08:39:07 -07:00
Daniel Hiltgen
2abebb2cbe Merge pull request #5128 from zhewang1-intc/fix_levelzero_empty_symbol_detect
Fix levelzero empty symbol detect
2024-06-19 08:33:16 -07:00
Blake Mizerany
380e06e5be types/model: remove Digest
The Digest type in its current form is awkward to work with and presents
challenges with regard to how it serializes via String using the '-'
prefix.

We currently only use this in ollama.com, so we'll move our specific
needs around digest parsing and validation there.
2024-06-18 20:28:11 -07:00
Wang,Zhe
badf975e45 get real func ptr. 2024-06-19 09:00:51 +08:00
Wang,Zhe
755b4e4fc2 Revert "gpu: add env var for detecting Intel oneapi gpus (#5076)"
This reverts commit 163cd3e77c.
2024-06-19 08:59:58 +08:00
Michael Yang
21adf8b6d2 Merge pull request #5121 from ollama/mxyng/deepseekv2
deepseek v2 graph
2024-06-18 16:30:58 -07:00
Michael Yang
e873841cbb deepseek v2 graph 2024-06-18 15:35:12 -07:00
Daniel Hiltgen
26d0bf9236 Merge pull request #5117 from dhiltgen/fix_prediction
Handle models with divergent layer sizes
2024-06-18 11:36:51 -07:00
Daniel Hiltgen
359b15a597 Handle models with divergent layer sizes
The recent refactoring of the memory prediction assumed all layers
are the same size, but for some models (like deepseek-coder-v2) this
is not the case, so our predictions were significantly off.
2024-06-18 11:05:34 -07:00
Daniel Hiltgen
b55958a587 Merge pull request #5106 from dhiltgen/clean_logs
Tighten up memory prediction logging
2024-06-18 09:24:38 -07:00
Daniel Hiltgen
7784ca33ce Tighten up memory prediction logging
Prior to this change, we logged the memory prediction multiple times
as the scheduler iterates to find a suitable configuration, which can be
confusing since only the last log before the server starts is actually valid.
This now logs once just before starting the server on the final configuration.
It also reports what library instead of always saying "offloading to gpu" when
using CPU.
2024-06-18 09:15:35 -07:00
Daniel Hiltgen
c9c8c98bf6 Merge pull request #5105 from dhiltgen/cuda_mmap
Adjust mmap logic for cuda windows for faster model load
2024-06-17 17:07:30 -07:00
Daniel Hiltgen
171796791f Adjust mmap logic for cuda windows for faster model load
On Windows, recent llama.cpp changes make mmap slower in most
cases, so default to off.  This also implements a tri-state for
use_mmap so we can detect the difference between a user provided
value of true/false, or unspecified.
2024-06-17 16:54:30 -07:00
Jeffrey Morgan
176d0f7075 Update import.md 2024-06-17 19:44:14 -04:00
Daniel Hiltgen
8ed51cac37 Merge pull request #5103 from dhiltgen/faster_win_build
Revert powershell jobs, but keep nvcc and cmake parallelism
2024-06-17 14:23:18 -07:00
Daniel Hiltgen
c9e6f0542d Merge pull request #5069 from dhiltgen/ci_release
Implement custom github release action
2024-06-17 13:59:37 -07:00
Daniel Hiltgen
b0930626c5 Add back lower level parallel flags
nvcc supports parallelism (threads) and cmake + make can use -j,
while msbuild requires /p:CL_MPcount=8
2024-06-17 13:44:46 -07:00
Daniel Hiltgen
e890be4814 Revert "More parallelism on windows generate"
This reverts commit 0577af98f4.
2024-06-17 13:32:46 -07:00
Daniel Hiltgen
b2799f111b Move libraries out of users path
We update the PATH on windows to get the CLI mapped, but this has
an unintended side effect of causing other apps that may use our bundled
DLLs to get terminated when we upgrade.
2024-06-17 13:12:18 -07:00
Jeffrey Morgan
152fc202f5 llm: update llama.cpp commit to 7c26775 (#4896)
* llm: update llama.cpp submodule to `7c26775`

* disable `LLAMA_BLAS` for now

* `-DLLAMA_OPENMP=off`
2024-06-17 15:56:16 -04:00
Lei Jitang
4ad0d4d6d3 Fix a build warning (#5096)
Signed-off-by: Lei Jitang <leijitang@outlook.com>
2024-06-17 14:47:48 -04:00
Daniel Hiltgen
a12283e2ff Implement custom github release action
This implements the release logic we want via gh cli
to support updating releases with rc tags in place and retain
release notes and other community reactions.
2024-06-15 11:36:56 -07:00
33 changed files with 629 additions and 324 deletions

View File

@@ -437,6 +437,7 @@ jobs:
env: env:
OLLAMA_SKIP_IMAGE_BUILD: '1' OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1' PUSH: '1'
GH_TOKEN: ${{ github.token }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Set Version - name: Set Version
@@ -460,15 +461,20 @@ jobs:
ls -lh dist/ ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt) (cd dist; sha256sum * > sha256sum.txt)
cat dist/sha256sum.txt cat dist/sha256sum.txt
- uses: ncipollo/release-action@v1 - name: Create or update Release
with: run: |
name: ${{ env.RELEASE_VERSION }} echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
allowUpdates: true OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
artifacts: 'dist/*' if [ -n "$OLD_TAG" ]; then
draft: true echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
prerelease: true gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
omitBodyDuringUpdate: true else
generateReleaseNotes: true echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
omitDraftDuringUpdate: true gh release create ${GITHUB_REF_NAME} \
omitPrereleaseDuringUpdate: true --title ${{ env.RELEASE_VERSION }} \
replacesArtifacts: true --draft \
--generate-notes \
--prerelease
fi
echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
gh release upload ${GITHUB_REF_NAME} dist/* --clobber

View File

@@ -159,18 +159,49 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory // Runner options which must be set when the model is loaded into memory
type Runner struct { type Runner struct {
UseNUMA bool `json:"numa,omitempty"` UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"` NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"` NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"` NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"` LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"` F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"` LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"` VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"` UseMMap TriState `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"` UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"` NumThread int `json:"num_thread,omitempty"`
}
type TriState int
const (
TriStateUndefined TriState = -1
TriStateFalse TriState = 0
TriStateTrue TriState = 1
)
func (b *TriState) UnmarshalJSON(data []byte) error {
var v bool
if err := json.Unmarshal(data, &v); err != nil {
return err
}
if v {
*b = TriStateTrue
}
*b = TriStateFalse
return nil
}
func (b *TriState) MarshalJSON() ([]byte, error) {
if *b == TriStateUndefined {
return nil, nil
}
var v bool
if *b == TriStateTrue {
v = true
}
return json.Marshal(v)
} }
// EmbeddingRequest is the request passed to [Client.Embeddings]. // EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -222,6 +253,7 @@ type ShowRequest struct {
Model string `json:"model"` Model string `json:"model"`
System string `json:"system"` System string `json:"system"`
Template string `json:"template"` Template string `json:"template"`
Verbose bool `json:"verbose"`
Options map[string]interface{} `json:"options"` Options map[string]interface{} `json:"options"`
@@ -231,14 +263,16 @@ type ShowRequest struct {
// ShowResponse is the response returned from [Client.Show]. // ShowResponse is the response returned from [Client.Show].
type ShowResponse struct { type ShowResponse struct {
License string `json:"license,omitempty"` License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"` Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"` Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"` Template string `json:"template,omitempty"`
System string `json:"system,omitempty"` System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"` Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"` Messages []Message `json:"messages,omitempty"`
ModifiedAt time.Time `json:"modified_at,omitempty"` ModelInfo map[string]any `json:"model_info,omitempty"`
ProjectorInfo map[string]any `json:"projector_info,omitempty"`
ModifiedAt time.Time `json:"modified_at,omitempty"`
} }
// CopyRequest is the request passed to [Client.Copy]. // CopyRequest is the request passed to [Client.Copy].
@@ -403,6 +437,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
continue continue
} }
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
val, ok := val.(bool)
if !ok {
return fmt.Errorf("option %q must be of type boolean", key)
}
if val {
field.SetInt(int64(TriStateTrue))
} else {
field.SetInt(int64(TriStateFalse))
}
continue
}
switch field.Kind() { switch field.Kind() {
case reflect.Int: case reflect.Int:
switch t := val.(type) { switch t := val.(type) {
@@ -491,7 +538,7 @@ func DefaultOptions() Options {
LowVRAM: false, LowVRAM: false,
F16KV: true, F16KV: true,
UseMLock: false, UseMLock: false,
UseMMap: true, UseMMap: TriStateUndefined,
UseNUMA: false, UseNUMA: false,
}, },
} }

View File

@@ -105,3 +105,39 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
}) })
} }
} }
func TestUseMmapParsingFromJSON(t *testing.T) {
tests := []struct {
name string
req string
exp TriState
}{
{
name: "Undefined",
req: `{ }`,
exp: TriStateUndefined,
},
{
name: "True",
req: `{ "use_mmap": true }`,
exp: TriStateTrue,
},
{
name: "False",
req: `{ "use_mmap": false }`,
exp: TriStateFalse,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var oMap map[string]interface{}
err := json.Unmarshal([]byte(test.req), &oMap)
require.NoError(t, err)
opts := DefaultOptions()
err = opts.FromMap(oMap)
require.NoError(t, err)
assert.Equal(t, test.exp, opts.UseMMap)
})
}
}

View File

@@ -5,6 +5,8 @@ import (
"log/slog" "log/slog"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
) )
@@ -24,6 +26,7 @@ func InitLogging() {
logFile = os.Stderr logFile = os.Stderr
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion // TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else { } else {
rotateLogs(AppLogFile)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755) logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil { if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err)) slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {
slog.Info("ollama app started") slog.Info("ollama app started")
} }
func rotateLogs(logFile string) {
if _, err := os.Stat(logFile); os.IsNotExist(err) {
return
}
index := strings.LastIndex(logFile, ".")
pre := logFile[:index]
post := "." + logFile[index+1:]
for i := LogRotationCount; i > 0; i-- {
older := pre + "-" + strconv.Itoa(i) + post
newer := pre + "-" + strconv.Itoa(i-1) + post
if i == 1 {
newer = pre + post
}
if _, err := os.Stat(newer); err == nil {
if _, err := os.Stat(older); err == nil {
err := os.Remove(older)
if err != nil {
slog.Warn("Failed to remove older log", "older", older, "error", err)
continue
}
}
err := os.Rename(newer, older)
if err != nil {
slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
}
}
}
}

View File

@@ -0,0 +1,44 @@
package lifecycle
import (
"os"
"path/filepath"
"strconv"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRotateLogs(t *testing.T) {
logDir := t.TempDir()
logFile := filepath.Join(logDir, "testlog.log")
// No log exists
rotateLogs(logFile)
require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
assert.FileExists(t, logFile)
// First rotation
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
// Should be a no-op without a new log
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
for i := 2; i <= LogRotationCount+1; i++ {
require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
assert.FileExists(t, logFile)
rotateLogs(logFile)
assert.NoFileExists(t, logFile)
for j := 1; j < i; j++ {
assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
}
assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
}
}

View File

@@ -16,11 +16,12 @@ var (
AppDir = "/opt/Ollama" AppDir = "/opt/Ollama"
AppDataDir = "/opt/Ollama" AppDataDir = "/opt/Ollama"
// TODO - should there be a distinct log dir? // TODO - should there be a distinct log dir?
UpdateStageDir = "/tmp" UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log" AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log" ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log" UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe" Installer = "OllamaSetup.exe"
LogRotationCount = 5
) )
func init() { func init() {

View File

@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err) return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
} }
// TODO - rotation rotateLogs(ServerLogFile)
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755) logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err) return nil, fmt.Errorf("failed to create server log: %w", err)

View File

@@ -88,10 +88,15 @@ DialogFontSize=12
[Files] [Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
#if DirExists("..\dist\windows-amd64\cuda")
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\oneapi")
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\rocm") #if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif #endif

View File

@@ -579,10 +579,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err return err
} }
if len(args) != 1 {
return errors.New("missing model name")
}
license, errLicense := cmd.Flags().GetBool("license") license, errLicense := cmd.Flags().GetBool("license")
modelfile, errModelfile := cmd.Flags().GetBool("modelfile") modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
parameters, errParams := cmd.Flags().GetBool("parameters") parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +621,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
if flagsSet > 1 { if flagsSet > 1 {
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified") return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
} else if flagsSet == 0 { }
return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
if flagsSet == 1 {
req := api.ShowRequest{Name: args[0]}
resp, err := client.Show(cmd.Context(), &req)
if err != nil {
return err
}
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(formatParams(resp.Parameters, false))
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
}
return nil
} }
req := api.ShowRequest{Name: args[0]} req := api.ShowRequest{Name: args[0]}
@@ -635,22 +652,120 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err return err
} }
switch showType { arch := resp.ModelInfo["general.architecture"].(string)
case "license":
fmt.Println(resp.License) modelData := [][]string{
case "modelfile": {"arch", arch},
fmt.Println(resp.Modelfile) {"parameters", resp.Details.ParameterSize},
case "parameters": {"quantization", resp.Details.QuantizationLevel},
fmt.Println(resp.Parameters) {"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
case "system": {"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
} }
mainTableData := [][]string{
{"Model"},
{renderSubTable(modelData, false, true)},
}
if resp.ProjectorInfo != nil {
projectorData := [][]string{
{"arch", "clip"},
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
}
mainTableData = append(mainTableData,
[]string{"Projector"},
[]string{renderSubTable(projectorData, false, true)},
)
}
if resp.Parameters != "" {
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters, true)})
}
if resp.System != "" {
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true, true)})
}
if resp.License != "" {
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true, true)})
}
table := tablewriter.NewWriter(os.Stdout)
table.SetAutoWrapText(false)
table.SetBorder(false)
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range mainTableData {
table.Append(v)
}
table.Render()
return nil return nil
} }
func renderSubTable(data [][]string, file bool, tab bool) string {
var buf bytes.Buffer
table := tablewriter.NewWriter(&buf)
table.SetAutoWrapText(!file)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding("\t")
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range data {
table.Append(v)
}
table.Render()
if !tab {
return buf.String()
}
renderedTable := buf.String()
lines := strings.Split(renderedTable, "\n")
for i, line := range lines {
lines[i] = "\t" + line
}
return strings.Join(lines, "\n")
}
func twoLines(s string) [][]string {
lines := strings.Split(s, "\n")
res := [][]string{}
count := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
count++
res = append(res, []string{line})
if count == 2 {
return res
}
}
}
return res
}
func formatParams(s string, tab bool) string {
lines := strings.Split(s, "\n")
table := [][]string{}
for _, line := range lines {
fields := strings.Fields(line)
fields[1] = strings.TrimPrefix(strings.TrimSuffix(fields[1], `"`), `"`)
table = append(table, fields)
}
return renderSubTable(table, false, tab)
}
func CopyHandler(cmd *cobra.Command, args []string) error { func CopyHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment() client, err := api.ClientFromEnvironment()
if err != nil { if err != nil {

View File

@@ -777,11 +777,12 @@ A single JSON object will be returned.
POST /api/show POST /api/show
``` ```
Show information about a model including details, modelfile, template, parameters, license, and system prompt. Show information about a model including details, modelfile, template, parameters, license, system prompt.
### Parameters ### Parameters
- `name`: name of the model to show - `name`: name of the model to show
- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
### Examples ### Examples
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
```json ```json
{ {
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"", "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
"parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:", "parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
"template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ", "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
"details": { "details": {
"parent_model": "",
"format": "gguf", "format": "gguf",
"family": "llama", "family": "llama",
"families": ["llama", "clip"], "families": [
"parameter_size": "7B", "llama"
],
"parameter_size": "8.0B",
"quantization_level": "Q4_0" "quantization_level": "Q4_0"
},
"model_info": {
"general.architecture": "llama",
"general.file_type": 2,
"general.parameter_count": 8030261248,
"general.quantization_version": 2,
"llama.attention.head_count": 32,
"llama.attention.head_count_kv": 8,
"llama.attention.layer_norm_rms_epsilon": 0.00001,
"llama.block_count": 32,
"llama.context_length": 8192,
"llama.embedding_length": 4096,
"llama.feed_forward_length": 14336,
"llama.rope.dimension_count": 128,
"llama.rope.freq_base": 500000,
"llama.vocab_size": 128256,
"tokenizer.ggml.bos_token_id": 128000,
"tokenizer.ggml.eos_token_id": 128009,
"tokenizer.ggml.merges": [], // populates if `verbose=true`
"tokenizer.ggml.model": "gpt2",
"tokenizer.ggml.pre": "llama-bpe",
"tokenizer.ggml.token_type": [], // populates if `verbose=true`
"tokenizer.ggml.tokens": [] // populates if `verbose=true`
} }
} }
``` ```

View File

@@ -47,19 +47,13 @@ success
### Supported Quantizations ### Supported Quantizations
<details>
<summary>Legacy Quantization</summary>
- `Q4_0` - `Q4_0`
- `Q4_1` - `Q4_1`
- `Q5_0` - `Q5_0`
- `Q5_1` - `Q5_1`
- `Q8_0` - `Q8_0`
</details> #### K-means Quantizations
<details>
<summary>K-means Quantization</summary>`
- `Q3_K_S` - `Q3_K_S`
- `Q3_K_M` - `Q3_K_M`
@@ -70,11 +64,6 @@ success
- `Q5_K_M` - `Q5_K_M`
- `Q6_K` - `Q6_K`
</details>
> [!NOTE]
> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
## Template Detection ## Template Detection
> [!NOTE] > [!NOTE]

View File

@@ -22,7 +22,7 @@ docker logs <container-name>
If manually running `ollama serve` in a terminal, the logs will be on that terminal. If manually running `ollama serve` in a terminal, the logs will be on that terminal.
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in: When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs - `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH) - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories - `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories

View File

@@ -39,8 +39,8 @@ server.
Ollama on Windows stores files in a few different locations. You can view them in Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in: the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains logs from the GUI application - *app.log* contains most resent logs from the GUI application
- *server.log* contains the server logs - *server.log* contains the most recent server logs
- *upgrade.log* contains log output for upgrades - *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH) - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration - `explorer %HOMEPATH%\.ollama` contains models and configuration

View File

@@ -231,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
// On windows we bundle the nvidia library one level above the runner dir // On windows we bundle the nvidia library one level above the runner dir
depPath := "" depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Dir(envconfig.RunnersDir) depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
} }
// Load ALL libraries // Load ALL libraries
@@ -282,6 +282,12 @@ func GetGPUInfo() GpuInfoList {
// Intel // Intel
if envconfig.IntelGpu { if envconfig.IntelGpu {
oHandles = initOneAPIHandles() oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir
depPath = ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
}
for d := range oHandles.oneapi.num_drivers { for d := range oHandles.oneapi.num_drivers {
if oHandles.oneapi == nil { if oHandles.oneapi == nil {
// shouldn't happen // shouldn't happen
@@ -306,7 +312,7 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
// TODO dependency path? gpuInfo.DependencyPath = depPath
oneapiGPUs = append(oneapiGPUs, gpuInfo) oneapiGPUs = append(oneapiGPUs, gpuInfo)
} }
} }

View File

@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) { for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) { if (!*(l[i].p)) {
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg); LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);

View File

@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) { for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*l[i].p) { if (!*(l[i].p)) {
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg); LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);

View File

@@ -42,7 +42,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) { if (!*(l[i].p)) {
resp->ch.handle = NULL; resp->ch.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg); LOG(resp->ch.verbose, "dlerr: %s\n", msg);

View File

@@ -50,7 +50,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s); LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
if (!l[i].p) { if (!*(l[i].p)) {
resp->oh.handle = NULL; resp->oh.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg); LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -98,7 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
} }
for (d = 0; d < resp->oh.num_drivers; d++) { for (d = 0; d < resp->oh.num_drivers; d++) {
LOG(resp->oh.verbose, "calling zesDeviceGet %d\n", resp->oh.drivers[d]); LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
&resp->oh.num_devices[d], NULL); &resp->oh.num_devices[d], NULL);
if (ret != ZE_RESULT_SUCCESS) { if (ret != ZE_RESULT_SUCCESS) {

View File

@@ -56,7 +56,6 @@ struct server_params {
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
std::string chat_template = "";
int32_t port = 8080; int32_t port = 8080;
int32_t read_timeout = 600; int32_t read_timeout = 600;
int32_t write_timeout = 600; int32_t write_timeout = 600;
@@ -427,16 +426,6 @@ struct llama_server_context
return true; return true;
} }
void validate_model_chat_template(server_params & sparams) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) {
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
sparams.chat_template = "chatml";
}
}
void initialize() { void initialize() {
// create slots // create slots
all_slots_are_idle = true; all_slots_are_idle = true;
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true; invalid_param = true;
break; break;
} }
sparams.chat_template = argv[i];
} }
else if (arg == "--override-kv") else if (arg == "--override-kv")
{ {
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
} }
const auto model_meta = llama.model_meta(); const auto model_meta = llama.model_meta();
if (sparams.chat_template.empty()) { // custom chat template is not supplied
// check if the template comes with the model is supported by us
llama.validate_model_chat_template(sparams);
}
// Middleware for API key validation // Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation // If API key is not set, skip validation

View File

@@ -18,7 +18,7 @@ sign() {
fi fi
} }
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on" COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
case "${GOARCH}" in case "${GOARCH}" in
"amd64") "amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary # Static build for linking into the Go binary
init_vars init_vars
CMAKE_TARGETS="--target llama --target ggml" CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static" BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library" echo "Building static library"
build build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu" BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU" echo "Building LCD CPU"
build build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU # Approximately 400% faster than LCD on same CPU
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU" echo "Building AVX CPU"
build build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU # Approximately 10% faster than AVX on same CPU
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU" echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary # Static build for linking into the Go binary
init_vars init_vars
CMAKE_TARGETS="--target llama --target ggml" CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static" BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library" echo "Building static library"
build build

View File

@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc) export CUDACXX=$(command -v nvcc)
fi fi
fi fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
source $(dirname $0)/gen_common.sh source $(dirname $0)/gen_common.sh
init_vars init_vars
git_module_setup git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary # Static build for linking into the Go binary
init_vars init_vars
CMAKE_TARGETS="--target llama --target ggml" CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static" BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library" echo "Building static library"
build build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off" COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
# #
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU" echo "Building custom CUDA GPU"
else else
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi fi
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"

View File

@@ -1,5 +1,7 @@
#!powershell #!powershell
$ErrorActionPreference = "Stop"
function amdGPUs { function amdGPUs {
if ($env:AMDGPU_TARGETS) { if ($env:AMDGPU_TARGETS) {
return $env:AMDGPU_TARGETS return $env:AMDGPU_TARGETS
@@ -37,7 +39,8 @@ function init_vars {
} }
$script:cmakeDefs = @( $script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on", "-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off" "-DLLAMA_NATIVE=off",
"-DLLAMA_OPENMP=off"
) )
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -83,9 +86,9 @@ function init_vars {
function git_module_setup { function git_module_setup {
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
& git submodule init & git submodule init
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& git submodule update --force "${script:llamacppDir}" & git submodule update --force "${script:llamacppDir}"
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
function apply_patches { function apply_patches {
@@ -119,7 +122,7 @@ function build {
write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs" write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
& cmake --version & cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ($cmakeDefs -contains "-G") { if ($cmakeDefs -contains "-G") {
$extra=@("-j8") $extra=@("-j8")
} else { } else {
@@ -127,7 +130,7 @@ function build {
} }
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra" write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
if ($LASTEXITCODE -ne 0) { write-host "cmake build exit status $LASTEXITCODE"; throw($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Rearrange output to be consistent between different generators # Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) { if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/" mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
@@ -141,7 +144,7 @@ function sign {
foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){ foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
} }
} }
@@ -206,7 +209,8 @@ function build_static() {
"-DLLAMA_AVX2=off", "-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off", "-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off", "-DLLAMA_F16C=off",
"-DLLAMA_FMA=off") "-DLLAMA_FMA=off",
"-DLLAMA_OPENMP=off")
$script:buildDir="../build/windows/${script:ARCH}_static" $script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library" write-host "Building static library"
build build
@@ -216,13 +220,7 @@ function build_static() {
} }
} }
function build_cpu() { function build_cpu($gen_arch) {
if ($script:ARCH -eq "arm64") {
$gen_arch = "ARM64"
} else { # amd64
$gen_arch = "x64"
}
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) { if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
# remaining llama.cpp builds use MSVC # remaining llama.cpp builds use MSVC
init_vars init_vars
@@ -285,7 +283,7 @@ function build_cuda() {
"-DLLAMA_AVX=on", "-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off", "-DLLAMA_AVX2=off",
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
"-DCMAKE_CUDA_FLAGS=-t8" "-DCMAKE_CUDA_FLAGS=-t8",
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}" "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
) )
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) { if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
@@ -297,10 +295,12 @@ function build_cuda() {
sign sign
install install
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\" rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\" md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\" write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\" cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
} else { } else {
write-host "Skipping CUDA generation step" write-host "Skipping CUDA generation step"
} }
@@ -334,16 +334,18 @@ function build_oneapi() {
sign sign
install install
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}" rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}" md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}" cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
} else { } else {
Write-Host "Skipping oneAPI generation step" Write-Host "Skipping oneAPI generation step"
} }
@@ -408,29 +410,16 @@ init_vars
if ($($args.count) -eq 0) { if ($($args.count) -eq 0) {
git_module_setup git_module_setup
apply_patches apply_patches
build_static
$tasks = @("build_static", "build_cpu") if ($script:ARCH -eq "arm64") {
$jobs = @() build_cpu("ARM64")
if ($script:ARCH -ne "arm64") { } else { # amd64
$tasks += $("build_cpu_avx", "build_cpu_avx2", "build_cuda", "build_oneapi", "build_rocm") build_cpu("x64")
} build_cpu_avx
foreach ($t in $tasks) { build_cpu_avx2
$jobs += @(Start-ThreadJob -ThrottleLimit 12 -FilePath .\gen_windows.ps1 -ArgumentList $t -Name $t) build_cuda
} build_oneapi
get-job build_rocm
foreach ($job in $jobs) {
write-host "----" $job.Name output follows
receive-job -wait -job $job
write-host "----" $job.Name $job.State
write-host ""
if ($job.State -contains 'Failed') {
cleanup
write-host "Terminating remaining jobs (this takes a while, you can ^C)"
# TODO find some way to kill the spawned cmake processes faster
remove-job -force -job $jobs
exit(-1)
}
get-job
} }
cleanup cleanup

View File

@@ -367,6 +367,17 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4*batch*(vocab+2*embedding), 4*batch*(vocab+2*embedding),
fullOffload, fullOffload,
) )
case "deepseek2":
keys := uint64(llm.KV()["deepseek2.attention.key_length"].(uint32))
fullOffload = max(
4*batch*(3*embedding+vocab),
4*batch*(3*embedding+2+context*(1+headsKV)+2*keys*headsKV),
)
partialOffload = max(
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
4*batch*(2*embedding+1+2*keys*headsKV+context+context*headsKV)+4*keys*context*headsKV+embedding*keys*headsKV*9/16,
)
} }
return return

View File

@@ -1,6 +1,7 @@
package llm package llm
import ( import (
"fmt"
"log/slog" "log/slog"
"strconv" "strconv"
"strings" "strings"
@@ -49,6 +50,18 @@ type MemoryEstimate struct {
// For multi-GPU scenarios, this is the size in bytes per GPU // For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64 GPUSizes []uint64
// internal fields for logging purposes
inferenceLibrary string
layersRequested int
layersModel int
availableList []string
kv uint64
allocationsList []string
memoryWeights uint64
memoryLayerOutput uint64
graphFullOffload uint64
graphPartialOffload uint64
} }
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@@ -167,6 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// For all the layers, find where they can fit on the GPU(s) // For all the layers, find where they can fit on the GPU(s)
for i := range int(ggml.KV().BlockCount()) { for i := range int(ggml.KV().BlockCount()) {
// Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
layerSize = blk.size()
layerSize += kv / ggml.KV().BlockCount()
}
memoryWeights += layerSize memoryWeights += layerSize
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
@@ -252,78 +270,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
allocationsList = append(allocationsList, format.HumanBytes2(a)) allocationsList = append(allocationsList, format.HumanBytes2(a))
} }
estimate := MemoryEstimate{
TotalSize: memoryRequiredTotal,
Layers: 0,
Graph: 0,
VRAMSize: 0,
GPUSizes: []uint64{},
inferenceLibrary: gpus[0].Library,
layersRequested: opts.NumGPU,
layersModel: int(ggml.KV().BlockCount()) + 1,
availableList: availableList,
kv: kv,
allocationsList: allocationsList,
memoryWeights: memoryWeights,
memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload,
}
if gpus[0].Library == "cpu" {
return estimate
}
if layerCount == 0 {
slog.Debug("insufficient VRAM to load any model layers")
return estimate
}
estimate.Layers = layerCount
estimate.Graph = graphOffload
estimate.VRAMSize = memoryRequiredPartial
estimate.TotalSize = memoryRequiredTotal
estimate.TensorSplit = tensorSplit
estimate.GPUSizes = gpuAllocations
return estimate
}
func (m MemoryEstimate) log() {
slog.Info( slog.Info(
"offload to gpu", "offload to "+m.inferenceLibrary,
slog.Group( slog.Group(
"layers", "layers",
// requested number of layers to offload // requested number of layers to offload
"requested", opts.NumGPU, "requested", m.layersRequested,
// The number of layers the model has (including output) // The number of layers the model has (including output)
"model", int(ggml.KV().BlockCount())+1, "model", m.layersModel,
// estimated number of layers that can be offloaded // estimated number of layers that can be offloaded
"offload", layerCount, "offload", m.Layers,
// multi-gpu split for tesnors // multi-gpu split for tensors
"split", tensorSplit, "split", m.TensorSplit,
), ),
slog.Group( slog.Group(
"memory", "memory",
// memory available by GPU for offloading // memory available by GPU for offloading
"available", availableList, "available", m.availableList,
slog.Group( slog.Group(
"required", "required",
// memory required for full offloading // memory required for full offloading
"full", format.HumanBytes2(memoryRequiredTotal), "full", format.HumanBytes2(m.TotalSize),
// memory required to offload layers.estimate layers // memory required to offload layers.estimate layers
"partial", format.HumanBytes2(memoryRequiredPartial), "partial", format.HumanBytes2(m.VRAMSize),
// memory of KV cache // memory of KV cache
"kv", format.HumanBytes2(kv), "kv", format.HumanBytes2(m.kv),
// Allocations across the GPUs // Allocations across the GPUs
"allocations", allocationsList, "allocations", m.allocationsList,
), ),
slog.Group( slog.Group(
"weights", "weights",
// memory of the weights // memory of the weights
"total", format.HumanBytes2(memoryWeights), "total", format.HumanBytes2(m.memoryWeights),
// memory of repeating layers // memory of repeating layers
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput), "repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
// memory of non-repeating layers // memory of non-repeating layers
"nonrepeating", format.HumanBytes2(memoryLayerOutput), "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
), ),
slog.Group( slog.Group(
"graph", "graph",
// memory of graph when fully offloaded // memory of graph when fully offloaded
"full", format.HumanBytes2(graphFullOffload), "full", format.HumanBytes2(m.graphFullOffload),
// memory of graph when not fully offloaded // memory of graph when not fully offloaded
"partial", format.HumanBytes2(graphPartialOffload), "partial", format.HumanBytes2(m.graphPartialOffload),
), ),
), ),
) )
if gpus[0].Library == "cpu" {
return MemoryEstimate{
Layers: 0,
Graph: 0,
VRAMSize: 0,
TotalSize: memoryRequiredTotal,
GPUSizes: []uint64{},
}
}
if layerCount == 0 {
slog.Debug("insufficient VRAM to load any model layers")
return MemoryEstimate{
Layers: 0,
Graph: 0,
VRAMSize: 0,
TotalSize: memoryRequiredTotal,
GPUSizes: []uint64{},
}
}
return MemoryEstimate{
Layers: layerCount,
Graph: graphOffload,
VRAMSize: memoryRequiredPartial,
TotalSize: memoryRequiredTotal,
TensorSplit: tensorSplit,
GPUSizes: gpuAllocations,
}
} }

View File

@@ -1,8 +1,8 @@
diff --git a/common/common.cpp b/common/common.cpp diff --git a/common/common.cpp b/common/common.cpp
index ba1ecf0e..cead57cc 100644 index 73ff0e85..6adb1a92 100644
--- a/common/common.cpp --- a/common/common.cpp
+++ b/common/common.cpp +++ b/common/common.cpp
@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & @@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors; mparams.check_tensors = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
mparams.kv_overrides = NULL; mparams.kv_overrides = NULL;
} else { } else {
diff --git a/common/common.h b/common/common.h diff --git a/common/common.h b/common/common.h
index d80344f2..71e84834 100644 index 58ed72f4..0bb2605e 100644
--- a/common/common.h --- a/common/common.h
+++ b/common/common.h +++ b/common/common.h
@@ -174,6 +174,13 @@ struct gpt_params { @@ -180,6 +180,13 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
+
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+ // If the provided progress_callback returns true, model loading continues. + // If the provided progress_callback returns true, model loading continues.
+ // If it returns false, model loading is immediately aborted. + // If it returns false, model loading is immediately aborted.
+ llama_progress_callback progress_callback = NULL; + llama_progress_callback progress_callback = NULL;
+ // context pointer passed to the progress callback + // context pointer passed to the progress callback
+ void * progress_callback_user_data; + void * progress_callback_user_data;
}; +
// server params
void gpt_params_handle_model_default(gpt_params & params); int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds

View File

@@ -1,8 +1,8 @@
diff --git a/llama.cpp b/llama.cpp diff --git a/llama.cpp b/llama.cpp
index 40d2ec2c..74f3ee9c 100644 index 61948751..4b72a293 100644
--- a/llama.cpp --- a/llama.cpp
+++ b/llama.cpp +++ b/llama.cpp
@@ -4642,16 +4642,7 @@ static void llm_load_vocab( @@ -4824,16 +4824,7 @@ static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers // for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__); - LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if ( - } else if (tokenizer_pre == "default") {
+ if ( + if (tokenizer_pre == "default") {
tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
@@ -4703,7 +4694,8 @@ static void llm_load_vocab( tokenizer_pre == "llama3" ||
tokenizer_pre == "smaug-bpe") { @@ -4888,7 +4879,8 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);

View File

@@ -58,7 +58,7 @@ func availableServers() map[string]string {
} }
// glob payloadsDir for files that start with ollama_ // glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*") pattern := filepath.Join(payloadsDir, "*", "ollama_*")
files, err := filepath.Glob(pattern) files, err := filepath.Glob(pattern)
if err != nil { if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
servers := make(map[string]string) servers := make(map[string]string)
for _, file := range files { for _, file := range files {
slog.Debug("availableServers : found", "file", file) slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(file)] = file servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
} }
return servers return servers

View File

@@ -116,6 +116,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
} }
estimate.log()
// Loop through potential servers // Loop through potential servers
finalErr := errors.New("no suitable llama servers found") finalErr := errors.New("no suitable llama servers found")
@@ -200,7 +202,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if g.Library == "metal" && if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 && uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = false opts.UseMMap = api.TriStateFalse
} }
} }
@@ -208,7 +210,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--flash-attn") params = append(params, "--flash-attn")
} }
if !opts.UseMMap { // Windows CUDA should not use mmap for best performance
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap") params = append(params, "--no-mmap")
} }
@@ -271,8 +274,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" { if runtime.GOOS == "windows" {
pathEnv = "PATH" pathEnv = "PATH"
} }
// prepend the server directory to LD_LIBRARY_PATH/PATH // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
libraryPaths := []string{dir} libraryPaths := []string{dir, filepath.Dir(dir)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok { if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path // Append our runner directory to the path

View File

@@ -103,19 +103,19 @@ function buildApp() {
function gatherDependencies() { function gatherDependencies() {
write-host "Gathering runtime dependencies" write-host "Gathering runtime dependencies"
cd "${script:SRC_DIR}" cd "${script:SRC_DIR}"
md "${script:DEPS_DIR}" -ea 0 > $null md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
# currently works for Win11 + MSVC 2019 + Cuda V11 # currently works for Win11 + MSVC 2019 + Cuda V11
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
if ("${env:KEY_CONTAINER}") { if ("${env:KEY_CONTAINER}") {
write-host "about to sign" write-host "about to sign"
foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
write-host "signing $file" write-host "signing $file"
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file

View File

@@ -734,9 +734,44 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
fmt.Fprint(&sb, m.String()) fmt.Fprint(&sb, m.String())
resp.Modelfile = sb.String() resp.Modelfile = sb.String()
kvData, err := getKVData(m.ModelPath, req.Verbose)
if err != nil {
return nil, err
}
delete(kvData, "general.name")
delete(kvData, "tokenizer.chat_template")
resp.ModelInfo = kvData
if len(m.ProjectorPaths) > 0 {
projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
if err != nil {
return nil, err
}
resp.ProjectorInfo = projectorData
}
return resp, nil return resp, nil
} }
func getKVData(digest string, verbose bool) (llm.KV, error) {
kvData, err := llm.LoadModel(digest)
if err != nil {
return nil, err
}
kv := kvData.KV()
if !verbose {
for k := range kv {
if t, ok := kv[k].([]any); len(t) > 5 && ok {
kv[k] = []any{}
}
}
}
return kv, nil
}
func (s *Server) ListModelsHandler(c *gin.Context) { func (s *Server) ListModelsHandler(c *gin.Context) {
ms, err := Manifests() ms, err := Manifests()
if err != nil { if err != nil {

View File

@@ -19,6 +19,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
@@ -212,6 +213,7 @@ func Test_Routes(t *testing.T) {
"top_p 0.9", "top_p 0.9",
} }
assert.Equal(t, expectedParams, params) assert.Equal(t, expectedParams, params)
assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
}, },
}, },
} }
@@ -325,3 +327,40 @@ func TestCase(t *testing.T) {
}) })
} }
} }
func TestShow(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
var s Server
createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "show-model",
Modelfile: fmt.Sprintf(
"FROM %s\nFROM %s",
createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
),
})
w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
Name: "show-model",
})
if w.Code != http.StatusOK {
t.Fatalf("expected status code 200, actual %d", w.Code)
}
var resp api.ShowResponse
if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
t.Fatal(err)
}
if resp.ModelInfo["general.architecture"] != "test" {
t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
}
if resp.ProjectorInfo["general.architecture"] != "clip" {
t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
}
}

View File

@@ -4,7 +4,6 @@ package model
import ( import (
"cmp" "cmp"
"encoding/hex"
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
@@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
} }
return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
} }
type DigestType byte
const (
DigestTypeInvalid DigestType = iota
DigestTypeSHA256
)
func (t DigestType) String() string {
switch t {
case DigestTypeSHA256:
return "sha256"
default:
return "invalid"
}
}
type Digest struct {
Type DigestType
Sum [32]byte
}
func ParseDigest(s string) (Digest, error) {
i := strings.IndexAny(s, "-:")
if i < 0 {
return Digest{}, fmt.Errorf("invalid digest %q", s)
}
typ, encSum := s[:i], s[i+1:]
if typ != "sha256" {
return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
}
d := Digest{
Type: DigestTypeSHA256,
}
n, err := hex.Decode(d.Sum[:], []byte(encSum))
if err != nil {
return Digest{}, err
}
if n != 32 {
return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
}
return d, nil
}
func (d Digest) String() string {
if d.Type == DigestTypeInvalid {
return ""
}
return fmt.Sprintf("sha256-%x", d.Sum)
}
func (d Digest) IsValid() bool {
return d.Type != DigestTypeInvalid
}

View File

@@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) {
} }
} }
const (
validSha256 = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
)
func TestParseDigest(t *testing.T) {
cases := []struct {
in string
want string
}{
{"", ""}, // empty
{"sha123-12", ""}, // invalid type
{"sha256-", ""}, // invalid sum
{"sha256-123", ""}, // invalid odd length sum
{validSha256, validSha256},
{validSha256Old, validSha256},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
got, err := ParseDigest(tt.in)
if err != nil {
if tt.want != "" {
t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
}
return
}
if got.String() != tt.want {
t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
}
})
}
}
func TestParseNameFromFilepath(t *testing.T) { func TestParseNameFromFilepath(t *testing.T) {
cases := map[string]Name{ cases := map[string]Name{
filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"}, filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},