diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 40f9c41f0..61ca3c433 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -437,6 +437,7 @@ jobs:
env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
+ GH_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v4
- name: Set Version
@@ -460,15 +461,20 @@ jobs:
ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt)
cat dist/sha256sum.txt
- - uses: ncipollo/release-action@v1
- with:
- name: ${{ env.RELEASE_VERSION }}
- allowUpdates: true
- artifacts: 'dist/*'
- draft: true
- prerelease: true
- omitBodyDuringUpdate: true
- generateReleaseNotes: true
- omitDraftDuringUpdate: true
- omitPrereleaseDuringUpdate: true
- replacesArtifacts: true
+ - name: Create or update Release
+ run: |
+ echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
+ OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
+ if [ -n "$OLD_TAG" ]; then
+ echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
+ gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
+ else
+ echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
+ gh release create ${GITHUB_REF_NAME} \
+ --title ${{ env.RELEASE_VERSION }} \
+ --draft \
+ --generate-notes \
+ --prerelease
+ fi
+ echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
+ gh release upload ${GITHUB_REF_NAME} dist/* --clobber
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index dbb6c2fdf..29adf56f3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -124,7 +124,7 @@ jobs:
strategy:
matrix:
rocm-version:
- - '6.0.2'
+ - '6.1.1'
runs-on: linux
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps:
diff --git a/Dockerfile b/Dockerfile
index 72edef2a9..98a3ddfd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.0.2
+ARG ROCM_VERSION=6.1.1
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
diff --git a/README.md b/README.md
index 2fdc63cb3..72ed8fa5e 100644
--- a/README.md
+++ b/README.md
@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
-| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
-| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
+| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
+| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
+### Show model information
+
+```
+ollama show llama3
+```
+
### List models on your computer
```
diff --git a/api/types.go b/api/types.go
index d99cf3bcc..95ed5d37e 100644
--- a/api/types.go
+++ b/api/types.go
@@ -159,18 +159,49 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type Runner struct {
- UseNUMA bool `json:"numa,omitempty"`
- NumCtx int `json:"num_ctx,omitempty"`
- NumBatch int `json:"num_batch,omitempty"`
- NumGPU int `json:"num_gpu,omitempty"`
- MainGPU int `json:"main_gpu,omitempty"`
- LowVRAM bool `json:"low_vram,omitempty"`
- F16KV bool `json:"f16_kv,omitempty"`
- LogitsAll bool `json:"logits_all,omitempty"`
- VocabOnly bool `json:"vocab_only,omitempty"`
- UseMMap bool `json:"use_mmap,omitempty"`
- UseMLock bool `json:"use_mlock,omitempty"`
- NumThread int `json:"num_thread,omitempty"`
+ UseNUMA bool `json:"numa,omitempty"`
+ NumCtx int `json:"num_ctx,omitempty"`
+ NumBatch int `json:"num_batch,omitempty"`
+ NumGPU int `json:"num_gpu,omitempty"`
+ MainGPU int `json:"main_gpu,omitempty"`
+ LowVRAM bool `json:"low_vram,omitempty"`
+ F16KV bool `json:"f16_kv,omitempty"`
+ LogitsAll bool `json:"logits_all,omitempty"`
+ VocabOnly bool `json:"vocab_only,omitempty"`
+ UseMMap TriState `json:"use_mmap,omitempty"`
+ UseMLock bool `json:"use_mlock,omitempty"`
+ NumThread int `json:"num_thread,omitempty"`
+}
+
+type TriState int
+
+const (
+ TriStateUndefined TriState = -1
+ TriStateFalse TriState = 0
+ TriStateTrue TriState = 1
+)
+
+func (b *TriState) UnmarshalJSON(data []byte) error {
+ var v bool
+ if err := json.Unmarshal(data, &v); err != nil {
+ return err
+ }
+ if v {
+ *b = TriStateTrue
+ }
+ *b = TriStateFalse
+ return nil
+}
+
+func (b *TriState) MarshalJSON() ([]byte, error) {
+ if *b == TriStateUndefined {
+ return nil, nil
+ }
+ var v bool
+ if *b == TriStateTrue {
+ v = true
+ }
+ return json.Marshal(v)
}
// EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -222,6 +253,7 @@ type ShowRequest struct {
Model string `json:"model"`
System string `json:"system"`
Template string `json:"template"`
+ Verbose bool `json:"verbose"`
Options map[string]interface{} `json:"options"`
@@ -231,13 +263,16 @@ type ShowRequest struct {
// ShowResponse is the response returned from [Client.Show].
type ShowResponse struct {
- License string `json:"license,omitempty"`
- Modelfile string `json:"modelfile,omitempty"`
- Parameters string `json:"parameters,omitempty"`
- Template string `json:"template,omitempty"`
- System string `json:"system,omitempty"`
- Details ModelDetails `json:"details,omitempty"`
- Messages []Message `json:"messages,omitempty"`
+ License string `json:"license,omitempty"`
+ Modelfile string `json:"modelfile,omitempty"`
+ Parameters string `json:"parameters,omitempty"`
+ Template string `json:"template,omitempty"`
+ System string `json:"system,omitempty"`
+ Details ModelDetails `json:"details,omitempty"`
+ Messages []Message `json:"messages,omitempty"`
+ ModelInfo map[string]any `json:"model_info,omitempty"`
+ ProjectorInfo map[string]any `json:"projector_info,omitempty"`
+ ModifiedAt time.Time `json:"modified_at,omitempty"`
}
// CopyRequest is the request passed to [Client.Copy].
@@ -402,6 +437,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
continue
}
+ if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+ val, ok := val.(bool)
+ if !ok {
+ return fmt.Errorf("option %q must be of type boolean", key)
+ }
+ if val {
+ field.SetInt(int64(TriStateTrue))
+ } else {
+ field.SetInt(int64(TriStateFalse))
+ }
+ continue
+ }
+
switch field.Kind() {
case reflect.Int:
switch t := val.(type) {
@@ -490,7 +538,7 @@ func DefaultOptions() Options {
LowVRAM: false,
F16KV: true,
UseMLock: false,
- UseMMap: true,
+ UseMMap: TriStateUndefined,
UseNUMA: false,
},
}
@@ -560,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
} else {
field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() {
+ if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+ boolVal, err := strconv.ParseBool(vals[0])
+ if err != nil {
+ return nil, fmt.Errorf("invalid bool value %s", vals)
+ }
+ if boolVal {
+ out[key] = TriStateTrue
+ } else {
+ out[key] = TriStateFalse
+ }
+ continue
+ }
+
switch field.Kind() {
case reflect.Float32:
floatVal, err := strconv.ParseFloat(vals[0], 32)
diff --git a/api/types_test.go b/api/types_test.go
index 211385c70..8b6c60c62 100644
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -2,6 +2,7 @@ package api
import (
"encoding/json"
+ "fmt"
"math"
"testing"
"time"
@@ -105,3 +106,101 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
})
}
}
+
+func TestUseMmapParsingFromJSON(t *testing.T) {
+ tests := []struct {
+ name string
+ req string
+ exp TriState
+ }{
+ {
+ name: "Undefined",
+ req: `{ }`,
+ exp: TriStateUndefined,
+ },
+ {
+ name: "True",
+ req: `{ "use_mmap": true }`,
+ exp: TriStateTrue,
+ },
+ {
+ name: "False",
+ req: `{ "use_mmap": false }`,
+ exp: TriStateFalse,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ var oMap map[string]interface{}
+ err := json.Unmarshal([]byte(test.req), &oMap)
+ require.NoError(t, err)
+ opts := DefaultOptions()
+ err = opts.FromMap(oMap)
+ require.NoError(t, err)
+ assert.Equal(t, test.exp, opts.UseMMap)
+ })
+ }
+}
+
+func TestUseMmapFormatParams(t *testing.T) {
+ tests := []struct {
+ name string
+ req map[string][]string
+ exp TriState
+ err error
+ }{
+ {
+ name: "True",
+ req: map[string][]string{
+ "use_mmap": []string{"true"},
+ },
+ exp: TriStateTrue,
+ err: nil,
+ },
+ {
+ name: "False",
+ req: map[string][]string{
+ "use_mmap": []string{"false"},
+ },
+ exp: TriStateFalse,
+ err: nil,
+ },
+ {
+ name: "Numeric True",
+ req: map[string][]string{
+ "use_mmap": []string{"1"},
+ },
+ exp: TriStateTrue,
+ err: nil,
+ },
+ {
+ name: "Numeric False",
+ req: map[string][]string{
+ "use_mmap": []string{"0"},
+ },
+ exp: TriStateFalse,
+ err: nil,
+ },
+ {
+ name: "invalid string",
+ req: map[string][]string{
+ "use_mmap": []string{"foo"},
+ },
+ exp: TriStateUndefined,
+ err: fmt.Errorf("invalid bool value [foo]"),
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ resp, err := FormatParams(test.req)
+ require.Equal(t, err, test.err)
+ respVal, ok := resp["use_mmap"]
+ if test.exp != TriStateUndefined {
+ assert.True(t, ok, "resp: %v", resp)
+ assert.Equal(t, test.exp, respVal)
+ }
+ })
+ }
+}
diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go
index df2597a83..a8f1f7cdf 100644
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -5,6 +5,8 @@ import (
"log/slog"
"os"
"path/filepath"
+ "strconv"
+ "strings"
"github.com/ollama/ollama/envconfig"
)
@@ -24,6 +26,7 @@ func InitLogging() {
logFile = os.Stderr
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else {
+ rotateLogs(AppLogFile)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {
slog.Info("ollama app started")
}
+
+func rotateLogs(logFile string) {
+ if _, err := os.Stat(logFile); os.IsNotExist(err) {
+ return
+ }
+ index := strings.LastIndex(logFile, ".")
+ pre := logFile[:index]
+ post := "." + logFile[index+1:]
+ for i := LogRotationCount; i > 0; i-- {
+ older := pre + "-" + strconv.Itoa(i) + post
+ newer := pre + "-" + strconv.Itoa(i-1) + post
+ if i == 1 {
+ newer = pre + post
+ }
+ if _, err := os.Stat(newer); err == nil {
+ if _, err := os.Stat(older); err == nil {
+ err := os.Remove(older)
+ if err != nil {
+ slog.Warn("Failed to remove older log", "older", older, "error", err)
+ continue
+ }
+ }
+ err := os.Rename(newer, older)
+ if err != nil {
+ slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
+ }
+ }
+ }
+}
diff --git a/app/lifecycle/logging_test.go b/app/lifecycle/logging_test.go
new file mode 100644
index 000000000..a2157ca2c
--- /dev/null
+++ b/app/lifecycle/logging_test.go
@@ -0,0 +1,44 @@
+package lifecycle
+
+import (
+ "os"
+ "path/filepath"
+ "strconv"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestRotateLogs(t *testing.T) {
+ logDir := t.TempDir()
+ logFile := filepath.Join(logDir, "testlog.log")
+
+ // No log exists
+ rotateLogs(logFile)
+
+ require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
+ assert.FileExists(t, logFile)
+ // First rotation
+ rotateLogs(logFile)
+ assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+ assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+ assert.NoFileExists(t, logFile)
+
+ // Should be a no-op without a new log
+ rotateLogs(logFile)
+ assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+ assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+ assert.NoFileExists(t, logFile)
+
+ for i := 2; i <= LogRotationCount+1; i++ {
+ require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
+ assert.FileExists(t, logFile)
+ rotateLogs(logFile)
+ assert.NoFileExists(t, logFile)
+ for j := 1; j < i; j++ {
+ assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
+ }
+ assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
+ }
+}
diff --git a/app/lifecycle/paths.go b/app/lifecycle/paths.go
index fe07bce10..4d9f4c5a1 100644
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@@ -16,11 +16,12 @@ var (
AppDir = "/opt/Ollama"
AppDataDir = "/opt/Ollama"
// TODO - should there be a distinct log dir?
- UpdateStageDir = "/tmp"
- AppLogFile = "/tmp/ollama_app.log"
- ServerLogFile = "/tmp/ollama.log"
- UpgradeLogFile = "/tmp/ollama_update.log"
- Installer = "OllamaSetup.exe"
+ UpdateStageDir = "/tmp"
+ AppLogFile = "/tmp/ollama_app.log"
+ ServerLogFile = "/tmp/ollama.log"
+ UpgradeLogFile = "/tmp/ollama_update.log"
+ Installer = "OllamaSetup.exe"
+ LogRotationCount = 5
)
func init() {
diff --git a/app/lifecycle/server.go b/app/lifecycle/server.go
index 0152ccd11..c178a1abf 100644
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
}
- // TODO - rotation
+ rotateLogs(ServerLogFile)
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err)
diff --git a/app/ollama.iss b/app/ollama.iss
index 9dc61abbf..e6502abd3 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,10 +88,15 @@ DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
+#if DirExists("..\dist\windows-amd64\cuda")
+ Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
+#endif
+#if DirExists("..\dist\windows-amd64\oneapi")
+ Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
+#endif
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif
diff --git a/cmd/cmd.go b/cmd/cmd.go
index ae7c8da8f..909e8e4b2 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
}
defer tempfile.Close()
- zipfile := zip.NewWriter(tempfile)
- defer zipfile.Close()
-
detectContentType := func(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
files = append(files, tks...)
}
+ zipfile := zip.NewWriter(tempfile)
+ defer zipfile.Close()
+
for _, file := range files {
f, err := os.Open(file)
if err != nil {
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
}
func RunHandler(cmd *cobra.Command, args []string) error {
- client, err := api.ClientFromEnvironment()
- if err != nil {
- return err
- }
-
- name := args[0]
-
- // check if the model exists on the server
- show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
- var statusError api.StatusError
- switch {
- case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
- if err := PullHandler(cmd, []string{name}); err != nil {
- return err
- }
-
- show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
- if err != nil {
- return err
- }
- case err != nil:
- return err
- }
-
interactive := true
opts := runOptions{
- Model: args[0],
- WordWrap: os.Getenv("TERM") == "xterm-256color",
- Options: map[string]interface{}{},
- MultiModal: slices.Contains(show.Details.Families, "clip"),
- ParentModel: show.Details.ParentModel,
+ Model: args[0],
+ WordWrap: os.Getenv("TERM") == "xterm-256color",
+ Options: map[string]interface{}{},
}
format, err := cmd.Flags().GetString("format")
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
opts.WordWrap = !nowrap
- if !interactive {
- return generate(cmd, opts)
+ // Fill out the rest of the options based on information about the
+ // model.
+ client, err := api.ClientFromEnvironment()
+ if err != nil {
+ return err
}
- return generateInteractive(cmd, opts)
+ name := args[0]
+ info, err := func() (*api.ShowResponse, error) {
+ showReq := &api.ShowRequest{Name: name}
+ info, err := client.Show(cmd.Context(), showReq)
+ var se api.StatusError
+ if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
+ if err := PullHandler(cmd, []string{name}); err != nil {
+ return nil, err
+ }
+ return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
+ }
+ return info, err
+ }()
+ if err != nil {
+ return err
+ }
+
+ opts.MultiModal = slices.Contains(info.Details.Families, "clip")
+ opts.ParentModel = info.Details.ParentModel
+ opts.Messages = append(opts.Messages, info.Messages...)
+
+ if interactive {
+ return generateInteractive(cmd, opts)
+ }
+ return generate(cmd, opts)
}
func errFromUnknownKey(unknownKeyErr error) error {
@@ -579,10 +580,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
- if len(args) != 1 {
- return errors.New("missing model name")
- }
-
license, errLicense := cmd.Flags().GetBool("license")
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +622,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
if flagsSet > 1 {
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
- } else if flagsSet == 0 {
- return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
+ }
+
+ if flagsSet == 1 {
+ req := api.ShowRequest{Name: args[0]}
+ resp, err := client.Show(cmd.Context(), &req)
+ if err != nil {
+ return err
+ }
+
+ switch showType {
+ case "license":
+ fmt.Println(resp.License)
+ case "modelfile":
+ fmt.Println(resp.Modelfile)
+ case "parameters":
+ fmt.Println(resp.Parameters)
+ case "system":
+ fmt.Println(resp.System)
+ case "template":
+ fmt.Println(resp.Template)
+ }
+
+ return nil
}
req := api.ShowRequest{Name: args[0]}
@@ -635,22 +653,114 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
- switch showType {
- case "license":
- fmt.Println(resp.License)
- case "modelfile":
- fmt.Println(resp.Modelfile)
- case "parameters":
- fmt.Println(resp.Parameters)
- case "system":
- fmt.Println(resp.System)
- case "template":
- fmt.Println(resp.Template)
+ arch := resp.ModelInfo["general.architecture"].(string)
+
+ modelData := [][]string{
+ {"arch", arch},
+ {"parameters", resp.Details.ParameterSize},
+ {"quantization", resp.Details.QuantizationLevel},
+ {"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+ {"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
}
+ mainTableData := [][]string{
+ {"Model"},
+ {renderSubTable(modelData, false)},
+ }
+
+ if resp.ProjectorInfo != nil {
+ projectorData := [][]string{
+ {"arch", "clip"},
+ {"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
+ {"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
+ {"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
+ {"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
+ }
+
+ mainTableData = append(mainTableData,
+ []string{"Projector"},
+ []string{renderSubTable(projectorData, false)},
+ )
+ }
+
+ if resp.Parameters != "" {
+ mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
+ }
+
+ if resp.System != "" {
+ mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
+ }
+
+ if resp.License != "" {
+ mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
+ }
+
+ table := tablewriter.NewWriter(os.Stdout)
+ table.SetAutoWrapText(false)
+ table.SetBorder(false)
+ table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+ for _, v := range mainTableData {
+ table.Append(v)
+ }
+
+ table.Render()
+
return nil
}
+func renderSubTable(data [][]string, file bool) string {
+ var buf bytes.Buffer
+ table := tablewriter.NewWriter(&buf)
+ table.SetAutoWrapText(!file)
+ table.SetBorder(false)
+ table.SetNoWhiteSpace(true)
+ table.SetTablePadding("\t")
+ table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+ for _, v := range data {
+ table.Append(v)
+ }
+
+ table.Render()
+
+ renderedTable := buf.String()
+ lines := strings.Split(renderedTable, "\n")
+ for i, line := range lines {
+ lines[i] = "\t" + line
+ }
+
+ return strings.Join(lines, "\n")
+}
+
+func twoLines(s string) [][]string {
+ lines := strings.Split(s, "\n")
+ res := [][]string{}
+
+ count := 0
+ for _, line := range lines {
+ line = strings.TrimSpace(line)
+ if line != "" {
+ count++
+ res = append(res, []string{line})
+ if count == 2 {
+ return res
+ }
+ }
+ }
+ return res
+}
+
+func formatParams(s string) string {
+ lines := strings.Split(s, "\n")
+ table := [][]string{}
+
+ for _, line := range lines {
+ table = append(table, strings.Fields(line))
+ }
+ return renderSubTable(table, false)
+}
+
func CopyHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
diff --git a/cmd/interactive.go b/cmd/interactive.go
index 80a915474..0a2f429b6 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -31,65 +31,40 @@ const (
)
func loadModel(cmd *cobra.Command, opts *runOptions) error {
- client, err := api.ClientFromEnvironment()
- if err != nil {
- return err
- }
-
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
- showReq := api.ShowRequest{Name: opts.Model}
- showResp, err := client.Show(cmd.Context(), &showReq)
+ client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
- opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
- opts.ParentModel = showResp.Details.ParentModel
-
- if len(showResp.Messages) > 0 {
- opts.Messages = append(opts.Messages, showResp.Messages...)
- }
chatReq := &api.ChatRequest{
- Model: opts.Model,
- Messages: []api.Message{},
+ Model: opts.Model,
+ KeepAlive: opts.KeepAlive,
}
- if opts.KeepAlive != nil {
- chatReq.KeepAlive = opts.KeepAlive
- }
-
- err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
+ return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear()
- if len(opts.Messages) > 0 {
- for _, msg := range opts.Messages {
- switch msg.Role {
- case "user":
- fmt.Printf(">>> %s\n", msg.Content)
- case "assistant":
- state := &displayResponseState{}
- displayResponse(msg.Content, opts.WordWrap, state)
- fmt.Println()
- fmt.Println()
- }
+ for _, msg := range opts.Messages {
+ switch msg.Role {
+ case "user":
+ fmt.Printf(">>> %s\n", msg.Content)
+ case "assistant":
+ state := &displayResponseState{}
+ displayResponse(msg.Content, opts.WordWrap, state)
+ fmt.Println()
+ fmt.Println()
}
}
return nil
})
- if err != nil {
- return err
- }
-
- return nil
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
- opts.Messages = make([]api.Message, 0)
-
err := loadModel(cmd, &opts)
if err != nil {
return err
diff --git a/docs/api.md b/docs/api.md
index 35f1def33..107b5211f 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -777,11 +777,12 @@ A single JSON object will be returned.
POST /api/show
```
-Show information about a model including details, modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, system prompt.
### Parameters
- `name`: name of the model to show
+- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
### Examples
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
```json
{
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
- "parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:",
- "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
+ "parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
+ "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
"details": {
+ "parent_model": "",
"format": "gguf",
"family": "llama",
- "families": ["llama", "clip"],
- "parameter_size": "7B",
+ "families": [
+ "llama"
+ ],
+ "parameter_size": "8.0B",
"quantization_level": "Q4_0"
+ },
+ "model_info": {
+ "general.architecture": "llama",
+ "general.file_type": 2,
+ "general.parameter_count": 8030261248,
+ "general.quantization_version": 2,
+ "llama.attention.head_count": 32,
+ "llama.attention.head_count_kv": 8,
+ "llama.attention.layer_norm_rms_epsilon": 0.00001,
+ "llama.block_count": 32,
+ "llama.context_length": 8192,
+ "llama.embedding_length": 4096,
+ "llama.feed_forward_length": 14336,
+ "llama.rope.dimension_count": 128,
+ "llama.rope.freq_base": 500000,
+ "llama.vocab_size": 128256,
+ "tokenizer.ggml.bos_token_id": 128000,
+ "tokenizer.ggml.eos_token_id": 128009,
+ "tokenizer.ggml.merges": [], // populates if `verbose=true`
+ "tokenizer.ggml.model": "gpt2",
+ "tokenizer.ggml.pre": "llama-bpe",
+ "tokenizer.ggml.token_type": [], // populates if `verbose=true`
+ "tokenizer.ggml.tokens": [] // populates if `verbose=true`
}
}
```
diff --git a/docs/development.md b/docs/development.md
index 8c035a518..2a6886a43 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -114,15 +114,18 @@ If you have Docker available, you can build linux binaries with `./scripts/build
### Windows
-Note: The windows build for Ollama is still under development.
+Note: The Windows build for Ollama is still under development.
-Install required tools:
+First, install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
- [MinGW-w64](https://www.mingw-w64.org/)
- [MSYS2](https://www.msys2.org/)
+- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
+
+Then, build the `ollama` binary:
```powershell
$env:CGO_ENABLED="1"
diff --git a/docs/import.md b/docs/import.md
index 7abe39b2a..f34f09ace 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -47,19 +47,13 @@ success
### Supported Quantizations
-
-Legacy Quantization
-
- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
-
-
-
-K-means Quantization
`
+#### K-means Quantizations
- `Q3_K_S`
- `Q3_K_M`
@@ -70,11 +64,6 @@ success
- `Q5_K_M`
- `Q6_K`
-
-
-> [!NOTE]
-> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
-
## Template Detection
> [!NOTE]
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 60d63c7d9..de29b344c 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -22,7 +22,7 @@ docker logs
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `+R` and type in:
-- `explorer %LOCALAPPDATA%\Ollama` to view logs
+- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
diff --git a/docs/windows.md b/docs/windows.md
index 832b3d431..abc0eb300 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -39,8 +39,8 @@ server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- - *app.log* contains logs from the GUI application
- - *server.log* contains the server logs
+ - *app.log* contains most resent logs from the GUI application
+ - *server.log* contains the most recent server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
diff --git a/envconfig/config.go b/envconfig/config.go
index 4d2150b72..e86f72e6a 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -57,6 +57,19 @@ var (
SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment
TmpDir string
+ // Set via OLLAMA_INTEL_GPU in the environment
+ IntelGpu bool
+
+ // Set via CUDA_VISIBLE_DEVICES in the environment
+ CudaVisibleDevices string
+ // Set via HIP_VISIBLE_DEVICES in the environment
+ HipVisibleDevices string
+ // Set via ROCR_VISIBLE_DEVICES in the environment
+ RocrVisibleDevices string
+ // Set via GPU_DEVICE_ORDINAL in the environment
+ GpuDeviceOrdinal string
+ // Set via HSA_OVERRIDE_GFX_VERSION in the environment
+ HsaOverrideGfxVersion string
)
type EnvVar struct {
@@ -66,7 +79,7 @@ type EnvVar struct {
}
func AsMap() map[string]EnvVar {
- return map[string]EnvVar{
+ ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
@@ -84,6 +97,15 @@ func AsMap() map[string]EnvVar {
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
}
+ if runtime.GOOS != "darwin" {
+ ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
+ ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
+ ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
+ ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
+ ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
+ ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
+ }
+ return ret
}
func Values() map[string]string {
@@ -256,6 +278,16 @@ func LoadConfig() {
if err != nil {
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
}
+
+ if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
+ IntelGpu = set
+ }
+
+ CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
+ HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
+ RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
+ GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
+ HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
}
func getModelsDir() (string, error) {
diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 61e6a0598..15b6fc61f 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -13,6 +13,7 @@ import (
"strconv"
"strings"
+ "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -59,9 +60,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
var visibleDevices []string
- hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only
- rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
- gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index
+ hipVD := envconfig.HipVisibleDevices // zero based index only
+ rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
+ gpuDO := envconfig.GpuDeviceOrdinal // zero based index
switch {
// TODO is this priorty order right?
case hipVD != "":
@@ -74,7 +75,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
visibleDevices = strings.Split(gpuDO, ",")
}
- gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+ gfxOverride := envconfig.HsaOverrideGfxVersion
var supported []string
libDir := ""
@@ -332,6 +333,11 @@ func AMDGetGPUInfo() []RocmGPUInfo {
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
}
+ // Check for env var workarounds
+ if name == "1002:687f" { // Vega RX 56
+ gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
+ }
+
// The GPU has passed all the verification steps and is supported
resp = append(resp, gpuInfo)
}
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index cad45f6c5..21585277a 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -10,6 +10,7 @@ import (
"strconv"
"strings"
+ "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -53,7 +54,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
}
var supported []string
- gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+ gfxOverride := envconfig.HsaOverrideGfxVersion
if gfxOverride == "" {
supported, err = GetSupportedGFX(libDir)
if err != nil {
diff --git a/gpu/assets.go b/gpu/assets.go
index f2adcf3e3..073d2e813 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
- if err == nil {
- pid, err := strconv.Atoi(string(raw))
- if err == nil {
- if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
- // Another running ollama, ignore this tmpdir
- continue
- }
- }
- } else {
- slog.Debug("failed to open ollama.pid", "path", d, "error", err)
- }
- err = os.RemoveAll(d)
if err != nil {
- slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
+ slog.Warn("failed to read ollama.pid", "path", d, "error", err)
+ // No pid, ignore this tmpdir
+ continue
+ }
+
+ pid, err := strconv.Atoi(string(raw))
+ if err != nil {
+ slog.Warn("failed to parse pid", "path", d, "error", err)
+ continue
+ }
+
+ proc, err := os.FindProcess(pid)
+ if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+ slog.Warn("found running ollama", "pid", pid, "path", d)
+ // Another running ollama, ignore this tmpdir
+ continue
+ }
+
+ if err := os.Remove(d); err != nil {
+ slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
}
}
}
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 6cebbd2b9..0120d4271 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -265,7 +265,7 @@ func GetGPUInfo() GpuInfoList {
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
- depPath = filepath.Dir(envconfig.RunnersDir)
+ depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
}
// Load ALL libraries
@@ -314,33 +314,41 @@ func GetGPUInfo() GpuInfoList {
}
// Intel
- oHandles = initOneAPIHandles()
- for d := 0; oHandles.oneapi != nil && d < int(oHandles.oneapi.num_drivers); d++ {
- if oHandles.oneapi == nil {
- // shouldn't happen
- slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
- continue
+ if envconfig.IntelGpu {
+ oHandles = initOneAPIHandles()
+ // On windows we bundle the oneapi library one level above the runner dir
+ depPath = ""
+ if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+ depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
}
- devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
- for i := range devCount {
- gpuInfo := OneapiGPUInfo{
- GpuInfo: GpuInfo{
- Library: "oneapi",
- },
- driverIndex: d,
- gpuIndex: int(i),
+
+ for d := range oHandles.oneapi.num_drivers {
+ if oHandles.oneapi == nil {
+ // shouldn't happen
+ slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
+ continue
+ }
+ devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
+ for i := range devCount {
+ gpuInfo := OneapiGPUInfo{
+ GpuInfo: GpuInfo{
+ Library: "oneapi",
+ },
+ driverIndex: int(d),
+ gpuIndex: int(i),
+ }
+ // TODO - split bootstrapping from updating free memory
+ C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
+ // TODO - convert this to MinimumMemory based on testing...
+ var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+ memInfo.free = C.uint64_t(totalFreeMem)
+ gpuInfo.TotalMemory = uint64(memInfo.total)
+ gpuInfo.FreeMemory = uint64(memInfo.free)
+ gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+ gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+ gpuInfo.DependencyPath = depPath
+ oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
- // TODO - split bootstrapping from updating free memory
- C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
- // TODO - convert this to MinimumMemory based on testing...
- var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
- memInfo.free = C.uint64_t(totalFreeMem)
- gpuInfo.TotalMemory = uint64(memInfo.total)
- gpuInfo.FreeMemory = uint64(memInfo.free)
- gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
- gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
- // TODO dependency path?
- oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
}
diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c
index 9db89529a..03f15a2c3 100644
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
- if (!l[i].p) {
+ if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c
index 675ce5cc4..abe140844 100644
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
- if (!*l[i].p) {
+ if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
diff --git a/gpu/gpu_info_nvml.c b/gpu/gpu_info_nvml.c
index ef0a97df2..11293e448 100644
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -42,7 +42,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
- if (!l[i].p) {
+ if (!*(l[i].p)) {
resp->ch.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c
index e90c694a1..3ff708ea2 100644
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -13,7 +13,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
resp->oh.num_drivers = 0;
const int buflen = 256;
char buf[buflen + 1];
- int i, d, count;
+ int i, d;
struct lookup {
char *s;
void **p;
@@ -50,7 +50,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
- if (!l[i].p) {
+ if (!*(l[i].p)) {
resp->oh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -62,6 +62,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
}
}
+ LOG(resp->oh.verbose, "calling zesInit\n");
+
ret = (*resp->oh.zesInit)(0);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
@@ -71,7 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
return;
}
- count = 0;
+ LOG(resp->oh.verbose, "calling zesDriverGet\n");
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
@@ -96,6 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
}
for (d = 0; d < resp->oh.num_drivers; d++) {
+ LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
&resp->oh.num_devices[d], NULL);
if (ret != ZE_RESULT_SUCCESS) {
@@ -116,7 +119,6 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
oneapi_release(resp->oh);
return;
}
- count += resp->oh.num_devices[d];
}
return;
@@ -160,7 +162,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
return;
}
- snprintf(&resp->gpu_name[0], GPU_NAME_LEN, props.modelName);
+ snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
// TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
// (this is probably wrong...)
diff --git a/gpu/types.go b/gpu/types.go
index b451c0f38..693ca4668 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -26,6 +26,9 @@ type GpuInfo struct {
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath string `json:"lib_path,omitempty"`
+ // Extra environment variables specific to the GPU as list of [key,value]
+ EnvWorkarounds [][2]string `json:"envs,omitempty"`
+
// GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
Name string `json:"name"` // user friendly name if available
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 18b3fa18d..492126a4f 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -56,7 +56,6 @@ struct server_params {
std::string hostname = "127.0.0.1";
std::vector api_keys;
std::string public_path = "examples/server/public";
- std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
@@ -427,16 +426,6 @@ struct llama_server_context
return true;
}
- void validate_model_chat_template(server_params & sparams) {
- llama_chat_message chat[] = {{"user", "test"}};
- std::vector buf(1);
- int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
- if (res < 0) {
- LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- sparams.chat_template = "chatml";
- }
- }
-
void initialize() {
// create slots
all_slots_are_idle = true;
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
- sparams.chat_template = argv[i];
}
else if (arg == "--override-kv")
{
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
}
const auto model_meta = llama.model_meta();
- if (sparams.chat_template.empty()) { // custom chat template is not supplied
- // check if the template comes with the model is supported by us
- llama.validate_model_chat_template(sparams);
- }
-
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 0baf86ffc..721a9ae80 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,7 +18,7 @@ sign() {
fi
}
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
case "${GOARCH}" in
"amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU
#
init_vars
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU
#
init_vars
- CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+ CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
- CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 2190fb93e..0e98e1635 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc)
fi
fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
- CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+ CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
- COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+ COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU"
else
- CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+ CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index c9d860977..e217a0382 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -39,7 +39,8 @@ function init_vars {
}
$script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on",
- "-DLLAMA_NATIVE=off"
+ "-DLLAMA_NATIVE=off",
+ "-DLLAMA_OPENMP=off"
)
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -122,8 +123,13 @@ function build {
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
- write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
- & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
+ if ($cmakeDefs -contains "-G") {
+ $extra=@("-j8")
+ } else {
+ $extra= @("--", "/p:CL_MPcount=8")
+ }
+ write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
+ & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
@@ -203,7 +209,8 @@ function build_static() {
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
- "-DLLAMA_FMA=off")
+ "-DLLAMA_FMA=off",
+ "-DLLAMA_OPENMP=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
@@ -270,7 +277,15 @@ function build_cuda() {
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
- $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+ $script:cmakeDefs += @(
+ "-A", "x64",
+ "-DLLAMA_CUDA=ON",
+ "-DLLAMA_AVX=on",
+ "-DLLAMA_AVX2=off",
+ "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
+ "-DCMAKE_CUDA_FLAGS=-t8",
+ "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+ )
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
@@ -280,10 +295,12 @@ function build_cuda() {
sign
install
- write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
- cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
- cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
- cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+ rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
+ write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+ cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
} else {
write-host "Skipping CUDA generation step"
}
@@ -317,16 +334,18 @@ function build_oneapi() {
sign
install
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
- cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
+ rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+ cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
} else {
Write-Host "Skipping oneAPI generation step"
}
diff --git a/llm/ggla.go b/llm/ggla.go
index a5d90b6cb..34c4f6ca3 100644
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
return llm.tensors
}
-func (llm *ggla) decode(rs io.ReadSeeker) error {
+func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
for {
var dims uint32
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
+ if errors.Is(err, io.EOF) {
+ return nil
+ }
return err
}
+ defer func() {
+ if errors.Is(retErr, io.EOF) {
+ retErr = io.ErrUnexpectedEOF
+ }
+ }()
+
var namesize uint32
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
return err
}
- if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
+ if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
return err
}
diff --git a/llm/ggml.go b/llm/ggml.go
index 35b89d16e..cfead450d 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -6,6 +6,8 @@ import (
"fmt"
"io"
"strings"
+
+ "github.com/ollama/ollama/util/bufioutil"
)
type GGML struct {
@@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 {
return 1
}
+func (kv KV) EmbeddingHeadCount() uint64 {
+ if heads := kv.HeadCount(); heads > 0 {
+ return kv.EmbeddingLength() / kv.HeadCount()
+ }
+
+ return 0
+}
+
+func (kv KV) EmbeddingHeadCountK() uint64 {
+ if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
+ return k
+ }
+
+ return kv.EmbeddingHeadCount()
+}
+
+func (kv KV) EmbeddingHeadCountV() uint64 {
+ if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
+ return v
+ }
+
+ return kv.EmbeddingHeadCount()
+}
+
func (kv KV) GQA() uint64 {
return kv.HeadCount() / kv.HeadCountKV()
}
@@ -254,7 +280,18 @@ func DetectGGMLType(b []byte) string {
}
}
-func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
+// DecodeGGML decodes a GGML model from the given reader.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+ if maxArraySize == 0 {
+ maxArraySize = 1024
+ }
+
+ rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
+
var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err
@@ -267,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
case FILE_MAGIC_GGLA:
c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE:
- c = &containerGGUF{ByteOrder: binary.LittleEndian}
+ c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
case FILE_MAGIC_GGUF_BE:
- c = &containerGGUF{ByteOrder: binary.BigEndian}
+ c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
default:
return nil, 0, errors.New("invalid file magic")
}
model, err := c.Decode(rs)
- if errors.Is(err, io.EOF) {
- // noop
- } else if err != nil {
+ if err != nil {
return nil, 0, err
}
@@ -297,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV()
- vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
+ vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
+
+ embeddingHeads := llm.KV().EmbeddingHeadCount()
+ embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
layers := llm.Tensors().Layers()
@@ -308,7 +346,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload = 4 * batch * embedding
partialOffload += max(
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
- 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
+ 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
@@ -316,21 +354,30 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
// mixtral 8x22b
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
partialOffload = max(
- 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
- 4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
+ 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
+ 4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
)
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
// mixtral 8x7b
ffnGateWeight1 := ffnGateWeight.Shape[1]
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
partialOffload = max(
- 4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
+ 4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
)
}
- case "gemma":
- fullOffload = 4 * batch * (embedding + vocab)
- partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
+ case "gemma", "gemma2":
+ fullOffload = max(
+ 4*batch*(embedding+vocab),
+ 4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
+ )
+
+ partialOffload = max(
+ 4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
+ 4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
+ 4*embeddingHeadsK*context*8+
+ embedding*embeddingHeadsK*heads*9/16,
+ )
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
@@ -367,6 +414,16 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4*batch*(vocab+2*embedding),
fullOffload,
)
+ case "deepseek2":
+ fullOffload = max(
+ 4*batch*(3*embedding+vocab),
+ 4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
+ )
+
+ partialOffload = max(
+ 4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
+ 4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
+ )
}
return
diff --git a/llm/ggml_test.go b/llm/ggml_test.go
new file mode 100644
index 000000000..006c3ded8
--- /dev/null
+++ b/llm/ggml_test.go
@@ -0,0 +1 @@
+package llm
diff --git a/llm/gguf.go b/llm/gguf.go
index 234efe574..4d343a1bd 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -3,11 +3,10 @@ package llm
import (
"bytes"
"encoding/binary"
+ "encoding/json"
"fmt"
"io"
"strings"
-
- "log/slog"
)
type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
NumTensor uint64
NumKV uint64
}
+
+ maxArraySize int
+}
+
+func (c *containerGGUF) canCollectArray(size int) bool {
+ return c.maxArraySize < 0 || size <= c.maxArraySize
}
func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
}
model := newGGUF(c)
- slog.Debug(fmt.Sprintf("model = %#v", model))
if err := model.Decode(rs); err != nil {
return nil, err
}
@@ -85,6 +89,8 @@ type gguf struct {
tensors []*Tensor
parameters uint64
+
+ scratch [16 << 10]byte
}
func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
}
// decode tensors
- for i := 0; uint64(i) < llm.numTensor(); i++ {
+ for range llm.numTensor() {
name, err := readGGUFString(llm, rs)
if err != nil {
- return err
+ return fmt.Errorf("failed to read tensor name: %w", err)
}
// dims is the number of dimensions in the tensor
dims, err := readGGUF[uint32](llm, rs)
if err != nil {
- return err
+ return fmt.Errorf("failed to read tensor dimensions: %w", err)
}
shape := [4]uint64{1, 1, 1, 1}
for i := 0; uint32(i) < dims; i++ {
shape[i], err = readGGUF[uint64](llm, rs)
if err != nil {
- return err
+ return fmt.Errorf("failed to read tensor shape: %w", err)
}
}
kind, err := readGGUF[uint32](llm, rs)
if err != nil {
- return err
+ return fmt.Errorf("failed to read tensor kind: %w", err)
}
offset, err := readGGUF[uint64](llm, rs)
if err != nil {
- return err
+ return fmt.Errorf("failed to read tensor offset: %w", err)
}
tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
alignment = 32
}
- offset, err := rs.Seek(0, io.SeekCurrent)
- if err != nil {
- return err
- }
-
- padding := llm.padding(offset, int64(alignment))
- if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
- return err
- }
-
for _, tensor := range llm.tensors {
- if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
- return err
+ offset, err := rs.Seek(0, io.SeekCurrent)
+ if err != nil {
+ return fmt.Errorf("failed to get current offset: %w", err)
}
- padding := llm.padding(int64(tensor.Size()), int64(alignment))
+ padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
- return err
+ return fmt.Errorf("failed to seek to init padding: %w", err)
+ }
+
+ if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
+ return fmt.Errorf("failed to seek to tensor: %w", err)
}
}
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
return b.String(), nil
}
+func discardGGUFString(llm *gguf, r io.Reader) error {
+ buf := llm.scratch[:8]
+ _, err := io.ReadFull(r, buf)
+ if err != nil {
+ return err
+ }
+
+ size := int(llm.ByteOrder.Uint64(buf))
+ for size > 0 {
+ n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
+ if err != nil {
+ return err
+ }
+ size -= n
+ }
+ return nil
+}
+
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
if llm.Version == 1 {
return readGGUFV1String(llm, r)
}
- var length uint64
- if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
+ buf := llm.scratch[:8]
+ _, err := io.ReadFull(r, buf)
+ if err != nil {
return "", err
}
- var b bytes.Buffer
- if _, err := io.CopyN(&b, r, int64(length)); err != nil {
+ length := int(llm.ByteOrder.Uint64(buf))
+ if length > len(llm.scratch) {
+ buf = make([]byte, length)
+ } else {
+ buf = llm.scratch[:length]
+ }
+ clear(buf)
+
+ _, err = io.ReadFull(r, buf)
+ if err != nil {
return "", err
}
-
- return b.String(), nil
+ return string(buf), nil
}
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
return err
}
-func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
+type array struct {
+ size int
+ values []any
+}
+
+func (a *array) MarshalJSON() ([]byte, error) {
+ return json.Marshal(a.values)
+}
+
+func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
t, err := readGGUF[uint32](llm, r)
if err != nil {
return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
- for i := 0; uint32(i) < n; i++ {
+ a := &array{size: int(n)}
+ if llm.canCollectArray(int(n)) {
+ a.values = make([]any, 0, int(n))
+ }
+
+ for i := range n {
var e any
switch t {
case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
- a = append(a, e)
+ if a.values != nil {
+ a.values[i] = e
+ }
}
- return
+ return a, nil
}
-func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
+func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
if llm.Version == 1 {
return readGGUFV1Array(llm, r)
}
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
- for i := 0; uint64(i) < n; i++ {
+ a := &array{size: int(n)}
+ if llm.canCollectArray(int(n)) {
+ a.values = make([]any, int(n))
+ }
+
+ for i := range n {
var e any
switch t {
case ggufTypeUint8:
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
case ggufTypeBool:
e, err = readGGUF[bool](llm, r)
case ggufTypeString:
- e, err = readGGUFString(llm, r)
+ if a.values != nil {
+ e, err = readGGUFString(llm, r)
+ } else {
+ err = discardGGUFString(llm, r)
+ }
default:
return nil, fmt.Errorf("invalid array type: %d", t)
}
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
- a = append(a, e)
+ if a.values != nil {
+ a.values[i] = e
+ }
}
- return
+ return a, nil
}
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
diff --git a/llm/llama.cpp b/llm/llama.cpp
index 5921b8f08..7c26775ad 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603
+Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c
diff --git a/llm/memory.go b/llm/memory.go
index 5afb1c2e9..19b12cbfc 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,6 +1,7 @@
package llm
import (
+ "fmt"
"log/slog"
"strconv"
"strings"
@@ -49,6 +50,18 @@ type MemoryEstimate struct {
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
+
+ // internal fields for logging purposes
+ inferenceLibrary string
+ layersRequested int
+ layersModel int
+ availableList []string
+ kv uint64
+ allocationsList []string
+ memoryWeights uint64
+ memoryLayerOutput uint64
+ graphFullOffload uint64
+ graphPartialOffload uint64
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@@ -102,8 +115,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
slog.Warn("model missing blk.0 layer size")
}
- // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
- var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
+ // fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
+ var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
@@ -167,6 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// For all the layers, find where they can fit on the GPU(s)
for i := range int(ggml.KV().BlockCount()) {
+ // Some models have inconsistent layer sizes
+ if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+ layerSize = blk.size()
+ layerSize += kv / ggml.KV().BlockCount()
+ }
memoryWeights += layerSize
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
@@ -252,78 +270,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
allocationsList = append(allocationsList, format.HumanBytes2(a))
}
+ estimate := MemoryEstimate{
+ TotalSize: memoryRequiredTotal,
+ Layers: 0,
+ Graph: 0,
+ VRAMSize: 0,
+ GPUSizes: []uint64{},
+
+ inferenceLibrary: gpus[0].Library,
+ layersRequested: opts.NumGPU,
+ layersModel: int(ggml.KV().BlockCount()) + 1,
+ availableList: availableList,
+ kv: kv,
+ allocationsList: allocationsList,
+ memoryWeights: memoryWeights,
+ memoryLayerOutput: memoryLayerOutput,
+ graphFullOffload: graphFullOffload,
+ graphPartialOffload: graphPartialOffload,
+ }
+
+ if gpus[0].Library == "cpu" {
+ return estimate
+ }
+ if layerCount == 0 {
+ slog.Debug("insufficient VRAM to load any model layers")
+ return estimate
+ }
+ estimate.Layers = layerCount
+ estimate.Graph = graphOffload
+ estimate.VRAMSize = memoryRequiredPartial
+ estimate.TotalSize = memoryRequiredTotal
+ estimate.TensorSplit = tensorSplit
+ estimate.GPUSizes = gpuAllocations
+ return estimate
+}
+
+func (m MemoryEstimate) log() {
slog.Info(
- "offload to gpu",
+ "offload to "+m.inferenceLibrary,
slog.Group(
"layers",
// requested number of layers to offload
- "requested", opts.NumGPU,
+ "requested", m.layersRequested,
// The number of layers the model has (including output)
- "model", int(ggml.KV().BlockCount())+1,
+ "model", m.layersModel,
// estimated number of layers that can be offloaded
- "offload", layerCount,
- // multi-gpu split for tesnors
- "split", tensorSplit,
+ "offload", m.Layers,
+ // multi-gpu split for tensors
+ "split", m.TensorSplit,
),
slog.Group(
"memory",
// memory available by GPU for offloading
- "available", availableList,
+ "available", m.availableList,
slog.Group(
"required",
// memory required for full offloading
- "full", format.HumanBytes2(memoryRequiredTotal),
+ "full", format.HumanBytes2(m.TotalSize),
// memory required to offload layers.estimate layers
- "partial", format.HumanBytes2(memoryRequiredPartial),
+ "partial", format.HumanBytes2(m.VRAMSize),
// memory of KV cache
- "kv", format.HumanBytes2(kv),
+ "kv", format.HumanBytes2(m.kv),
// Allocations across the GPUs
- "allocations", allocationsList,
+ "allocations", m.allocationsList,
),
slog.Group(
"weights",
// memory of the weights
- "total", format.HumanBytes2(memoryWeights),
+ "total", format.HumanBytes2(m.memoryWeights),
// memory of repeating layers
- "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
+ "repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
// memory of non-repeating layers
- "nonrepeating", format.HumanBytes2(memoryLayerOutput),
+ "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
),
slog.Group(
"graph",
// memory of graph when fully offloaded
- "full", format.HumanBytes2(graphFullOffload),
+ "full", format.HumanBytes2(m.graphFullOffload),
// memory of graph when not fully offloaded
- "partial", format.HumanBytes2(graphPartialOffload),
+ "partial", format.HumanBytes2(m.graphPartialOffload),
),
),
)
- if gpus[0].Library == "cpu" {
- return MemoryEstimate{
- Layers: 0,
- Graph: 0,
- VRAMSize: 0,
- TotalSize: memoryRequiredTotal,
- GPUSizes: []uint64{},
- }
- }
- if layerCount == 0 {
- slog.Debug("insufficient VRAM to load any model layers")
- return MemoryEstimate{
- Layers: 0,
- Graph: 0,
- VRAMSize: 0,
- TotalSize: memoryRequiredTotal,
- GPUSizes: []uint64{},
- }
- }
-
- return MemoryEstimate{
- Layers: layerCount,
- Graph: graphOffload,
- VRAMSize: memoryRequiredPartial,
- TotalSize: memoryRequiredTotal,
- TensorSplit: tensorSplit,
- GPUSizes: gpuAllocations,
- }
}
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 8eaa07715..f972f9275 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
defer f.Close()
gguf := NewGGUFV3(binary.LittleEndian)
inputLayerCount := 5
+
tensors := []Tensor{
- {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
- {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
- {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
- {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
- {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
- {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+ {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+ {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+ {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+ {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+ {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
}
assert.Len(t, tensors, inputLayerCount+1)
err = gguf.Encode(f, KV{
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
}, tensors)
require.NoError(t, err)
- ggml, err := LoadModel(f.Name())
- require.NoError(t, err)
+ ggml, err := LoadModel(f.Name(), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
// Simple CPU scenario
gpus := []gpu.GpuInfo{
diff --git a/llm/patches/01-load-progress.diff b/llm/patches/01-load-progress.diff
index acd44d207..be5286091 100644
--- a/llm/patches/01-load-progress.diff
+++ b/llm/patches/01-load-progress.diff
@@ -1,8 +1,8 @@
diff --git a/common/common.cpp b/common/common.cpp
-index ba1ecf0e..cead57cc 100644
+index 73ff0e85..6adb1a92 100644
--- a/common/common.cpp
+++ b/common/common.cpp
-@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
mparams.kv_overrides = NULL;
} else {
diff --git a/common/common.h b/common/common.h
-index d80344f2..71e84834 100644
+index 58ed72f4..0bb2605e 100644
--- a/common/common.h
+++ b/common/common.h
-@@ -174,6 +174,13 @@ struct gpt_params {
- // multimodal models (see examples/llava)
+@@ -180,6 +180,13 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector image; // path to image file(s)
-+
+
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+ // If the provided progress_callback returns true, model loading continues.
+ // If it returns false, model loading is immediately aborted.
+ llama_progress_callback progress_callback = NULL;
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
- };
-
- void gpt_params_handle_model_default(gpt_params & params);
++
+ // server params
+ int32_t port = 8080; // server listens on this network port
+ int32_t timeout_read = 600; // http read timeout in seconds
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 27c8aabc2..2a2e7306e 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
diff --git a/llama.cpp b/llama.cpp
-index 40d2ec2c..74f3ee9c 100644
+index 61948751..4b72a293 100644
--- a/llama.cpp
+++ b/llama.cpp
-@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
+@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-- } else if (
-+ if (
- tokenizer_pre == "default") {
+- } else if (tokenizer_pre == "default") {
++ if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
-@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
- tokenizer_pre == "smaug-bpe") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+ tokenizer_pre == "llama3" ||
+@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
+ tokenizer_pre == "poro-chat") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
diff --git a/llm/patches/07-gemma.diff b/llm/patches/07-gemma.diff
new file mode 100644
index 000000000..86eac3d17
--- /dev/null
+++ b/llm/patches/07-gemma.diff
@@ -0,0 +1,305 @@
+From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
+From: Ollama maintainers
+Date: Wed, 26 Jun 2024 16:18:09 -0700
+Subject: [PATCH] Architecture support
+
+---
+ llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 193 insertions(+), 1 deletion(-)
+
+diff --git a/llama.cpp b/llama.cpp
+index 61948751..3b4196f5 100644
+--- a/llama.cpp
++++ b/llama.cpp
+@@ -217,6 +217,7 @@ enum llm_arch {
+ LLM_ARCH_INTERNLM2,
+ LLM_ARCH_MINICPM,
+ LLM_ARCH_GEMMA,
++ LLM_ARCH_GEMMA2,
+ LLM_ARCH_STARCODER2,
+ LLM_ARCH_MAMBA,
+ LLM_ARCH_XVERSE,
+@@ -255,6 +256,7 @@ static const std::map LLM_ARCH_NAMES = {
+ { LLM_ARCH_INTERNLM2, "internlm2" },
+ { LLM_ARCH_MINICPM, "minicpm" },
+ { LLM_ARCH_GEMMA, "gemma" },
++ { LLM_ARCH_GEMMA2, "gemma2" },
+ { LLM_ARCH_STARCODER2, "starcoder2" },
+ { LLM_ARCH_MAMBA, "mamba" },
+ { LLM_ARCH_XVERSE, "xverse" },
+@@ -464,10 +466,12 @@ enum llm_tensor {
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_ATTN_OUT_NORM,
++ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_NORM,
++ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+@@ -960,6 +964,24 @@ static const std::map> LLM_TENSOR_NA
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
++ {
++ LLM_ARCH_GEMMA2,
++ {
++ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
++ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
++ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
++ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
++ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
++ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
++ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
++ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
++ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
++ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
++ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
++ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
++ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
++ },
++ },
+ {
+ LLM_ARCH_STARCODER2,
+ {
+@@ -1941,6 +1963,8 @@ enum e_model {
+ MODEL_8x22B,
+ MODEL_16x12B,
+ MODEL_10B_128x3_66B,
++ MODEL_9B,
++ MODEL_27B,
+ };
+
+ static const size_t kiB = 1024;
+@@ -2114,6 +2138,7 @@ struct llama_layer {
+ struct ggml_tensor * attn_out_norm_b;
+ struct ggml_tensor * attn_q_a_norm;
+ struct ggml_tensor * attn_kv_a_norm;
++ struct ggml_tensor * attn_post_norm;
+
+ // attention
+ struct ggml_tensor * wq;
+@@ -2136,6 +2161,7 @@ struct llama_layer {
+ // normalization
+ struct ggml_tensor * ffn_norm;
+ struct ggml_tensor * ffn_norm_b;
++ struct ggml_tensor * ffn_post_norm;
+ struct ggml_tensor * layer_out_norm;
+ struct ggml_tensor * layer_out_norm_b;
+ struct ggml_tensor * ffn_norm_exps;
+@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
+ }
+ } break;
+ case LLM_ARCH_GEMMA:
++ {
++ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
++
++ switch (hparams.n_layer) {
++ case 18: model.type = e_model::MODEL_9B; break;
++ case 28: model.type = e_model::MODEL_27B; break;
++ default: model.type = e_model::MODEL_UNKNOWN;
++ }
++ } break;
++ case LLM_ARCH_GEMMA2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ }
+ } break;
++ case LLM_ARCH_GEMMA2:
++ {
++ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
++
++ // output
++ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
++ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
++
++ const int64_t n_ff = hparams.n_ff;
++ const int64_t n_embd_head_k = hparams.n_embd_head_k;
++ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
++ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
++
++ for (uint32_t i = 0; i < n_layer; ++i) {
++ ggml_context * ctx_layer = ctx_for_layer(i);
++ ggml_context * ctx_split = ctx_for_layer_split(i);
++
++ auto & layer = model.layers[i];
++
++ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
++
++ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
++ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
++ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
++ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
++ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
++
++ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
++ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
++ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
++ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
++ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
++ }
++ } break;
+ case LLM_ARCH_STARCODER2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+@@ -10614,6 +10684,123 @@ struct llm_build_context {
+ return gf;
+ }
+
++ struct ggml_cgraph * build_gemma2() {
++ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
++
++ const int64_t n_embd_head_k = hparams.n_embd_head_k;
++
++ struct ggml_tensor * cur;
++ struct ggml_tensor * inpL;
++
++ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
++
++ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
++ cb(inpL, "inp_scaled", -1);
++
++ // inp_pos - contains the positions
++ struct ggml_tensor * inp_pos = build_inp_pos();
++
++ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
++ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
++
++ for (int il = 0; il < n_layer; ++il) {
++ // norm
++ cur = llm_build_norm(ctx0, inpL, hparams,
++ model.layers[il].attn_norm, NULL,
++ LLM_NORM_RMS, cb, il);
++ cb(cur, "attn_norm", il);
++
++ // self-attention
++ {
++ // compute Q and K and RoPE them
++ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
++ cb(Qcur, "Qcur", il);
++
++ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
++ cb(Kcur, "Kcur", il);
++
++ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
++ cb(Vcur, "Vcur", il);
++
++ Qcur = ggml_rope_ext(
++ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
++ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
++ ext_factor, attn_factor, beta_fast, beta_slow);
++ cb(Qcur, "Qcur", il);
++
++ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
++ cb(Qcur, "Qcur_scaled", il);
++
++ Kcur = ggml_rope_ext(
++ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
++ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
++ ext_factor, attn_factor, beta_fast, beta_slow);
++ cb(Kcur, "Kcur", il);
++
++ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
++ model.layers[il].wo, NULL,
++ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
++ }
++
++ if (il == n_layer - 1) {
++ // skip computing output for unused tokens
++ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
++ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
++ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
++ }
++
++ cur = llm_build_norm(ctx0, cur, hparams,
++ model.layers[il].attn_post_norm, NULL,
++ LLM_NORM_RMS, cb, il);
++ cb(cur, "attn_post_norm", il);
++
++ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
++ cb(sa_out, "sa_out", il);
++
++ cur = llm_build_norm(ctx0, sa_out, hparams,
++ model.layers[il].ffn_norm, NULL,
++ LLM_NORM_RMS, cb, il);
++ cb(cur, "ffn_norm", il);
++
++ // feed-forward network
++ {
++ cur = llm_build_ffn(ctx0, cur,
++ model.layers[il].ffn_up, NULL,
++ model.layers[il].ffn_gate, NULL,
++ model.layers[il].ffn_down, NULL,
++ NULL,
++ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
++ cb(cur, "ffn_out", il);
++ }
++
++ cur = llm_build_norm(ctx0, cur, hparams,
++ model.layers[il].ffn_post_norm, NULL,
++ LLM_NORM_RMS, cb, -1);
++ cb(cur, "ffn_post_norm", -1);
++
++ cur = ggml_add(ctx0, cur, sa_out);
++ cb(cur, "l_out", il);
++
++ // input for next layer
++ inpL = cur;
++ }
++
++ cur = inpL;
++
++ cur = llm_build_norm(ctx0, cur, hparams,
++ model.output_norm, NULL,
++ LLM_NORM_RMS, cb, -1);
++ cb(cur, "result_norm", -1);
++
++ // lm_head
++ cur = ggml_mul_mat(ctx0, model.output, cur);
++ cb(cur, "result_output", -1);
++
++ ggml_build_forward_expand(gf, cur);
++
++ return gf;
++ }
++
+ struct ggml_cgraph * build_starcoder2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
+ {
+ result = llm.build_gemma();
+ } break;
++ case LLM_ARCH_GEMMA2:
++ {
++ result = llm.build_gemma2();
++ } break;
+ case LLM_ARCH_STARCODER2:
+ {
+ result = llm.build_starcoder2();
+@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+ case LLM_ARCH_PHI2:
+ case LLM_ARCH_PHI3:
+ case LLM_ARCH_GEMMA:
++ case LLM_ARCH_GEMMA2:
+ case LLM_ARCH_STARCODER2:
+ case LLM_ARCH_GPTNEOX:
+ return LLAMA_ROPE_TYPE_NEOX;
+@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
+ if (add_ass) {
+ ss << "assistant\n";
+ }
+- } else if (tmpl == "gemma" || tmpl.find("") != std::string::npos) {
++ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("") != std::string::npos) {
+ // google/gemma-7b-it
+ std::string system_prompt = "";
+ for (auto message : chat) {
+--
+2.45.2
+
diff --git a/llm/payload.go b/llm/payload.go
index 20dcee7b5..9296db336 100644
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
}
// glob payloadsDir for files that start with ollama_
- pattern := filepath.Join(payloadsDir, "*")
+ pattern := filepath.Join(payloadsDir, "*", "ollama_*")
files, err := filepath.Glob(pattern)
if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
- servers[filepath.Base(file)] = file
+ servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
}
return servers
diff --git a/llm/server.go b/llm/server.go
index 6313fc327..ad67138b5 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -60,7 +60,12 @@ type llmServer struct {
sem *semaphore.Weighted
}
-func LoadModel(model string) (*GGML, error) {
+// LoadModel will load a model from disk. The model must be in the GGML format.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func LoadModel(model string, maxArraySize int) (*GGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
}
defer f.Close()
- ggml, _, err := DecodeGGML(f)
+ ggml, _, err := DecodeGGML(f, maxArraySize)
return ggml, err
}
@@ -81,7 +86,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var err error
var cpuRunner string
var estimate MemoryEstimate
- var systemMemory uint64
+ var systemTotalMemory uint64
+ var systemFreeMemory uint64
+
+ systemMemInfo, err := gpu.GetCPUMem()
+ if err != nil {
+ slog.Error("failed to lookup system memory", "error", err)
+ } else {
+ systemTotalMemory = systemMemInfo.TotalMemory
+ systemFreeMemory = systemMemInfo.FreeMemory
+ slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
+ }
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 {
@@ -91,19 +106,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner = serverForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
- if gpus[0].Library == "metal" {
- memInfo, err := gpu.GetCPUMem()
- if err != nil {
- slog.Error("failed to lookup system memory", "error", err)
- } else {
- systemMemory = memInfo.TotalMemory
- slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
- }
- }
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
switch {
- case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
+ case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
opts.NumGPU = 0
@@ -116,6 +122,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
}
+ estimate.log()
+
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
@@ -200,7 +208,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
- opts.UseMMap = false
+ opts.UseMMap = api.TriStateFalse
}
}
@@ -208,7 +216,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--flash-attn")
}
- if !opts.UseMMap {
+ // Windows CUDA should not use mmap for best performance
+ // Linux with a model larger than free space, mmap leads to thrashing
+ if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
+ (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
+ opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap")
}
@@ -271,8 +283,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
- // prepend the server directory to LD_LIBRARY_PATH/PATH
- libraryPaths := []string{dir}
+ // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
+ libraryPaths := []string{dir, filepath.Dir(dir)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
@@ -320,6 +332,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
+ envWorkarounds := [][2]string{}
+ for _, gpu := range gpus {
+ envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
+ }
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
@@ -334,6 +350,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
devicesNeeded = false
+ } else if len(envWorkarounds) != 0 {
+ for _, kv := range envWorkarounds {
+ if strings.EqualFold(cmp[0], kv[0]) {
+ s.cmd.Env[i] = kv[0] + "=" + kv[1]
+ }
+ }
}
}
if pathNeeded {
@@ -395,7 +417,7 @@ func projectorMemoryRequirements(filename string) uint64 {
}
defer file.Close()
- ggml, _, err := DecodeGGML(file)
+ ggml, _, err := DecodeGGML(file, 0)
if err != nil {
return 0
}
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 60de03073..b3991ce1f 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -103,19 +103,19 @@ function buildApp() {
function gatherDependencies() {
write-host "Gathering runtime dependencies"
cd "${script:SRC_DIR}"
- md "${script:DEPS_DIR}" -ea 0 > $null
+ md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
# currently works for Win11 + MSVC 2019 + Cuda V11
- cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
- cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
- cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
+ cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
+ cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
+ cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
if ("${env:KEY_CONTAINER}") {
write-host "about to sign"
- foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+ foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
write-host "signing $file"
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
diff --git a/scripts/install.sh b/scripts/install.sh
index 0f12d7e09..2a06c350a 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
case $OS_NAME in
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
- fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
+ fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
amzn) install_cuda_driver_yum 'fedora' '37' ;;
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
diff --git a/server/images.go b/server/images.go
index 53a957715..e949fb18a 100644
--- a/server/images.go
+++ b/server/images.go
@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
- layers, err := parseFromFile(ctx, temp, "", fn)
+ layer, err := NewLayer(temp, baseLayer.MediaType)
if err != nil {
return err
}
- if len(layers) != 1 {
- return errors.New("quantization failed")
+ if _, err := temp.Seek(0, io.SeekStart); err != nil {
+ return err
}
- baseLayer.Layer = layers[0].Layer
- baseLayer.GGML = layers[0].GGML
+ ggml, _, err := llm.DecodeGGML(temp, 0)
+ if err != nil {
+ return err
+ }
+
+ baseLayer.Layer = layer
+ baseLayer.GGML = ggml
}
}
diff --git a/server/model.go b/server/model.go
index b262ea385..d56e641ba 100644
--- a/server/model.go
+++ b/server/model.go
@@ -11,6 +11,7 @@ import (
"net/http"
"os"
"path/filepath"
+ "strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/convert"
@@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
}
defer blob.Close()
- ggml, _, err := llm.DecodeGGML(blob)
+ ggml, _, err := llm.DecodeGGML(blob, 0)
if err != nil {
return nil, err
}
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
return layers, nil
}
-func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
stat, err := file.Stat()
if err != nil {
- return nil, err
+ return err
}
r, err := zip.NewReader(file, stat.Size())
if err != nil {
- return nil, err
+ return err
}
- tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
- if err != nil {
- return nil, err
- }
- defer os.RemoveAll(tempdir)
-
fn(api.ProgressResponse{Status: "unpacking model metadata"})
for _, f := range r.File {
+ n := filepath.Join(p, f.Name)
+ if !strings.HasPrefix(n, p) {
+ slog.Warn("skipped extracting file outside of context", "name", f.Name)
+ continue
+ }
+
+ if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
+ return err
+ }
+
// TODO(mxyng): this should not write out all files to disk
- outfile, err := os.Create(filepath.Join(tempdir, f.Name))
+ outfile, err := os.Create(n)
if err != nil {
- return nil, err
+ return err
}
defer outfile.Close()
infile, err := f.Open()
if err != nil {
- return nil, err
+ return err
}
defer infile.Close()
if _, err = io.Copy(outfile, infile); err != nil {
- return nil, err
+ return err
}
if err := outfile.Close(); err != nil {
- return nil, err
+ return err
}
if err := infile.Close(); err != nil {
- return nil, err
+ return err
}
}
- mf, err := convert.GetModelFormat(tempdir)
+ return nil
+}
+
+func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+ tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
+ if err != nil {
+ return nil, err
+ }
+ defer os.RemoveAll(tempDir)
+
+ if err := extractFromZipFile(tempDir, file, fn); err != nil {
+ return nil, err
+ }
+
+ mf, err := convert.GetModelFormat(tempDir)
if err != nil {
return nil, err
}
- params, err := mf.GetParams(tempdir)
+ params, err := mf.GetParams(tempDir)
if err != nil {
return nil, err
}
- mArch, err := mf.GetModelArch("", tempdir, params)
+ mArch, err := mf.GetModelArch("", tempDir, params)
if err != nil {
return nil, err
}
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
// TODO(mxyng): this should write directly into a layer
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
- temp, err := os.CreateTemp(tempdir, "fp16")
+ temp, err := os.CreateTemp(tempDir, "fp16")
if err != nil {
return nil, err
}
@@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
}
defer bin.Close()
- ggml, _, err := llm.DecodeGGML(bin)
+ ggml, _, err := llm.DecodeGGML(bin, 0)
if err != nil {
return nil, err
}
@@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
var offset int64
for offset < stat.Size() {
- ggml, n, err := llm.DecodeGGML(file)
+ ggml, n, err := llm.DecodeGGML(file, 0)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
diff --git a/server/model_test.go b/server/model_test.go
new file mode 100644
index 000000000..c3023eb2b
--- /dev/null
+++ b/server/model_test.go
@@ -0,0 +1,92 @@
+package server
+
+import (
+ "archive/zip"
+ "bytes"
+ "io"
+ "os"
+ "path/filepath"
+ "slices"
+ "testing"
+
+ "github.com/ollama/ollama/api"
+)
+
+func createZipFile(t *testing.T, name string) *os.File {
+ t.Helper()
+
+ f, err := os.CreateTemp(t.TempDir(), "")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ zf := zip.NewWriter(f)
+ defer zf.Close()
+
+ zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
+ t.Fatal(err)
+ }
+
+ return f
+}
+
+func TestExtractFromZipFile(t *testing.T) {
+ cases := []struct {
+ name string
+ expect []string
+ }{
+ {
+ name: "good",
+ expect: []string{"good"},
+ },
+ {
+ name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
+ },
+ }
+
+ for _, tt := range cases {
+ t.Run(tt.name, func(t *testing.T) {
+ f := createZipFile(t, tt.name)
+ defer f.Close()
+
+ tempDir := t.TempDir()
+ if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
+ t.Fatal(err)
+ }
+
+ var matches []string
+ if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
+ if err != nil {
+ return err
+ }
+
+ if !fi.IsDir() {
+ matches = append(matches, p)
+ }
+
+ return nil
+ }); err != nil {
+ t.Fatal(err)
+ }
+
+ var actual []string
+ for _, match := range matches {
+ rel, err := filepath.Rel(tempDir, match)
+ if err != nil {
+ t.Error(err)
+ }
+
+ actual = append(actual, rel)
+ }
+
+ if !slices.Equal(actual, tt.expect) {
+ t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
+ }
+ })
+ }
+}
diff --git a/server/routes.go b/server/routes.go
index 188fe9748..ff66663c0 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -646,9 +646,12 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
resp, err := GetModelInfo(req)
if err != nil {
- if os.IsNotExist(err) {
+ switch {
+ case os.IsNotExist(err):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
- } else {
+ case err.Error() == "invalid model name":
+ c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+ default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
@@ -658,44 +661,55 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
}
func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
- model, err := GetModel(req.Model)
+ m, err := GetModel(req.Model)
if err != nil {
return nil, err
}
modelDetails := api.ModelDetails{
- ParentModel: model.ParentModel,
- Format: model.Config.ModelFormat,
- Family: model.Config.ModelFamily,
- Families: model.Config.ModelFamilies,
- ParameterSize: model.Config.ModelType,
- QuantizationLevel: model.Config.FileType,
+ ParentModel: m.ParentModel,
+ Format: m.Config.ModelFormat,
+ Family: m.Config.ModelFamily,
+ Families: m.Config.ModelFamilies,
+ ParameterSize: m.Config.ModelType,
+ QuantizationLevel: m.Config.FileType,
}
if req.System != "" {
- model.System = req.System
+ m.System = req.System
}
if req.Template != "" {
- model.Template = req.Template
+ m.Template = req.Template
}
msgs := make([]api.Message, 0)
- for _, msg := range model.Messages {
+ for _, msg := range m.Messages {
msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content})
}
+ n := model.ParseName(req.Model)
+ if !n.IsValid() {
+ return nil, fmt.Errorf("invalid model name")
+ }
+
+ manifest, err := ParseNamedManifest(n)
+ if err != nil {
+ return nil, err
+ }
+
resp := &api.ShowResponse{
- License: strings.Join(model.License, "\n"),
- System: model.System,
- Template: model.Template,
- Details: modelDetails,
- Messages: msgs,
+ License: strings.Join(m.License, "\n"),
+ System: m.System,
+ Template: m.Template,
+ Details: modelDetails,
+ Messages: msgs,
+ ModifiedAt: manifest.fi.ModTime(),
}
var params []string
cs := 30
- for k, v := range model.Options {
+ for k, v := range m.Options {
switch val := v.(type) {
case []interface{}:
for _, nv := range val {
@@ -709,20 +723,59 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
for k, v := range req.Options {
if _, ok := req.Options[k]; ok {
- model.Options[k] = v
+ m.Options[k] = v
}
}
var sb strings.Builder
fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
- fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
- fmt.Fprint(&sb, model.String())
+ fmt.Fprintf(&sb, "# FROM %s\n\n", m.ShortName)
+ fmt.Fprint(&sb, m.String())
resp.Modelfile = sb.String()
+ kvData, err := getKVData(m.ModelPath, req.Verbose)
+ if err != nil {
+ return nil, err
+ }
+ delete(kvData, "general.name")
+ delete(kvData, "tokenizer.chat_template")
+ resp.ModelInfo = kvData
+
+ if len(m.ProjectorPaths) > 0 {
+ projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
+ if err != nil {
+ return nil, err
+ }
+ resp.ProjectorInfo = projectorData
+ }
+
return resp, nil
}
+func getKVData(digest string, verbose bool) (llm.KV, error) {
+ maxArraySize := 0
+ if verbose {
+ maxArraySize = -1
+ }
+ kvData, err := llm.LoadModel(digest, maxArraySize)
+ if err != nil {
+ return nil, err
+ }
+
+ kv := kvData.KV()
+
+ if !verbose {
+ for k := range kv {
+ if t, ok := kv[k].([]any); len(t) > 5 && ok {
+ kv[k] = []any{}
+ }
+ }
+ }
+
+ return kv, nil
+}
+
func (s *Server) ListModelsHandler(c *gin.Context) {
ms, err := Manifests()
if err != nil {
@@ -1052,11 +1105,20 @@ func Serve(ln net.Listener) error {
schedCtx, schedDone := context.WithCancel(ctx)
sched := InitScheduler(schedCtx)
s := &Server{addr: ln.Addr(), sched: sched}
- r := s.GenerateRoutes()
+
+ http.Handle("/", s.GenerateRoutes())
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
srvr := &http.Server{
- Handler: r,
+ // Use http.DefaultServeMux so we get net/http/pprof for
+ // free.
+ //
+ // TODO(bmizerany): Decide if we want to make this
+ // configurable so it is not exposed by default, or allow
+ // users to bind it to a different port. This was a quick
+ // and easy way to get pprof, but it may not be the best
+ // way.
+ Handler: nil,
}
// listen for a ctrl+c and stop any loaded llm
diff --git a/server/routes_test.go b/server/routes_test.go
index 5e16cfeff..5a5c0fbba 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -19,6 +19,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
+ "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
@@ -212,6 +213,7 @@ func Test_Routes(t *testing.T) {
"top_p 0.9",
}
assert.Equal(t, expectedParams, params)
+ assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
},
},
}
@@ -325,3 +327,40 @@ func TestCase(t *testing.T) {
})
}
}
+
+func TestShow(t *testing.T) {
+ t.Setenv("OLLAMA_MODELS", t.TempDir())
+ envconfig.LoadConfig()
+
+ var s Server
+
+ createRequest(t, s.CreateModelHandler, api.CreateRequest{
+ Name: "show-model",
+ Modelfile: fmt.Sprintf(
+ "FROM %s\nFROM %s",
+ createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
+ createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
+ ),
+ })
+
+ w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
+ Name: "show-model",
+ })
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected status code 200, actual %d", w.Code)
+ }
+
+ var resp api.ShowResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatal(err)
+ }
+
+ if resp.ModelInfo["general.architecture"] != "test" {
+ t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
+ }
+
+ if resp.ProjectorInfo["general.architecture"] != "clip" {
+ t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
+ }
+}
diff --git a/server/sched.go b/server/sched.go
index 424395544..0084b533b 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
// Load model for fitting
- ggml, err := llm.LoadModel(pending.model.ModelPath)
+ ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
if err != nil {
pending.errCh <- err
break
diff --git a/server/sched_test.go b/server/sched_test.go
index 953288347..4a1cf72a0 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
- {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
- {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+ {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
})
require.NoError(t, err)
fname := f.Name()
model := &Model{Name: modelName, ModelPath: fname}
- scenario.ggml, err = llm.LoadModel(model.ModelPath)
+ scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
require.NoError(t, err)
scenario.req = &LlmRequest{
diff --git a/types/model/name.go b/types/model/name.go
index d85fd0c6c..e645a844c 100644
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -4,7 +4,6 @@ package model
import (
"cmp"
- "encoding/hex"
"errors"
"fmt"
"log/slog"
@@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
}
return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
}
-
-type DigestType byte
-
-const (
- DigestTypeInvalid DigestType = iota
- DigestTypeSHA256
-)
-
-func (t DigestType) String() string {
- switch t {
- case DigestTypeSHA256:
- return "sha256"
- default:
- return "invalid"
- }
-}
-
-type Digest struct {
- Type DigestType
- Sum [32]byte
-}
-
-func ParseDigest(s string) (Digest, error) {
- i := strings.IndexAny(s, "-:")
- if i < 0 {
- return Digest{}, fmt.Errorf("invalid digest %q", s)
- }
- typ, encSum := s[:i], s[i+1:]
- if typ != "sha256" {
- return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
- }
- d := Digest{
- Type: DigestTypeSHA256,
- }
- n, err := hex.Decode(d.Sum[:], []byte(encSum))
- if err != nil {
- return Digest{}, err
- }
- if n != 32 {
- return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
- }
- return d, nil
-}
-
-func (d Digest) String() string {
- if d.Type == DigestTypeInvalid {
- return ""
- }
- return fmt.Sprintf("sha256-%x", d.Sum)
-}
-
-func (d Digest) IsValid() bool {
- return d.Type != DigestTypeInvalid
-}
diff --git a/types/model/name_test.go b/types/model/name_test.go
index 66ce4c339..008dd586c 100644
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) {
}
}
-const (
- validSha256 = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
- validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
-)
-
-func TestParseDigest(t *testing.T) {
- cases := []struct {
- in string
- want string
- }{
- {"", ""}, // empty
- {"sha123-12", ""}, // invalid type
- {"sha256-", ""}, // invalid sum
- {"sha256-123", ""}, // invalid odd length sum
-
- {validSha256, validSha256},
- {validSha256Old, validSha256},
- }
- for _, tt := range cases {
- t.Run(tt.in, func(t *testing.T) {
- got, err := ParseDigest(tt.in)
- if err != nil {
- if tt.want != "" {
- t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
- }
- return
- }
- if got.String() != tt.want {
- t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
- }
- })
- }
-}
-
func TestParseNameFromFilepath(t *testing.T) {
cases := map[string]Name{
filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},
diff --git a/util/bufioutil/buffer_seeker.go b/util/bufioutil/buffer_seeker.go
new file mode 100644
index 000000000..8775fdb83
--- /dev/null
+++ b/util/bufioutil/buffer_seeker.go
@@ -0,0 +1,34 @@
+package bufioutil
+
+import (
+ "bufio"
+ "io"
+)
+
+type BufferedSeeker struct {
+ rs io.ReadSeeker
+ br *bufio.Reader
+}
+
+func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
+ return &BufferedSeeker{
+ rs: rs,
+ br: bufio.NewReaderSize(rs, size),
+ }
+}
+
+func (b *BufferedSeeker) Read(p []byte) (int, error) {
+ return b.br.Read(p)
+}
+
+func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
+ if whence == io.SeekCurrent {
+ offset -= int64(b.br.Buffered())
+ }
+ n, err := b.rs.Seek(offset, whence)
+ if err != nil {
+ return 0, err
+ }
+ b.br.Reset(b.rs)
+ return n, nil
+}
diff --git a/util/bufioutil/buffer_seeker_test.go b/util/bufioutil/buffer_seeker_test.go
new file mode 100644
index 000000000..87145f6b6
--- /dev/null
+++ b/util/bufioutil/buffer_seeker_test.go
@@ -0,0 +1,64 @@
+package bufioutil
+
+import (
+ "bytes"
+ "io"
+ "strings"
+ "testing"
+)
+
+func TestBufferedSeeker(t *testing.T) {
+ const alphabet = "abcdefghijklmnopqrstuvwxyz"
+
+ bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
+
+ checkRead := func(buf []byte, expected string) {
+ t.Helper()
+ _, err := bs.Read(buf)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !bytes.Equal(buf, []byte(expected)) {
+ t.Fatalf("expected %s, got %s", expected, buf)
+ }
+ }
+
+ // Read the first 5 bytes
+ buf := make([]byte, 5)
+
+ checkRead(buf, "abcde")
+
+ // Seek back to the beginning
+ _, err := bs.Seek(0, io.SeekStart)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // read 'a'
+ checkRead(buf[:1], "a")
+
+ if bs.br.Buffered() == 0 {
+ t.Fatalf("totally unexpected sanity check failed")
+ }
+
+ // Seek past 'b'
+ _, err = bs.Seek(1, io.SeekCurrent)
+ if err != nil {
+ t.Fatal(err)
+ }
+ checkRead(buf, "cdefg")
+
+ // Seek back to the beginning
+ _, err = bs.Seek(0, io.SeekStart)
+ if err != nil {
+ t.Fatal(err)
+ }
+ checkRead(buf, "abcde")
+
+ // Seek to the end
+ _, err = bs.Seek(-5, io.SeekEnd)
+ if err != nil {
+ t.Fatal(err)
+ }
+ checkRead(buf, "vwxyz")
+}