diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 40f9c41f0..61ca3c433 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -437,6 +437,7 @@ jobs: env: OLLAMA_SKIP_IMAGE_BUILD: '1' PUSH: '1' + GH_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v4 - name: Set Version @@ -460,15 +461,20 @@ jobs: ls -lh dist/ (cd dist; sha256sum * > sha256sum.txt) cat dist/sha256sum.txt - - uses: ncipollo/release-action@v1 - with: - name: ${{ env.RELEASE_VERSION }} - allowUpdates: true - artifacts: 'dist/*' - draft: true - prerelease: true - omitBodyDuringUpdate: true - generateReleaseNotes: true - omitDraftDuringUpdate: true - omitPrereleaseDuringUpdate: true - replacesArtifacts: true + - name: Create or update Release + run: | + echo "Looking for existing release for ${{ env.RELEASE_VERSION }}" + OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName") + if [ -n "$OLD_TAG" ]; then + echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}" + gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME} + else + echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}" + gh release create ${GITHUB_REF_NAME} \ + --title ${{ env.RELEASE_VERSION }} \ + --draft \ + --generate-notes \ + --prerelease + fi + echo "Uploading artifacts for tag ${GITHUB_REF_NAME}" + gh release upload ${GITHUB_REF_NAME} dist/* --clobber diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index dbb6c2fdf..29adf56f3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -124,7 +124,7 @@ jobs: strategy: matrix: rocm-version: - - '6.0.2' + - '6.1.1' runs-on: linux container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }} steps: diff --git a/Dockerfile b/Dockerfile index 72edef2a9..98a3ddfd2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1 ARG CMAKE_VERSION=3.22.1 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md ARG CUDA_VERSION=11.3.1 -ARG ROCM_VERSION=6.0.2 +ARG ROCM_VERSION=6.1.1 # Copy the minimal context we need to run the generate scripts FROM scratch AS llm-code diff --git a/README.md b/README.md index 2fdc63cb3..72ed8fa5e 100644 --- a/README.md +++ b/README.md @@ -53,8 +53,8 @@ Here are some example models that can be downloaded: | Llama 3 | 70B | 40GB | `ollama run llama3:70b` | | Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` | | Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` | -| Gemma | 2B | 1.4GB | `ollama run gemma:2b` | -| Gemma | 7B | 4.8GB | `ollama run gemma:7b` | +| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` | +| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` | | Mistral | 7B | 4.1GB | `ollama run mistral` | | Moondream 2 | 1.4B | 829MB | `ollama run moondream` | | Neural Chat | 7B | 4.1GB | `ollama run neural-chat` | @@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)" Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications. ``` +### Show model information + +``` +ollama show llama3 +``` + ### List models on your computer ``` diff --git a/api/types.go b/api/types.go index d99cf3bcc..95ed5d37e 100644 --- a/api/types.go +++ b/api/types.go @@ -159,18 +159,49 @@ type Options struct { // Runner options which must be set when the model is loaded into memory type Runner struct { - UseNUMA bool `json:"numa,omitempty"` - NumCtx int `json:"num_ctx,omitempty"` - NumBatch int `json:"num_batch,omitempty"` - NumGPU int `json:"num_gpu,omitempty"` - MainGPU int `json:"main_gpu,omitempty"` - LowVRAM bool `json:"low_vram,omitempty"` - F16KV bool `json:"f16_kv,omitempty"` - LogitsAll bool `json:"logits_all,omitempty"` - VocabOnly bool `json:"vocab_only,omitempty"` - UseMMap bool `json:"use_mmap,omitempty"` - UseMLock bool `json:"use_mlock,omitempty"` - NumThread int `json:"num_thread,omitempty"` + UseNUMA bool `json:"numa,omitempty"` + NumCtx int `json:"num_ctx,omitempty"` + NumBatch int `json:"num_batch,omitempty"` + NumGPU int `json:"num_gpu,omitempty"` + MainGPU int `json:"main_gpu,omitempty"` + LowVRAM bool `json:"low_vram,omitempty"` + F16KV bool `json:"f16_kv,omitempty"` + LogitsAll bool `json:"logits_all,omitempty"` + VocabOnly bool `json:"vocab_only,omitempty"` + UseMMap TriState `json:"use_mmap,omitempty"` + UseMLock bool `json:"use_mlock,omitempty"` + NumThread int `json:"num_thread,omitempty"` +} + +type TriState int + +const ( + TriStateUndefined TriState = -1 + TriStateFalse TriState = 0 + TriStateTrue TriState = 1 +) + +func (b *TriState) UnmarshalJSON(data []byte) error { + var v bool + if err := json.Unmarshal(data, &v); err != nil { + return err + } + if v { + *b = TriStateTrue + } + *b = TriStateFalse + return nil +} + +func (b *TriState) MarshalJSON() ([]byte, error) { + if *b == TriStateUndefined { + return nil, nil + } + var v bool + if *b == TriStateTrue { + v = true + } + return json.Marshal(v) } // EmbeddingRequest is the request passed to [Client.Embeddings]. @@ -222,6 +253,7 @@ type ShowRequest struct { Model string `json:"model"` System string `json:"system"` Template string `json:"template"` + Verbose bool `json:"verbose"` Options map[string]interface{} `json:"options"` @@ -231,13 +263,16 @@ type ShowRequest struct { // ShowResponse is the response returned from [Client.Show]. type ShowResponse struct { - License string `json:"license,omitempty"` - Modelfile string `json:"modelfile,omitempty"` - Parameters string `json:"parameters,omitempty"` - Template string `json:"template,omitempty"` - System string `json:"system,omitempty"` - Details ModelDetails `json:"details,omitempty"` - Messages []Message `json:"messages,omitempty"` + License string `json:"license,omitempty"` + Modelfile string `json:"modelfile,omitempty"` + Parameters string `json:"parameters,omitempty"` + Template string `json:"template,omitempty"` + System string `json:"system,omitempty"` + Details ModelDetails `json:"details,omitempty"` + Messages []Message `json:"messages,omitempty"` + ModelInfo map[string]any `json:"model_info,omitempty"` + ProjectorInfo map[string]any `json:"projector_info,omitempty"` + ModifiedAt time.Time `json:"modified_at,omitempty"` } // CopyRequest is the request passed to [Client.Copy]. @@ -402,6 +437,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error { continue } + if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) { + val, ok := val.(bool) + if !ok { + return fmt.Errorf("option %q must be of type boolean", key) + } + if val { + field.SetInt(int64(TriStateTrue)) + } else { + field.SetInt(int64(TriStateFalse)) + } + continue + } + switch field.Kind() { case reflect.Int: switch t := val.(type) { @@ -490,7 +538,7 @@ func DefaultOptions() Options { LowVRAM: false, F16KV: true, UseMLock: false, - UseMMap: true, + UseMMap: TriStateUndefined, UseNUMA: false, }, } @@ -560,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) { } else { field := valueOpts.FieldByName(opt.Name) if field.IsValid() && field.CanSet() { + if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) { + boolVal, err := strconv.ParseBool(vals[0]) + if err != nil { + return nil, fmt.Errorf("invalid bool value %s", vals) + } + if boolVal { + out[key] = TriStateTrue + } else { + out[key] = TriStateFalse + } + continue + } + switch field.Kind() { case reflect.Float32: floatVal, err := strconv.ParseFloat(vals[0], 32) diff --git a/api/types_test.go b/api/types_test.go index 211385c70..8b6c60c62 100644 --- a/api/types_test.go +++ b/api/types_test.go @@ -2,6 +2,7 @@ package api import ( "encoding/json" + "fmt" "math" "testing" "time" @@ -105,3 +106,101 @@ func TestDurationMarshalUnmarshal(t *testing.T) { }) } } + +func TestUseMmapParsingFromJSON(t *testing.T) { + tests := []struct { + name string + req string + exp TriState + }{ + { + name: "Undefined", + req: `{ }`, + exp: TriStateUndefined, + }, + { + name: "True", + req: `{ "use_mmap": true }`, + exp: TriStateTrue, + }, + { + name: "False", + req: `{ "use_mmap": false }`, + exp: TriStateFalse, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var oMap map[string]interface{} + err := json.Unmarshal([]byte(test.req), &oMap) + require.NoError(t, err) + opts := DefaultOptions() + err = opts.FromMap(oMap) + require.NoError(t, err) + assert.Equal(t, test.exp, opts.UseMMap) + }) + } +} + +func TestUseMmapFormatParams(t *testing.T) { + tests := []struct { + name string + req map[string][]string + exp TriState + err error + }{ + { + name: "True", + req: map[string][]string{ + "use_mmap": []string{"true"}, + }, + exp: TriStateTrue, + err: nil, + }, + { + name: "False", + req: map[string][]string{ + "use_mmap": []string{"false"}, + }, + exp: TriStateFalse, + err: nil, + }, + { + name: "Numeric True", + req: map[string][]string{ + "use_mmap": []string{"1"}, + }, + exp: TriStateTrue, + err: nil, + }, + { + name: "Numeric False", + req: map[string][]string{ + "use_mmap": []string{"0"}, + }, + exp: TriStateFalse, + err: nil, + }, + { + name: "invalid string", + req: map[string][]string{ + "use_mmap": []string{"foo"}, + }, + exp: TriStateUndefined, + err: fmt.Errorf("invalid bool value [foo]"), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + resp, err := FormatParams(test.req) + require.Equal(t, err, test.err) + respVal, ok := resp["use_mmap"] + if test.exp != TriStateUndefined { + assert.True(t, ok, "resp: %v", resp) + assert.Equal(t, test.exp, respVal) + } + }) + } +} diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go index df2597a83..a8f1f7cdf 100644 --- a/app/lifecycle/logging.go +++ b/app/lifecycle/logging.go @@ -5,6 +5,8 @@ import ( "log/slog" "os" "path/filepath" + "strconv" + "strings" "github.com/ollama/ollama/envconfig" ) @@ -24,6 +26,7 @@ func InitLogging() { logFile = os.Stderr // TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion } else { + rotateLogs(AppLogFile) logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755) if err != nil { slog.Error(fmt.Sprintf("failed to create server log %v", err)) @@ -46,3 +49,32 @@ func InitLogging() { slog.Info("ollama app started") } + +func rotateLogs(logFile string) { + if _, err := os.Stat(logFile); os.IsNotExist(err) { + return + } + index := strings.LastIndex(logFile, ".") + pre := logFile[:index] + post := "." + logFile[index+1:] + for i := LogRotationCount; i > 0; i-- { + older := pre + "-" + strconv.Itoa(i) + post + newer := pre + "-" + strconv.Itoa(i-1) + post + if i == 1 { + newer = pre + post + } + if _, err := os.Stat(newer); err == nil { + if _, err := os.Stat(older); err == nil { + err := os.Remove(older) + if err != nil { + slog.Warn("Failed to remove older log", "older", older, "error", err) + continue + } + } + err := os.Rename(newer, older) + if err != nil { + slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err) + } + } + } +} diff --git a/app/lifecycle/logging_test.go b/app/lifecycle/logging_test.go new file mode 100644 index 000000000..a2157ca2c --- /dev/null +++ b/app/lifecycle/logging_test.go @@ -0,0 +1,44 @@ +package lifecycle + +import ( + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRotateLogs(t *testing.T) { + logDir := t.TempDir() + logFile := filepath.Join(logDir, "testlog.log") + + // No log exists + rotateLogs(logFile) + + require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644)) + assert.FileExists(t, logFile) + // First rotation + rotateLogs(logFile) + assert.FileExists(t, filepath.Join(logDir, "testlog-1.log")) + assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log")) + assert.NoFileExists(t, logFile) + + // Should be a no-op without a new log + rotateLogs(logFile) + assert.FileExists(t, filepath.Join(logDir, "testlog-1.log")) + assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log")) + assert.NoFileExists(t, logFile) + + for i := 2; i <= LogRotationCount+1; i++ { + require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644)) + assert.FileExists(t, logFile) + rotateLogs(logFile) + assert.NoFileExists(t, logFile) + for j := 1; j < i; j++ { + assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log")) + } + assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log")) + } +} diff --git a/app/lifecycle/paths.go b/app/lifecycle/paths.go index fe07bce10..4d9f4c5a1 100644 --- a/app/lifecycle/paths.go +++ b/app/lifecycle/paths.go @@ -16,11 +16,12 @@ var ( AppDir = "/opt/Ollama" AppDataDir = "/opt/Ollama" // TODO - should there be a distinct log dir? - UpdateStageDir = "/tmp" - AppLogFile = "/tmp/ollama_app.log" - ServerLogFile = "/tmp/ollama.log" - UpgradeLogFile = "/tmp/ollama_update.log" - Installer = "OllamaSetup.exe" + UpdateStageDir = "/tmp" + AppLogFile = "/tmp/ollama_app.log" + ServerLogFile = "/tmp/ollama.log" + UpgradeLogFile = "/tmp/ollama_update.log" + Installer = "OllamaSetup.exe" + LogRotationCount = 5 ) func init() { diff --git a/app/lifecycle/server.go b/app/lifecycle/server.go index 0152ccd11..c178a1abf 100644 --- a/app/lifecycle/server.go +++ b/app/lifecycle/server.go @@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) { return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err) } - // TODO - rotation + rotateLogs(ServerLogFile) logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755) if err != nil { return nil, fmt.Errorf("failed to create server log: %w", err) diff --git a/app/ollama.iss b/app/ollama.iss index 9dc61abbf..e6502abd3 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -88,10 +88,15 @@ DialogFontSize=12 [Files] Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit -Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion +#if DirExists("..\dist\windows-amd64\cuda") + Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs +#endif +#if DirExists("..\dist\windows-amd64\oneapi") + Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs +#endif #if DirExists("..\dist\windows-amd64\rocm") Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs #endif diff --git a/cmd/cmd.go b/cmd/cmd.go index ae7c8da8f..909e8e4b2 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) { } defer tempfile.Close() - zipfile := zip.NewWriter(tempfile) - defer zipfile.Close() - detectContentType := func(path string) (string, error) { f, err := os.Open(path) if err != nil { @@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) { files = append(files, tks...) } + zipfile := zip.NewWriter(tempfile) + defer zipfile.Close() + for _, file := range files { f, err := os.Open(file) if err != nil { @@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er } func RunHandler(cmd *cobra.Command, args []string) error { - client, err := api.ClientFromEnvironment() - if err != nil { - return err - } - - name := args[0] - - // check if the model exists on the server - show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name}) - var statusError api.StatusError - switch { - case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound: - if err := PullHandler(cmd, []string{name}); err != nil { - return err - } - - show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name}) - if err != nil { - return err - } - case err != nil: - return err - } - interactive := true opts := runOptions{ - Model: args[0], - WordWrap: os.Getenv("TERM") == "xterm-256color", - Options: map[string]interface{}{}, - MultiModal: slices.Contains(show.Details.Families, "clip"), - ParentModel: show.Details.ParentModel, + Model: args[0], + WordWrap: os.Getenv("TERM") == "xterm-256color", + Options: map[string]interface{}{}, } format, err := cmd.Flags().GetString("format") @@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error { } opts.WordWrap = !nowrap - if !interactive { - return generate(cmd, opts) + // Fill out the rest of the options based on information about the + // model. + client, err := api.ClientFromEnvironment() + if err != nil { + return err } - return generateInteractive(cmd, opts) + name := args[0] + info, err := func() (*api.ShowResponse, error) { + showReq := &api.ShowRequest{Name: name} + info, err := client.Show(cmd.Context(), showReq) + var se api.StatusError + if errors.As(err, &se) && se.StatusCode == http.StatusNotFound { + if err := PullHandler(cmd, []string{name}); err != nil { + return nil, err + } + return client.Show(cmd.Context(), &api.ShowRequest{Name: name}) + } + return info, err + }() + if err != nil { + return err + } + + opts.MultiModal = slices.Contains(info.Details.Families, "clip") + opts.ParentModel = info.Details.ParentModel + opts.Messages = append(opts.Messages, info.Messages...) + + if interactive { + return generateInteractive(cmd, opts) + } + return generate(cmd, opts) } func errFromUnknownKey(unknownKeyErr error) error { @@ -579,10 +580,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return err } - if len(args) != 1 { - return errors.New("missing model name") - } - license, errLicense := cmd.Flags().GetBool("license") modelfile, errModelfile := cmd.Flags().GetBool("modelfile") parameters, errParams := cmd.Flags().GetBool("parameters") @@ -625,8 +622,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error { if flagsSet > 1 { return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified") - } else if flagsSet == 0 { - return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified") + } + + if flagsSet == 1 { + req := api.ShowRequest{Name: args[0]} + resp, err := client.Show(cmd.Context(), &req) + if err != nil { + return err + } + + switch showType { + case "license": + fmt.Println(resp.License) + case "modelfile": + fmt.Println(resp.Modelfile) + case "parameters": + fmt.Println(resp.Parameters) + case "system": + fmt.Println(resp.System) + case "template": + fmt.Println(resp.Template) + } + + return nil } req := api.ShowRequest{Name: args[0]} @@ -635,22 +653,114 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return err } - switch showType { - case "license": - fmt.Println(resp.License) - case "modelfile": - fmt.Println(resp.Modelfile) - case "parameters": - fmt.Println(resp.Parameters) - case "system": - fmt.Println(resp.System) - case "template": - fmt.Println(resp.Template) + arch := resp.ModelInfo["general.architecture"].(string) + + modelData := [][]string{ + {"arch", arch}, + {"parameters", resp.Details.ParameterSize}, + {"quantization", resp.Details.QuantizationLevel}, + {"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))}, + {"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))}, } + mainTableData := [][]string{ + {"Model"}, + {renderSubTable(modelData, false)}, + } + + if resp.ProjectorInfo != nil { + projectorData := [][]string{ + {"arch", "clip"}, + {"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}, + {"projector type", resp.ProjectorInfo["clip.projector_type"].(string)}, + {"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))}, + {"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))}, + } + + mainTableData = append(mainTableData, + []string{"Projector"}, + []string{renderSubTable(projectorData, false)}, + ) + } + + if resp.Parameters != "" { + mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)}) + } + + if resp.System != "" { + mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)}) + } + + if resp.License != "" { + mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)}) + } + + table := tablewriter.NewWriter(os.Stdout) + table.SetAutoWrapText(false) + table.SetBorder(false) + table.SetAlignment(tablewriter.ALIGN_LEFT) + + for _, v := range mainTableData { + table.Append(v) + } + + table.Render() + return nil } +func renderSubTable(data [][]string, file bool) string { + var buf bytes.Buffer + table := tablewriter.NewWriter(&buf) + table.SetAutoWrapText(!file) + table.SetBorder(false) + table.SetNoWhiteSpace(true) + table.SetTablePadding("\t") + table.SetAlignment(tablewriter.ALIGN_LEFT) + + for _, v := range data { + table.Append(v) + } + + table.Render() + + renderedTable := buf.String() + lines := strings.Split(renderedTable, "\n") + for i, line := range lines { + lines[i] = "\t" + line + } + + return strings.Join(lines, "\n") +} + +func twoLines(s string) [][]string { + lines := strings.Split(s, "\n") + res := [][]string{} + + count := 0 + for _, line := range lines { + line = strings.TrimSpace(line) + if line != "" { + count++ + res = append(res, []string{line}) + if count == 2 { + return res + } + } + } + return res +} + +func formatParams(s string) string { + lines := strings.Split(s, "\n") + table := [][]string{} + + for _, line := range lines { + table = append(table, strings.Fields(line)) + } + return renderSubTable(table, false) +} + func CopyHandler(cmd *cobra.Command, args []string) error { client, err := api.ClientFromEnvironment() if err != nil { diff --git a/cmd/interactive.go b/cmd/interactive.go index 80a915474..0a2f429b6 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -31,65 +31,40 @@ const ( ) func loadModel(cmd *cobra.Command, opts *runOptions) error { - client, err := api.ClientFromEnvironment() - if err != nil { - return err - } - p := progress.NewProgress(os.Stderr) defer p.StopAndClear() spinner := progress.NewSpinner("") p.Add("", spinner) - showReq := api.ShowRequest{Name: opts.Model} - showResp, err := client.Show(cmd.Context(), &showReq) + client, err := api.ClientFromEnvironment() if err != nil { return err } - opts.MultiModal = slices.Contains(showResp.Details.Families, "clip") - opts.ParentModel = showResp.Details.ParentModel - - if len(showResp.Messages) > 0 { - opts.Messages = append(opts.Messages, showResp.Messages...) - } chatReq := &api.ChatRequest{ - Model: opts.Model, - Messages: []api.Message{}, + Model: opts.Model, + KeepAlive: opts.KeepAlive, } - if opts.KeepAlive != nil { - chatReq.KeepAlive = opts.KeepAlive - } - - err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error { + return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error { p.StopAndClear() - if len(opts.Messages) > 0 { - for _, msg := range opts.Messages { - switch msg.Role { - case "user": - fmt.Printf(">>> %s\n", msg.Content) - case "assistant": - state := &displayResponseState{} - displayResponse(msg.Content, opts.WordWrap, state) - fmt.Println() - fmt.Println() - } + for _, msg := range opts.Messages { + switch msg.Role { + case "user": + fmt.Printf(">>> %s\n", msg.Content) + case "assistant": + state := &displayResponseState{} + displayResponse(msg.Content, opts.WordWrap, state) + fmt.Println() + fmt.Println() } } return nil }) - if err != nil { - return err - } - - return nil } func generateInteractive(cmd *cobra.Command, opts runOptions) error { - opts.Messages = make([]api.Message, 0) - err := loadModel(cmd, &opts) if err != nil { return err diff --git a/docs/api.md b/docs/api.md index 35f1def33..107b5211f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -777,11 +777,12 @@ A single JSON object will be returned. POST /api/show ``` -Show information about a model including details, modelfile, template, parameters, license, and system prompt. +Show information about a model including details, modelfile, template, parameters, license, system prompt. ### Parameters - `name`: name of the model to show +- `verbose`: (optional) if set to `true`, returns full data for verbose response fields ### Examples @@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{ ```json { "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"", - "parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:", - "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ", + "parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"", + "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>", "details": { + "parent_model": "", "format": "gguf", "family": "llama", - "families": ["llama", "clip"], - "parameter_size": "7B", + "families": [ + "llama" + ], + "parameter_size": "8.0B", "quantization_level": "Q4_0" + }, + "model_info": { + "general.architecture": "llama", + "general.file_type": 2, + "general.parameter_count": 8030261248, + "general.quantization_version": 2, + "llama.attention.head_count": 32, + "llama.attention.head_count_kv": 8, + "llama.attention.layer_norm_rms_epsilon": 0.00001, + "llama.block_count": 32, + "llama.context_length": 8192, + "llama.embedding_length": 4096, + "llama.feed_forward_length": 14336, + "llama.rope.dimension_count": 128, + "llama.rope.freq_base": 500000, + "llama.vocab_size": 128256, + "tokenizer.ggml.bos_token_id": 128000, + "tokenizer.ggml.eos_token_id": 128009, + "tokenizer.ggml.merges": [], // populates if `verbose=true` + "tokenizer.ggml.model": "gpt2", + "tokenizer.ggml.pre": "llama-bpe", + "tokenizer.ggml.token_type": [], // populates if `verbose=true` + "tokenizer.ggml.tokens": [] // populates if `verbose=true` } } ``` diff --git a/docs/development.md b/docs/development.md index 8c035a518..2a6886a43 100644 --- a/docs/development.md +++ b/docs/development.md @@ -114,15 +114,18 @@ If you have Docker available, you can build linux binaries with `./scripts/build ### Windows -Note: The windows build for Ollama is still under development. +Note: The Windows build for Ollama is still under development. -Install required tools: +First, install required tools: - MSVC toolchain - C/C++ and cmake as minimal requirements - Go version 1.22 or higher - MinGW (pick one variant) with GCC. - [MinGW-w64](https://www.mingw-w64.org/) - [MSYS2](https://www.msys2.org/) +- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser` + +Then, build the `ollama` binary: ```powershell $env:CGO_ENABLED="1" diff --git a/docs/import.md b/docs/import.md index 7abe39b2a..f34f09ace 100644 --- a/docs/import.md +++ b/docs/import.md @@ -47,19 +47,13 @@ success ### Supported Quantizations -
-Legacy Quantization - - `Q4_0` - `Q4_1` - `Q5_0` - `Q5_1` - `Q8_0` -
- -
-K-means Quantization` +#### K-means Quantizations - `Q3_K_S` - `Q3_K_M` @@ -70,11 +64,6 @@ success - `Q5_K_M` - `Q6_K` -
- -> [!NOTE] -> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf). - ## Template Detection > [!NOTE] diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 60d63c7d9..de29b344c 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -22,7 +22,7 @@ docker logs If manually running `ollama serve` in a terminal, the logs will be on that terminal. When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `+R` and type in: -- `explorer %LOCALAPPDATA%\Ollama` to view logs +- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log` - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH) - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored - `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories diff --git a/docs/windows.md b/docs/windows.md index 832b3d431..abc0eb300 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -39,8 +39,8 @@ server. Ollama on Windows stores files in a few different locations. You can view them in the explorer window by hitting `+R` and type in: - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates - - *app.log* contains logs from the GUI application - - *server.log* contains the server logs + - *app.log* contains most resent logs from the GUI application + - *server.log* contains the most recent server logs - *upgrade.log* contains log output for upgrades - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH) - `explorer %HOMEPATH%\.ollama` contains models and configuration diff --git a/envconfig/config.go b/envconfig/config.go index 4d2150b72..e86f72e6a 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -57,6 +57,19 @@ var ( SchedSpread bool // Set via OLLAMA_TMPDIR in the environment TmpDir string + // Set via OLLAMA_INTEL_GPU in the environment + IntelGpu bool + + // Set via CUDA_VISIBLE_DEVICES in the environment + CudaVisibleDevices string + // Set via HIP_VISIBLE_DEVICES in the environment + HipVisibleDevices string + // Set via ROCR_VISIBLE_DEVICES in the environment + RocrVisibleDevices string + // Set via GPU_DEVICE_ORDINAL in the environment + GpuDeviceOrdinal string + // Set via HSA_OVERRIDE_GFX_VERSION in the environment + HsaOverrideGfxVersion string ) type EnvVar struct { @@ -66,7 +79,7 @@ type EnvVar struct { } func AsMap() map[string]EnvVar { - return map[string]EnvVar{ + ret := map[string]EnvVar{ "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"}, @@ -84,6 +97,15 @@ func AsMap() map[string]EnvVar { "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"}, } + if runtime.GOOS != "darwin" { + ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"} + ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"} + ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"} + ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"} + ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"} + ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"} + } + return ret } func Values() map[string]string { @@ -256,6 +278,16 @@ func LoadConfig() { if err != nil { slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port) } + + if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil { + IntelGpu = set + } + + CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES") + HipVisibleDevices = clean("HIP_VISIBLE_DEVICES") + RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES") + GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL") + HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION") } func getModelsDir() (string, error) { diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 61e6a0598..15b6fc61f 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -13,6 +13,7 @@ import ( "strconv" "strings" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" ) @@ -59,9 +60,9 @@ func AMDGetGPUInfo() []RocmGPUInfo { // Determine if the user has already pre-selected which GPUs to look at, then ignore the others var visibleDevices []string - hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only - rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID - gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index + hipVD := envconfig.HipVisibleDevices // zero based index only + rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID + gpuDO := envconfig.GpuDeviceOrdinal // zero based index switch { // TODO is this priorty order right? case hipVD != "": @@ -74,7 +75,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { visibleDevices = strings.Split(gpuDO, ",") } - gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION") + gfxOverride := envconfig.HsaOverrideGfxVersion var supported []string libDir := "" @@ -332,6 +333,11 @@ func AMDGetGPUInfo() []RocmGPUInfo { slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride) } + // Check for env var workarounds + if name == "1002:687f" { // Vega RX 56 + gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"}) + } + // The GPU has passed all the verification steps and is supported resp = append(resp, gpuInfo) } diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index cad45f6c5..21585277a 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -10,6 +10,7 @@ import ( "strconv" "strings" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" ) @@ -53,7 +54,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { } var supported []string - gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION") + gfxOverride := envconfig.HsaOverrideGfxVersion if gfxOverride == "" { supported, err = GetSupportedGFX(libDir) if err != nil { diff --git a/gpu/assets.go b/gpu/assets.go index f2adcf3e3..073d2e813 100644 --- a/gpu/assets.go +++ b/gpu/assets.go @@ -77,20 +77,27 @@ func cleanupTmpDirs() { continue } raw, err := os.ReadFile(filepath.Join(d, "ollama.pid")) - if err == nil { - pid, err := strconv.Atoi(string(raw)) - if err == nil { - if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) { - // Another running ollama, ignore this tmpdir - continue - } - } - } else { - slog.Debug("failed to open ollama.pid", "path", d, "error", err) - } - err = os.RemoveAll(d) if err != nil { - slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err) + slog.Warn("failed to read ollama.pid", "path", d, "error", err) + // No pid, ignore this tmpdir + continue + } + + pid, err := strconv.Atoi(string(raw)) + if err != nil { + slog.Warn("failed to parse pid", "path", d, "error", err) + continue + } + + proc, err := os.FindProcess(pid) + if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) { + slog.Warn("found running ollama", "pid", pid, "path", d) + // Another running ollama, ignore this tmpdir + continue + } + + if err := os.Remove(d); err != nil { + slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err) } } } diff --git a/gpu/gpu.go b/gpu/gpu.go index 6cebbd2b9..0120d4271 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -265,7 +265,7 @@ func GetGPUInfo() GpuInfoList { // On windows we bundle the nvidia library one level above the runner dir depPath := "" if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { - depPath = filepath.Dir(envconfig.RunnersDir) + depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda") } // Load ALL libraries @@ -314,33 +314,41 @@ func GetGPUInfo() GpuInfoList { } // Intel - oHandles = initOneAPIHandles() - for d := 0; oHandles.oneapi != nil && d < int(oHandles.oneapi.num_drivers); d++ { - if oHandles.oneapi == nil { - // shouldn't happen - slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers)) - continue + if envconfig.IntelGpu { + oHandles = initOneAPIHandles() + // On windows we bundle the oneapi library one level above the runner dir + depPath = "" + if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { + depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi") } - devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) - for i := range devCount { - gpuInfo := OneapiGPUInfo{ - GpuInfo: GpuInfo{ - Library: "oneapi", - }, - driverIndex: d, - gpuIndex: int(i), + + for d := range oHandles.oneapi.num_drivers { + if oHandles.oneapi == nil { + // shouldn't happen + slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers)) + continue + } + devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) + for i := range devCount { + gpuInfo := OneapiGPUInfo{ + GpuInfo: GpuInfo{ + Library: "oneapi", + }, + driverIndex: int(d), + gpuIndex: int(i), + } + // TODO - split bootstrapping from updating free memory + C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) + // TODO - convert this to MinimumMemory based on testing... + var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. + memInfo.free = C.uint64_t(totalFreeMem) + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DependencyPath = depPath + oneapiGPUs = append(oneapiGPUs, gpuInfo) } - // TODO - split bootstrapping from updating free memory - C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) - // TODO - convert this to MinimumMemory based on testing... - var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. - memInfo.free = C.uint64_t(totalFreeMem) - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - // TODO dependency path? - oneapiGPUs = append(oneapiGPUs, gpuInfo) } } diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c index 9db89529a..03f15a2c3 100644 --- a/gpu/gpu_info_cudart.c +++ b/gpu/gpu_info_cudart.c @@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { for (i = 0; l[i].s != NULL; i++) { *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); - if (!l[i].p) { + if (!*(l[i].p)) { char *msg = LOAD_ERR(); LOG(resp->ch.verbose, "dlerr: %s\n", msg); UNLOAD_LIBRARY(resp->ch.handle); diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c index 675ce5cc4..abe140844 100644 --- a/gpu/gpu_info_nvcuda.c +++ b/gpu/gpu_info_nvcuda.c @@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { for (i = 0; l[i].s != NULL; i++) { *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); - if (!*l[i].p) { + if (!*(l[i].p)) { char *msg = LOAD_ERR(); LOG(resp->ch.verbose, "dlerr: %s\n", msg); UNLOAD_LIBRARY(resp->ch.handle); diff --git a/gpu/gpu_info_nvml.c b/gpu/gpu_info_nvml.c index ef0a97df2..11293e448 100644 --- a/gpu/gpu_info_nvml.c +++ b/gpu/gpu_info_nvml.c @@ -42,7 +42,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); - if (!l[i].p) { + if (!*(l[i].p)) { resp->ch.handle = NULL; char *msg = LOAD_ERR(); LOG(resp->ch.verbose, "dlerr: %s\n", msg); diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c index e90c694a1..3ff708ea2 100644 --- a/gpu/gpu_info_oneapi.c +++ b/gpu/gpu_info_oneapi.c @@ -13,7 +13,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) { resp->oh.num_drivers = 0; const int buflen = 256; char buf[buflen + 1]; - int i, d, count; + int i, d; struct lookup { char *s; void **p; @@ -50,7 +50,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) { LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s); *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s); - if (!l[i].p) { + if (!*(l[i].p)) { resp->oh.handle = NULL; char *msg = LOAD_ERR(); LOG(resp->oh.verbose, "dlerr: %s\n", msg); @@ -62,6 +62,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) { } } + LOG(resp->oh.verbose, "calling zesInit\n"); + ret = (*resp->oh.zesInit)(0); if (ret != ZE_RESULT_SUCCESS) { LOG(resp->oh.verbose, "zesInit err: %x\n", ret); @@ -71,7 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) { return; } - count = 0; + LOG(resp->oh.verbose, "calling zesDriverGet\n"); ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL); if (ret != ZE_RESULT_SUCCESS) { LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret); @@ -96,6 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) { } for (d = 0; d < resp->oh.num_drivers; d++) { + LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]); ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], NULL); if (ret != ZE_RESULT_SUCCESS) { @@ -116,7 +119,6 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) { oneapi_release(resp->oh); return; } - count += resp->oh.num_devices[d]; } return; @@ -160,7 +162,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, return; } - snprintf(&resp->gpu_name[0], GPU_NAME_LEN, props.modelName); + snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName); // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax // (this is probably wrong...) diff --git a/gpu/types.go b/gpu/types.go index b451c0f38..693ca4668 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -26,6 +26,9 @@ type GpuInfo struct { // Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly DependencyPath string `json:"lib_path,omitempty"` + // Extra environment variables specific to the GPU as list of [key,value] + EnvWorkarounds [][2]string `json:"envs,omitempty"` + // GPU information ID string `json:"gpu_id"` // string to use for selection of this specific GPU Name string `json:"name"` // user friendly name if available diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 18b3fa18d..492126a4f 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -56,7 +56,6 @@ struct server_params { std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; - std::string chat_template = ""; int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; @@ -427,16 +426,6 @@ struct llama_server_context return true; } - void validate_model_chat_template(server_params & sparams) { - llama_chat_message chat[] = {{"user", "test"}}; - std::vector buf(1); - int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); - if (res < 0) { - LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - sparams.chat_template = "chatml"; - } - } - void initialize() { // create slots all_slots_are_idle = true; @@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - sparams.chat_template = argv[i]; } else if (arg == "--override-kv") { @@ -3008,11 +2996,6 @@ int main(int argc, char **argv) { } const auto model_meta = llama.model_meta(); - if (sparams.chat_template.empty()) { // custom chat template is not supplied - // check if the template comes with the model is supported by us - llama.validate_model_chat_template(sparams); - } - // Middleware for API key validation auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { // If API key is not set, skip validation diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 0baf86ffc..721a9ae80 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -18,7 +18,7 @@ sign() { fi } -COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on" +COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off" case "${GOARCH}" in "amd64") @@ -27,7 +27,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build @@ -37,7 +37,7 @@ case "${GOARCH}" in # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu" echo "Building LCD CPU" build @@ -49,7 +49,7 @@ case "${GOARCH}" in # Approximately 400% faster than LCD on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" echo "Building AVX CPU" build @@ -61,7 +61,7 @@ case "${GOARCH}" in # Approximately 10% faster than AVX on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" @@ -75,7 +75,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 2190fb93e..0e98e1635 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" +COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}_static" echo "Building static library" build @@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off" + COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) @@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" fi CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index c9d860977..e217a0382 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -39,7 +39,8 @@ function init_vars { } $script:cmakeDefs = @( "-DBUILD_SHARED_LIBS=on", - "-DLLAMA_NATIVE=off" + "-DLLAMA_NATIVE=off", + "-DLLAMA_OPENMP=off" ) $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() @@ -122,8 +123,13 @@ function build { & cmake --version & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })" - & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) + if ($cmakeDefs -contains "-G") { + $extra=@("-j8") + } else { + $extra= @("--", "/p:CL_MPcount=8") + } + write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra" + & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} # Rearrange output to be consistent between different generators if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) { @@ -203,7 +209,8 @@ function build_static() { "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_F16C=off", - "-DLLAMA_FMA=off") + "-DLLAMA_FMA=off", + "-DLLAMA_OPENMP=off") $script:buildDir="../build/windows/${script:ARCH}_static" write-host "Building static library" build @@ -270,7 +277,15 @@ function build_cuda() { init_vars $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT" $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT" - $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}") + $script:cmakeDefs += @( + "-A", "x64", + "-DLLAMA_CUDA=ON", + "-DLLAMA_AVX=on", + "-DLLAMA_AVX2=off", + "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", + "-DCMAKE_CUDA_FLAGS=-t8", + "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}" + ) if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) { write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`"" $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}") @@ -280,10 +295,12 @@ function build_cuda() { sign install - write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\" - cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\" - cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\" - cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\" + rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null + write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" } else { write-host "Skipping CUDA generation step" } @@ -317,16 +334,18 @@ function build_oneapi() { sign install - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}" + rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" } else { Write-Host "Skipping oneAPI generation step" } diff --git a/llm/ggla.go b/llm/ggla.go index a5d90b6cb..34c4f6ca3 100644 --- a/llm/ggla.go +++ b/llm/ggla.go @@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors { return llm.tensors } -func (llm *ggla) decode(rs io.ReadSeeker) error { +func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) { var r uint32 if err := binary.Read(rs, binary.LittleEndian, &r); err != nil { return err @@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error { for { var dims uint32 if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil { + if errors.Is(err, io.EOF) { + return nil + } return err } + defer func() { + if errors.Is(retErr, io.EOF) { + retErr = io.ErrUnexpectedEOF + } + }() + var namesize uint32 if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil { return err @@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error { return err } - if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil { + if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil { return err } diff --git a/llm/ggml.go b/llm/ggml.go index 35b89d16e..cfead450d 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -6,6 +6,8 @@ import ( "fmt" "io" "strings" + + "github.com/ollama/ollama/util/bufioutil" ) type GGML struct { @@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 { return 1 } +func (kv KV) EmbeddingHeadCount() uint64 { + if heads := kv.HeadCount(); heads > 0 { + return kv.EmbeddingLength() / kv.HeadCount() + } + + return 0 +} + +func (kv KV) EmbeddingHeadCountK() uint64 { + if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 { + return k + } + + return kv.EmbeddingHeadCount() +} + +func (kv KV) EmbeddingHeadCountV() uint64 { + if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 { + return v + } + + return kv.EmbeddingHeadCount() +} + func (kv KV) GQA() uint64 { return kv.HeadCount() / kv.HeadCountKV() } @@ -254,7 +280,18 @@ func DetectGGMLType(b []byte) string { } } -func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { +// DecodeGGML decodes a GGML model from the given reader. +// +// It collects array values for arrays with a size less than or equal to +// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If +// the maxArraySize is negative, all arrays are collected. +func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { + if maxArraySize == 0 { + maxArraySize = 1024 + } + + rs = bufioutil.NewBufferedSeeker(rs, 32<<10) + var magic uint32 if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { return nil, 0, err @@ -267,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { case FILE_MAGIC_GGLA: c = &containerGGLA{} case FILE_MAGIC_GGUF_LE: - c = &containerGGUF{ByteOrder: binary.LittleEndian} + c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize} case FILE_MAGIC_GGUF_BE: - c = &containerGGUF{ByteOrder: binary.BigEndian} + c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize} default: return nil, 0, errors.New("invalid file magic") } model, err := c.Decode(rs) - if errors.Is(err, io.EOF) { - // noop - } else if err != nil { + if err != nil { return nil, 0, err } @@ -297,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui embedding := llm.KV().EmbeddingLength() heads := llm.KV().HeadCount() headsKV := llm.KV().HeadCountKV() - vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any))) + vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size) + + embeddingHeads := llm.KV().EmbeddingHeadCount() + embeddingHeadsK := llm.KV().EmbeddingHeadCountK() layers := llm.Tensors().Layers() @@ -308,7 +346,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui partialOffload = 4 * batch * embedding partialOffload += max( // 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()), - 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV), + 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV), 4*batch*(embedding+vocab)+embedding*vocab*105/128, ) @@ -316,21 +354,30 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui // mixtral 8x22b ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32)) partialOffload = max( - 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV), - 4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch), + 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV), + 4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch), ) } else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok { // mixtral 8x7b ffnGateWeight1 := ffnGateWeight.Shape[1] fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1) partialOffload = max( - 4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16, + 4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16, 4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16), ) } - case "gemma": - fullOffload = 4 * batch * (embedding + vocab) - partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128 + case "gemma", "gemma2": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads), + ) + + partialOffload = max( + 4*embedding*batch+embedding*vocab*105/128+4*vocab*batch, + 4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+ + 4*embeddingHeadsK*context*8+ + embedding*embeddingHeadsK*heads*9/16, + ) case "command-r": fullOffload = max( 4*batch*(embedding+vocab), @@ -367,6 +414,16 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui 4*batch*(vocab+2*embedding), fullOffload, ) + case "deepseek2": + fullOffload = max( + 4*batch*(3*embedding+vocab), + 4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV), + ) + + partialOffload = max( + 4*batch*(3*embedding+vocab)+embedding*vocab*105/128, + 4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16, + ) } return diff --git a/llm/ggml_test.go b/llm/ggml_test.go new file mode 100644 index 000000000..006c3ded8 --- /dev/null +++ b/llm/ggml_test.go @@ -0,0 +1 @@ +package llm diff --git a/llm/gguf.go b/llm/gguf.go index 234efe574..4d343a1bd 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -3,11 +3,10 @@ package llm import ( "bytes" "encoding/binary" + "encoding/json" "fmt" "io" "strings" - - "log/slog" ) type containerGGUF struct { @@ -29,6 +28,12 @@ type containerGGUF struct { NumTensor uint64 NumKV uint64 } + + maxArraySize int +} + +func (c *containerGGUF) canCollectArray(size int) bool { + return c.maxArraySize < 0 || size <= c.maxArraySize } func (c *containerGGUF) Name() string { @@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) { } model := newGGUF(c) - slog.Debug(fmt.Sprintf("model = %#v", model)) if err := model.Decode(rs); err != nil { return nil, err } @@ -85,6 +89,8 @@ type gguf struct { tensors []*Tensor parameters uint64 + + scratch [16 << 10]byte } func newGGUF(container *containerGGUF) *gguf { @@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { } // decode tensors - for i := 0; uint64(i) < llm.numTensor(); i++ { + for range llm.numTensor() { name, err := readGGUFString(llm, rs) if err != nil { - return err + return fmt.Errorf("failed to read tensor name: %w", err) } // dims is the number of dimensions in the tensor dims, err := readGGUF[uint32](llm, rs) if err != nil { - return err + return fmt.Errorf("failed to read tensor dimensions: %w", err) } shape := [4]uint64{1, 1, 1, 1} for i := 0; uint32(i) < dims; i++ { shape[i], err = readGGUF[uint64](llm, rs) if err != nil { - return err + return fmt.Errorf("failed to read tensor shape: %w", err) } } kind, err := readGGUF[uint32](llm, rs) if err != nil { - return err + return fmt.Errorf("failed to read tensor kind: %w", err) } offset, err := readGGUF[uint64](llm, rs) if err != nil { - return err + return fmt.Errorf("failed to read tensor offset: %w", err) } tensor := Tensor{ @@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { alignment = 32 } - offset, err := rs.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - padding := llm.padding(offset, int64(alignment)) - if _, err := rs.Seek(padding, io.SeekCurrent); err != nil { - return err - } - for _, tensor := range llm.tensors { - if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil { - return err + offset, err := rs.Seek(0, io.SeekCurrent) + if err != nil { + return fmt.Errorf("failed to get current offset: %w", err) } - padding := llm.padding(int64(tensor.Size()), int64(alignment)) + padding := llm.padding(offset, int64(alignment)) if _, err := rs.Seek(padding, io.SeekCurrent); err != nil { - return err + return fmt.Errorf("failed to seek to init padding: %w", err) + } + + if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil { + return fmt.Errorf("failed to seek to tensor: %w", err) } } @@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) { return b.String(), nil } +func discardGGUFString(llm *gguf, r io.Reader) error { + buf := llm.scratch[:8] + _, err := io.ReadFull(r, buf) + if err != nil { + return err + } + + size := int(llm.ByteOrder.Uint64(buf)) + for size > 0 { + n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))]) + if err != nil { + return err + } + size -= n + } + return nil +} + func readGGUFString(llm *gguf, r io.Reader) (string, error) { if llm.Version == 1 { return readGGUFV1String(llm, r) } - var length uint64 - if err := binary.Read(r, llm.ByteOrder, &length); err != nil { + buf := llm.scratch[:8] + _, err := io.ReadFull(r, buf) + if err != nil { return "", err } - var b bytes.Buffer - if _, err := io.CopyN(&b, r, int64(length)); err != nil { + length := int(llm.ByteOrder.Uint64(buf)) + if length > len(llm.scratch) { + buf = make([]byte, length) + } else { + buf = llm.scratch[:length] + } + clear(buf) + + _, err = io.ReadFull(r, buf) + if err != nil { return "", err } - - return b.String(), nil + return string(buf), nil } func writeGGUFString(llm *gguf, w io.Writer, s string) error { @@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error { return err } -func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) { +type array struct { + size int + values []any +} + +func (a *array) MarshalJSON() ([]byte, error) { + return json.Marshal(a.values) +} + +func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) { t, err := readGGUF[uint32](llm, r) if err != nil { return nil, err @@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } - for i := 0; uint32(i) < n; i++ { + a := &array{size: int(n)} + if llm.canCollectArray(int(n)) { + a.values = make([]any, 0, int(n)) + } + + for i := range n { var e any switch t { case ggufTypeUint8: @@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } - a = append(a, e) + if a.values != nil { + a.values[i] = e + } } - return + return a, nil } -func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { +func readGGUFArray(llm *gguf, r io.Reader) (*array, error) { if llm.Version == 1 { return readGGUFV1Array(llm, r) } @@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } - for i := 0; uint64(i) < n; i++ { + a := &array{size: int(n)} + if llm.canCollectArray(int(n)) { + a.values = make([]any, int(n)) + } + + for i := range n { var e any switch t { case ggufTypeUint8: @@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { case ggufTypeBool: e, err = readGGUF[bool](llm, r) case ggufTypeString: - e, err = readGGUFString(llm, r) + if a.values != nil { + e, err = readGGUFString(llm, r) + } else { + err = discardGGUFString(llm, r) + } default: return nil, fmt.Errorf("invalid array type: %d", t) } @@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } - a = append(a, e) + if a.values != nil { + a.values[i] = e + } } - return + return a, nil } func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error { diff --git a/llm/llama.cpp b/llm/llama.cpp index 5921b8f08..7c26775ad 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603 +Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c diff --git a/llm/memory.go b/llm/memory.go index 5afb1c2e9..19b12cbfc 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -1,6 +1,7 @@ package llm import ( + "fmt" "log/slog" "strconv" "strings" @@ -49,6 +50,18 @@ type MemoryEstimate struct { // For multi-GPU scenarios, this is the size in bytes per GPU GPUSizes []uint64 + + // internal fields for logging purposes + inferenceLibrary string + layersRequested int + layersModel int + availableList []string + kv uint64 + allocationsList []string + memoryWeights uint64 + memoryLayerOutput uint64 + graphFullOffload uint64 + graphPartialOffload uint64 } // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size @@ -102,8 +115,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts slog.Warn("model missing blk.0 layer size") } - // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv - var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() + // fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv + var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV() // KV is proportional to the number of layers layerSize += kv / ggml.KV().BlockCount() @@ -167,6 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts // For all the layers, find where they can fit on the GPU(s) for i := range int(ggml.KV().BlockCount()) { + // Some models have inconsistent layer sizes + if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { + layerSize = blk.size() + layerSize += kv / ggml.KV().BlockCount() + } memoryWeights += layerSize if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { @@ -252,78 +270,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts allocationsList = append(allocationsList, format.HumanBytes2(a)) } + estimate := MemoryEstimate{ + TotalSize: memoryRequiredTotal, + Layers: 0, + Graph: 0, + VRAMSize: 0, + GPUSizes: []uint64{}, + + inferenceLibrary: gpus[0].Library, + layersRequested: opts.NumGPU, + layersModel: int(ggml.KV().BlockCount()) + 1, + availableList: availableList, + kv: kv, + allocationsList: allocationsList, + memoryWeights: memoryWeights, + memoryLayerOutput: memoryLayerOutput, + graphFullOffload: graphFullOffload, + graphPartialOffload: graphPartialOffload, + } + + if gpus[0].Library == "cpu" { + return estimate + } + if layerCount == 0 { + slog.Debug("insufficient VRAM to load any model layers") + return estimate + } + estimate.Layers = layerCount + estimate.Graph = graphOffload + estimate.VRAMSize = memoryRequiredPartial + estimate.TotalSize = memoryRequiredTotal + estimate.TensorSplit = tensorSplit + estimate.GPUSizes = gpuAllocations + return estimate +} + +func (m MemoryEstimate) log() { slog.Info( - "offload to gpu", + "offload to "+m.inferenceLibrary, slog.Group( "layers", // requested number of layers to offload - "requested", opts.NumGPU, + "requested", m.layersRequested, // The number of layers the model has (including output) - "model", int(ggml.KV().BlockCount())+1, + "model", m.layersModel, // estimated number of layers that can be offloaded - "offload", layerCount, - // multi-gpu split for tesnors - "split", tensorSplit, + "offload", m.Layers, + // multi-gpu split for tensors + "split", m.TensorSplit, ), slog.Group( "memory", // memory available by GPU for offloading - "available", availableList, + "available", m.availableList, slog.Group( "required", // memory required for full offloading - "full", format.HumanBytes2(memoryRequiredTotal), + "full", format.HumanBytes2(m.TotalSize), // memory required to offload layers.estimate layers - "partial", format.HumanBytes2(memoryRequiredPartial), + "partial", format.HumanBytes2(m.VRAMSize), // memory of KV cache - "kv", format.HumanBytes2(kv), + "kv", format.HumanBytes2(m.kv), // Allocations across the GPUs - "allocations", allocationsList, + "allocations", m.allocationsList, ), slog.Group( "weights", // memory of the weights - "total", format.HumanBytes2(memoryWeights), + "total", format.HumanBytes2(m.memoryWeights), // memory of repeating layers - "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput), + "repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput), // memory of non-repeating layers - "nonrepeating", format.HumanBytes2(memoryLayerOutput), + "nonrepeating", format.HumanBytes2(m.memoryLayerOutput), ), slog.Group( "graph", // memory of graph when fully offloaded - "full", format.HumanBytes2(graphFullOffload), + "full", format.HumanBytes2(m.graphFullOffload), // memory of graph when not fully offloaded - "partial", format.HumanBytes2(graphPartialOffload), + "partial", format.HumanBytes2(m.graphPartialOffload), ), ), ) - if gpus[0].Library == "cpu" { - return MemoryEstimate{ - Layers: 0, - Graph: 0, - VRAMSize: 0, - TotalSize: memoryRequiredTotal, - GPUSizes: []uint64{}, - } - } - if layerCount == 0 { - slog.Debug("insufficient VRAM to load any model layers") - return MemoryEstimate{ - Layers: 0, - Graph: 0, - VRAMSize: 0, - TotalSize: memoryRequiredTotal, - GPUSizes: []uint64{}, - } - } - - return MemoryEstimate{ - Layers: layerCount, - Graph: graphOffload, - VRAMSize: memoryRequiredPartial, - TotalSize: memoryRequiredTotal, - TensorSplit: tensorSplit, - GPUSizes: gpuAllocations, - } } diff --git a/llm/memory_test.go b/llm/memory_test.go index 8eaa07715..f972f9275 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) { defer f.Close() gguf := NewGGUFV3(binary.LittleEndian) inputLayerCount := 5 + tensors := []Tensor{ - {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, - {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, - {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, - {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, - {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, - {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, + {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, + {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, + {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, + {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, + {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, } assert.Len(t, tensors, inputLayerCount+1) err = gguf.Encode(f, KV{ @@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) { }, tensors) require.NoError(t, err) - ggml, err := LoadModel(f.Name()) - require.NoError(t, err) + ggml, err := LoadModel(f.Name(), 0) + if err != nil { + t.Fatal(err) + } // Simple CPU scenario gpus := []gpu.GpuInfo{ diff --git a/llm/patches/01-load-progress.diff b/llm/patches/01-load-progress.diff index acd44d207..be5286091 100644 --- a/llm/patches/01-load-progress.diff +++ b/llm/patches/01-load-progress.diff @@ -1,8 +1,8 @@ diff --git a/common/common.cpp b/common/common.cpp -index ba1ecf0e..cead57cc 100644 +index 73ff0e85..6adb1a92 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & +@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; @@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644 mparams.kv_overrides = NULL; } else { diff --git a/common/common.h b/common/common.h -index d80344f2..71e84834 100644 +index 58ed72f4..0bb2605e 100644 --- a/common/common.h +++ b/common/common.h -@@ -174,6 +174,13 @@ struct gpt_params { - // multimodal models (see examples/llava) +@@ -180,6 +180,13 @@ struct gpt_params { std::string mmproj = ""; // path to multimodal projector std::vector image; // path to image file(s) -+ + + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. + llama_progress_callback progress_callback = NULL; + // context pointer passed to the progress callback + void * progress_callback_user_data; - }; - - void gpt_params_handle_model_default(gpt_params & params); ++ + // server params + int32_t port = 8080; // server listens on this network port + int32_t timeout_read = 600; // http read timeout in seconds diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 27c8aabc2..2a2e7306e 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/llama.cpp b/llama.cpp -index 40d2ec2c..74f3ee9c 100644 +index 61948751..4b72a293 100644 --- a/llama.cpp +++ b/llama.cpp -@@ -4642,16 +4642,7 @@ static void llm_load_vocab( +@@ -4824,16 +4824,7 @@ static void llm_load_vocab( // for now, only BPE models have pre-tokenizers if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { @@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644 - LLAMA_LOG_WARN("%s: ************************************ \n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__); - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; -- } else if ( -+ if ( - tokenizer_pre == "default") { +- } else if (tokenizer_pre == "default") { ++ if (tokenizer_pre == "default") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( -@@ -4703,7 +4694,8 @@ static void llm_load_vocab( - tokenizer_pre == "smaug-bpe") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; + tokenizer_pre == "llama3" || +@@ -4888,7 +4879,8 @@ static void llm_load_vocab( + tokenizer_pre == "poro-chat") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); diff --git a/llm/patches/07-gemma.diff b/llm/patches/07-gemma.diff new file mode 100644 index 000000000..86eac3d17 --- /dev/null +++ b/llm/patches/07-gemma.diff @@ -0,0 +1,305 @@ +From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001 +From: Ollama maintainers +Date: Wed, 26 Jun 2024 16:18:09 -0700 +Subject: [PATCH] Architecture support + +--- + llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 193 insertions(+), 1 deletion(-) + +diff --git a/llama.cpp b/llama.cpp +index 61948751..3b4196f5 100644 +--- a/llama.cpp ++++ b/llama.cpp +@@ -217,6 +217,7 @@ enum llm_arch { + LLM_ARCH_INTERNLM2, + LLM_ARCH_MINICPM, + LLM_ARCH_GEMMA, ++ LLM_ARCH_GEMMA2, + LLM_ARCH_STARCODER2, + LLM_ARCH_MAMBA, + LLM_ARCH_XVERSE, +@@ -255,6 +256,7 @@ static const std::map LLM_ARCH_NAMES = { + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_GEMMA, "gemma" }, ++ { LLM_ARCH_GEMMA2, "gemma2" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_XVERSE, "xverse" }, +@@ -464,10 +466,12 @@ enum llm_tensor { + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_OUT_NORM, ++ LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_NORM, ++ LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, +@@ -960,6 +964,24 @@ static const std::map> LLM_TENSOR_NA + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, ++ { ++ LLM_ARCH_GEMMA2, ++ { ++ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, ++ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, ++ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, ++ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, ++ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, ++ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, ++ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, ++ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, ++ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, ++ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, ++ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, ++ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, ++ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, ++ }, ++ }, + { + LLM_ARCH_STARCODER2, + { +@@ -1941,6 +1963,8 @@ enum e_model { + MODEL_8x22B, + MODEL_16x12B, + MODEL_10B_128x3_66B, ++ MODEL_9B, ++ MODEL_27B, + }; + + static const size_t kiB = 1024; +@@ -2114,6 +2138,7 @@ struct llama_layer { + struct ggml_tensor * attn_out_norm_b; + struct ggml_tensor * attn_q_a_norm; + struct ggml_tensor * attn_kv_a_norm; ++ struct ggml_tensor * attn_post_norm; + + // attention + struct ggml_tensor * wq; +@@ -2136,6 +2161,7 @@ struct llama_layer { + // normalization + struct ggml_tensor * ffn_norm; + struct ggml_tensor * ffn_norm_b; ++ struct ggml_tensor * ffn_post_norm; + struct ggml_tensor * layer_out_norm; + struct ggml_tensor * layer_out_norm_b; + struct ggml_tensor * ffn_norm_exps; +@@ -4529,6 +4555,16 @@ static void llm_load_hparams( + } + } break; + case LLM_ARCH_GEMMA: ++ { ++ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ++ ++ switch (hparams.n_layer) { ++ case 18: model.type = e_model::MODEL_9B; break; ++ case 28: model.type = e_model::MODEL_27B; break; ++ default: model.type = e_model::MODEL_UNKNOWN; ++ } ++ } break; ++ case LLM_ARCH_GEMMA2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + +@@ -6305,6 +6341,40 @@ static bool llm_load_tensors( + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + } + } break; ++ case LLM_ARCH_GEMMA2: ++ { ++ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); ++ ++ // output ++ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); ++ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading ++ ++ const int64_t n_ff = hparams.n_ff; ++ const int64_t n_embd_head_k = hparams.n_embd_head_k; ++ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); ++ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); ++ ++ for (uint32_t i = 0; i < n_layer; ++i) { ++ ggml_context * ctx_layer = ctx_for_layer(i); ++ ggml_context * ctx_split = ctx_for_layer_split(i); ++ ++ auto & layer = model.layers[i]; ++ ++ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); ++ ++ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); ++ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); ++ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); ++ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); ++ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}); ++ ++ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); ++ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); ++ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); ++ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); ++ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}); ++ } ++ } break; + case LLM_ARCH_STARCODER2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); +@@ -10614,6 +10684,123 @@ struct llm_build_context { + return gf; + } + ++ struct ggml_cgraph * build_gemma2() { ++ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); ++ ++ const int64_t n_embd_head_k = hparams.n_embd_head_k; ++ ++ struct ggml_tensor * cur; ++ struct ggml_tensor * inpL; ++ ++ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); ++ ++ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); ++ cb(inpL, "inp_scaled", -1); ++ ++ // inp_pos - contains the positions ++ struct ggml_tensor * inp_pos = build_inp_pos(); ++ ++ // KQ_mask (mask for 1 head, it will be broadcasted to all heads) ++ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); ++ ++ for (int il = 0; il < n_layer; ++il) { ++ // norm ++ cur = llm_build_norm(ctx0, inpL, hparams, ++ model.layers[il].attn_norm, NULL, ++ LLM_NORM_RMS, cb, il); ++ cb(cur, "attn_norm", il); ++ ++ // self-attention ++ { ++ // compute Q and K and RoPE them ++ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); ++ cb(Qcur, "Qcur", il); ++ ++ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); ++ cb(Kcur, "Kcur", il); ++ ++ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); ++ cb(Vcur, "Vcur", il); ++ ++ Qcur = ggml_rope_ext( ++ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, ++ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, ++ ext_factor, attn_factor, beta_fast, beta_slow); ++ cb(Qcur, "Qcur", il); ++ ++ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); ++ cb(Qcur, "Qcur_scaled", il); ++ ++ Kcur = ggml_rope_ext( ++ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, ++ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, ++ ext_factor, attn_factor, beta_fast, beta_slow); ++ cb(Kcur, "Kcur", il); ++ ++ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, ++ model.layers[il].wo, NULL, ++ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); ++ } ++ ++ if (il == n_layer - 1) { ++ // skip computing output for unused tokens ++ struct ggml_tensor * inp_out_ids = build_inp_out_ids(); ++ cur = ggml_get_rows(ctx0, cur, inp_out_ids); ++ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); ++ } ++ ++ cur = llm_build_norm(ctx0, cur, hparams, ++ model.layers[il].attn_post_norm, NULL, ++ LLM_NORM_RMS, cb, il); ++ cb(cur, "attn_post_norm", il); ++ ++ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); ++ cb(sa_out, "sa_out", il); ++ ++ cur = llm_build_norm(ctx0, sa_out, hparams, ++ model.layers[il].ffn_norm, NULL, ++ LLM_NORM_RMS, cb, il); ++ cb(cur, "ffn_norm", il); ++ ++ // feed-forward network ++ { ++ cur = llm_build_ffn(ctx0, cur, ++ model.layers[il].ffn_up, NULL, ++ model.layers[il].ffn_gate, NULL, ++ model.layers[il].ffn_down, NULL, ++ NULL, ++ LLM_FFN_GELU, LLM_FFN_PAR, cb, il); ++ cb(cur, "ffn_out", il); ++ } ++ ++ cur = llm_build_norm(ctx0, cur, hparams, ++ model.layers[il].ffn_post_norm, NULL, ++ LLM_NORM_RMS, cb, -1); ++ cb(cur, "ffn_post_norm", -1); ++ ++ cur = ggml_add(ctx0, cur, sa_out); ++ cb(cur, "l_out", il); ++ ++ // input for next layer ++ inpL = cur; ++ } ++ ++ cur = inpL; ++ ++ cur = llm_build_norm(ctx0, cur, hparams, ++ model.output_norm, NULL, ++ LLM_NORM_RMS, cb, -1); ++ cb(cur, "result_norm", -1); ++ ++ // lm_head ++ cur = ggml_mul_mat(ctx0, model.output, cur); ++ cb(cur, "result_output", -1); ++ ++ ggml_build_forward_expand(gf, cur); ++ ++ return gf; ++ } ++ + struct ggml_cgraph * build_starcoder2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + +@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph( + { + result = llm.build_gemma(); + } break; ++ case LLM_ARCH_GEMMA2: ++ { ++ result = llm.build_gemma2(); ++ } break; + case LLM_ARCH_STARCODER2: + { + result = llm.build_starcoder2(); +@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { + case LLM_ARCH_PHI2: + case LLM_ARCH_PHI3: + case LLM_ARCH_GEMMA: ++ case LLM_ARCH_GEMMA2: + case LLM_ARCH_STARCODER2: + case LLM_ARCH_GPTNEOX: + return LLAMA_ROPE_TYPE_NEOX; +@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal( + if (add_ass) { + ss << "assistant\n"; + } +- } else if (tmpl == "gemma" || tmpl.find("") != std::string::npos) { ++ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("") != std::string::npos) { + // google/gemma-7b-it + std::string system_prompt = ""; + for (auto message : chat) { +-- +2.45.2 + diff --git a/llm/payload.go b/llm/payload.go index 20dcee7b5..9296db336 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -58,7 +58,7 @@ func availableServers() map[string]string { } // glob payloadsDir for files that start with ollama_ - pattern := filepath.Join(payloadsDir, "*") + pattern := filepath.Join(payloadsDir, "*", "ollama_*") files, err := filepath.Glob(pattern) if err != nil { @@ -69,7 +69,7 @@ func availableServers() map[string]string { servers := make(map[string]string) for _, file := range files { slog.Debug("availableServers : found", "file", file) - servers[filepath.Base(file)] = file + servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file) } return servers diff --git a/llm/server.go b/llm/server.go index 6313fc327..ad67138b5 100644 --- a/llm/server.go +++ b/llm/server.go @@ -60,7 +60,12 @@ type llmServer struct { sem *semaphore.Weighted } -func LoadModel(model string) (*GGML, error) { +// LoadModel will load a model from disk. The model must be in the GGML format. +// +// It collects array values for arrays with a size less than or equal to +// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If +// the maxArraySize is negative, all arrays are collected. +func LoadModel(model string, maxArraySize int) (*GGML, error) { if _, err := os.Stat(model); err != nil { return nil, err } @@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) { } defer f.Close() - ggml, _, err := DecodeGGML(f) + ggml, _, err := DecodeGGML(f, maxArraySize) return ggml, err } @@ -81,7 +86,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr var err error var cpuRunner string var estimate MemoryEstimate - var systemMemory uint64 + var systemTotalMemory uint64 + var systemFreeMemory uint64 + + systemMemInfo, err := gpu.GetCPUMem() + if err != nil { + slog.Error("failed to lookup system memory", "error", err) + } else { + systemTotalMemory = systemMemInfo.TotalMemory + systemFreeMemory = systemMemInfo.FreeMemory + slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory) + } // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info if opts.NumGPU == 0 { @@ -91,19 +106,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr cpuRunner = serverForCpu() estimate = EstimateGPULayers(gpus, ggml, projectors, opts) } else { - if gpus[0].Library == "metal" { - memInfo, err := gpu.GetCPUMem() - if err != nil { - slog.Error("failed to lookup system memory", "error", err) - } else { - systemMemory = memInfo.TotalMemory - slog.Debug("system memory", "total", format.HumanBytes2(systemMemory)) - } - } estimate = EstimateGPULayers(gpus, ggml, projectors, opts) switch { - case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory: + case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory: // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system opts.NumGPU = 0 @@ -116,6 +122,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } + estimate.log() + // Loop through potential servers finalErr := errors.New("no suitable llama servers found") @@ -200,7 +208,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if g.Library == "metal" && uint64(opts.NumGPU) > 0 && uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { - opts.UseMMap = false + opts.UseMMap = api.TriStateFalse } } @@ -208,7 +216,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--flash-attn") } - if !opts.UseMMap { + // Windows CUDA should not use mmap for best performance + // Linux with a model larger than free space, mmap leads to thrashing + if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) || + (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) || + opts.UseMMap == api.TriStateFalse { params = append(params, "--no-mmap") } @@ -271,8 +283,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if runtime.GOOS == "windows" { pathEnv = "PATH" } - // prepend the server directory to LD_LIBRARY_PATH/PATH - libraryPaths := []string{dir} + // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies + libraryPaths := []string{dir, filepath.Dir(dir)} if libraryPath, ok := os.LookupEnv(pathEnv); ok { // Append our runner directory to the path @@ -320,6 +332,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr s.cmd.Stdout = os.Stdout s.cmd.Stderr = s.status + envWorkarounds := [][2]string{} + for _, gpu := range gpus { + envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...) + } visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv() pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) @@ -334,6 +350,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) { s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal devicesNeeded = false + } else if len(envWorkarounds) != 0 { + for _, kv := range envWorkarounds { + if strings.EqualFold(cmp[0], kv[0]) { + s.cmd.Env[i] = kv[0] + "=" + kv[1] + } + } } } if pathNeeded { @@ -395,7 +417,7 @@ func projectorMemoryRequirements(filename string) uint64 { } defer file.Close() - ggml, _, err := DecodeGGML(file) + ggml, _, err := DecodeGGML(file, 0) if err != nil { return 0 } diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 60de03073..b3991ce1f 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -103,19 +103,19 @@ function buildApp() { function gatherDependencies() { write-host "Gathering runtime dependencies" cd "${script:SRC_DIR}" - md "${script:DEPS_DIR}" -ea 0 > $null + md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\" cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" if ("${env:KEY_CONTAINER}") { write-host "about to sign" - foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ + foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ write-host "signing $file" & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file diff --git a/scripts/install.sh b/scripts/install.sh index 0f12d7e09..2a06c350a 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\ case $OS_NAME in centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;; rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;; - fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';; + fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';; amzn) install_cuda_driver_yum 'fedora' '37' ;; debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;; ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;; diff --git a/server/images.go b/server/images.go index 53a957715..e949fb18a 100644 --- a/server/images.go +++ b/server/images.go @@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio return err } - layers, err := parseFromFile(ctx, temp, "", fn) + layer, err := NewLayer(temp, baseLayer.MediaType) if err != nil { return err } - if len(layers) != 1 { - return errors.New("quantization failed") + if _, err := temp.Seek(0, io.SeekStart); err != nil { + return err } - baseLayer.Layer = layers[0].Layer - baseLayer.GGML = layers[0].GGML + ggml, _, err := llm.DecodeGGML(temp, 0) + if err != nil { + return err + } + + baseLayer.Layer = layer + baseLayer.GGML = ggml } } diff --git a/server/model.go b/server/model.go index b262ea385..d56e641ba 100644 --- a/server/model.go +++ b/server/model.go @@ -11,6 +11,7 @@ import ( "net/http" "os" "path/filepath" + "strings" "github.com/ollama/ollama/api" "github.com/ollama/ollama/convert" @@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe } defer blob.Close() - ggml, _, err := llm.DecodeGGML(blob) + ggml, _, err := llm.DecodeGGML(blob, 0) if err != nil { return nil, err } @@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe return layers, nil } -func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { +func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error { stat, err := file.Stat() if err != nil { - return nil, err + return err } r, err := zip.NewReader(file, stat.Size()) if err != nil { - return nil, err + return err } - tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "") - if err != nil { - return nil, err - } - defer os.RemoveAll(tempdir) - fn(api.ProgressResponse{Status: "unpacking model metadata"}) for _, f := range r.File { + n := filepath.Join(p, f.Name) + if !strings.HasPrefix(n, p) { + slog.Warn("skipped extracting file outside of context", "name", f.Name) + continue + } + + if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil { + return err + } + // TODO(mxyng): this should not write out all files to disk - outfile, err := os.Create(filepath.Join(tempdir, f.Name)) + outfile, err := os.Create(n) if err != nil { - return nil, err + return err } defer outfile.Close() infile, err := f.Open() if err != nil { - return nil, err + return err } defer infile.Close() if _, err = io.Copy(outfile, infile); err != nil { - return nil, err + return err } if err := outfile.Close(); err != nil { - return nil, err + return err } if err := infile.Close(); err != nil { - return nil, err + return err } } - mf, err := convert.GetModelFormat(tempdir) + return nil +} + +func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { + tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "") + if err != nil { + return nil, err + } + defer os.RemoveAll(tempDir) + + if err := extractFromZipFile(tempDir, file, fn); err != nil { + return nil, err + } + + mf, err := convert.GetModelFormat(tempDir) if err != nil { return nil, err } - params, err := mf.GetParams(tempdir) + params, err := mf.GetParams(tempDir) if err != nil { return nil, err } - mArch, err := mf.GetModelArch("", tempdir, params) + mArch, err := mf.GetModelArch("", tempDir, params) if err != nil { return nil, err } @@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a // TODO(mxyng): this should write directly into a layer // e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model") - temp, err := os.CreateTemp(tempdir, "fp16") + temp, err := os.CreateTemp(tempDir, "fp16") if err != nil { return nil, err } @@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a } defer bin.Close() - ggml, _, err := llm.DecodeGGML(bin) + ggml, _, err := llm.DecodeGGML(bin, 0) if err != nil { return nil, err } @@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap var offset int64 for offset < stat.Size() { - ggml, n, err := llm.DecodeGGML(file) + ggml, n, err := llm.DecodeGGML(file, 0) if errors.Is(err, io.EOF) { break } else if err != nil { diff --git a/server/model_test.go b/server/model_test.go new file mode 100644 index 000000000..c3023eb2b --- /dev/null +++ b/server/model_test.go @@ -0,0 +1,92 @@ +package server + +import ( + "archive/zip" + "bytes" + "io" + "os" + "path/filepath" + "slices" + "testing" + + "github.com/ollama/ollama/api" +) + +func createZipFile(t *testing.T, name string) *os.File { + t.Helper() + + f, err := os.CreateTemp(t.TempDir(), "") + if err != nil { + t.Fatal(err) + } + + zf := zip.NewWriter(f) + defer zf.Close() + + zh, err := zf.CreateHeader(&zip.FileHeader{Name: name}) + if err != nil { + t.Fatal(err) + } + + if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil { + t.Fatal(err) + } + + return f +} + +func TestExtractFromZipFile(t *testing.T) { + cases := []struct { + name string + expect []string + }{ + { + name: "good", + expect: []string{"good"}, + }, + { + name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"), + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + f := createZipFile(t, tt.name) + defer f.Close() + + tempDir := t.TempDir() + if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil { + t.Fatal(err) + } + + var matches []string + if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error { + if err != nil { + return err + } + + if !fi.IsDir() { + matches = append(matches, p) + } + + return nil + }); err != nil { + t.Fatal(err) + } + + var actual []string + for _, match := range matches { + rel, err := filepath.Rel(tempDir, match) + if err != nil { + t.Error(err) + } + + actual = append(actual, rel) + } + + if !slices.Equal(actual, tt.expect) { + t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches)) + } + }) + } +} diff --git a/server/routes.go b/server/routes.go index 188fe9748..ff66663c0 100644 --- a/server/routes.go +++ b/server/routes.go @@ -646,9 +646,12 @@ func (s *Server) ShowModelHandler(c *gin.Context) { resp, err := GetModelInfo(req) if err != nil { - if os.IsNotExist(err) { + switch { + case os.IsNotExist(err): c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)}) - } else { + case err.Error() == "invalid model name": + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + default: c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) } return @@ -658,44 +661,55 @@ func (s *Server) ShowModelHandler(c *gin.Context) { } func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { - model, err := GetModel(req.Model) + m, err := GetModel(req.Model) if err != nil { return nil, err } modelDetails := api.ModelDetails{ - ParentModel: model.ParentModel, - Format: model.Config.ModelFormat, - Family: model.Config.ModelFamily, - Families: model.Config.ModelFamilies, - ParameterSize: model.Config.ModelType, - QuantizationLevel: model.Config.FileType, + ParentModel: m.ParentModel, + Format: m.Config.ModelFormat, + Family: m.Config.ModelFamily, + Families: m.Config.ModelFamilies, + ParameterSize: m.Config.ModelType, + QuantizationLevel: m.Config.FileType, } if req.System != "" { - model.System = req.System + m.System = req.System } if req.Template != "" { - model.Template = req.Template + m.Template = req.Template } msgs := make([]api.Message, 0) - for _, msg := range model.Messages { + for _, msg := range m.Messages { msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content}) } + n := model.ParseName(req.Model) + if !n.IsValid() { + return nil, fmt.Errorf("invalid model name") + } + + manifest, err := ParseNamedManifest(n) + if err != nil { + return nil, err + } + resp := &api.ShowResponse{ - License: strings.Join(model.License, "\n"), - System: model.System, - Template: model.Template, - Details: modelDetails, - Messages: msgs, + License: strings.Join(m.License, "\n"), + System: m.System, + Template: m.Template, + Details: modelDetails, + Messages: msgs, + ModifiedAt: manifest.fi.ModTime(), } var params []string cs := 30 - for k, v := range model.Options { + for k, v := range m.Options { switch val := v.(type) { case []interface{}: for _, nv := range val { @@ -709,20 +723,59 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { for k, v := range req.Options { if _, ok := req.Options[k]; ok { - model.Options[k] = v + m.Options[k] = v } } var sb strings.Builder fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"") fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:") - fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName) - fmt.Fprint(&sb, model.String()) + fmt.Fprintf(&sb, "# FROM %s\n\n", m.ShortName) + fmt.Fprint(&sb, m.String()) resp.Modelfile = sb.String() + kvData, err := getKVData(m.ModelPath, req.Verbose) + if err != nil { + return nil, err + } + delete(kvData, "general.name") + delete(kvData, "tokenizer.chat_template") + resp.ModelInfo = kvData + + if len(m.ProjectorPaths) > 0 { + projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose) + if err != nil { + return nil, err + } + resp.ProjectorInfo = projectorData + } + return resp, nil } +func getKVData(digest string, verbose bool) (llm.KV, error) { + maxArraySize := 0 + if verbose { + maxArraySize = -1 + } + kvData, err := llm.LoadModel(digest, maxArraySize) + if err != nil { + return nil, err + } + + kv := kvData.KV() + + if !verbose { + for k := range kv { + if t, ok := kv[k].([]any); len(t) > 5 && ok { + kv[k] = []any{} + } + } + } + + return kv, nil +} + func (s *Server) ListModelsHandler(c *gin.Context) { ms, err := Manifests() if err != nil { @@ -1052,11 +1105,20 @@ func Serve(ln net.Listener) error { schedCtx, schedDone := context.WithCancel(ctx) sched := InitScheduler(schedCtx) s := &Server{addr: ln.Addr(), sched: sched} - r := s.GenerateRoutes() + + http.Handle("/", s.GenerateRoutes()) slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version)) srvr := &http.Server{ - Handler: r, + // Use http.DefaultServeMux so we get net/http/pprof for + // free. + // + // TODO(bmizerany): Decide if we want to make this + // configurable so it is not exposed by default, or allow + // users to bind it to a different port. This was a quick + // and easy way to get pprof, but it may not be the best + // way. + Handler: nil, } // listen for a ctrl+c and stop any loaded llm diff --git a/server/routes_test.go b/server/routes_test.go index 5e16cfeff..5a5c0fbba 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -19,6 +19,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/llm" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/types/model" "github.com/ollama/ollama/version" @@ -212,6 +213,7 @@ func Test_Routes(t *testing.T) { "top_p 0.9", } assert.Equal(t, expectedParams, params) + assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0") }, }, } @@ -325,3 +327,40 @@ func TestCase(t *testing.T) { }) } } + +func TestShow(t *testing.T) { + t.Setenv("OLLAMA_MODELS", t.TempDir()) + envconfig.LoadConfig() + + var s Server + + createRequest(t, s.CreateModelHandler, api.CreateRequest{ + Name: "show-model", + Modelfile: fmt.Sprintf( + "FROM %s\nFROM %s", + createBinFile(t, llm.KV{"general.architecture": "test"}, nil), + createBinFile(t, llm.KV{"general.architecture": "clip"}, nil), + ), + }) + + w := createRequest(t, s.ShowModelHandler, api.ShowRequest{ + Name: "show-model", + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected status code 200, actual %d", w.Code) + } + + var resp api.ShowResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatal(err) + } + + if resp.ModelInfo["general.architecture"] != "test" { + t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"]) + } + + if resp.ProjectorInfo["general.architecture"] != "clip" { + t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"]) + } +} diff --git a/server/sched.go b/server/sched.go index 424395544..0084b533b 100644 --- a/server/sched.go +++ b/server/sched.go @@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } // Load model for fitting - ggml, err := llm.LoadModel(pending.model.ModelPath) + ggml, err := llm.LoadModel(pending.model.ModelPath, 0) if err != nil { pending.errCh <- err break diff --git a/server/sched_test.go b/server/sched_test.go index 953288347..4a1cf72a0 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, }, []llm.Tensor{ - {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, - {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, + {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, }) require.NoError(t, err) fname := f.Name() model := &Model{Name: modelName, ModelPath: fname} - scenario.ggml, err = llm.LoadModel(model.ModelPath) + scenario.ggml, err = llm.LoadModel(model.ModelPath, 0) require.NoError(t, err) scenario.req = &LlmRequest{ diff --git a/types/model/name.go b/types/model/name.go index d85fd0c6c..e645a844c 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -4,7 +4,6 @@ package model import ( "cmp" - "encoding/hex" "errors" "fmt" "log/slog" @@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) { } return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true } - -type DigestType byte - -const ( - DigestTypeInvalid DigestType = iota - DigestTypeSHA256 -) - -func (t DigestType) String() string { - switch t { - case DigestTypeSHA256: - return "sha256" - default: - return "invalid" - } -} - -type Digest struct { - Type DigestType - Sum [32]byte -} - -func ParseDigest(s string) (Digest, error) { - i := strings.IndexAny(s, "-:") - if i < 0 { - return Digest{}, fmt.Errorf("invalid digest %q", s) - } - typ, encSum := s[:i], s[i+1:] - if typ != "sha256" { - return Digest{}, fmt.Errorf("unsupported digest type %q", typ) - } - d := Digest{ - Type: DigestTypeSHA256, - } - n, err := hex.Decode(d.Sum[:], []byte(encSum)) - if err != nil { - return Digest{}, err - } - if n != 32 { - return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n) - } - return d, nil -} - -func (d Digest) String() string { - if d.Type == DigestTypeInvalid { - return "" - } - return fmt.Sprintf("sha256-%x", d.Sum) -} - -func (d Digest) IsValid() bool { - return d.Type != DigestTypeInvalid -} diff --git a/types/model/name_test.go b/types/model/name_test.go index 66ce4c339..008dd586c 100644 --- a/types/model/name_test.go +++ b/types/model/name_test.go @@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) { } } -const ( - validSha256 = "sha256-1000000000000000000000000000000000000000000000000000000000000000" - validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000" -) - -func TestParseDigest(t *testing.T) { - cases := []struct { - in string - want string - }{ - {"", ""}, // empty - {"sha123-12", ""}, // invalid type - {"sha256-", ""}, // invalid sum - {"sha256-123", ""}, // invalid odd length sum - - {validSha256, validSha256}, - {validSha256Old, validSha256}, - } - for _, tt := range cases { - t.Run(tt.in, func(t *testing.T) { - got, err := ParseDigest(tt.in) - if err != nil { - if tt.want != "" { - t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want) - } - return - } - if got.String() != tt.want { - t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want) - } - }) - } -} - func TestParseNameFromFilepath(t *testing.T) { cases := map[string]Name{ filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"}, diff --git a/util/bufioutil/buffer_seeker.go b/util/bufioutil/buffer_seeker.go new file mode 100644 index 000000000..8775fdb83 --- /dev/null +++ b/util/bufioutil/buffer_seeker.go @@ -0,0 +1,34 @@ +package bufioutil + +import ( + "bufio" + "io" +) + +type BufferedSeeker struct { + rs io.ReadSeeker + br *bufio.Reader +} + +func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker { + return &BufferedSeeker{ + rs: rs, + br: bufio.NewReaderSize(rs, size), + } +} + +func (b *BufferedSeeker) Read(p []byte) (int, error) { + return b.br.Read(p) +} + +func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) { + if whence == io.SeekCurrent { + offset -= int64(b.br.Buffered()) + } + n, err := b.rs.Seek(offset, whence) + if err != nil { + return 0, err + } + b.br.Reset(b.rs) + return n, nil +} diff --git a/util/bufioutil/buffer_seeker_test.go b/util/bufioutil/buffer_seeker_test.go new file mode 100644 index 000000000..87145f6b6 --- /dev/null +++ b/util/bufioutil/buffer_seeker_test.go @@ -0,0 +1,64 @@ +package bufioutil + +import ( + "bytes" + "io" + "strings" + "testing" +) + +func TestBufferedSeeker(t *testing.T) { + const alphabet = "abcdefghijklmnopqrstuvwxyz" + + bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16 + + checkRead := func(buf []byte, expected string) { + t.Helper() + _, err := bs.Read(buf) + if err != nil { + t.Fatal(err) + } + if !bytes.Equal(buf, []byte(expected)) { + t.Fatalf("expected %s, got %s", expected, buf) + } + } + + // Read the first 5 bytes + buf := make([]byte, 5) + + checkRead(buf, "abcde") + + // Seek back to the beginning + _, err := bs.Seek(0, io.SeekStart) + if err != nil { + t.Fatal(err) + } + + // read 'a' + checkRead(buf[:1], "a") + + if bs.br.Buffered() == 0 { + t.Fatalf("totally unexpected sanity check failed") + } + + // Seek past 'b' + _, err = bs.Seek(1, io.SeekCurrent) + if err != nil { + t.Fatal(err) + } + checkRead(buf, "cdefg") + + // Seek back to the beginning + _, err = bs.Seek(0, io.SeekStart) + if err != nil { + t.Fatal(err) + } + checkRead(buf, "abcde") + + // Seek to the end + _, err = bs.Seek(-5, io.SeekEnd) + if err != nil { + t.Fatal(err) + } + checkRead(buf, "vwxyz") +}