Strip stop strings

Extend api/show and ollama show to return more model info (#4881 )
* API Show Extended * Initial Draft of Information Co-Authored-By: Patrick Devine <pdevine@sonic.net> * Clean Up * Descriptive arg error messages and other fixes * Second Draft of Show with Projectors Included * Remove Chat Template * Touches * Prevent wrapping from files * Verbose functionality * Docs * Address Feedback * Lint * Resolve Conflicts * Function Name * Tests for api/show model info * Show Test File * Add Projector Test * Clean routes * Projector Check * Move Show Test * Touches * Doc update --------- Co-authored-by: Patrick Devine <pdevine@sonic.net>
2024-06-20 09:06:08 -07:00 · 2024-06-19 14:19:02 -07:00 · 2024-06-19 13:02:24 -07:00 · 2024-06-19 12:53:34 -07:00 · 2024-06-19 12:50:31 -07:00 · 2024-06-19 11:14:11 -07:00
33 changed files with 629 additions and 324 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -437,6 +437,7 @@ jobs:
    env:
      OLLAMA_SKIP_IMAGE_BUILD: '1'
      PUSH: '1'
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
@@ -460,15 +461,20 @@ jobs:
          ls -lh dist/
          (cd dist; sha256sum * > sha256sum.txt)
          cat dist/sha256sum.txt
-      - uses: ncipollo/release-action@v1
+      - name: Create or update Release
-        with:
+        run: |
-          name: ${{ env.RELEASE_VERSION }}
+          echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
-          allowUpdates: true
+          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
-          artifacts: 'dist/*'
+          if [ -n "$OLD_TAG" ]; then
-          draft: true
+            echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
-          prerelease: true
+            gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
-          omitBodyDuringUpdate: true
+          else
-          generateReleaseNotes: true
+            echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
-          omitDraftDuringUpdate: true
+            gh release create ${GITHUB_REF_NAME} \
-          omitPrereleaseDuringUpdate: true
+              --title ${{ env.RELEASE_VERSION }} \
-          replacesArtifacts: true
+              --draft \
              --generate-notes \
              --prerelease
          fi
          echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
          gh release upload ${GITHUB_REF_NAME} dist/* --clobber
--- a/api/types.go
+++ b/api/types.go
@@ -159,18 +159,49 @@ type Options struct {
 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	UseNUMA   bool `json:"numa,omitempty"`
+	UseNUMA   bool     `json:"numa,omitempty"`
-	NumCtx    int  `json:"num_ctx,omitempty"`
+	NumCtx    int      `json:"num_ctx,omitempty"`
-	NumBatch  int  `json:"num_batch,omitempty"`
+	NumBatch  int      `json:"num_batch,omitempty"`
-	NumGPU    int  `json:"num_gpu,omitempty"`
+	NumGPU    int      `json:"num_gpu,omitempty"`
-	MainGPU   int  `json:"main_gpu,omitempty"`
+	MainGPU   int      `json:"main_gpu,omitempty"`
-	LowVRAM   bool `json:"low_vram,omitempty"`
+	LowVRAM   bool     `json:"low_vram,omitempty"`
-	F16KV     bool `json:"f16_kv,omitempty"`
+	F16KV     bool     `json:"f16_kv,omitempty"`
-	LogitsAll bool `json:"logits_all,omitempty"`
+	LogitsAll bool     `json:"logits_all,omitempty"`
-	VocabOnly bool `json:"vocab_only,omitempty"`
+	VocabOnly bool     `json:"vocab_only,omitempty"`
-	UseMMap   bool `json:"use_mmap,omitempty"`
+	UseMMap   TriState `json:"use_mmap,omitempty"`
-	UseMLock  bool `json:"use_mlock,omitempty"`
+	UseMLock  bool     `json:"use_mlock,omitempty"`
-	NumThread int  `json:"num_thread,omitempty"`
+	NumThread int      `json:"num_thread,omitempty"`
 }
 type TriState int
 const (
 	TriStateUndefined TriState = -1
 	TriStateFalse     TriState = 0
 	TriStateTrue      TriState = 1
 )
 func (b *TriState) UnmarshalJSON(data []byte) error {
 	var v bool
 	if err := json.Unmarshal(data, &v); err != nil {
 		return err
 	}
 	if v {
 		*b = TriStateTrue
 	}
 	*b = TriStateFalse
 	return nil
 }
 func (b *TriState) MarshalJSON() ([]byte, error) {
 	if *b == TriStateUndefined {
 		return nil, nil
 	}
 	var v bool
 	if *b == TriStateTrue {
 		v = true
 	}
 	return json.Marshal(v)
 }
 // EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -222,6 +253,7 @@ type ShowRequest struct {
 	Model    string `json:"model"`
 	System   string `json:"system"`
 	Template string `json:"template"`
 	Verbose  bool   `json:"verbose"`
 	Options map[string]interface{} `json:"options"`
@@ -231,14 +263,16 @@ type ShowRequest struct {
 // ShowResponse is the response returned from [Client.Show].
 type ShowResponse struct {
-	License    string       `json:"license,omitempty"`
+	License       string         `json:"license,omitempty"`
-	Modelfile  string       `json:"modelfile,omitempty"`
+	Modelfile     string         `json:"modelfile,omitempty"`
-	Parameters string       `json:"parameters,omitempty"`
+	Parameters    string         `json:"parameters,omitempty"`
-	Template   string       `json:"template,omitempty"`
+	Template      string         `json:"template,omitempty"`
-	System     string       `json:"system,omitempty"`
+	System        string         `json:"system,omitempty"`
-	Details    ModelDetails `json:"details,omitempty"`
+	Details       ModelDetails   `json:"details,omitempty"`
-	Messages   []Message    `json:"messages,omitempty"`
+	Messages      []Message      `json:"messages,omitempty"`
-	ModifiedAt time.Time    `json:"modified_at,omitempty"`
+	ModelInfo     map[string]any `json:"model_info,omitempty"`
 	ProjectorInfo map[string]any `json:"projector_info,omitempty"`
 	ModifiedAt    time.Time      `json:"modified_at,omitempty"`
 }
 // CopyRequest is the request passed to [Client.Copy].
@@ -403,6 +437,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 				continue
 			}
 			if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
 				val, ok := val.(bool)
 				if !ok {
 					return fmt.Errorf("option %q must be of type boolean", key)
 				}
 				if val {
 					field.SetInt(int64(TriStateTrue))
 				} else {
 					field.SetInt(int64(TriStateFalse))
 				}
 				continue
 			}
 			switch field.Kind() {
 			case reflect.Int:
 				switch t := val.(type) {
@@ -491,7 +538,7 @@ func DefaultOptions() Options {
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
-			UseMMap:   true,
+			UseMMap:   TriStateUndefined,
 			UseNUMA:   false,
 		},
 	}
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -105,3 +105,39 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
 		})
 	}
 }
 func TestUseMmapParsingFromJSON(t *testing.T) {
 	tests := []struct {
 		name string
 		req  string
 		exp  TriState
 	}{
 		{
 			name: "Undefined",
 			req:  `{ }`,
 			exp:  TriStateUndefined,
 		},
 		{
 			name: "True",
 			req:  `{ "use_mmap": true }`,
 			exp:  TriStateTrue,
 		},
 		{
 			name: "False",
 			req:  `{ "use_mmap": false }`,
 			exp:  TriStateFalse,
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			var oMap map[string]interface{}
 			err := json.Unmarshal([]byte(test.req), &oMap)
 			require.NoError(t, err)
 			opts := DefaultOptions()
 			err = opts.FromMap(oMap)
 			require.NoError(t, err)
 			assert.Equal(t, test.exp, opts.UseMMap)
 		})
 	}
 }
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -5,6 +5,8 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/envconfig"
 )
@@ -24,6 +26,7 @@ func InitLogging() {
 		logFile = os.Stderr
 		// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
 	} else {
 		rotateLogs(AppLogFile)
 		logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 		if err != nil {
 			slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {
 	slog.Info("ollama app started")
 }
 func rotateLogs(logFile string) {
 	if _, err := os.Stat(logFile); os.IsNotExist(err) {
 		return
 	}
 	index := strings.LastIndex(logFile, ".")
 	pre := logFile[:index]
 	post := "." + logFile[index+1:]
 	for i := LogRotationCount; i > 0; i-- {
 		older := pre + "-" + strconv.Itoa(i) + post
 		newer := pre + "-" + strconv.Itoa(i-1) + post
 		if i == 1 {
 			newer = pre + post
 		}
 		if _, err := os.Stat(newer); err == nil {
 			if _, err := os.Stat(older); err == nil {
 				err := os.Remove(older)
 				if err != nil {
 					slog.Warn("Failed to remove older log", "older", older, "error", err)
 					continue
 				}
 			}
 			err := os.Rename(newer, older)
 			if err != nil {
 				slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
 			}
 		}
 	}
 }
--- a/app/lifecycle/logging_test.go
+++ b/app/lifecycle/logging_test.go
@@ -0,0 +1,44 @@
 package lifecycle
 import (
 	"os"
 	"path/filepath"
 	"strconv"
 	"testing"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 func TestRotateLogs(t *testing.T) {
 	logDir := t.TempDir()
 	logFile := filepath.Join(logDir, "testlog.log")
 	// No log exists
 	rotateLogs(logFile)
 	require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
 	assert.FileExists(t, logFile)
 	// First rotation
 	rotateLogs(logFile)
 	assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
 	assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
 	assert.NoFileExists(t, logFile)
 	// Should be a no-op without a new log
 	rotateLogs(logFile)
 	assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
 	assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
 	assert.NoFileExists(t, logFile)
 	for i := 2; i <= LogRotationCount+1; i++ {
 		require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
 		assert.FileExists(t, logFile)
 		rotateLogs(logFile)
 		assert.NoFileExists(t, logFile)
 		for j := 1; j < i; j++ {
 			assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
 		}
 		assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
 	}
 }
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@@ -16,11 +16,12 @@ var (
 	AppDir     = "/opt/Ollama"
 	AppDataDir = "/opt/Ollama"
 	// TODO - should there be a distinct log dir?
-	UpdateStageDir = "/tmp"
+	UpdateStageDir   = "/tmp"
-	AppLogFile     = "/tmp/ollama_app.log"
+	AppLogFile       = "/tmp/ollama_app.log"
-	ServerLogFile  = "/tmp/ollama.log"
+	ServerLogFile    = "/tmp/ollama.log"
-	UpgradeLogFile = "/tmp/ollama_update.log"
+	UpgradeLogFile   = "/tmp/ollama_update.log"
-	Installer      = "OllamaSetup.exe"
+	Installer        = "OllamaSetup.exe"
 	LogRotationCount = 5
 )
 func init() {
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
 		return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
 	}
-	// TODO - rotation
+	rotateLogs(ServerLogFile)
 	logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create server log: %w", err)
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,10 +88,15 @@ DialogFontSize=12
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
 Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
 #if DirExists("..\dist\windows-amd64\cuda")
  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
 #endif
 #if DirExists("..\dist\windows-amd64\oneapi")
  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
 #endif
 #if DirExists("..\dist\windows-amd64\rocm")
  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
 #endif
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -579,10 +579,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 	if len(args) != 1 {
 		return errors.New("missing model name")
 	}
 	license, errLicense := cmd.Flags().GetBool("license")
 	modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
 	parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +621,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 	if flagsSet > 1 {
 		return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
-	} else if flagsSet == 0 {
+	}
-		return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
+
 	if flagsSet == 1 {
 		req := api.ShowRequest{Name: args[0]}
 		resp, err := client.Show(cmd.Context(), &req)
 		if err != nil {
 			return err
 		}
 		switch showType {
 		case "license":
 			fmt.Println(resp.License)
 		case "modelfile":
 			fmt.Println(resp.Modelfile)
 		case "parameters":
 			fmt.Println(formatParams(resp.Parameters, false))
 		case "system":
 			fmt.Println(resp.System)
 		case "template":
 			fmt.Println(resp.Template)
 		}
 		return nil
 	}
 	req := api.ShowRequest{Name: args[0]}
@@ -635,22 +652,120 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
-	switch showType {
+	arch := resp.ModelInfo["general.architecture"].(string)
-	case "license":
+
-		fmt.Println(resp.License)
+	modelData := [][]string{
-	case "modelfile":
+		{"arch", arch},
-		fmt.Println(resp.Modelfile)
+		{"parameters", resp.Details.ParameterSize},
-	case "parameters":
+		{"quantization", resp.Details.QuantizationLevel},
-		fmt.Println(resp.Parameters)
+		{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
-	case "system":
+		{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
 		fmt.Println(resp.System)
 	case "template":
 		fmt.Println(resp.Template)
 	}
 	mainTableData := [][]string{
 		{"Model"},
 		{renderSubTable(modelData, false, true)},
 	}
 	if resp.ProjectorInfo != nil {
 		projectorData := [][]string{
 			{"arch", "clip"},
 			{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
 			{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
 			{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
 			{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
 		}
 		mainTableData = append(mainTableData,
 			[]string{"Projector"},
 			[]string{renderSubTable(projectorData, false, true)},
 		)
 	}
 	if resp.Parameters != "" {
 		mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters, true)})
 	}
 	if resp.System != "" {
 		mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true, true)})
 	}
 	if resp.License != "" {
 		mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true, true)})
 	}
 	table := tablewriter.NewWriter(os.Stdout)
 	table.SetAutoWrapText(false)
 	table.SetBorder(false)
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	for _, v := range mainTableData {
 		table.Append(v)
 	}
 	table.Render()
 	return nil
 }
 func renderSubTable(data [][]string, file bool, tab bool) string {
 	var buf bytes.Buffer
 	table := tablewriter.NewWriter(&buf)
 	table.SetAutoWrapText(!file)
 	table.SetBorder(false)
 	table.SetNoWhiteSpace(true)
 	table.SetTablePadding("\t")
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	for _, v := range data {
 		table.Append(v)
 	}
 	table.Render()
 	if !tab {
 		return buf.String()
 	}
 	renderedTable := buf.String()
 	lines := strings.Split(renderedTable, "\n")
 	for i, line := range lines {
 		lines[i] = "\t" + line
 	}
 	return strings.Join(lines, "\n")
 }
 func twoLines(s string) [][]string {
 	lines := strings.Split(s, "\n")
 	res := [][]string{}
 	count := 0
 	for _, line := range lines {
 		line = strings.TrimSpace(line)
 		if line != "" {
 			count++
 			res = append(res, []string{line})
 			if count == 2 {
 				return res
 			}
 		}
 	}
 	return res
 }
 func formatParams(s string, tab bool) string {
 	lines := strings.Split(s, "\n")
 	table := [][]string{}
 	for _, line := range lines {
 		fields := strings.Fields(line)
 		fields[1] = strings.TrimPrefix(strings.TrimSuffix(fields[1], `"`), `"`)
 		table = append(table, fields)
 	}
 	return renderSubTable(table, false, tab)
 }
 func CopyHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
--- a/docs/api.md
+++ b/docs/api.md
@@ -777,11 +777,12 @@ A single JSON object will be returned.
 POST /api/show
 ```
-Show information about a model including details, modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, system prompt.
 ### Parameters
 - `name`: name of the model to show
 - `verbose`: (optional) if set to `true`, returns full data for verbose response fields
 ### Examples
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
 ```json
 {
  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
-  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSISTANT:",
+  "parameters": "num_keep                       24\nstop                           \"<|start_header_id|>\"\nstop                           \"<|end_header_id|>\"\nstop                           \"<|eot_id|>\"",
-  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
+  "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
  "details": {
    "parent_model": "",
    "format": "gguf",
    "family": "llama",
-    "families": ["llama", "clip"],
+    "families": [
-    "parameter_size": "7B",
+      "llama"
    ],
    "parameter_size": "8.0B",
    "quantization_level": "Q4_0"
  },
  "model_info": {
    "general.architecture": "llama",
    "general.file_type": 2,
    "general.parameter_count": 8030261248,
    "general.quantization_version": 2,
    "llama.attention.head_count": 32,
    "llama.attention.head_count_kv": 8,
    "llama.attention.layer_norm_rms_epsilon": 0.00001,
    "llama.block_count": 32,
    "llama.context_length": 8192,
    "llama.embedding_length": 4096,
    "llama.feed_forward_length": 14336,
    "llama.rope.dimension_count": 128,
    "llama.rope.freq_base": 500000,
    "llama.vocab_size": 128256,
    "tokenizer.ggml.bos_token_id": 128000,
    "tokenizer.ggml.eos_token_id": 128009,
    "tokenizer.ggml.merges": [],            // populates if `verbose=true`
    "tokenizer.ggml.model": "gpt2",
    "tokenizer.ggml.pre": "llama-bpe",
    "tokenizer.ggml.token_type": [],        // populates if `verbose=true`
    "tokenizer.ggml.tokens": []             // populates if `verbose=true`
  }
 }
 ```
--- a/docs/import.md
+++ b/docs/import.md
@@ -47,19 +47,13 @@ success
 ### Supported Quantizations
 <details>
 <summary>Legacy Quantization</summary>
 - `Q4_0`
 - `Q4_1`
 - `Q5_0`
 - `Q5_1`
 - `Q8_0`
-</details>
+#### K-means Quantizations
 <details>
 <summary>K-means Quantization</summary>`
 - `Q3_K_S`
 - `Q3_K_M`
@@ -70,11 +64,6 @@ success
 - `Q5_K_M`
 - `Q6_K`
 </details>
 > [!NOTE]
 > Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
 ## Template Detection
 > [!NOTE]
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -22,7 +22,7 @@ docker logs <container-name>
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.
 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
 - `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -39,8 +39,8 @@ server.
 Ollama on Windows stores files in a few different locations.  You can view them in
 the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
-    - *app.log* contains logs from the GUI application
+    - *app.log* contains most resent logs from the GUI application
-    - *server.log* contains the server logs
+    - *server.log* contains the most recent server logs
    - *upgrade.log* contains log output for upgrades
 - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` contains models and configuration
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -231,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
 		// On windows we bundle the nvidia library one level above the runner dir
 		depPath := ""
 		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-			depPath = filepath.Dir(envconfig.RunnersDir)
+			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
 		}
 		// Load ALL libraries
@@ -282,6 +282,12 @@ func GetGPUInfo() GpuInfoList {
 		// Intel
 		if envconfig.IntelGpu {
 			oHandles = initOneAPIHandles()
 			// On windows we bundle the oneapi library one level above the runner dir
 			depPath = ""
 			if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
 				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
 			}
 			for d := range oHandles.oneapi.num_drivers {
 				if oHandles.oneapi == nil {
 					// shouldn't happen
@@ -306,7 +312,7 @@ func GetGPUInfo() GpuInfoList {
 					gpuInfo.FreeMemory = uint64(memInfo.free)
 					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-					// TODO dependency path?
+					gpuInfo.DependencyPath = depPath
 					oneapiGPUs = append(oneapiGPUs, gpuInfo)
 				}
 			}
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*l[i].p) {
+    if (!*(l[i].p)) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -42,7 +42,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
      resp->ch.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -50,7 +50,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
      resp->oh.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -98,7 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
  }
  for (d = 0; d < resp->oh.num_drivers; d++) {
-    LOG(resp->oh.verbose, "calling zesDeviceGet %d\n", resp->oh.drivers[d]);
+    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
                                   &resp->oh.num_devices[d], NULL);
    if (ret != ZE_RESULT_SUCCESS) {
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -56,7 +56,6 @@ struct server_params {
    std::string hostname = "127.0.0.1";
    std::vector<std::string> api_keys;
    std::string public_path = "examples/server/public";
    std::string chat_template = "";
    int32_t port = 8080;
    int32_t read_timeout = 600;
    int32_t write_timeout = 600;
@@ -427,16 +426,6 @@ struct llama_server_context
        return true;
    }
    void validate_model_chat_template(server_params & sparams) {
        llama_chat_message chat[] = {{"user", "test"}};
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
            sparams.chat_template = "chatml";
        }
    }
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
            sparams.chat_template = argv[i];
        }
        else if (arg == "--override-kv")
        {
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
    }
    const auto model_meta = llama.model_meta();
    if (sparams.chat_template.empty()) { // custom chat template is not supplied
        // check if the template comes with the model is supported by us
        llama.validate_model_chat_template(sparams);
    }
    // Middleware for API key validation
    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
        // If API key is not set, skip validation
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,7 +18,7 @@ sign() {
    fi
 }
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
 case "${GOARCH}" in
 "amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
        #
        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="../build/darwin/${ARCH}/cpu"
        echo "Building LCD CPU"
        build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
        # Approximately 400% faster than LCD on same CPU
        #
        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
        echo "Building AVX CPU"
        build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
        # Approximately 10% faster than AVX on same CPU
        #
        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
        export CUDACXX=$(command -v nvcc)
    fi
 fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}_static"
    echo "Building static library"
    build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
-        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,5 +1,7 @@
 #!powershell
 $ErrorActionPreference = "Stop"
 function amdGPUs {
    if ($env:AMDGPU_TARGETS) {
        return $env:AMDGPU_TARGETS
@@ -37,7 +39,8 @@ function init_vars {
    }
    $script:cmakeDefs = @(
        "-DBUILD_SHARED_LIBS=on",
-        "-DLLAMA_NATIVE=off"
+        "-DLLAMA_NATIVE=off",
        "-DLLAMA_OPENMP=off"
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -83,9 +86,9 @@ function init_vars {
 function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function apply_patches {
@@ -119,7 +122,7 @@ function build {
    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ($cmakeDefs -contains "-G") {
        $extra=@("-j8")
    } else {
@@ -127,7 +130,7 @@ function build {
    }
    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
-    if ($LASTEXITCODE -ne 0) { write-host "cmake build exit status $LASTEXITCODE"; throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    # Rearrange output to be consistent between different generators
    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
@@ -141,7 +144,7 @@ function sign {
        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
 }
@@ -206,7 +209,8 @@ function build_static() {
            "-DLLAMA_AVX2=off",
            "-DLLAMA_AVX512=off",
            "-DLLAMA_F16C=off",
-            "-DLLAMA_FMA=off")
+            "-DLLAMA_FMA=off",
            "-DLLAMA_OPENMP=off")
        $script:buildDir="../build/windows/${script:ARCH}_static"
        write-host "Building static library"
        build
@@ -216,13 +220,7 @@ function build_static() {
    }
 }
-function build_cpu() {
+function build_cpu($gen_arch) {
    if ($script:ARCH -eq "arm64") {
        $gen_arch = "ARM64"
    } else { # amd64
        $gen_arch = "x64"
    }
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        # remaining llama.cpp builds use MSVC 
        init_vars
@@ -285,7 +283,7 @@ function build_cuda() {
            "-DLLAMA_AVX=on",
            "-DLLAMA_AVX2=off",
            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
-            "-DCMAKE_CUDA_FLAGS=-t8"
+            "-DCMAKE_CUDA_FLAGS=-t8",
            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
            )
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
@@ -297,10 +295,12 @@ function build_cuda() {
        sign
        install
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
    } else {
        write-host "Skipping CUDA generation step"
    }
@@ -334,16 +334,18 @@ function build_oneapi() {
    sign
    install
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
+    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
  } else {
    Write-Host "Skipping oneAPI generation step"
  }
@@ -408,29 +410,16 @@ init_vars
 if ($($args.count) -eq 0) {
    git_module_setup
    apply_patches
-
+    build_static
-    $tasks = @("build_static", "build_cpu")
+    if ($script:ARCH -eq "arm64") {
-    $jobs = @()
+        build_cpu("ARM64")
-    if ($script:ARCH -ne "arm64") {
+    } else { # amd64
-        $tasks += $("build_cpu_avx", "build_cpu_avx2", "build_cuda", "build_oneapi", "build_rocm")
+        build_cpu("x64")
-    }
+        build_cpu_avx
-    foreach ($t in $tasks) {
+        build_cpu_avx2
-        $jobs += @(Start-ThreadJob -ThrottleLimit 12 -FilePath .\gen_windows.ps1 -ArgumentList $t -Name $t)
+        build_cuda
-    }
+        build_oneapi
-    get-job
+        build_rocm
    foreach ($job in $jobs) {
        write-host "----" $job.Name output follows
        receive-job -wait -job $job
        write-host "----" $job.Name $job.State
        write-host ""
        if ($job.State -contains 'Failed') {
            cleanup
            write-host "Terminating remaining jobs (this takes a while, you can ^C)"
            # TODO find some way to kill the spawned cmake processes faster
            remove-job -force -job $jobs
            exit(-1)
        }
        get-job
    }
    cleanup
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -367,6 +367,17 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(vocab+2*embedding),
 			fullOffload,
 		)
 	case "deepseek2":
 		keys := uint64(llm.KV()["deepseek2.attention.key_length"].(uint32))
 		fullOffload = max(
 			4*batch*(3*embedding+vocab),
 			4*batch*(3*embedding+2+context*(1+headsKV)+2*keys*headsKV),
 		)
 		partialOffload = max(
 			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
 			4*batch*(2*embedding+1+2*keys*headsKV+context+context*headsKV)+4*keys*context*headsKV+embedding*keys*headsKV*9/16,
 		)
 	}
 	return
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,6 +1,7 @@
 package llm
 import (
 	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
@@ -49,6 +50,18 @@ type MemoryEstimate struct {
 	// For multi-GPU scenarios, this is the size in bytes per GPU
 	GPUSizes []uint64
 	// internal fields for logging purposes
 	inferenceLibrary    string
 	layersRequested     int
 	layersModel         int
 	availableList       []string
 	kv                  uint64
 	allocationsList     []string
 	memoryWeights       uint64
 	memoryLayerOutput   uint64
 	graphFullOffload    uint64
 	graphPartialOffload uint64
 }
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@@ -167,6 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// For all the layers, find where they can fit on the GPU(s)
 	for i := range int(ggml.KV().BlockCount()) {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.size()
 			layerSize += kv / ggml.KV().BlockCount()
 		}
 		memoryWeights += layerSize
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
@@ -252,78 +270,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		allocationsList = append(allocationsList, format.HumanBytes2(a))
 	}
 	estimate := MemoryEstimate{
 		TotalSize: memoryRequiredTotal,
 		Layers:    0,
 		Graph:     0,
 		VRAMSize:  0,
 		GPUSizes:  []uint64{},
 		inferenceLibrary:    gpus[0].Library,
 		layersRequested:     opts.NumGPU,
 		layersModel:         int(ggml.KV().BlockCount()) + 1,
 		availableList:       availableList,
 		kv:                  kv,
 		allocationsList:     allocationsList,
 		memoryWeights:       memoryWeights,
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
 	}
 	if gpus[0].Library == "cpu" {
 		return estimate
 	}
 	if layerCount == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
 		return estimate
 	}
 	estimate.Layers = layerCount
 	estimate.Graph = graphOffload
 	estimate.VRAMSize = memoryRequiredPartial
 	estimate.TotalSize = memoryRequiredTotal
 	estimate.TensorSplit = tensorSplit
 	estimate.GPUSizes = gpuAllocations
 	return estimate
 }
 func (m MemoryEstimate) log() {
 	slog.Info(
-		"offload to gpu",
+		"offload to "+m.inferenceLibrary,
 		slog.Group(
 			"layers",
 			// requested number of layers to offload
-			"requested", opts.NumGPU,
+			"requested", m.layersRequested,
 			// The number of layers the model has (including output)
-			"model", int(ggml.KV().BlockCount())+1,
+			"model", m.layersModel,
 			// estimated number of layers that can be offloaded
-			"offload", layerCount,
+			"offload", m.Layers,
-			// multi-gpu split for tesnors
+			// multi-gpu split for tensors
-			"split", tensorSplit,
+			"split", m.TensorSplit,
 		),
 		slog.Group(
 			"memory",
 			// memory available by GPU for offloading
-			"available", availableList,
+			"available", m.availableList,
 			slog.Group(
 				"required",
 				// memory required for full offloading
-				"full", format.HumanBytes2(memoryRequiredTotal),
+				"full", format.HumanBytes2(m.TotalSize),
 				// memory required to offload layers.estimate layers
-				"partial", format.HumanBytes2(memoryRequiredPartial),
+				"partial", format.HumanBytes2(m.VRAMSize),
 				// memory of KV cache
-				"kv", format.HumanBytes2(kv),
+				"kv", format.HumanBytes2(m.kv),
 				// Allocations across the GPUs
-				"allocations", allocationsList,
+				"allocations", m.allocationsList,
 			),
 			slog.Group(
 				"weights",
 				// memory of the weights
-				"total", format.HumanBytes2(memoryWeights),
+				"total", format.HumanBytes2(m.memoryWeights),
 				// memory of repeating layers
-				"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
+				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
 				// memory of non-repeating layers
-				"nonrepeating", format.HumanBytes2(memoryLayerOutput),
+				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
 			),
 			slog.Group(
 				"graph",
 				// memory of graph when fully offloaded
-				"full", format.HumanBytes2(graphFullOffload),
+				"full", format.HumanBytes2(m.graphFullOffload),
 				// memory of graph when not fully offloaded
-				"partial", format.HumanBytes2(graphPartialOffload),
+				"partial", format.HumanBytes2(m.graphPartialOffload),
 			),
 		),
 	)
 	if gpus[0].Library == "cpu" {
 		return MemoryEstimate{
 			Layers:    0,
 			Graph:     0,
 			VRAMSize:  0,
 			TotalSize: memoryRequiredTotal,
 			GPUSizes:  []uint64{},
 		}
 	}
 	if layerCount == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
 		return MemoryEstimate{
 			Layers:    0,
 			Graph:     0,
 			VRAMSize:  0,
 			TotalSize: memoryRequiredTotal,
 			GPUSizes:  []uint64{},
 		}
 	}
 	return MemoryEstimate{
 		Layers:      layerCount,
 		Graph:       graphOffload,
 		VRAMSize:    memoryRequiredPartial,
 		TotalSize:   memoryRequiredTotal,
 		TensorSplit: tensorSplit,
 		GPUSizes:    gpuAllocations,
 	}
 }
--- a/llm/patches/01-load-progress.diff
+++ b/llm/patches/01-load-progress.diff
@@ -1,8 +1,8 @@
 diff --git a/common/common.cpp b/common/common.cpp
-index ba1ecf0e..cead57cc 100644
+index 73ff0e85..6adb1a92 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
         mparams.kv_overrides = NULL;
     } else {
 diff --git a/common/common.h b/common/common.h
-index d80344f2..71e84834 100644
+index 58ed72f4..0bb2605e 100644
 --- a/common/common.h
 +++ b/common/common.h
-@@ -174,6 +174,13 @@ struct gpt_params {
+@@ -180,6 +180,13 @@ struct gpt_params {
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector
     std::vector<std::string> image; // path to image file(s)
-+
+ 
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 +    // If the provided progress_callback returns true, model loading continues.
 +    // If it returns false, model loading is immediately aborted.
 +    llama_progress_callback progress_callback = NULL;
 +    // context pointer passed to the progress callback
 +    void * progress_callback_user_data;
- };
+
- 
+     // server params
- void gpt_params_handle_model_default(gpt_params & params);
+     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/llama.cpp b/llama.cpp
-index 40d2ec2c..74f3ee9c 100644
+index 61948751..4b72a293 100644
 --- a/llama.cpp
 +++ b/llama.cpp
-@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
+@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
         // for now, only BPE models have pre-tokenizers
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-            } else if (
+-            } else if (tokenizer_pre == "default") {
-+            if (
+            if (tokenizer_pre == "default") {
                     tokenizer_pre == "default") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
-@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
+                     tokenizer_pre == "llama3"   ||
-                 tokenizer_pre == "smaug-bpe") {
+@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+                 tokenizer_pre == "poro-chat") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
 	}
 	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*")
+	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
 	files, err := filepath.Glob(pattern)
 	if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(file)] = file
+		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
 	}
 	return servers
--- a/llm/server.go
+++ b/llm/server.go
@@ -116,6 +116,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}
 	estimate.log()
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")
@@ -200,7 +202,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
-			opts.UseMMap = false
+			opts.UseMMap = api.TriStateFalse
 		}
 	}
@@ -208,7 +210,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--flash-attn")
 	}
-	if !opts.UseMMap {
+	// Windows CUDA should not use mmap for best performance
 	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
 		params = append(params, "--no-mmap")
 	}
@@ -271,8 +274,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// prepend the server directory to LD_LIBRARY_PATH/PATH
+		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
-		libraryPaths := []string{dir}
+		libraryPaths := []string{dir, filepath.Dir(dir)}
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			// Append our runner directory to the path
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -103,19 +103,19 @@ function buildApp() {
 function gatherDependencies() {
    write-host "Gathering runtime dependencies"
    cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}" -ea 0 > $null
+    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
    # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
    # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
        write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
            write-host "signing $file"
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
--- a/server/routes.go
+++ b/server/routes.go
@@ -734,9 +734,44 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	fmt.Fprint(&sb, m.String())
 	resp.Modelfile = sb.String()
 	kvData, err := getKVData(m.ModelPath, req.Verbose)
 	if err != nil {
 		return nil, err
 	}
 	delete(kvData, "general.name")
 	delete(kvData, "tokenizer.chat_template")
 	resp.ModelInfo = kvData
 	if len(m.ProjectorPaths) > 0 {
 		projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
 		if err != nil {
 			return nil, err
 		}
 		resp.ProjectorInfo = projectorData
 	}
 	return resp, nil
 }
 func getKVData(digest string, verbose bool) (llm.KV, error) {
 	kvData, err := llm.LoadModel(digest)
 	if err != nil {
 		return nil, err
 	}
 	kv := kvData.KV()
 	if !verbose {
 		for k := range kv {
 			if t, ok := kv[k].([]any); len(t) > 5 && ok {
 				kv[k] = []any{}
 			}
 		}
 	}
 	return kv, nil
 }
 func (s *Server) ListModelsHandler(c *gin.Context) {
 	ms, err := Manifests()
 	if err != nil {
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -19,6 +19,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -212,6 +213,7 @@ func Test_Routes(t *testing.T) {
 					"top_p 0.9",
 				}
 				assert.Equal(t, expectedParams, params)
 				assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
 			},
 		},
 	}
@@ -325,3 +327,40 @@ func TestCase(t *testing.T) {
 		})
 	}
 }
 func TestShow(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
 	envconfig.LoadConfig()
 	var s Server
 	createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name: "show-model",
 		Modelfile: fmt.Sprintf(
 			"FROM %s\nFROM %s",
 			createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
 			createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
 		),
 	})
 	w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
 		Name: "show-model",
 	})
 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
 	}
 	var resp api.ShowResponse
 	if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
 		t.Fatal(err)
 	}
 	if resp.ModelInfo["general.architecture"] != "test" {
 		t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
 	}
 	if resp.ProjectorInfo["general.architecture"] != "clip" {
 		t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
 	}
 }
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -4,7 +4,6 @@ package model
 import (
 	"cmp"
 	"encoding/hex"
 	"errors"
 	"fmt"
 	"log/slog"
@@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
 	}
 	return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
 }
 type DigestType byte
 const (
 	DigestTypeInvalid DigestType = iota
 	DigestTypeSHA256
 )
 func (t DigestType) String() string {
 	switch t {
 	case DigestTypeSHA256:
 		return "sha256"
 	default:
 		return "invalid"
 	}
 }
 type Digest struct {
 	Type DigestType
 	Sum  [32]byte
 }
 func ParseDigest(s string) (Digest, error) {
 	i := strings.IndexAny(s, "-:")
 	if i < 0 {
 		return Digest{}, fmt.Errorf("invalid digest %q", s)
 	}
 	typ, encSum := s[:i], s[i+1:]
 	if typ != "sha256" {
 		return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
 	}
 	d := Digest{
 		Type: DigestTypeSHA256,
 	}
 	n, err := hex.Decode(d.Sum[:], []byte(encSum))
 	if err != nil {
 		return Digest{}, err
 	}
 	if n != 32 {
 		return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
 	}
 	return d, nil
 }
 func (d Digest) String() string {
 	if d.Type == DigestTypeInvalid {
 		return ""
 	}
 	return fmt.Sprintf("sha256-%x", d.Sum)
 }
 func (d Digest) IsValid() bool {
 	return d.Type != DigestTypeInvalid
 }
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) {
 	}
 }
 const (
 	validSha256    = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
 	validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
 )
 func TestParseDigest(t *testing.T) {
 	cases := []struct {
 		in   string
 		want string
 	}{
 		{"", ""},           // empty
 		{"sha123-12", ""},  // invalid type
 		{"sha256-", ""},    // invalid sum
 		{"sha256-123", ""}, // invalid odd length sum
 		{validSha256, validSha256},
 		{validSha256Old, validSha256},
 	}
 	for _, tt := range cases {
 		t.Run(tt.in, func(t *testing.T) {
 			got, err := ParseDigest(tt.in)
 			if err != nil {
 				if tt.want != "" {
 					t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
 				}
 				return
 			}
 			if got.String() != tt.want {
 				t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
 			}
 		})
 	}
 }
 func TestParseNameFromFilepath(t *testing.T) {
 	cases := map[string]Name{
 		filepath.Join("host", "namespace", "model", "tag"):      {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},
Author	SHA1	Message	Date
Roy Han	c494aea5c8	Strip stop strings	2024-06-20 09:06:08 -07:00
royjhan	fedf71635e	Extend api/show and ollama show to return more model info (#4881 ) * API Show Extended * Initial Draft of Information Co-Authored-By: Patrick Devine <pdevine@sonic.net> * Clean Up * Descriptive arg error messages and other fixes * Second Draft of Show with Projectors Included * Remove Chat Template * Touches * Prevent wrapping from files * Verbose functionality * Docs * Address Feedback * Lint * Resolve Conflicts * Function Name * Tests for api/show model info * Show Test File * Add Projector Test * Clean routes * Projector Check * Move Show Test * Touches * Doc update --------- Co-authored-by: Patrick Devine <pdevine@sonic.net>	2024-06-19 14:19:02 -07:00
Daniel Hiltgen	97c59be653	Merge pull request #5074 from dhiltgen/app_log_rotation Implement log rotation for tray app	2024-06-19 13:02:24 -07:00
Daniel Hiltgen	9d8a4988e8	Implement log rotation for tray app	2024-06-19 12:53:34 -07:00
Michael Yang	1ae0750a21	Merge pull request #5147 from ollama/mxyng/cleanup remove confusing log message	2024-06-19 12:50:31 -07:00
Michael Yang	9d91e5e587	remove confusing log message	2024-06-19 11:14:11 -07:00
Daniel Hiltgen	96624aa412	Merge pull request #5072 from dhiltgen/windows_path Move libraries out of users path	2024-06-19 09:13:39 -07:00
Daniel Hiltgen	10f33b8537	Merge pull request #5146 from dhiltgen/backout Put back temporary intel GPU env var	2024-06-19 09:12:45 -07:00
Daniel Hiltgen	4a633cc295	Merge pull request #5145 from dhiltgen/bad_loads Fix bad symbol load detection	2024-06-19 09:12:33 -07:00
Daniel Hiltgen	d34d88e417	Revert "Revert "gpu: add env var for detecting Intel oneapi gpus (#5076 )"" This reverts commit `755b4e4fc2`.	2024-06-19 08:57:41 -07:00
Daniel Hiltgen	52ce350b7a	Fix bad symbol load detection pointer deref's weren't correct on a few libraries, which explains some crashes on older systems or miswired symlinks for discovery libraries.	2024-06-19 08:39:07 -07:00
Daniel Hiltgen	2abebb2cbe	Merge pull request #5128 from zhewang1-intc/fix_levelzero_empty_symbol_detect Fix levelzero empty symbol detect	2024-06-19 08:33:16 -07:00
Blake Mizerany	380e06e5be	types/model: remove Digest The Digest type in its current form is awkward to work with and presents challenges with regard to how it serializes via String using the '-' prefix. We currently only use this in ollama.com, so we'll move our specific needs around digest parsing and validation there.	2024-06-18 20:28:11 -07:00
Wang,Zhe	badf975e45	get real func ptr.	2024-06-19 09:00:51 +08:00
Wang,Zhe	755b4e4fc2	Revert "gpu: add env var for detecting Intel oneapi gpus (#5076 )" This reverts commit `163cd3e77c`.	2024-06-19 08:59:58 +08:00
Michael Yang	21adf8b6d2	Merge pull request #5121 from ollama/mxyng/deepseekv2 deepseek v2 graph	2024-06-18 16:30:58 -07:00
Michael Yang	e873841cbb	deepseek v2 graph	2024-06-18 15:35:12 -07:00
Daniel Hiltgen	26d0bf9236	Merge pull request #5117 from dhiltgen/fix_prediction Handle models with divergent layer sizes	2024-06-18 11:36:51 -07:00
Daniel Hiltgen	359b15a597	Handle models with divergent layer sizes The recent refactoring of the memory prediction assumed all layers are the same size, but for some models (like deepseek-coder-v2) this is not the case, so our predictions were significantly off.	2024-06-18 11:05:34 -07:00
Daniel Hiltgen	b55958a587	Merge pull request #5106 from dhiltgen/clean_logs Tighten up memory prediction logging	2024-06-18 09:24:38 -07:00
Daniel Hiltgen	7784ca33ce	Tighten up memory prediction logging Prior to this change, we logged the memory prediction multiple times as the scheduler iterates to find a suitable configuration, which can be confusing since only the last log before the server starts is actually valid. This now logs once just before starting the server on the final configuration. It also reports what library instead of always saying "offloading to gpu" when using CPU.	2024-06-18 09:15:35 -07:00
Daniel Hiltgen	c9c8c98bf6	Merge pull request #5105 from dhiltgen/cuda_mmap Adjust mmap logic for cuda windows for faster model load	2024-06-17 17:07:30 -07:00
Daniel Hiltgen	171796791f	Adjust mmap logic for cuda windows for faster model load On Windows, recent llama.cpp changes make mmap slower in most cases, so default to off. This also implements a tri-state for use_mmap so we can detect the difference between a user provided value of true/false, or unspecified.	2024-06-17 16:54:30 -07:00
Jeffrey Morgan	176d0f7075	Update import.md	2024-06-17 19:44:14 -04:00
Daniel Hiltgen	8ed51cac37	Merge pull request #5103 from dhiltgen/faster_win_build Revert powershell jobs, but keep nvcc and cmake parallelism	2024-06-17 14:23:18 -07:00
Daniel Hiltgen	c9e6f0542d	Merge pull request #5069 from dhiltgen/ci_release Implement custom github release action	2024-06-17 13:59:37 -07:00
Daniel Hiltgen	b0930626c5	Add back lower level parallel flags nvcc supports parallelism (threads) and cmake + make can use -j, while msbuild requires /p:CL_MPcount=8	2024-06-17 13:44:46 -07:00
Daniel Hiltgen	e890be4814	Revert "More parallelism on windows generate" This reverts commit `0577af98f4`.	2024-06-17 13:32:46 -07:00
Daniel Hiltgen	b2799f111b	Move libraries out of users path We update the PATH on windows to get the CLI mapped, but this has an unintended side effect of causing other apps that may use our bundled DLLs to get terminated when we upgrade.	2024-06-17 13:12:18 -07:00
Jeffrey Morgan	152fc202f5	llm: update llama.cpp commit to `7c26775` (#4896 ) * llm: update llama.cpp submodule to `7c26775` * disable `LLAMA_BLAS` for now * `-DLLAMA_OPENMP=off`	2024-06-17 15:56:16 -04:00
Lei Jitang	4ad0d4d6d3	Fix a build warning (#5096 ) Signed-off-by: Lei Jitang <leijitang@outlook.com>	2024-06-17 14:47:48 -04:00
Daniel Hiltgen	a12283e2ff	Implement custom github release action This implements the release logic we want via gh cli to support updating releases with rc tags in place and retain release notes and other community reactions.	2024-06-15 11:36:56 -07:00