From 26ab67732b4e9e4a559f876ccf45e7f5a15fa3b6 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 6 Jun 2024 10:43:55 -0700
Subject: [PATCH 01/43] Bump ROCm linux to 6.1.1

---
 .github/workflows/test.yaml | 2 +-
 Dockerfile                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index dbb6c2fdf..29adf56f3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -124,7 +124,7 @@ jobs:
     strategy:
       matrix:
         rocm-version:
-          - '6.0.2'
+          - '6.1.1'
     runs-on: linux
     container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
     steps:
diff --git a/Dockerfile b/Dockerfile
index 72edef2a9..98a3ddfd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.0.2
+ARG ROCM_VERSION=6.1.1
 
 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code

From da3bf2335483fac7cbfff72a1b80e40988f4f35a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 31 May 2024 16:15:21 -0700
Subject: [PATCH 02/43] Workaround gfx900 SDMA bugs

Implement support for GPU env var workarounds, and leverage
this for the Vega RX 56 which needs
HSA_ENABLE_SDMA=0 set to work properly
---
 gpu/amd_linux.go |  5 +++++
 gpu/types.go     |  3 +++
 llm/server.go    | 10 ++++++++++
 3 files changed, 18 insertions(+)

diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 61e6a0598..7637c7768 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -332,6 +332,11 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 		}
 
+		// Check for env var workarounds
+		if name == "1002:687f" { // Vega RX 56
+			gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
+		}
+
 		// The GPU has passed all the verification steps and is supported
 		resp = append(resp, gpuInfo)
 	}
diff --git a/gpu/types.go b/gpu/types.go
index 47355959c..9920db5ff 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -26,6 +26,9 @@ type GpuInfo struct {
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath string `json:"lib_path,omitempty"`
 
+	// Extra environment variables specific to the GPU as list of [key,value]
+	EnvWorkarounds [][2]string `json:"envs,omitempty"`
+
 	// GPU information
 	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
 	Name    string `json:"name"`    // user friendly name if available
diff --git a/llm/server.go b/llm/server.go
index 6313fc327..117565ba5 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -320,6 +320,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
 
+		envWorkarounds := [][2]string{}
+		for _, gpu := range gpus {
+			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
+		}
 		visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
 		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
 
@@ -334,6 +338,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
 				s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
 				devicesNeeded = false
+			} else if len(envWorkarounds) != 0 {
+				for _, kv := range envWorkarounds {
+					if strings.EqualFold(cmp[0], kv[0]) {
+						s.cmd.Env[i] = kv[0] + "=" + kv[1]
+					}
+				}
 			}
 		}
 		if pathNeeded {

From 6be309e1bd8a7fde936853c8981779df5b918e01 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 8 May 2024 11:11:50 -0700
Subject: [PATCH 03/43] Centralize GPU configuration vars

This should aid in troubleshooting by capturing and reporting the GPU
settings at startup in the logs along with all the other server settings.
---
 envconfig/config.go | 27 ++++++++++++++++++++++++++-
 gpu/amd_linux.go    |  9 +++++----
 gpu/amd_windows.go  |  3 ++-
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index 4d2150b72..bcf2e18ae 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -57,6 +57,17 @@ var (
 	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
+
+	// Set via CUDA_VISIBLE_DEVICES in the environment
+	CudaVisibleDevices string
+	// Set via HIP_VISIBLE_DEVICES in the environment
+	HipVisibleDevices string
+	// Set via ROCR_VISIBLE_DEVICES in the environment
+	RocrVisibleDevices string
+	// Set via GPU_DEVICE_ORDINAL in the environment
+	GpuDeviceOrdinal string
+	// Set via HSA_OVERRIDE_GFX_VERSION in the environment
+	HsaOverrideGfxVersion string
 )
 
 type EnvVar struct {
@@ -66,7 +77,7 @@ type EnvVar struct {
 }
 
 func AsMap() map[string]EnvVar {
-	return map[string]EnvVar{
+	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
@@ -84,6 +95,14 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 	}
+	if runtime.GOOS != "darwin" {
+		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
+		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
+		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
+		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
+		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
+	}
+	return ret
 }
 
 func Values() map[string]string {
@@ -256,6 +275,12 @@ func LoadConfig() {
 	if err != nil {
 		slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
 	}
+
+	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
+	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
+	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
+	GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
+	HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
 }
 
 func getModelsDir() (string, error) {
diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 61e6a0598..b88832787 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -13,6 +13,7 @@ import (
 	"strconv"
 	"strings"
 
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )
 
@@ -59,9 +60,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
 	var visibleDevices []string
-	hipVD := os.Getenv("HIP_VISIBLE_DEVICES")   // zero based index only
-	rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
-	gpuDO := os.Getenv("GPU_DEVICE_ORDINAL")    // zero based index
+	hipVD := envconfig.HipVisibleDevices   // zero based index only
+	rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
+	gpuDO := envconfig.GpuDeviceOrdinal    // zero based index
 	switch {
 	// TODO is this priorty order right?
 	case hipVD != "":
@@ -74,7 +75,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		visibleDevices = strings.Split(gpuDO, ",")
 	}
 
-	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+	gfxOverride := envconfig.HsaOverrideGfxVersion
 	var supported []string
 	libDir := ""
 
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index cad45f6c5..21585277a 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -10,6 +10,7 @@ import (
 	"strconv"
 	"strings"
 
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )
 
@@ -53,7 +54,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	}
 
 	var supported []string
-	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+	gfxOverride := envconfig.HsaOverrideGfxVersion
 	if gfxOverride == "" {
 		supported, err = GetSupportedGFX(libDir)
 		if err != nil {

From 225f0d121948f4d9ca8700d66c3514f233feaffd Mon Sep 17 00:00:00 2001
From: Lei Jitang <leijitang@outlook.com>
Date: Sat, 15 Jun 2024 14:26:23 +0800
Subject: [PATCH 04/43] gpu: Fix build warning

Signed-off-by: Lei Jitang <leijitang@outlook.com>
---
 gpu/gpu_info_oneapi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c
index e90c694a1..c1d326623 100644
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -160,7 +160,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
     return;
   }
 
-  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, props.modelName);
+  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
 
   // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
   // (this is probably wrong...)

From 0577af98f4129fc6bf5cc47d6b4d82d394ee68a6 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 13 Jun 2024 17:13:01 -0700
Subject: [PATCH 05/43] More parallelism on windows generate

Make the build faster
---
 llm/generate/gen_windows.ps1 | 72 +++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 21 deletions(-)

diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index c9d860977..7b56aed37 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,7 +1,5 @@
 #!powershell
 
-$ErrorActionPreference = "Stop"
-
 function amdGPUs {
     if ($env:AMDGPU_TARGETS) {
         return $env:AMDGPU_TARGETS
@@ -85,9 +83,9 @@ function init_vars {
 function git_module_setup {
     # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
     & git submodule init
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
     & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
 }
 
 function apply_patches {
@@ -121,10 +119,15 @@ function build {
     write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
     & cmake --version
     & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($cmakeDefs -contains "-G") {
+        $extra=@("-j8")
+    } else {
+        $extra= @("--", "/p:CL_MPcount=8")
+    }
+    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
+    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
+    if ($LASTEXITCODE -ne 0) { write-host "cmake build exit status $LASTEXITCODE"; throw($LASTEXITCODE)}
     # Rearrange output to be consistent between different generators
     if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
         mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
@@ -138,7 +141,7 @@ function sign {
         foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
         }
     }
 }
@@ -213,7 +216,13 @@ function build_static() {
     }
 }
 
-function build_cpu($gen_arch) {
+function build_cpu() {
+    if ($script:ARCH -eq "arm64") {
+        $gen_arch = "ARM64"
+    } else { # amd64
+        $gen_arch = "x64"
+    }
+
     if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
         # remaining llama.cpp builds use MSVC 
         init_vars
@@ -270,7 +279,15 @@ function build_cuda() {
         init_vars
         $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
         $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
-        $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+        $script:cmakeDefs += @(
+            "-A", "x64",
+            "-DLLAMA_CUDA=ON",
+            "-DLLAMA_AVX=on",
+            "-DLLAMA_AVX2=off",
+            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
+            "-DCMAKE_CUDA_FLAGS=-t8"
+            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+            )
         if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
             write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
             $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
@@ -391,16 +408,29 @@ init_vars
 if ($($args.count) -eq 0) {
     git_module_setup
     apply_patches
-    build_static
-    if ($script:ARCH -eq "arm64") {
-        build_cpu("ARM64")
-    } else { # amd64
-        build_cpu("x64")
-        build_cpu_avx
-        build_cpu_avx2
-        build_cuda
-        build_oneapi
-        build_rocm
+
+    $tasks = @("build_static", "build_cpu")
+    $jobs = @()
+    if ($script:ARCH -ne "arm64") {
+        $tasks += $("build_cpu_avx", "build_cpu_avx2", "build_cuda", "build_oneapi", "build_rocm")
+    }
+    foreach ($t in $tasks) {
+        $jobs += @(Start-ThreadJob -ThrottleLimit 12 -FilePath .\gen_windows.ps1 -ArgumentList $t -Name $t)
+    }
+    get-job
+    foreach ($job in $jobs) {
+        write-host "----" $job.Name output follows
+        receive-job -wait -job $job
+        write-host "----" $job.Name $job.State
+        write-host ""
+        if ($job.State -contains 'Failed') {
+            cleanup
+            write-host "Terminating remaining jobs (this takes a while, you can ^C)"
+            # TODO find some way to kill the spawned cmake processes faster
+            remove-job -force -job $jobs
+            exit(-1)
+        }
+        get-job
     }
 
     cleanup

From a12283e2ff381b18da3b10480f01b57a81766168 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 15 Jun 2024 08:26:54 -0700
Subject: [PATCH 06/43] Implement custom github release action

This implements the release logic we want via gh cli
to support updating releases with rc tags in place and retain
release notes and other community reactions.
---
 .github/workflows/release.yaml | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 40f9c41f0..61ca3c433 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -437,6 +437,7 @@ jobs:
     env:
       OLLAMA_SKIP_IMAGE_BUILD: '1'
       PUSH: '1'
+      GH_TOKEN: ${{ github.token }}
     steps:
       - uses: actions/checkout@v4
       - name: Set Version
@@ -460,15 +461,20 @@ jobs:
           ls -lh dist/
           (cd dist; sha256sum * > sha256sum.txt)
           cat dist/sha256sum.txt
-      - uses: ncipollo/release-action@v1
-        with:
-          name: ${{ env.RELEASE_VERSION }}
-          allowUpdates: true
-          artifacts: 'dist/*'
-          draft: true
-          prerelease: true
-          omitBodyDuringUpdate: true
-          generateReleaseNotes: true
-          omitDraftDuringUpdate: true
-          omitPrereleaseDuringUpdate: true
-          replacesArtifacts: true
+      - name: Create or update Release
+        run: |
+          echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
+          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
+          if [ -n "$OLD_TAG" ]; then
+            echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
+            gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
+          else
+            echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
+            gh release create ${GITHUB_REF_NAME} \
+              --title ${{ env.RELEASE_VERSION }} \
+              --draft \
+              --generate-notes \
+              --prerelease
+          fi
+          echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
+          gh release upload ${GITHUB_REF_NAME} dist/* --clobber

From c7b77004e35ac86bec8163f63052a5f44122b7d1 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sat, 15 Jun 2024 23:08:09 -0400
Subject: [PATCH 07/43] docs: add missing powershell package to windows
 development instructions (#5075)

* docs: add missing instruction for powershell build

The powershell script for building Ollama on Windows now requires the `ThreadJob` module. Add this to the instructions and dependency list.

* Update development.md
---
 docs/development.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/development.md b/docs/development.md
index 8c035a518..2a6886a43 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -114,15 +114,18 @@ If you have Docker available, you can build linux binaries with `./scripts/build
 
 ### Windows
 
-Note: The windows build for Ollama is still under development.
+Note: The Windows build for Ollama is still under development.
 
-Install required tools:
+First, install required tools:
 
 - MSVC toolchain - C/C++ and cmake as minimal requirements
 - Go version 1.22 or higher
 - MinGW (pick one variant) with GCC.
   - [MinGW-w64](https://www.mingw-w64.org/)
   - [MSYS2](https://www.msys2.org/)
+- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
+
+Then, build the `ollama` binary:
 
 ```powershell
 $env:CGO_ENABLED="1"

From 89c79bec8cf7a7b04f761fcc5306d2edf47a4164 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Sat, 15 Jun 2024 20:53:56 -0700
Subject: [PATCH 08/43] Add ModifiedAt Field to /api/show (#5033)

* Add Mod Time to Show

* Error Handling
---
 api/types.go     |  1 +
 server/routes.go | 56 ++++++++++++++++++++++++++++++------------------
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/api/types.go b/api/types.go
index d99cf3bcc..d62079aee 100644
--- a/api/types.go
+++ b/api/types.go
@@ -238,6 +238,7 @@ type ShowResponse struct {
 	System     string       `json:"system,omitempty"`
 	Details    ModelDetails `json:"details,omitempty"`
 	Messages   []Message    `json:"messages,omitempty"`
+	ModifiedAt time.Time    `json:"modified_at,omitempty"`
 }
 
 // CopyRequest is the request passed to [Client.Copy].
diff --git a/server/routes.go b/server/routes.go
index 188fe9748..f36fe1b08 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -646,9 +646,12 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
 
 	resp, err := GetModelInfo(req)
 	if err != nil {
-		if os.IsNotExist(err) {
+		switch {
+		case os.IsNotExist(err):
 			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
-		} else {
+		case err.Error() == "invalid model name":
+			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		default:
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		}
 		return
@@ -658,44 +661,55 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
 }
 
 func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
-	model, err := GetModel(req.Model)
+	m, err := GetModel(req.Model)
 	if err != nil {
 		return nil, err
 	}
 
 	modelDetails := api.ModelDetails{
-		ParentModel:       model.ParentModel,
-		Format:            model.Config.ModelFormat,
-		Family:            model.Config.ModelFamily,
-		Families:          model.Config.ModelFamilies,
-		ParameterSize:     model.Config.ModelType,
-		QuantizationLevel: model.Config.FileType,
+		ParentModel:       m.ParentModel,
+		Format:            m.Config.ModelFormat,
+		Family:            m.Config.ModelFamily,
+		Families:          m.Config.ModelFamilies,
+		ParameterSize:     m.Config.ModelType,
+		QuantizationLevel: m.Config.FileType,
 	}
 
 	if req.System != "" {
-		model.System = req.System
+		m.System = req.System
 	}
 
 	if req.Template != "" {
-		model.Template = req.Template
+		m.Template = req.Template
 	}
 
 	msgs := make([]api.Message, 0)
-	for _, msg := range model.Messages {
+	for _, msg := range m.Messages {
 		msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content})
 	}
 
+	n := model.ParseName(req.Model)
+	if !n.IsValid() {
+		return nil, fmt.Errorf("invalid model name")
+	}
+
+	manifest, err := ParseNamedManifest(n)
+	if err != nil {
+		return nil, err
+	}
+
 	resp := &api.ShowResponse{
-		License:  strings.Join(model.License, "\n"),
-		System:   model.System,
-		Template: model.Template,
-		Details:  modelDetails,
-		Messages: msgs,
+		License:    strings.Join(m.License, "\n"),
+		System:     m.System,
+		Template:   m.Template,
+		Details:    modelDetails,
+		Messages:   msgs,
+		ModifiedAt: manifest.fi.ModTime(),
 	}
 
 	var params []string
 	cs := 30
-	for k, v := range model.Options {
+	for k, v := range m.Options {
 		switch val := v.(type) {
 		case []interface{}:
 			for _, nv := range val {
@@ -709,15 +723,15 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 
 	for k, v := range req.Options {
 		if _, ok := req.Options[k]; ok {
-			model.Options[k] = v
+			m.Options[k] = v
 		}
 	}
 
 	var sb strings.Builder
 	fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
 	fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
-	fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
-	fmt.Fprint(&sb, model.String())
+	fmt.Fprintf(&sb, "# FROM %s\n\n", m.ShortName)
+	fmt.Fprint(&sb, m.String())
 	resp.Modelfile = sb.String()
 
 	return resp, nil

From fd1e6e0590894b795a5b041736a43bc1498adc22 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sun, 16 Jun 2024 07:42:52 -0700
Subject: [PATCH 09/43] Add some more debugging logs for intel discovery

Also removes an unused overall count variable
---
 gpu/gpu_info_oneapi.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c
index c1d326623..80ed11648 100644
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -13,7 +13,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
   resp->oh.num_drivers = 0;
   const int buflen = 256;
   char buf[buflen + 1];
-  int i, d, count;
+  int i, d;
   struct lookup {
     char *s;
     void **p;
@@ -62,6 +62,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
     }
   }
 
+  LOG(resp->oh.verbose, "calling zesInit\n");
+
   ret = (*resp->oh.zesInit)(0);
   if (ret != ZE_RESULT_SUCCESS) {
     LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
@@ -71,7 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
     return;
   }
 
-  count = 0;
+  LOG(resp->oh.verbose, "calling zesDriverGet\n");
   ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
   if (ret != ZE_RESULT_SUCCESS) {
     LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
@@ -96,6 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
   }
 
   for (d = 0; d < resp->oh.num_drivers; d++) {
+    LOG(resp->oh.verbose, "calling zesDeviceGet %d\n", resp->oh.drivers[d]);
     ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
                                    &resp->oh.num_devices[d], NULL);
     if (ret != ZE_RESULT_SUCCESS) {
@@ -116,7 +119,6 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
       oneapi_release(resp->oh);
       return;
     }
-    count += resp->oh.num_devices[d];
   }
 
   return;

From 163cd3e77c42aafd003b9cb884b3a51cdbaea106 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 16 Jun 2024 20:09:05 -0400
Subject: [PATCH 10/43] gpu: add env var for detecting Intel oneapi gpus
 (#5076)

* gpu: add env var for detecting intel oneapi gpus

* fix build error
---
 envconfig/config.go |  7 ++++++
 gpu/gpu.go          | 54 +++++++++++++++++++++++----------------------
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index bcf2e18ae..e86f72e6a 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -57,6 +57,8 @@ var (
 	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
+	// Set via OLLAMA_INTEL_GPU in the environment
+	IntelGpu bool
 
 	// Set via CUDA_VISIBLE_DEVICES in the environment
 	CudaVisibleDevices string
@@ -101,6 +103,7 @@ func AsMap() map[string]EnvVar {
 		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
+		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
 	}
 	return ret
 }
@@ -276,6 +279,10 @@ func LoadConfig() {
 		slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
 	}
 
+	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
+		IntelGpu = set
+	}
+
 	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
 	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
 	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 56a4dbfac..ce0a10496 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -280,33 +280,35 @@ func GetGPUInfo() GpuInfoList {
 		}
 
 		// Intel
-		oHandles = initOneAPIHandles()
-		for d := 0; oHandles.oneapi != nil && d < int(oHandles.oneapi.num_drivers); d++ {
-			if oHandles.oneapi == nil {
-				// shouldn't happen
-				slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
-				continue
-			}
-			devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-			for i := range devCount {
-				gpuInfo := OneapiGPUInfo{
-					GpuInfo: GpuInfo{
-						Library: "oneapi",
-					},
-					driverIndex: d,
-					gpuIndex:    int(i),
+		if envconfig.IntelGpu {
+			oHandles = initOneAPIHandles()
+			for d := range oHandles.oneapi.num_drivers {
+				if oHandles.oneapi == nil {
+					// shouldn't happen
+					slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
+					continue
+				}
+				devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
+				for i := range devCount {
+					gpuInfo := OneapiGPUInfo{
+						GpuInfo: GpuInfo{
+							Library: "oneapi",
+						},
+						driverIndex: int(d),
+						gpuIndex:    int(i),
+					}
+					// TODO - split bootstrapping from updating free memory
+					C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
+					// TODO - convert this to MinimumMemory based on testing...
+					var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+					memInfo.free = C.uint64_t(totalFreeMem)
+					gpuInfo.TotalMemory = uint64(memInfo.total)
+					gpuInfo.FreeMemory = uint64(memInfo.free)
+					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+					// TODO dependency path?
+					oneapiGPUs = append(oneapiGPUs, gpuInfo)
 				}
-				// TODO - split bootstrapping from updating free memory
-				C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
-				// TODO - convert this to MinimumMemory based on testing...
-				var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-				memInfo.free = C.uint64_t(totalFreeMem)
-				gpuInfo.TotalMemory = uint64(memInfo.total)
-				gpuInfo.FreeMemory = uint64(memInfo.free)
-				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				// TODO dependency path?
-				oneapiGPUs = append(oneapiGPUs, gpuInfo)
 			}
 		}
 

From 4ad0d4d6d3bd9b3507644d872fb27113fd3e487b Mon Sep 17 00:00:00 2001
From: Lei Jitang <leijitang@outlook.com>
Date: Tue, 18 Jun 2024 02:47:48 +0800
Subject: [PATCH 11/43] Fix a build warning (#5096)

Signed-off-by: Lei Jitang <leijitang@outlook.com>
---
 gpu/gpu_info_oneapi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c
index 80ed11648..004377456 100644
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -98,7 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
   }
 
   for (d = 0; d < resp->oh.num_drivers; d++) {
-    LOG(resp->oh.verbose, "calling zesDeviceGet %d\n", resp->oh.drivers[d]);
+    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
     ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
                                    &resp->oh.num_devices[d], NULL);
     if (ret != ZE_RESULT_SUCCESS) {

From 152fc202f5cb18b17a6f713526a0c0d62f6cec94 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 17 Jun 2024 15:56:16 -0400
Subject: [PATCH 12/43] llm: update llama.cpp commit to `7c26775` (#4896)

* llm: update llama.cpp submodule to `7c26775`

* disable `LLAMA_BLAS` for now

* `-DLLAMA_OPENMP=off`
---
 llm/generate/gen_darwin.sh               | 12 ++++++------
 llm/generate/gen_linux.sh                |  6 +++---
 llm/generate/gen_windows.ps1             |  6 ++++--
 llm/llama.cpp                            |  2 +-
 llm/patches/01-load-progress.diff        | 18 +++++++++---------
 llm/patches/05-default-pretokenizer.diff | 16 ++++++++--------
 6 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 0baf86ffc..721a9ae80 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,7 +18,7 @@ sign() {
     fi
 }
 
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
 
 case "${GOARCH}" in
 "amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
     # Static build for linking into the Go binary
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     echo "Building static library"
     build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
         # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
         #
         init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
         BUILD_DIR="../build/darwin/${ARCH}/cpu"
         echo "Building LCD CPU"
         build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
         # Approximately 400% faster than LCD on same CPU
         #
         init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
         BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
         echo "Building AVX CPU"
         build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
         # Approximately 10% faster than AVX on same CPU
         #
         init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
         BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
         echo "Building AVX2 CPU"
         EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
     # Static build for linking into the Go binary
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     echo "Building static library"
     build
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index a9df6ff86..f91008753 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
         export CUDACXX=$(command -v nvcc)
     fi
 fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
     # Static build for linking into the Go binary
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
     BUILD_DIR="../build/linux/${ARCH}_static"
     echo "Building static library"
     build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
         # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
         # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
 
-        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
         if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
             #
             # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 7b56aed37..0eb48ffac 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -37,7 +37,8 @@ function init_vars {
     }
     $script:cmakeDefs = @(
         "-DBUILD_SHARED_LIBS=on",
-        "-DLLAMA_NATIVE=off"
+        "-DLLAMA_NATIVE=off",
+        "-DLLAMA_OPENMP=off"
         )
     $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
     $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -206,7 +207,8 @@ function build_static() {
             "-DLLAMA_AVX2=off",
             "-DLLAMA_AVX512=off",
             "-DLLAMA_F16C=off",
-            "-DLLAMA_FMA=off")
+            "-DLLAMA_FMA=off",
+            "-DLLAMA_OPENMP=off")
         $script:buildDir="../build/windows/${script:ARCH}_static"
         write-host "Building static library"
         build
diff --git a/llm/llama.cpp b/llm/llama.cpp
index 5921b8f08..7c26775ad 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603
+Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c
diff --git a/llm/patches/01-load-progress.diff b/llm/patches/01-load-progress.diff
index acd44d207..be5286091 100644
--- a/llm/patches/01-load-progress.diff
+++ b/llm/patches/01-load-progress.diff
@@ -1,8 +1,8 @@
 diff --git a/common/common.cpp b/common/common.cpp
-index ba1ecf0e..cead57cc 100644
+index 73ff0e85..6adb1a92 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
      mparams.use_mmap        = params.use_mmap;
      mparams.use_mlock       = params.use_mlock;
      mparams.check_tensors   = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
          mparams.kv_overrides = NULL;
      } else {
 diff --git a/common/common.h b/common/common.h
-index d80344f2..71e84834 100644
+index 58ed72f4..0bb2605e 100644
 --- a/common/common.h
 +++ b/common/common.h
-@@ -174,6 +174,13 @@ struct gpt_params {
-     // multimodal models (see examples/llava)
+@@ -180,6 +180,13 @@ struct gpt_params {
      std::string mmproj = "";        // path to multimodal projector
      std::vector<std::string> image; // path to image file(s)
-+
+ 
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 +    // If the provided progress_callback returns true, model loading continues.
 +    // If it returns false, model loading is immediately aborted.
 +    llama_progress_callback progress_callback = NULL;
 +    // context pointer passed to the progress callback
 +    void * progress_callback_user_data;
- };
- 
- void gpt_params_handle_model_default(gpt_params & params);
++
+     // server params
+     int32_t port           = 8080;         // server listens on this network port
+     int32_t timeout_read   = 600;          // http read timeout in seconds
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 27c8aabc2..2a2e7306e 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/llama.cpp b/llama.cpp
-index 40d2ec2c..74f3ee9c 100644
+index 61948751..4b72a293 100644
 --- a/llama.cpp
 +++ b/llama.cpp
-@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
+@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
  
          // for now, only BPE models have pre-tokenizers
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--            } else if (
-+            if (
-                     tokenizer_pre == "default") {
+-            } else if (tokenizer_pre == "default") {
++            if (tokenizer_pre == "default") {
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
-@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "smaug-bpe") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+                     tokenizer_pre == "llama3"   ||
+@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "poro-chat") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);

From b2799f111b6b662690ec3f705f4ab95a7f5d7df4 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 15 Jun 2024 13:17:20 -0700
Subject: [PATCH 13/43] Move libraries out of users path

We update the PATH on windows to get the CLI mapped, but this has
an unintended side effect of causing other apps that may use our bundled
DLLs to get terminated when we upgrade.
---
 app/ollama.iss               |  7 ++++++-
 gpu/gpu.go                   | 10 ++++++++--
 llm/generate/gen_windows.ps1 | 32 ++++++++++++++++++--------------
 llm/payload.go               |  4 ++--
 llm/server.go                |  4 ++--
 scripts/build_windows.ps1    | 10 +++++-----
 6 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/app/ollama.iss b/app/ollama.iss
index 9dc61abbf..e6502abd3 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,10 +88,15 @@ DialogFontSize=12
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
 Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
+#if DirExists("..\dist\windows-amd64\cuda")
+  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
+#endif
+#if DirExists("..\dist\windows-amd64\oneapi")
+  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
+#endif
 #if DirExists("..\dist\windows-amd64\rocm")
   Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
 #endif
diff --git a/gpu/gpu.go b/gpu/gpu.go
index ce0a10496..583bb79c6 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -231,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
 		// On windows we bundle the nvidia library one level above the runner dir
 		depPath := ""
 		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-			depPath = filepath.Dir(envconfig.RunnersDir)
+			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
 		}
 
 		// Load ALL libraries
@@ -282,6 +282,12 @@ func GetGPUInfo() GpuInfoList {
 		// Intel
 		if envconfig.IntelGpu {
 			oHandles = initOneAPIHandles()
+			// On windows we bundle the oneapi library one level above the runner dir
+			depPath = ""
+			if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
+			}
+
 			for d := range oHandles.oneapi.num_drivers {
 				if oHandles.oneapi == nil {
 					// shouldn't happen
@@ -306,7 +312,7 @@ func GetGPUInfo() GpuInfoList {
 					gpuInfo.FreeMemory = uint64(memInfo.free)
 					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-					// TODO dependency path?
+					gpuInfo.DependencyPath = depPath
 					oneapiGPUs = append(oneapiGPUs, gpuInfo)
 				}
 			}
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 0eb48ffac..a3c53e63c 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -299,10 +299,12 @@ function build_cuda() {
         sign
         install
 
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
     } else {
         write-host "Skipping CUDA generation step"
     }
@@ -336,16 +338,18 @@ function build_oneapi() {
     sign
     install
 
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
+    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
   } else {
     Write-Host "Skipping oneAPI generation step"
   }
diff --git a/llm/payload.go b/llm/payload.go
index 20dcee7b5..9296db336 100644
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
 	}
 
 	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*")
+	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
 
 	files, err := filepath.Glob(pattern)
 	if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(file)] = file
+		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
 	}
 
 	return servers
diff --git a/llm/server.go b/llm/server.go
index 117565ba5..a94633ee6 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -271,8 +271,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// prepend the server directory to LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{dir}
+		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
+		libraryPaths := []string{dir, filepath.Dir(dir)}
 
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			// Append our runner directory to the path
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 60de03073..b3991ce1f 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -103,19 +103,19 @@ function buildApp() {
 function gatherDependencies() {
     write-host "Gathering runtime dependencies"
     cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}" -ea 0 > $null
+    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
 
     # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
     # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
 
 
     cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
     if ("${env:KEY_CONTAINER}") {
         write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
             write-host "signing $file"
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file

From e890be48149a739cd311f8ec60ca8c60a2abb4e4 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 17 Jun 2024 13:32:46 -0700
Subject: [PATCH 14/43] Revert "More parallelism on windows generate"

This reverts commit 0577af98f4129fc6bf5cc47d6b4d82d394ee68a6.
---
 llm/generate/gen_windows.ps1 | 72 +++++++++++-------------------------
 1 file changed, 21 insertions(+), 51 deletions(-)

diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 0eb48ffac..ed286c322 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,5 +1,7 @@
 #!powershell
 
+$ErrorActionPreference = "Stop"
+
 function amdGPUs {
     if ($env:AMDGPU_TARGETS) {
         return $env:AMDGPU_TARGETS
@@ -84,9 +86,9 @@ function init_vars {
 function git_module_setup {
     # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
     & git submodule init
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 
 function apply_patches {
@@ -120,15 +122,10 @@ function build {
     write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
     & cmake --version
     & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
-    if ($cmakeDefs -contains "-G") {
-        $extra=@("-j8")
-    } else {
-        $extra= @("--", "/p:CL_MPcount=8")
-    }
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
-    if ($LASTEXITCODE -ne 0) { write-host "cmake build exit status $LASTEXITCODE"; throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
+    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     # Rearrange output to be consistent between different generators
     if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
         mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
@@ -142,7 +139,7 @@ function sign {
         foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
         }
     }
 }
@@ -218,13 +215,7 @@ function build_static() {
     }
 }
 
-function build_cpu() {
-    if ($script:ARCH -eq "arm64") {
-        $gen_arch = "ARM64"
-    } else { # amd64
-        $gen_arch = "x64"
-    }
-
+function build_cpu($gen_arch) {
     if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
         # remaining llama.cpp builds use MSVC 
         init_vars
@@ -281,15 +272,7 @@ function build_cuda() {
         init_vars
         $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
         $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
-        $script:cmakeDefs += @(
-            "-A", "x64",
-            "-DLLAMA_CUDA=ON",
-            "-DLLAMA_AVX=on",
-            "-DLLAMA_AVX2=off",
-            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
-            "-DCMAKE_CUDA_FLAGS=-t8"
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
-            )
+        $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
         if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
             write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
             $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
@@ -410,29 +393,16 @@ init_vars
 if ($($args.count) -eq 0) {
     git_module_setup
     apply_patches
-
-    $tasks = @("build_static", "build_cpu")
-    $jobs = @()
-    if ($script:ARCH -ne "arm64") {
-        $tasks += $("build_cpu_avx", "build_cpu_avx2", "build_cuda", "build_oneapi", "build_rocm")
-    }
-    foreach ($t in $tasks) {
-        $jobs += @(Start-ThreadJob -ThrottleLimit 12 -FilePath .\gen_windows.ps1 -ArgumentList $t -Name $t)
-    }
-    get-job
-    foreach ($job in $jobs) {
-        write-host "----" $job.Name output follows
-        receive-job -wait -job $job
-        write-host "----" $job.Name $job.State
-        write-host ""
-        if ($job.State -contains 'Failed') {
-            cleanup
-            write-host "Terminating remaining jobs (this takes a while, you can ^C)"
-            # TODO find some way to kill the spawned cmake processes faster
-            remove-job -force -job $jobs
-            exit(-1)
-        }
-        get-job
+    build_static
+    if ($script:ARCH -eq "arm64") {
+        build_cpu("ARM64")
+    } else { # amd64
+        build_cpu("x64")
+        build_cpu_avx
+        build_cpu_avx2
+        build_cuda
+        build_oneapi
+        build_rocm
     }
 
     cleanup

From b0930626c5b5e64c819f985313b805be488a1f5d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 17 Jun 2024 13:44:46 -0700
Subject: [PATCH 15/43] Add back lower level parallel flags

nvcc supports parallelism (threads) and cmake + make can use -j,
while msbuild requires /p:CL_MPcount=8
---
 llm/generate/gen_linux.sh    |  2 +-
 llm/generate/gen_windows.ps1 | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index f91008753..28ce1f21d 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
         CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
         echo "Building custom CUDA GPU"
     else
-        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
     fi
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
     BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index ed286c322..87d1207fd 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -123,8 +123,13 @@ function build {
     & cmake --version
     & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
+    if ($cmakeDefs -contains "-G") {
+        $extra=@("-j8")
+    } else {
+        $extra= @("--", "/p:CL_MPcount=8")
+    }
+    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
+    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     # Rearrange output to be consistent between different generators
     if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
@@ -272,7 +277,15 @@ function build_cuda() {
         init_vars
         $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
         $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
-        $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+        $script:cmakeDefs += @(
+            "-A", "x64",
+            "-DLLAMA_CUDA=ON",
+            "-DLLAMA_AVX=on",
+            "-DLLAMA_AVX2=off",
+            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
+            "-DCMAKE_CUDA_FLAGS=-t8",
+            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+            )
         if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
             write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
             $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")

From 176d0f7075b7de1097b2f88121a9a3c9434c2d39 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 17 Jun 2024 19:44:14 -0400
Subject: [PATCH 16/43] Update import.md

---
 docs/import.md | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/docs/import.md b/docs/import.md
index 7abe39b2a..f34f09ace 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -47,19 +47,13 @@ success
 
 ### Supported Quantizations
 
-<details>
-<summary>Legacy Quantization</summary>
-
 - `Q4_0`
 - `Q4_1`
 - `Q5_0`
 - `Q5_1`
 - `Q8_0`
 
-</details>
-
-<details>
-<summary>K-means Quantization</summary>`
+#### K-means Quantizations
 
 - `Q3_K_S`
 - `Q3_K_M`
@@ -70,11 +64,6 @@ success
 - `Q5_K_M`
 - `Q6_K`
 
-</details>
-
-> [!NOTE]
-> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
-
 ## Template Detection
 
 > [!NOTE]

From 171796791f2bb09fc731fdfca4b6fa8926fd6f74 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 17 Jun 2024 12:14:42 -0700
Subject: [PATCH 17/43] Adjust mmap logic for cuda windows for faster model
 load

On Windows, recent llama.cpp changes make mmap slower in most
cases, so default to off.  This also implements a tri-state for
use_mmap so we can detect the difference between a user provided
value of true/false, or unspecified.
---
 api/types.go      | 70 ++++++++++++++++++++++++++++++++++++++---------
 api/types_test.go | 36 ++++++++++++++++++++++++
 llm/server.go     |  5 ++--
 3 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/api/types.go b/api/types.go
index d62079aee..7822a6034 100644
--- a/api/types.go
+++ b/api/types.go
@@ -159,18 +159,49 @@ type Options struct {
 
 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	UseNUMA   bool `json:"numa,omitempty"`
-	NumCtx    int  `json:"num_ctx,omitempty"`
-	NumBatch  int  `json:"num_batch,omitempty"`
-	NumGPU    int  `json:"num_gpu,omitempty"`
-	MainGPU   int  `json:"main_gpu,omitempty"`
-	LowVRAM   bool `json:"low_vram,omitempty"`
-	F16KV     bool `json:"f16_kv,omitempty"`
-	LogitsAll bool `json:"logits_all,omitempty"`
-	VocabOnly bool `json:"vocab_only,omitempty"`
-	UseMMap   bool `json:"use_mmap,omitempty"`
-	UseMLock  bool `json:"use_mlock,omitempty"`
-	NumThread int  `json:"num_thread,omitempty"`
+	UseNUMA   bool     `json:"numa,omitempty"`
+	NumCtx    int      `json:"num_ctx,omitempty"`
+	NumBatch  int      `json:"num_batch,omitempty"`
+	NumGPU    int      `json:"num_gpu,omitempty"`
+	MainGPU   int      `json:"main_gpu,omitempty"`
+	LowVRAM   bool     `json:"low_vram,omitempty"`
+	F16KV     bool     `json:"f16_kv,omitempty"`
+	LogitsAll bool     `json:"logits_all,omitempty"`
+	VocabOnly bool     `json:"vocab_only,omitempty"`
+	UseMMap   TriState `json:"use_mmap,omitempty"`
+	UseMLock  bool     `json:"use_mlock,omitempty"`
+	NumThread int      `json:"num_thread,omitempty"`
+}
+
+type TriState int
+
+const (
+	TriStateUndefined TriState = -1
+	TriStateFalse     TriState = 0
+	TriStateTrue      TriState = 1
+)
+
+func (b *TriState) UnmarshalJSON(data []byte) error {
+	var v bool
+	if err := json.Unmarshal(data, &v); err != nil {
+		return err
+	}
+	if v {
+		*b = TriStateTrue
+	}
+	*b = TriStateFalse
+	return nil
+}
+
+func (b *TriState) MarshalJSON() ([]byte, error) {
+	if *b == TriStateUndefined {
+		return nil, nil
+	}
+	var v bool
+	if *b == TriStateTrue {
+		v = true
+	}
+	return json.Marshal(v)
 }
 
 // EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -403,6 +434,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 				continue
 			}
 
+			if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+				val, ok := val.(bool)
+				if !ok {
+					return fmt.Errorf("option %q must be of type boolean", key)
+				}
+				if val {
+					field.SetInt(int64(TriStateTrue))
+				} else {
+					field.SetInt(int64(TriStateFalse))
+				}
+				continue
+			}
+
 			switch field.Kind() {
 			case reflect.Int:
 				switch t := val.(type) {
@@ -491,7 +535,7 @@ func DefaultOptions() Options {
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
-			UseMMap:   true,
+			UseMMap:   TriStateUndefined,
 			UseNUMA:   false,
 		},
 	}
diff --git a/api/types_test.go b/api/types_test.go
index 211385c70..7b4a0f83c 100644
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -105,3 +105,39 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
 		})
 	}
 }
+
+func TestUseMmapParsingFromJSON(t *testing.T) {
+	tests := []struct {
+		name string
+		req  string
+		exp  TriState
+	}{
+		{
+			name: "Undefined",
+			req:  `{ }`,
+			exp:  TriStateUndefined,
+		},
+		{
+			name: "True",
+			req:  `{ "use_mmap": true }`,
+			exp:  TriStateTrue,
+		},
+		{
+			name: "False",
+			req:  `{ "use_mmap": false }`,
+			exp:  TriStateFalse,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var oMap map[string]interface{}
+			err := json.Unmarshal([]byte(test.req), &oMap)
+			require.NoError(t, err)
+			opts := DefaultOptions()
+			err = opts.FromMap(oMap)
+			require.NoError(t, err)
+			assert.Equal(t, test.exp, opts.UseMMap)
+		})
+	}
+}
diff --git a/llm/server.go b/llm/server.go
index 117565ba5..dd9862924 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -200,7 +200,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
-			opts.UseMMap = false
+			opts.UseMMap = api.TriStateFalse
 		}
 	}
 
@@ -208,7 +208,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--flash-attn")
 	}
 
-	if !opts.UseMMap {
+	// Windows CUDA should not use mmap for best performance
+	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
 		params = append(params, "--no-mmap")
 	}
 

From 7784ca33ce4cc78629160719f4a84981437071b7 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 17 Jun 2024 18:39:48 -0700
Subject: [PATCH 18/43] Tighten up memory prediction logging

Prior to this change, we logged the memory prediction multiple times
as the scheduler iterates to find a suitable configuration, which can be
confusing since only the last log before the server starts is actually valid.
This now logs once just before starting the server on the final configuration.
It also reports what library instead of always saying "offloading to gpu" when
using CPU.
---
 llm/memory.go | 108 ++++++++++++++++++++++++++++++--------------------
 llm/server.go |   2 +
 2 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index 5afb1c2e9..3bf18c3f2 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -49,6 +49,18 @@ type MemoryEstimate struct {
 
 	// For multi-GPU scenarios, this is the size in bytes per GPU
 	GPUSizes []uint64
+
+	// internal fields for logging purposes
+	inferenceLibrary    string
+	layersRequested     int
+	layersModel         int
+	availableList       []string
+	kv                  uint64
+	allocationsList     []string
+	memoryWeights       uint64
+	memoryLayerOutput   uint64
+	graphFullOffload    uint64
+	graphPartialOffload uint64
 }
 
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@@ -252,78 +264,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		allocationsList = append(allocationsList, format.HumanBytes2(a))
 	}
 
+	estimate := MemoryEstimate{
+		TotalSize: memoryRequiredTotal,
+		Layers:    0,
+		Graph:     0,
+		VRAMSize:  0,
+		GPUSizes:  []uint64{},
+
+		inferenceLibrary:    gpus[0].Library,
+		layersRequested:     opts.NumGPU,
+		layersModel:         int(ggml.KV().BlockCount()) + 1,
+		availableList:       availableList,
+		kv:                  kv,
+		allocationsList:     allocationsList,
+		memoryWeights:       memoryWeights,
+		memoryLayerOutput:   memoryLayerOutput,
+		graphFullOffload:    graphFullOffload,
+		graphPartialOffload: graphPartialOffload,
+	}
+
+	if gpus[0].Library == "cpu" {
+		return estimate
+	}
+	if layerCount == 0 {
+		slog.Debug("insufficient VRAM to load any model layers")
+		return estimate
+	}
+	estimate.Layers = layerCount
+	estimate.Graph = graphOffload
+	estimate.VRAMSize = memoryRequiredPartial
+	estimate.TotalSize = memoryRequiredTotal
+	estimate.TensorSplit = tensorSplit
+	estimate.GPUSizes = gpuAllocations
+	return estimate
+}
+
+func (m MemoryEstimate) log() {
 	slog.Info(
-		"offload to gpu",
+		"offload to "+m.inferenceLibrary,
 		slog.Group(
 			"layers",
 			// requested number of layers to offload
-			"requested", opts.NumGPU,
+			"requested", m.layersRequested,
 			// The number of layers the model has (including output)
-			"model", int(ggml.KV().BlockCount())+1,
+			"model", m.layersModel,
 			// estimated number of layers that can be offloaded
-			"offload", layerCount,
-			// multi-gpu split for tesnors
-			"split", tensorSplit,
+			"offload", m.Layers,
+			// multi-gpu split for tensors
+			"split", m.TensorSplit,
 		),
 		slog.Group(
 			"memory",
 			// memory available by GPU for offloading
-			"available", availableList,
+			"available", m.availableList,
 			slog.Group(
 				"required",
 				// memory required for full offloading
-				"full", format.HumanBytes2(memoryRequiredTotal),
+				"full", format.HumanBytes2(m.TotalSize),
 				// memory required to offload layers.estimate layers
-				"partial", format.HumanBytes2(memoryRequiredPartial),
+				"partial", format.HumanBytes2(m.VRAMSize),
 				// memory of KV cache
-				"kv", format.HumanBytes2(kv),
+				"kv", format.HumanBytes2(m.kv),
 				// Allocations across the GPUs
-				"allocations", allocationsList,
+				"allocations", m.allocationsList,
 			),
 			slog.Group(
 				"weights",
 				// memory of the weights
-				"total", format.HumanBytes2(memoryWeights),
+				"total", format.HumanBytes2(m.memoryWeights),
 				// memory of repeating layers
-				"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
+				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
 				// memory of non-repeating layers
-				"nonrepeating", format.HumanBytes2(memoryLayerOutput),
+				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
 			),
 			slog.Group(
 				"graph",
 				// memory of graph when fully offloaded
-				"full", format.HumanBytes2(graphFullOffload),
+				"full", format.HumanBytes2(m.graphFullOffload),
 				// memory of graph when not fully offloaded
-				"partial", format.HumanBytes2(graphPartialOffload),
+				"partial", format.HumanBytes2(m.graphPartialOffload),
 			),
 		),
 	)
-	if gpus[0].Library == "cpu" {
-		return MemoryEstimate{
-			Layers:    0,
-			Graph:     0,
-			VRAMSize:  0,
-			TotalSize: memoryRequiredTotal,
-			GPUSizes:  []uint64{},
-		}
-	}
-	if layerCount == 0 {
-		slog.Debug("insufficient VRAM to load any model layers")
-		return MemoryEstimate{
-			Layers:    0,
-			Graph:     0,
-			VRAMSize:  0,
-			TotalSize: memoryRequiredTotal,
-			GPUSizes:  []uint64{},
-		}
-	}
-
-	return MemoryEstimate{
-		Layers:      layerCount,
-		Graph:       graphOffload,
-		VRAMSize:    memoryRequiredPartial,
-		TotalSize:   memoryRequiredTotal,
-		TensorSplit: tensorSplit,
-		GPUSizes:    gpuAllocations,
-	}
 }
diff --git a/llm/server.go b/llm/server.go
index dd9862924..7de1ec1b1 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -116,6 +116,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}
 
+	estimate.log()
+
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")
 

From 359b15a59785809465ddffbaffd8be0ae3afcd5a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 18 Jun 2024 11:05:34 -0700
Subject: [PATCH 19/43] Handle models with divergent layer sizes

The recent refactoring of the memory prediction assumed all layers
are the same size, but for some models (like deepseek-coder-v2) this
is not the case, so our predictions were significantly off.
---
 llm/memory.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llm/memory.go b/llm/memory.go
index 3bf18c3f2..b8b862bd6 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,6 +1,7 @@
 package llm
 
 import (
+	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
@@ -179,6 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
 	// For all the layers, find where they can fit on the GPU(s)
 	for i := range int(ggml.KV().BlockCount()) {
+		// Some models have inconsistent layer sizes
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			layerSize = blk.size()
+			layerSize += kv / ggml.KV().BlockCount()
+		}
 		memoryWeights += layerSize
 
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {

From e873841cbb38d9d8f1b058e1338d88eaffbf9afa Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 18 Jun 2024 12:42:37 -0700
Subject: [PATCH 20/43] deepseek v2 graph

---
 llm/ggml.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llm/ggml.go b/llm/ggml.go
index 35b89d16e..4d9ba97a8 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -367,6 +367,17 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(vocab+2*embedding),
 			fullOffload,
 		)
+	case "deepseek2":
+		keys := uint64(llm.KV()["deepseek2.attention.key_length"].(uint32))
+		fullOffload = max(
+			4*batch*(3*embedding+vocab),
+			4*batch*(3*embedding+2+context*(1+headsKV)+2*keys*headsKV),
+		)
+
+		partialOffload = max(
+			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
+			4*batch*(2*embedding+1+2*keys*headsKV+context+context*headsKV)+4*keys*context*headsKV+embedding*keys*headsKV*9/16,
+		)
 	}
 
 	return

From 1a1c99e3346da21bf2062fa266cf39da954c66a8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 18 Jun 2024 17:13:54 -0700
Subject: [PATCH 21/43] Bump latest fedora cuda repo to 39

---
 scripts/install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index 0f12d7e09..2a06c350a 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
     case $OS_NAME in
         centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
         rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
-        fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
+        fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
         amzn) install_cuda_driver_yum 'fedora' '37' ;;
         debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
         ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;

From 755b4e4fc291366595ed7bfb37c2a91ff5834df8 Mon Sep 17 00:00:00 2001
From: "Wang,Zhe" <zhe1.wang@intel.com>
Date: Wed, 19 Jun 2024 08:59:58 +0800
Subject: [PATCH 22/43] Revert "gpu: add env var for detecting Intel oneapi
 gpus (#5076)"

This reverts commit 163cd3e77c42aafd003b9cb884b3a51cdbaea106.
---
 envconfig/config.go |  7 ------
 gpu/gpu.go          | 54 ++++++++++++++++++++++-----------------------
 2 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index e86f72e6a..bcf2e18ae 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -57,8 +57,6 @@ var (
 	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
-	// Set via OLLAMA_INTEL_GPU in the environment
-	IntelGpu bool
 
 	// Set via CUDA_VISIBLE_DEVICES in the environment
 	CudaVisibleDevices string
@@ -103,7 +101,6 @@ func AsMap() map[string]EnvVar {
 		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
 	}
 	return ret
 }
@@ -279,10 +276,6 @@ func LoadConfig() {
 		slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
 	}
 
-	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
-		IntelGpu = set
-	}
-
 	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
 	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
 	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
diff --git a/gpu/gpu.go b/gpu/gpu.go
index ce0a10496..56a4dbfac 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -280,35 +280,33 @@ func GetGPUInfo() GpuInfoList {
 		}
 
 		// Intel
-		if envconfig.IntelGpu {
-			oHandles = initOneAPIHandles()
-			for d := range oHandles.oneapi.num_drivers {
-				if oHandles.oneapi == nil {
-					// shouldn't happen
-					slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
-					continue
-				}
-				devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-				for i := range devCount {
-					gpuInfo := OneapiGPUInfo{
-						GpuInfo: GpuInfo{
-							Library: "oneapi",
-						},
-						driverIndex: int(d),
-						gpuIndex:    int(i),
-					}
-					// TODO - split bootstrapping from updating free memory
-					C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
-					// TODO - convert this to MinimumMemory based on testing...
-					var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-					memInfo.free = C.uint64_t(totalFreeMem)
-					gpuInfo.TotalMemory = uint64(memInfo.total)
-					gpuInfo.FreeMemory = uint64(memInfo.free)
-					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-					// TODO dependency path?
-					oneapiGPUs = append(oneapiGPUs, gpuInfo)
+		oHandles = initOneAPIHandles()
+		for d := 0; oHandles.oneapi != nil && d < int(oHandles.oneapi.num_drivers); d++ {
+			if oHandles.oneapi == nil {
+				// shouldn't happen
+				slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
+				continue
+			}
+			devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
+			for i := range devCount {
+				gpuInfo := OneapiGPUInfo{
+					GpuInfo: GpuInfo{
+						Library: "oneapi",
+					},
+					driverIndex: d,
+					gpuIndex:    int(i),
 				}
+				// TODO - split bootstrapping from updating free memory
+				C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
+				// TODO - convert this to MinimumMemory based on testing...
+				var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+				memInfo.free = C.uint64_t(totalFreeMem)
+				gpuInfo.TotalMemory = uint64(memInfo.total)
+				gpuInfo.FreeMemory = uint64(memInfo.free)
+				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+				// TODO dependency path?
+				oneapiGPUs = append(oneapiGPUs, gpuInfo)
 			}
 		}
 

From badf975e45005c45cf5d7794a18c88f4e069f89c Mon Sep 17 00:00:00 2001
From: "Wang,Zhe" <zhe1.wang@intel.com>
Date: Wed, 19 Jun 2024 09:00:51 +0800
Subject: [PATCH 23/43] get real func ptr.

---
 gpu/gpu_info_oneapi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c
index 004377456..3ff708ea2 100644
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -50,7 +50,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
     LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
 
     *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
       resp->oh.handle = NULL;
       char *msg = LOAD_ERR();
       LOG(resp->oh.verbose, "dlerr: %s\n", msg);

From 380e06e5bea06ae8ded37f47c37bd5d604194d3e Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Tue, 18 Jun 2024 13:29:38 -0700
Subject: [PATCH 24/43] types/model: remove Digest

The Digest type in its current form is awkward to work with and presents
challenges with regard to how it serializes via String using the '-'
prefix.

We currently only use this in ollama.com, so we'll move our specific
needs around digest parsing and validation there.
---
 types/model/name.go      | 55 ----------------------------------------
 types/model/name_test.go | 34 -------------------------
 2 files changed, 89 deletions(-)

diff --git a/types/model/name.go b/types/model/name.go
index d85fd0c6c..e645a844c 100644
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -4,7 +4,6 @@ package model
 
 import (
 	"cmp"
-	"encoding/hex"
 	"errors"
 	"fmt"
 	"log/slog"
@@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
 	}
 	return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
 }
-
-type DigestType byte
-
-const (
-	DigestTypeInvalid DigestType = iota
-	DigestTypeSHA256
-)
-
-func (t DigestType) String() string {
-	switch t {
-	case DigestTypeSHA256:
-		return "sha256"
-	default:
-		return "invalid"
-	}
-}
-
-type Digest struct {
-	Type DigestType
-	Sum  [32]byte
-}
-
-func ParseDigest(s string) (Digest, error) {
-	i := strings.IndexAny(s, "-:")
-	if i < 0 {
-		return Digest{}, fmt.Errorf("invalid digest %q", s)
-	}
-	typ, encSum := s[:i], s[i+1:]
-	if typ != "sha256" {
-		return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
-	}
-	d := Digest{
-		Type: DigestTypeSHA256,
-	}
-	n, err := hex.Decode(d.Sum[:], []byte(encSum))
-	if err != nil {
-		return Digest{}, err
-	}
-	if n != 32 {
-		return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
-	}
-	return d, nil
-}
-
-func (d Digest) String() string {
-	if d.Type == DigestTypeInvalid {
-		return ""
-	}
-	return fmt.Sprintf("sha256-%x", d.Sum)
-}
-
-func (d Digest) IsValid() bool {
-	return d.Type != DigestTypeInvalid
-}
diff --git a/types/model/name_test.go b/types/model/name_test.go
index 66ce4c339..008dd586c 100644
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) {
 	}
 }
 
-const (
-	validSha256    = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
-	validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
-)
-
-func TestParseDigest(t *testing.T) {
-	cases := []struct {
-		in   string
-		want string
-	}{
-		{"", ""},           // empty
-		{"sha123-12", ""},  // invalid type
-		{"sha256-", ""},    // invalid sum
-		{"sha256-123", ""}, // invalid odd length sum
-
-		{validSha256, validSha256},
-		{validSha256Old, validSha256},
-	}
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			got, err := ParseDigest(tt.in)
-			if err != nil {
-				if tt.want != "" {
-					t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
-				}
-				return
-			}
-			if got.String() != tt.want {
-				t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
-			}
-		})
-	}
-}
-
 func TestParseNameFromFilepath(t *testing.T) {
 	cases := map[string]Name{
 		filepath.Join("host", "namespace", "model", "tag"):      {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},

From 52ce350b7aecd4bce9c42fe4aac1d85e47a6d774 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 19 Jun 2024 08:39:07 -0700
Subject: [PATCH 25/43] Fix bad symbol load detection

pointer deref's weren't correct on a few libraries, which explains
some crashes on older systems or miswired symlinks for discovery libraries.
---
 gpu/gpu_info_cudart.c | 2 +-
 gpu/gpu_info_nvcuda.c | 2 +-
 gpu/gpu_info_nvml.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c
index 9db89529a..03f15a2c3 100644
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
 
   for (i = 0; l[i].s != NULL; i++) {
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
       char *msg = LOAD_ERR();
       LOG(resp->ch.verbose, "dlerr: %s\n", msg);
       UNLOAD_LIBRARY(resp->ch.handle);
diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c
index 675ce5cc4..abe140844 100644
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
 
   for (i = 0; l[i].s != NULL; i++) {
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*l[i].p) {
+    if (!*(l[i].p)) {
       char *msg = LOAD_ERR();
       LOG(resp->ch.verbose, "dlerr: %s\n", msg);
       UNLOAD_LIBRARY(resp->ch.handle);
diff --git a/gpu/gpu_info_nvml.c b/gpu/gpu_info_nvml.c
index ef0a97df2..11293e448 100644
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -42,7 +42,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
     // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
 
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
       resp->ch.handle = NULL;
       char *msg = LOAD_ERR();
       LOG(resp->ch.verbose, "dlerr: %s\n", msg);

From d34d88e41744bdcc36a425299af85ce762f3d30e Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 19 Jun 2024 08:57:41 -0700
Subject: [PATCH 26/43] Revert "Revert "gpu: add env var for detecting Intel
 oneapi gpus (#5076)""

This reverts commit 755b4e4fc291366595ed7bfb37c2a91ff5834df8.
---
 envconfig/config.go |  7 ++++++
 gpu/gpu.go          | 54 +++++++++++++++++++++++----------------------
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index bcf2e18ae..e86f72e6a 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -57,6 +57,8 @@ var (
 	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
+	// Set via OLLAMA_INTEL_GPU in the environment
+	IntelGpu bool
 
 	// Set via CUDA_VISIBLE_DEVICES in the environment
 	CudaVisibleDevices string
@@ -101,6 +103,7 @@ func AsMap() map[string]EnvVar {
 		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
+		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
 	}
 	return ret
 }
@@ -276,6 +279,10 @@ func LoadConfig() {
 		slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
 	}
 
+	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
+		IntelGpu = set
+	}
+
 	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
 	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
 	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 56a4dbfac..ce0a10496 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -280,33 +280,35 @@ func GetGPUInfo() GpuInfoList {
 		}
 
 		// Intel
-		oHandles = initOneAPIHandles()
-		for d := 0; oHandles.oneapi != nil && d < int(oHandles.oneapi.num_drivers); d++ {
-			if oHandles.oneapi == nil {
-				// shouldn't happen
-				slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
-				continue
-			}
-			devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-			for i := range devCount {
-				gpuInfo := OneapiGPUInfo{
-					GpuInfo: GpuInfo{
-						Library: "oneapi",
-					},
-					driverIndex: d,
-					gpuIndex:    int(i),
+		if envconfig.IntelGpu {
+			oHandles = initOneAPIHandles()
+			for d := range oHandles.oneapi.num_drivers {
+				if oHandles.oneapi == nil {
+					// shouldn't happen
+					slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
+					continue
+				}
+				devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
+				for i := range devCount {
+					gpuInfo := OneapiGPUInfo{
+						GpuInfo: GpuInfo{
+							Library: "oneapi",
+						},
+						driverIndex: int(d),
+						gpuIndex:    int(i),
+					}
+					// TODO - split bootstrapping from updating free memory
+					C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
+					// TODO - convert this to MinimumMemory based on testing...
+					var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+					memInfo.free = C.uint64_t(totalFreeMem)
+					gpuInfo.TotalMemory = uint64(memInfo.total)
+					gpuInfo.FreeMemory = uint64(memInfo.free)
+					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+					// TODO dependency path?
+					oneapiGPUs = append(oneapiGPUs, gpuInfo)
 				}
-				// TODO - split bootstrapping from updating free memory
-				C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
-				// TODO - convert this to MinimumMemory based on testing...
-				var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-				memInfo.free = C.uint64_t(totalFreeMem)
-				gpuInfo.TotalMemory = uint64(memInfo.total)
-				gpuInfo.FreeMemory = uint64(memInfo.free)
-				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				// TODO dependency path?
-				oneapiGPUs = append(oneapiGPUs, gpuInfo)
 			}
 		}
 

From 9d91e5e5875e2b2f8605ef15a7da9a616cb05171 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 19 Jun 2024 11:14:11 -0700
Subject: [PATCH 27/43] remove confusing log message

---
 llm/ext_server/server.cpp | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 18b3fa18d..492126a4f 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -56,7 +56,6 @@ struct server_params {
     std::string hostname = "127.0.0.1";
     std::vector<std::string> api_keys;
     std::string public_path = "examples/server/public";
-    std::string chat_template = "";
     int32_t port = 8080;
     int32_t read_timeout = 600;
     int32_t write_timeout = 600;
@@ -427,16 +426,6 @@ struct llama_server_context
         return true;
     }
 
-    void validate_model_chat_template(server_params & sparams) {
-        llama_chat_message chat[] = {{"user", "test"}};
-        std::vector<char> buf(1);
-        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
-        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            sparams.chat_template = "chatml";
-        }
-    }
-
     void initialize() {
         // create slots
         all_slots_are_idle = true;
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                 invalid_param = true;
                 break;
             }
-            sparams.chat_template = argv[i];
         }
         else if (arg == "--override-kv")
         {
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
     }
     const auto model_meta = llama.model_meta();
 
-    if (sparams.chat_template.empty()) { // custom chat template is not supplied
-        // check if the template comes with the model is supported by us
-        llama.validate_model_chat_template(sparams);
-    }
-
     // Middleware for API key validation
     auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
         // If API key is not set, skip validation

From 9d8a4988e8eb743f985ebbb511a96e7496eac1c8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 15 Jun 2024 16:30:37 -0700
Subject: [PATCH 28/43] Implement log rotation for tray app

---
 app/lifecycle/logging.go      | 32 +++++++++++++++++++++++++
 app/lifecycle/logging_test.go | 44 +++++++++++++++++++++++++++++++++++
 app/lifecycle/paths.go        | 11 +++++----
 app/lifecycle/server.go       |  2 +-
 docs/troubleshooting.md       |  2 +-
 docs/windows.md               |  4 ++--
 6 files changed, 86 insertions(+), 9 deletions(-)
 create mode 100644 app/lifecycle/logging_test.go

diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go
index df2597a83..a8f1f7cdf 100644
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -5,6 +5,8 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"strconv"
+	"strings"
 
 	"github.com/ollama/ollama/envconfig"
 )
@@ -24,6 +26,7 @@ func InitLogging() {
 		logFile = os.Stderr
 		// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
 	} else {
+		rotateLogs(AppLogFile)
 		logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 		if err != nil {
 			slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {
 
 	slog.Info("ollama app started")
 }
+
+func rotateLogs(logFile string) {
+	if _, err := os.Stat(logFile); os.IsNotExist(err) {
+		return
+	}
+	index := strings.LastIndex(logFile, ".")
+	pre := logFile[:index]
+	post := "." + logFile[index+1:]
+	for i := LogRotationCount; i > 0; i-- {
+		older := pre + "-" + strconv.Itoa(i) + post
+		newer := pre + "-" + strconv.Itoa(i-1) + post
+		if i == 1 {
+			newer = pre + post
+		}
+		if _, err := os.Stat(newer); err == nil {
+			if _, err := os.Stat(older); err == nil {
+				err := os.Remove(older)
+				if err != nil {
+					slog.Warn("Failed to remove older log", "older", older, "error", err)
+					continue
+				}
+			}
+			err := os.Rename(newer, older)
+			if err != nil {
+				slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
+			}
+		}
+	}
+}
diff --git a/app/lifecycle/logging_test.go b/app/lifecycle/logging_test.go
new file mode 100644
index 000000000..a2157ca2c
--- /dev/null
+++ b/app/lifecycle/logging_test.go
@@ -0,0 +1,44 @@
+package lifecycle
+
+import (
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestRotateLogs(t *testing.T) {
+	logDir := t.TempDir()
+	logFile := filepath.Join(logDir, "testlog.log")
+
+	// No log exists
+	rotateLogs(logFile)
+
+	require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
+	assert.FileExists(t, logFile)
+	// First rotation
+	rotateLogs(logFile)
+	assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+	assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+	assert.NoFileExists(t, logFile)
+
+	// Should be a no-op without a new log
+	rotateLogs(logFile)
+	assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+	assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+	assert.NoFileExists(t, logFile)
+
+	for i := 2; i <= LogRotationCount+1; i++ {
+		require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
+		assert.FileExists(t, logFile)
+		rotateLogs(logFile)
+		assert.NoFileExists(t, logFile)
+		for j := 1; j < i; j++ {
+			assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
+		}
+		assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
+	}
+}
diff --git a/app/lifecycle/paths.go b/app/lifecycle/paths.go
index fe07bce10..4d9f4c5a1 100644
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@@ -16,11 +16,12 @@ var (
 	AppDir     = "/opt/Ollama"
 	AppDataDir = "/opt/Ollama"
 	// TODO - should there be a distinct log dir?
-	UpdateStageDir = "/tmp"
-	AppLogFile     = "/tmp/ollama_app.log"
-	ServerLogFile  = "/tmp/ollama.log"
-	UpgradeLogFile = "/tmp/ollama_update.log"
-	Installer      = "OllamaSetup.exe"
+	UpdateStageDir   = "/tmp"
+	AppLogFile       = "/tmp/ollama_app.log"
+	ServerLogFile    = "/tmp/ollama.log"
+	UpgradeLogFile   = "/tmp/ollama_update.log"
+	Installer        = "OllamaSetup.exe"
+	LogRotationCount = 5
 )
 
 func init() {
diff --git a/app/lifecycle/server.go b/app/lifecycle/server.go
index 0152ccd11..c178a1abf 100644
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
 		return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
 	}
 
-	// TODO - rotation
+	rotateLogs(ServerLogFile)
 	logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create server log: %w", err)
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 60d63c7d9..de29b344c 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -22,7 +22,7 @@ docker logs <container-name>
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.
 
 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
-- `explorer %LOCALAPPDATA%\Ollama` to view logs
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
 - `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
diff --git a/docs/windows.md b/docs/windows.md
index 832b3d431..abc0eb300 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -39,8 +39,8 @@ server.
 Ollama on Windows stores files in a few different locations.  You can view them in
 the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
-    - *app.log* contains logs from the GUI application
-    - *server.log* contains the server logs
+    - *app.log* contains most resent logs from the GUI application
+    - *server.log* contains the most recent server logs
     - *upgrade.log* contains log output for upgrades
 - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` contains models and configuration

From fedf71635ec77644f8477a86c6155217d9213a11 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Wed, 19 Jun 2024 14:19:02 -0700
Subject: [PATCH 29/43] Extend api/show and ollama show to return more model
 info (#4881)

* API Show Extended

* Initial Draft of Information

Co-Authored-By: Patrick Devine <pdevine@sonic.net>

* Clean Up

* Descriptive arg error messages and other fixes

* Second Draft of Show with Projectors Included

* Remove Chat Template

* Touches

* Prevent wrapping from files

* Verbose functionality

* Docs

* Address Feedback

* Lint

* Resolve Conflicts

* Function Name

* Tests for api/show model info

* Show Test File

* Add Projector Test

* Clean routes

* Projector Check

* Move Show Test

* Touches

* Doc update

---------

Co-authored-by: Patrick Devine <pdevine@sonic.net>
---
 api/types.go          |  19 +++---
 cmd/cmd.go            | 143 +++++++++++++++++++++++++++++++++++++-----
 docs/api.md           |  37 +++++++++--
 server/routes.go      |  35 +++++++++++
 server/routes_test.go |  39 ++++++++++++
 5 files changed, 243 insertions(+), 30 deletions(-)

diff --git a/api/types.go b/api/types.go
index 7822a6034..0a1189e70 100644
--- a/api/types.go
+++ b/api/types.go
@@ -253,6 +253,7 @@ type ShowRequest struct {
 	Model    string `json:"model"`
 	System   string `json:"system"`
 	Template string `json:"template"`
+	Verbose  bool   `json:"verbose"`
 
 	Options map[string]interface{} `json:"options"`
 
@@ -262,14 +263,16 @@ type ShowRequest struct {
 
 // ShowResponse is the response returned from [Client.Show].
 type ShowResponse struct {
-	License    string       `json:"license,omitempty"`
-	Modelfile  string       `json:"modelfile,omitempty"`
-	Parameters string       `json:"parameters,omitempty"`
-	Template   string       `json:"template,omitempty"`
-	System     string       `json:"system,omitempty"`
-	Details    ModelDetails `json:"details,omitempty"`
-	Messages   []Message    `json:"messages,omitempty"`
-	ModifiedAt time.Time    `json:"modified_at,omitempty"`
+	License       string         `json:"license,omitempty"`
+	Modelfile     string         `json:"modelfile,omitempty"`
+	Parameters    string         `json:"parameters,omitempty"`
+	Template      string         `json:"template,omitempty"`
+	System        string         `json:"system,omitempty"`
+	Details       ModelDetails   `json:"details,omitempty"`
+	Messages      []Message      `json:"messages,omitempty"`
+	ModelInfo     map[string]any `json:"model_info,omitempty"`
+	ProjectorInfo map[string]any `json:"projector_info,omitempty"`
+	ModifiedAt    time.Time      `json:"modified_at,omitempty"`
 }
 
 // CopyRequest is the request passed to [Client.Copy].
diff --git a/cmd/cmd.go b/cmd/cmd.go
index ae7c8da8f..68197f72d 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -579,10 +579,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
-	if len(args) != 1 {
-		return errors.New("missing model name")
-	}
-
 	license, errLicense := cmd.Flags().GetBool("license")
 	modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
 	parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +621,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 
 	if flagsSet > 1 {
 		return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
-	} else if flagsSet == 0 {
-		return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
+	}
+
+	if flagsSet == 1 {
+		req := api.ShowRequest{Name: args[0]}
+		resp, err := client.Show(cmd.Context(), &req)
+		if err != nil {
+			return err
+		}
+
+		switch showType {
+		case "license":
+			fmt.Println(resp.License)
+		case "modelfile":
+			fmt.Println(resp.Modelfile)
+		case "parameters":
+			fmt.Println(resp.Parameters)
+		case "system":
+			fmt.Println(resp.System)
+		case "template":
+			fmt.Println(resp.Template)
+		}
+
+		return nil
 	}
 
 	req := api.ShowRequest{Name: args[0]}
@@ -635,22 +652,114 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
-	switch showType {
-	case "license":
-		fmt.Println(resp.License)
-	case "modelfile":
-		fmt.Println(resp.Modelfile)
-	case "parameters":
-		fmt.Println(resp.Parameters)
-	case "system":
-		fmt.Println(resp.System)
-	case "template":
-		fmt.Println(resp.Template)
+	arch := resp.ModelInfo["general.architecture"].(string)
+
+	modelData := [][]string{
+		{"arch", arch},
+		{"parameters", resp.Details.ParameterSize},
+		{"quantization", resp.Details.QuantizationLevel},
+		{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+		{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
 	}
 
+	mainTableData := [][]string{
+		{"Model"},
+		{renderSubTable(modelData, false)},
+	}
+
+	if resp.ProjectorInfo != nil {
+		projectorData := [][]string{
+			{"arch", "clip"},
+			{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
+			{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
+			{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
+			{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
+		}
+
+		mainTableData = append(mainTableData,
+			[]string{"Projector"},
+			[]string{renderSubTable(projectorData, false)},
+		)
+	}
+
+	if resp.Parameters != "" {
+		mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
+	}
+
+	if resp.System != "" {
+		mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
+	}
+
+	if resp.License != "" {
+		mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
+	}
+
+	table := tablewriter.NewWriter(os.Stdout)
+	table.SetAutoWrapText(false)
+	table.SetBorder(false)
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+	for _, v := range mainTableData {
+		table.Append(v)
+	}
+
+	table.Render()
+
 	return nil
 }
 
+func renderSubTable(data [][]string, file bool) string {
+	var buf bytes.Buffer
+	table := tablewriter.NewWriter(&buf)
+	table.SetAutoWrapText(!file)
+	table.SetBorder(false)
+	table.SetNoWhiteSpace(true)
+	table.SetTablePadding("\t")
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+	for _, v := range data {
+		table.Append(v)
+	}
+
+	table.Render()
+
+	renderedTable := buf.String()
+	lines := strings.Split(renderedTable, "\n")
+	for i, line := range lines {
+		lines[i] = "\t" + line
+	}
+
+	return strings.Join(lines, "\n")
+}
+
+func twoLines(s string) [][]string {
+	lines := strings.Split(s, "\n")
+	res := [][]string{}
+
+	count := 0
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line != "" {
+			count++
+			res = append(res, []string{line})
+			if count == 2 {
+				return res
+			}
+		}
+	}
+	return res
+}
+
+func formatParams(s string) string {
+	lines := strings.Split(s, "\n")
+	table := [][]string{}
+
+	for _, line := range lines {
+		table = append(table, strings.Fields(line))
+	}
+	return renderSubTable(table, false)
+}
+
 func CopyHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
diff --git a/docs/api.md b/docs/api.md
index 35f1def33..107b5211f 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -777,11 +777,12 @@ A single JSON object will be returned.
 POST /api/show
 ```
 
-Show information about a model including details, modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, system prompt.
 
 ### Parameters
 
 - `name`: name of the model to show
+- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
 
 ### Examples
 
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
 ```json
 {
   "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
-  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSISTANT:",
-  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
+  "parameters": "num_keep                       24\nstop                           \"<|start_header_id|>\"\nstop                           \"<|end_header_id|>\"\nstop                           \"<|eot_id|>\"",
+  "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
   "details": {
+    "parent_model": "",
     "format": "gguf",
     "family": "llama",
-    "families": ["llama", "clip"],
-    "parameter_size": "7B",
+    "families": [
+      "llama"
+    ],
+    "parameter_size": "8.0B",
     "quantization_level": "Q4_0"
+  },
+  "model_info": {
+    "general.architecture": "llama",
+    "general.file_type": 2,
+    "general.parameter_count": 8030261248,
+    "general.quantization_version": 2,
+    "llama.attention.head_count": 32,
+    "llama.attention.head_count_kv": 8,
+    "llama.attention.layer_norm_rms_epsilon": 0.00001,
+    "llama.block_count": 32,
+    "llama.context_length": 8192,
+    "llama.embedding_length": 4096,
+    "llama.feed_forward_length": 14336,
+    "llama.rope.dimension_count": 128,
+    "llama.rope.freq_base": 500000,
+    "llama.vocab_size": 128256,
+    "tokenizer.ggml.bos_token_id": 128000,
+    "tokenizer.ggml.eos_token_id": 128009,
+    "tokenizer.ggml.merges": [],            // populates if `verbose=true`
+    "tokenizer.ggml.model": "gpt2",
+    "tokenizer.ggml.pre": "llama-bpe",
+    "tokenizer.ggml.token_type": [],        // populates if `verbose=true`
+    "tokenizer.ggml.tokens": []             // populates if `verbose=true`
   }
 }
 ```
diff --git a/server/routes.go b/server/routes.go
index f36fe1b08..3d112e9f1 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -734,9 +734,44 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	fmt.Fprint(&sb, m.String())
 	resp.Modelfile = sb.String()
 
+	kvData, err := getKVData(m.ModelPath, req.Verbose)
+	if err != nil {
+		return nil, err
+	}
+	delete(kvData, "general.name")
+	delete(kvData, "tokenizer.chat_template")
+	resp.ModelInfo = kvData
+
+	if len(m.ProjectorPaths) > 0 {
+		projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
+		if err != nil {
+			return nil, err
+		}
+		resp.ProjectorInfo = projectorData
+	}
+
 	return resp, nil
 }
 
+func getKVData(digest string, verbose bool) (llm.KV, error) {
+	kvData, err := llm.LoadModel(digest)
+	if err != nil {
+		return nil, err
+	}
+
+	kv := kvData.KV()
+
+	if !verbose {
+		for k := range kv {
+			if t, ok := kv[k].([]any); len(t) > 5 && ok {
+				kv[k] = []any{}
+			}
+		}
+	}
+
+	return kv, nil
+}
+
 func (s *Server) ListModelsHandler(c *gin.Context) {
 	ms, err := Manifests()
 	if err != nil {
diff --git a/server/routes_test.go b/server/routes_test.go
index 5e16cfeff..5a5c0fbba 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -19,6 +19,7 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -212,6 +213,7 @@ func Test_Routes(t *testing.T) {
 					"top_p 0.9",
 				}
 				assert.Equal(t, expectedParams, params)
+				assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
 			},
 		},
 	}
@@ -325,3 +327,40 @@ func TestCase(t *testing.T) {
 		})
 	}
 }
+
+func TestShow(t *testing.T) {
+	t.Setenv("OLLAMA_MODELS", t.TempDir())
+	envconfig.LoadConfig()
+
+	var s Server
+
+	createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		Name: "show-model",
+		Modelfile: fmt.Sprintf(
+			"FROM %s\nFROM %s",
+			createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
+			createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
+		),
+	})
+
+	w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
+		Name: "show-model",
+	})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status code 200, actual %d", w.Code)
+	}
+
+	var resp api.ShowResponse
+	if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+		t.Fatal(err)
+	}
+
+	if resp.ModelInfo["general.architecture"] != "test" {
+		t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
+	}
+
+	if resp.ProjectorInfo["general.architecture"] != "clip" {
+		t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
+	}
+}

From 23e899f32d9f7b3bbe0b902a95c23be5a1254409 Mon Sep 17 00:00:00 2001
From: Josh Yan <jyan00017@gmail.com>
Date: Thu, 20 Jun 2024 08:51:35 -0700
Subject: [PATCH 30/43] skip os.removeAll() if PID does not exist

---
 gpu/assets.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gpu/assets.go b/gpu/assets.go
index f2adcf3e3..fdb3dd81d 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -87,6 +87,8 @@ func cleanupTmpDirs() {
 			}
 		} else {
 			slog.Debug("failed to open ollama.pid", "path", d, "error", err)
+			// No pid, ignore this tmpdir
+			continue
 		}
 		err = os.RemoveAll(d)
 		if err != nil {

From 4ebb66c6623d85f4fb69db0406ddd05bdc2d893d Mon Sep 17 00:00:00 2001
From: Josh Yan <jyan00017@gmail.com>
Date: Thu, 20 Jun 2024 09:23:43 -0700
Subject: [PATCH 31/43] reformat error check

---
 gpu/assets.go | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/gpu/assets.go b/gpu/assets.go
index fdb3dd81d..e2abfd58f 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -77,19 +77,20 @@ func cleanupTmpDirs() {
 			continue
 		}
 		raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
-		if err == nil {
-			pid, err := strconv.Atoi(string(raw))
-			if err == nil {
-				if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-					// Another running ollama, ignore this tmpdir
-					continue
-				}
-			}
-		} else {
-			slog.Debug("failed to open ollama.pid", "path", d, "error", err)
+		if err != nil {
+			slog.Warn("failed to read ollama.pid", "path", d, "error", err)
 			// No pid, ignore this tmpdir
 			continue
 		}
+
+		pid, err := strconv.Atoi(string(raw))
+		if err == nil {
+			if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+				// Another running ollama, ignore this tmpdir
+				continue
+			}
+		}
+
 		err = os.RemoveAll(d)
 		if err != nil {
 			slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)

From 662568d453debcf77d2e077ef98cfb2cfab8575e Mon Sep 17 00:00:00 2001
From: Josh Yan <jyan00017@gmail.com>
Date: Thu, 20 Jun 2024 09:30:59 -0700
Subject: [PATCH 32/43] err!=nil check

---
 gpu/assets.go | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/gpu/assets.go b/gpu/assets.go
index e2abfd58f..073d2e813 100644
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -84,16 +84,20 @@ func cleanupTmpDirs() {
 		}
 
 		pid, err := strconv.Atoi(string(raw))
-		if err == nil {
-			if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-				// Another running ollama, ignore this tmpdir
-				continue
-			}
+		if err != nil {
+			slog.Warn("failed to parse pid", "path", d, "error", err)
+			continue
 		}
 
-		err = os.RemoveAll(d)
-		if err != nil {
-			slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
+		proc, err := os.FindProcess(pid)
+		if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+			slog.Warn("found running ollama", "pid", pid, "path", d)
+			// Another running ollama, ignore this tmpdir
+			continue
+		}
+
+		if err := os.Remove(d); err != nil {
+			slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
 		}
 	}
 }

From 8e0641a9bffd0dde96e94a34bb3a4929da66c772 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 20 Jun 2024 09:40:17 -0700
Subject: [PATCH 33/43] handle asymmetric embedding KVs

---
 llm/ggml.go   | 40 +++++++++++++++++++++++++++++++++-------
 llm/memory.go |  4 ++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/llm/ggml.go b/llm/ggml.go
index 4d9ba97a8..f02f0ff60 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -69,6 +69,30 @@ func (kv KV) HeadCountKV() uint64 {
 	return 1
 }
 
+func (kv KV) EmbeddingHeadCount() uint64 {
+	if heads := kv.HeadCount(); heads > 0 {
+		return kv.EmbeddingLength() / kv.HeadCount()
+	}
+
+	return 0
+}
+
+func (kv KV) EmbeddingHeadCountK() uint64 {
+	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
+		return k
+	}
+
+	return kv.EmbeddingHeadCount()
+}
+
+func (kv KV) EmbeddingHeadCountV() uint64 {
+	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
+		return v
+	}
+
+	return kv.EmbeddingHeadCount()
+}
+
 func (kv KV) GQA() uint64 {
 	return kv.HeadCount() / kv.HeadCountKV()
 }
@@ -299,6 +323,9 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 	headsKV := llm.KV().HeadCountKV()
 	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
 
+	embeddingHeads := llm.KV().EmbeddingHeadCount()
+	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
+
 	layers := llm.Tensors().Layers()
 
 	switch llm.KV().Architecture() {
@@ -308,7 +335,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
 			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
-			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
+			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
 
@@ -316,15 +343,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			// mixtral 8x22b
 			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
 			partialOffload = max(
-				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
-				4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
+				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
+				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
 			)
 		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
 			// mixtral 8x7b
 			ffnGateWeight1 := ffnGateWeight.Shape[1]
 			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
 			partialOffload = max(
-				4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
+				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
@@ -368,15 +395,14 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			fullOffload,
 		)
 	case "deepseek2":
-		keys := uint64(llm.KV()["deepseek2.attention.key_length"].(uint32))
 		fullOffload = max(
 			4*batch*(3*embedding+vocab),
-			4*batch*(3*embedding+2+context*(1+headsKV)+2*keys*headsKV),
+			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
 		)
 
 		partialOffload = max(
 			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2*embedding+1+2*keys*headsKV+context+context*headsKV)+4*keys*context*headsKV+embedding*keys*headsKV*9/16,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
 		)
 	}
 
diff --git a/llm/memory.go b/llm/memory.go
index b8b862bd6..19b12cbfc 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -115,8 +115,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		slog.Warn("model missing blk.0 layer size")
 	}
 
-	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
-	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
+	// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
+	var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
 
 	// KV is proportional to the number of layers
 	layerSize += kv / ggml.KV().BlockCount()

From 5bf5aeec0140a70eeb94b65c61dbb3b75ff33e56 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 20 Jun 2024 11:07:04 -0700
Subject: [PATCH 34/43] Refine mmap default logic on linux

If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach.
---
 llm/server.go | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index ed5f288f2..da83416ee 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var err error
 	var cpuRunner string
 	var estimate MemoryEstimate
-	var systemMemory uint64
+	var systemTotalMemory uint64
+	var systemFreeMemory uint64
+
+	systemMemInfo, err := gpu.GetCPUMem()
+	if err != nil {
+		slog.Error("failed to lookup system memory", "error", err)
+	} else {
+		systemTotalMemory = systemMemInfo.TotalMemory
+		systemFreeMemory = systemMemInfo.FreeMemory
+		slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
+	}
 
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
@@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		cpuRunner = serverForCpu()
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
-		if gpus[0].Library == "metal" {
-			memInfo, err := gpu.GetCPUMem()
-			if err != nil {
-				slog.Error("failed to lookup system memory", "error", err)
-			} else {
-				systemMemory = memInfo.TotalMemory
-				slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
-			}
-		}
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 
 		switch {
-		case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
+		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
@@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	}
 
 	// Windows CUDA should not use mmap for best performance
-	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
+	// Linux  with a model larger than free space, mmap leads to thrashing
+	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
+		(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
+		opts.UseMMap == api.TriStateFalse {
 		params = append(params, "--no-mmap")
 	}
 

From 7e7749224c57ea4d7ae98e4d07dcb00e192a5c7c Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 21 Jun 2024 12:27:19 -0700
Subject: [PATCH 35/43] Fix use_mmap parsing for modelfiles

Add the new tristate parsing logic for the code path for modelfiles,
as well as a unit test.
---
 api/types.go      | 13 ++++++++++
 api/types_test.go | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/api/types.go b/api/types.go
index 0a1189e70..95ed5d37e 100644
--- a/api/types.go
+++ b/api/types.go
@@ -608,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
 		} else {
 			field := valueOpts.FieldByName(opt.Name)
 			if field.IsValid() && field.CanSet() {
+				if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+					boolVal, err := strconv.ParseBool(vals[0])
+					if err != nil {
+						return nil, fmt.Errorf("invalid bool value %s", vals)
+					}
+					if boolVal {
+						out[key] = TriStateTrue
+					} else {
+						out[key] = TriStateFalse
+					}
+					continue
+				}
+
 				switch field.Kind() {
 				case reflect.Float32:
 					floatVal, err := strconv.ParseFloat(vals[0], 32)
diff --git a/api/types_test.go b/api/types_test.go
index 7b4a0f83c..8b6c60c62 100644
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -2,6 +2,7 @@ package api
 
 import (
 	"encoding/json"
+	"fmt"
 	"math"
 	"testing"
 	"time"
@@ -141,3 +142,65 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
 		})
 	}
 }
+
+func TestUseMmapFormatParams(t *testing.T) {
+	tests := []struct {
+		name string
+		req  map[string][]string
+		exp  TriState
+		err  error
+	}{
+		{
+			name: "True",
+			req: map[string][]string{
+				"use_mmap": []string{"true"},
+			},
+			exp: TriStateTrue,
+			err: nil,
+		},
+		{
+			name: "False",
+			req: map[string][]string{
+				"use_mmap": []string{"false"},
+			},
+			exp: TriStateFalse,
+			err: nil,
+		},
+		{
+			name: "Numeric True",
+			req: map[string][]string{
+				"use_mmap": []string{"1"},
+			},
+			exp: TriStateTrue,
+			err: nil,
+		},
+		{
+			name: "Numeric False",
+			req: map[string][]string{
+				"use_mmap": []string{"0"},
+			},
+			exp: TriStateFalse,
+			err: nil,
+		},
+		{
+			name: "invalid string",
+			req: map[string][]string{
+				"use_mmap": []string{"foo"},
+			},
+			exp: TriStateUndefined,
+			err: fmt.Errorf("invalid bool value [foo]"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			resp, err := FormatParams(test.req)
+			require.Equal(t, err, test.err)
+			respVal, ok := resp["use_mmap"]
+			if test.exp != TriStateUndefined {
+				assert.True(t, ok, "resp: %v", resp)
+				assert.Equal(t, test.exp, respVal)
+			}
+		})
+	}
+}

From e835ef183691db1cc7da30cfc61fb4b96b321e80 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 21 Jun 2024 13:30:43 -0700
Subject: [PATCH 36/43] fix: quantization with template

---
 server/images.go | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/server/images.go b/server/images.go
index 53a957715..98794149e 100644
--- a/server/images.go
+++ b/server/images.go
@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 							return err
 						}
 
-						layers, err := parseFromFile(ctx, temp, "", fn)
+						layer, err := NewLayer(temp, baseLayer.MediaType)
 						if err != nil {
 							return err
 						}
 
-						if len(layers) != 1 {
-							return errors.New("quantization failed")
+						if _, err := temp.Seek(0, io.SeekStart); err != nil {
+							return err
 						}
 
-						baseLayer.Layer = layers[0].Layer
-						baseLayer.GGML = layers[0].GGML
+						ggml, _, err := llm.DecodeGGML(temp)
+						if err != nil {
+							return err
+						}
+
+						baseLayer.Layer = layer
+						baseLayer.GGML = ggml
 					}
 				}
 

From 9a9e7d83c416374782a984d7036f3f2ae5ddb78d Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Fri, 21 Jun 2024 15:52:09 -0700
Subject: [PATCH 37/43] Docs (#5149)

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 2fdc63cb3..978625731 100644
--- a/README.md
+++ b/README.md
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
  Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 
+### Show model information
+
+```
+ollama show llama3
+```
+
 ### List models on your computer
 
 ```

From 2aa91a937ba199ae5832c71ecc10221cc6420fa8 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Mon, 24 Jun 2024 20:14:03 -0700
Subject: [PATCH 38/43] cmd: defer stating model info until necessary (#5248)

This commit changes the 'ollama run' command to defer fetching model
information until it really needs it. That is, when in interactive mode.

It also removes one such case where the model information is fetch in
duplicate, just before calling generateInteractive and then again, first
thing, in generateInteractive.

This positively impacts the performance of the command:

    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.168 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.220 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.217 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 4% cpu 0.652 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 5% cpu 0.498 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 3% cpu 0.479 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
---
 cmd/cmd.go         | 65 +++++++++++++++++++++++-----------------------
 cmd/interactive.go | 51 ++++++++++--------------------------
 2 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 68197f72d..89b551f40 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
 }
 
 func RunHandler(cmd *cobra.Command, args []string) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	name := args[0]
-
-	// check if the model exists on the server
-	show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
-	var statusError api.StatusError
-	switch {
-	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
-		if err := PullHandler(cmd, []string{name}); err != nil {
-			return err
-		}
-
-		show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
-		if err != nil {
-			return err
-		}
-	case err != nil:
-		return err
-	}
-
 	interactive := true
 
 	opts := runOptions{
-		Model:       args[0],
-		WordWrap:    os.Getenv("TERM") == "xterm-256color",
-		Options:     map[string]interface{}{},
-		MultiModal:  slices.Contains(show.Details.Families, "clip"),
-		ParentModel: show.Details.ParentModel,
+		Model:    args[0],
+		WordWrap: os.Getenv("TERM") == "xterm-256color",
+		Options:  map[string]interface{}{},
 	}
 
 	format, err := cmd.Flags().GetString("format")
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.WordWrap = !nowrap
 
-	if !interactive {
-		return generate(cmd, opts)
+	// Fill out the rest of the options based on information about the
+	// model.
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
 	}
 
-	return generateInteractive(cmd, opts)
+	name := args[0]
+	info, err := func() (*api.ShowResponse, error) {
+		showReq := &api.ShowRequest{Name: name}
+		info, err := client.Show(cmd.Context(), showReq)
+		var se api.StatusError
+		if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
+			if err := PullHandler(cmd, []string{name}); err != nil {
+				return nil, err
+			}
+			return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
+		}
+		return info, err
+	}()
+	if err != nil {
+		return err
+	}
+
+	opts.MultiModal = slices.Contains(info.Details.Families, "clip")
+	opts.ParentModel = info.Details.ParentModel
+	opts.Messages = append(opts.Messages, info.Messages...)
+
+	if interactive {
+		return generateInteractive(cmd, opts)
+	}
+	return generate(cmd, opts)
 }
 
 func errFromUnknownKey(unknownKeyErr error) error {
diff --git a/cmd/interactive.go b/cmd/interactive.go
index 80a915474..0a2f429b6 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -31,65 +31,40 @@ const (
 )
 
 func loadModel(cmd *cobra.Command, opts *runOptions) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()
 
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
 
-	showReq := api.ShowRequest{Name: opts.Model}
-	showResp, err := client.Show(cmd.Context(), &showReq)
+	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
-	opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
-	opts.ParentModel = showResp.Details.ParentModel
-
-	if len(showResp.Messages) > 0 {
-		opts.Messages = append(opts.Messages, showResp.Messages...)
-	}
 
 	chatReq := &api.ChatRequest{
-		Model:    opts.Model,
-		Messages: []api.Message{},
+		Model:     opts.Model,
+		KeepAlive: opts.KeepAlive,
 	}
 
-	if opts.KeepAlive != nil {
-		chatReq.KeepAlive = opts.KeepAlive
-	}
-
-	err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
+	return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
 		p.StopAndClear()
-		if len(opts.Messages) > 0 {
-			for _, msg := range opts.Messages {
-				switch msg.Role {
-				case "user":
-					fmt.Printf(">>> %s\n", msg.Content)
-				case "assistant":
-					state := &displayResponseState{}
-					displayResponse(msg.Content, opts.WordWrap, state)
-					fmt.Println()
-					fmt.Println()
-				}
+		for _, msg := range opts.Messages {
+			switch msg.Role {
+			case "user":
+				fmt.Printf(">>> %s\n", msg.Content)
+			case "assistant":
+				state := &displayResponseState{}
+				displayResponse(msg.Content, opts.WordWrap, state)
+				fmt.Println()
+				fmt.Println()
 			}
 		}
 		return nil
 	})
-	if err != nil {
-		return err
-	}
-
-	return nil
 }
 
 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
-	opts.Messages = make([]api.Message, 0)
-
 	err := loadModel(cmd, &opts)
 	if err != nil {
 		return err

From cb42e607c5cf4d439ad4d5a93ed13c7d6a09fc34 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Mon, 24 Jun 2024 21:47:52 -0700
Subject: [PATCH 39/43] llm: speed up gguf decoding by a lot (#5246)

Previously, some costly things were causing the loading of GGUF files
and their metadata and tensor information to be VERY slow:

  * Too many allocations when decoding strings
  * Hitting disk for each read of each key and value, resulting in a
    not-okay amount of syscalls/disk I/O.

The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro
m3.

This commit also prevents collecting large arrays of values when
decoding GGUFs (if desired). When such keys are encountered, their
values are null, and are encoded as such in JSON.

Also, this fixes a broken test that was not encoding valid GGUF.
---
 llm/ggla.go                          |  13 ++-
 llm/ggml.go                          |  25 ++++--
 llm/ggml_test.go                     |   1 +
 llm/gguf.go                          | 130 +++++++++++++++++++--------
 llm/memory_test.go                   |  19 ++--
 llm/server.go                        |  11 ++-
 server/images.go                     |   2 +-
 server/model.go                      |   6 +-
 server/routes.go                     |  19 +++-
 server/sched.go                      |   2 +-
 server/sched_test.go                 |   6 +-
 util/bufioutil/buffer_seeker.go      |  34 +++++++
 util/bufioutil/buffer_seeker_test.go |  64 +++++++++++++
 13 files changed, 263 insertions(+), 69 deletions(-)
 create mode 100644 llm/ggml_test.go
 create mode 100644 util/bufioutil/buffer_seeker.go
 create mode 100644 util/bufioutil/buffer_seeker_test.go

diff --git a/llm/ggla.go b/llm/ggla.go
index a5d90b6cb..34c4f6ca3 100644
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
 	return llm.tensors
 }
 
-func (llm *ggla) decode(rs io.ReadSeeker) error {
+func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
 	var r uint32
 	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
 		return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 	for {
 		var dims uint32
 		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
+			if errors.Is(err, io.EOF) {
+				return nil
+			}
 			return err
 		}
 
+		defer func() {
+			if errors.Is(retErr, io.EOF) {
+				retErr = io.ErrUnexpectedEOF
+			}
+		}()
+
 		var namesize uint32
 		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
 			return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 			return err
 		}
 
-		if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
+		if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
 			return err
 		}
 
diff --git a/llm/ggml.go b/llm/ggml.go
index f02f0ff60..d0d0b6ddc 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 	"io"
 	"strings"
+
+	"github.com/ollama/ollama/util/bufioutil"
 )
 
 type GGML struct {
@@ -278,7 +280,18 @@ func DetectGGMLType(b []byte) string {
 	}
 }
 
-func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
+// DecodeGGML decodes a GGML model from the given reader.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+	if maxArraySize == 0 {
+		maxArraySize = 1024
+	}
+
+	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
+
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
 		return nil, 0, err
@@ -291,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
 	case FILE_MAGIC_GGLA:
 		c = &containerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{ByteOrder: binary.LittleEndian}
+		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
 	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{ByteOrder: binary.BigEndian}
+		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
 		return nil, 0, errors.New("invalid file magic")
 	}
 
 	model, err := c.Decode(rs)
-	if errors.Is(err, io.EOF) {
-		// noop
-	} else if err != nil {
+	if err != nil {
 		return nil, 0, err
 	}
 
@@ -321,7 +332,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
+	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
 
 	embeddingHeads := llm.KV().EmbeddingHeadCount()
 	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
diff --git a/llm/ggml_test.go b/llm/ggml_test.go
new file mode 100644
index 000000000..006c3ded8
--- /dev/null
+++ b/llm/ggml_test.go
@@ -0,0 +1 @@
+package llm
diff --git a/llm/gguf.go b/llm/gguf.go
index 234efe574..4d343a1bd 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -3,11 +3,10 @@ package llm
 import (
 	"bytes"
 	"encoding/binary"
+	"encoding/json"
 	"fmt"
 	"io"
 	"strings"
-
-	"log/slog"
 )
 
 type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
+
+	maxArraySize int
+}
+
+func (c *containerGGUF) canCollectArray(size int) bool {
+	return c.maxArraySize < 0 || size <= c.maxArraySize
 }
 
 func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
 	}
 
 	model := newGGUF(c)
-	slog.Debug(fmt.Sprintf("model = %#v", model))
 	if err := model.Decode(rs); err != nil {
 		return nil, err
 	}
@@ -85,6 +89,8 @@ type gguf struct {
 	tensors []*Tensor
 
 	parameters uint64
+
+	scratch [16 << 10]byte
 }
 
 func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 	}
 
 	// decode tensors
-	for i := 0; uint64(i) < llm.numTensor(); i++ {
+	for range llm.numTensor() {
 		name, err := readGGUFString(llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor name: %w", err)
 		}
 
 		// dims is the number of dimensions in the tensor
 		dims, err := readGGUF[uint32](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor dimensions: %w", err)
 		}
 
 		shape := [4]uint64{1, 1, 1, 1}
 		for i := 0; uint32(i) < dims; i++ {
 			shape[i], err = readGGUF[uint64](llm, rs)
 			if err != nil {
-				return err
+				return fmt.Errorf("failed to read tensor shape: %w", err)
 			}
 		}
 
 		kind, err := readGGUF[uint32](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor kind: %w", err)
 		}
 
 		offset, err := readGGUF[uint64](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor offset: %w", err)
 		}
 
 		tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		alignment = 32
 	}
 
-	offset, err := rs.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
-	padding := llm.padding(offset, int64(alignment))
-	if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
-		return err
-	}
-
 	for _, tensor := range llm.tensors {
-		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
-			return err
+		offset, err := rs.Seek(0, io.SeekCurrent)
+		if err != nil {
+			return fmt.Errorf("failed to get current offset: %w", err)
 		}
 
-		padding := llm.padding(int64(tensor.Size()), int64(alignment))
+		padding := llm.padding(offset, int64(alignment))
 		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
-			return err
+			return fmt.Errorf("failed to seek to init padding: %w", err)
+		}
+
+		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
+			return fmt.Errorf("failed to seek to tensor: %w", err)
 		}
 	}
 
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
 	return b.String(), nil
 }
 
+func discardGGUFString(llm *gguf, r io.Reader) error {
+	buf := llm.scratch[:8]
+	_, err := io.ReadFull(r, buf)
+	if err != nil {
+		return err
+	}
+
+	size := int(llm.ByteOrder.Uint64(buf))
+	for size > 0 {
+		n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
+		if err != nil {
+			return err
+		}
+		size -= n
+	}
+	return nil
+}
+
 func readGGUFString(llm *gguf, r io.Reader) (string, error) {
 	if llm.Version == 1 {
 		return readGGUFV1String(llm, r)
 	}
 
-	var length uint64
-	if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
+	buf := llm.scratch[:8]
+	_, err := io.ReadFull(r, buf)
+	if err != nil {
 		return "", err
 	}
 
-	var b bytes.Buffer
-	if _, err := io.CopyN(&b, r, int64(length)); err != nil {
+	length := int(llm.ByteOrder.Uint64(buf))
+	if length > len(llm.scratch) {
+		buf = make([]byte, length)
+	} else {
+		buf = llm.scratch[:length]
+	}
+	clear(buf)
+
+	_, err = io.ReadFull(r, buf)
+	if err != nil {
 		return "", err
 	}
-
-	return b.String(), nil
+	return string(buf), nil
 }
 
 func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
 	return err
 }
 
-func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
+type array struct {
+	size   int
+	values []any
+}
+
+func (a *array) MarshalJSON() ([]byte, error) {
+	return json.Marshal(a.values)
+}
+
+func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
 	t, err := readGGUF[uint32](llm, r)
 	if err != nil {
 		return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}
 
-	for i := 0; uint32(i) < n; i++ {
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, 0, int(n))
+	}
+
+	for i := range n {
 		var e any
 		switch t {
 		case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}
 
-		a = append(a, e)
+		if a.values != nil {
+			a.values[i] = e
+		}
 	}
 
-	return
+	return a, nil
 }
 
-func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
+func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
 	if llm.Version == 1 {
 		return readGGUFV1Array(llm, r)
 	}
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}
 
-	for i := 0; uint64(i) < n; i++ {
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, int(n))
+	}
+
+	for i := range n {
 		var e any
 		switch t {
 		case ggufTypeUint8:
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 		case ggufTypeBool:
 			e, err = readGGUF[bool](llm, r)
 		case ggufTypeString:
-			e, err = readGGUFString(llm, r)
+			if a.values != nil {
+				e, err = readGGUFString(llm, r)
+			} else {
+				err = discardGGUFString(llm, r)
+			}
 		default:
 			return nil, fmt.Errorf("invalid array type: %d", t)
 		}
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}
 
-		a = append(a, e)
+		if a.values != nil {
+			a.values[i] = e
+		}
 	}
 
-	return
+	return a, nil
 }
 
 func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 8eaa07715..f972f9275 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	gguf := NewGGUFV3(binary.LittleEndian)
 	inputLayerCount := 5
+
 	tensors := []Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = gguf.Encode(f, KV{
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
 	}, tensors)
 	require.NoError(t, err)
 
-	ggml, err := LoadModel(f.Name())
-	require.NoError(t, err)
+	ggml, err := LoadModel(f.Name(), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
 
 	// Simple CPU scenario
 	gpus := []gpu.GpuInfo{
diff --git a/llm/server.go b/llm/server.go
index da83416ee..ad67138b5 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -60,7 +60,12 @@ type llmServer struct {
 	sem *semaphore.Weighted
 }
 
-func LoadModel(model string) (*GGML, error) {
+// LoadModel will load a model from disk. The model must be in the GGML format.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func LoadModel(model string, maxArraySize int) (*GGML, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
 	}
 	defer f.Close()
 
-	ggml, _, err := DecodeGGML(f)
+	ggml, _, err := DecodeGGML(f, maxArraySize)
 	return ggml, err
 }
 
@@ -412,7 +417,7 @@ func projectorMemoryRequirements(filename string) uint64 {
 	}
 	defer file.Close()
 
-	ggml, _, err := DecodeGGML(file)
+	ggml, _, err := DecodeGGML(file, 0)
 	if err != nil {
 		return 0
 	}
diff --git a/server/images.go b/server/images.go
index 98794149e..e949fb18a 100644
--- a/server/images.go
+++ b/server/images.go
@@ -423,7 +423,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 							return err
 						}
 
-						ggml, _, err := llm.DecodeGGML(temp)
+						ggml, _, err := llm.DecodeGGML(temp, 0)
 						if err != nil {
 							return err
 						}
diff --git a/server/model.go b/server/model.go
index b262ea385..055ffd63a 100644
--- a/server/model.go
+++ b/server/model.go
@@ -63,7 +63,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()
 
-			ggml, _, err := llm.DecodeGGML(blob)
+			ggml, _, err := llm.DecodeGGML(blob, 0)
 			if err != nil {
 				return nil, err
 			}
@@ -176,7 +176,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
 	}
 	defer bin.Close()
 
-	ggml, _, err := llm.DecodeGGML(bin)
+	ggml, _, err := llm.DecodeGGML(bin, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -210,7 +210,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 
 	var offset int64
 	for offset < stat.Size() {
-		ggml, n, err := llm.DecodeGGML(file)
+		ggml, n, err := llm.DecodeGGML(file, 0)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
diff --git a/server/routes.go b/server/routes.go
index 3d112e9f1..ff66663c0 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -754,7 +754,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 }
 
 func getKVData(digest string, verbose bool) (llm.KV, error) {
-	kvData, err := llm.LoadModel(digest)
+	maxArraySize := 0
+	if verbose {
+		maxArraySize = -1
+	}
+	kvData, err := llm.LoadModel(digest, maxArraySize)
 	if err != nil {
 		return nil, err
 	}
@@ -1101,11 +1105,20 @@ func Serve(ln net.Listener) error {
 	schedCtx, schedDone := context.WithCancel(ctx)
 	sched := InitScheduler(schedCtx)
 	s := &Server{addr: ln.Addr(), sched: sched}
-	r := s.GenerateRoutes()
+
+	http.Handle("/", s.GenerateRoutes())
 
 	slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
 	srvr := &http.Server{
-		Handler: r,
+		// Use http.DefaultServeMux so we get net/http/pprof for
+		// free.
+		//
+		// TODO(bmizerany): Decide if we want to make this
+		// configurable so it is not exposed by default, or allow
+		// users to bind it to a different port. This was a quick
+		// and easy way to get pprof, but it may not be the best
+		// way.
+		Handler: nil,
 	}
 
 	// listen for a ctrl+c and stop any loaded llm
diff --git a/server/sched.go b/server/sched.go
index 424395544..0084b533b 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}
 
 					// Load model for fitting
-					ggml, err := llm.LoadModel(pending.model.ModelPath)
+					ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
 					if err != nil {
 						pending.errCh <- err
 						break
diff --git a/server/sched_test.go b/server/sched_test.go
index 953288347..4a1cf72a0 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
 	}, []llm.Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	})
 	require.NoError(t, err)
 
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
-	scenario.ggml, err = llm.LoadModel(model.ModelPath)
+	scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)
 
 	scenario.req = &LlmRequest{
diff --git a/util/bufioutil/buffer_seeker.go b/util/bufioutil/buffer_seeker.go
new file mode 100644
index 000000000..8775fdb83
--- /dev/null
+++ b/util/bufioutil/buffer_seeker.go
@@ -0,0 +1,34 @@
+package bufioutil
+
+import (
+	"bufio"
+	"io"
+)
+
+type BufferedSeeker struct {
+	rs io.ReadSeeker
+	br *bufio.Reader
+}
+
+func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
+	return &BufferedSeeker{
+		rs: rs,
+		br: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (b *BufferedSeeker) Read(p []byte) (int, error) {
+	return b.br.Read(p)
+}
+
+func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
+	if whence == io.SeekCurrent {
+		offset -= int64(b.br.Buffered())
+	}
+	n, err := b.rs.Seek(offset, whence)
+	if err != nil {
+		return 0, err
+	}
+	b.br.Reset(b.rs)
+	return n, nil
+}
diff --git a/util/bufioutil/buffer_seeker_test.go b/util/bufioutil/buffer_seeker_test.go
new file mode 100644
index 000000000..87145f6b6
--- /dev/null
+++ b/util/bufioutil/buffer_seeker_test.go
@@ -0,0 +1,64 @@
+package bufioutil
+
+import (
+	"bytes"
+	"io"
+	"strings"
+	"testing"
+)
+
+func TestBufferedSeeker(t *testing.T) {
+	const alphabet = "abcdefghijklmnopqrstuvwxyz"
+
+	bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
+
+	checkRead := func(buf []byte, expected string) {
+		t.Helper()
+		_, err := bs.Read(buf)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if !bytes.Equal(buf, []byte(expected)) {
+			t.Fatalf("expected %s, got %s", expected, buf)
+		}
+	}
+
+	// Read the first 5 bytes
+	buf := make([]byte, 5)
+
+	checkRead(buf, "abcde")
+
+	// Seek back to the beginning
+	_, err := bs.Seek(0, io.SeekStart)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// read 'a'
+	checkRead(buf[:1], "a")
+
+	if bs.br.Buffered() == 0 {
+		t.Fatalf("totally unexpected sanity check failed")
+	}
+
+	// Seek past 'b'
+	_, err = bs.Seek(1, io.SeekCurrent)
+	if err != nil {
+		t.Fatal(err)
+	}
+	checkRead(buf, "cdefg")
+
+	// Seek back to the beginning
+	_, err = bs.Seek(0, io.SeekStart)
+	if err != nil {
+		t.Fatal(err)
+	}
+	checkRead(buf, "abcde")
+
+	// Seek to the end
+	_, err = bs.Seek(-5, io.SeekEnd)
+	if err != nil {
+		t.Fatal(err)
+	}
+	checkRead(buf, "vwxyz")
+}

From 4d311eb731bb59512bcd17f1f33d60f3d9022837 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 26 Jun 2024 21:38:12 -0700
Subject: [PATCH 40/43] llm: architecture patch (#5316)

---
 llm/patches/07-gemma.diff | 305 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 305 insertions(+)
 create mode 100644 llm/patches/07-gemma.diff

diff --git a/llm/patches/07-gemma.diff b/llm/patches/07-gemma.diff
new file mode 100644
index 000000000..86eac3d17
--- /dev/null
+++ b/llm/patches/07-gemma.diff
@@ -0,0 +1,305 @@
+From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
+From: Ollama maintainers <hello@ollama.com>
+Date: Wed, 26 Jun 2024 16:18:09 -0700
+Subject: [PATCH] Architecture support
+
+---
+ llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 193 insertions(+), 1 deletion(-)
+
+diff --git a/llama.cpp b/llama.cpp
+index 61948751..3b4196f5 100644
+--- a/llama.cpp
++++ b/llama.cpp
+@@ -217,6 +217,7 @@ enum llm_arch {
+     LLM_ARCH_INTERNLM2,
+     LLM_ARCH_MINICPM,
+     LLM_ARCH_GEMMA,
++    LLM_ARCH_GEMMA2,
+     LLM_ARCH_STARCODER2,
+     LLM_ARCH_MAMBA,
+     LLM_ARCH_XVERSE,
+@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_INTERNLM2,       "internlm2"    },
+     { LLM_ARCH_MINICPM,         "minicpm"      },
+     { LLM_ARCH_GEMMA,           "gemma"        },
++    { LLM_ARCH_GEMMA2,          "gemma2"       },
+     { LLM_ARCH_STARCODER2,      "starcoder2"   },
+     { LLM_ARCH_MAMBA,           "mamba"        },
+     { LLM_ARCH_XVERSE,          "xverse"       },
+@@ -464,10 +466,12 @@ enum llm_tensor {
+     LLM_TENSOR_ATTN_NORM,
+     LLM_TENSOR_ATTN_NORM_2,
+     LLM_TENSOR_ATTN_OUT_NORM,
++    LLM_TENSOR_ATTN_POST_NORM,
+     LLM_TENSOR_ATTN_ROT_EMBD,
+     LLM_TENSOR_FFN_GATE_INP,
+     LLM_TENSOR_FFN_GATE_INP_SHEXP,
+     LLM_TENSOR_FFN_NORM,
++    LLM_TENSOR_FFN_POST_NORM,
+     LLM_TENSOR_FFN_GATE,
+     LLM_TENSOR_FFN_DOWN,
+     LLM_TENSOR_FFN_UP,
+@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+         },
+     },
++    {
++        LLM_ARCH_GEMMA2,
++        {
++            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
++            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
++            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
++            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
++            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
++            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
++            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
++            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
++            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
++            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
++            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
++            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
++            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
++        },
++    },
+     {
+         LLM_ARCH_STARCODER2,
+         {
+@@ -1941,6 +1963,8 @@ enum e_model {
+     MODEL_8x22B,
+     MODEL_16x12B,
+     MODEL_10B_128x3_66B,
++    MODEL_9B,
++    MODEL_27B,
+ };
+ 
+ static const size_t kiB = 1024;
+@@ -2114,6 +2138,7 @@ struct llama_layer {
+     struct ggml_tensor * attn_out_norm_b;
+     struct ggml_tensor * attn_q_a_norm;
+     struct ggml_tensor * attn_kv_a_norm;
++    struct ggml_tensor * attn_post_norm;
+ 
+     // attention
+     struct ggml_tensor * wq;
+@@ -2136,6 +2161,7 @@ struct llama_layer {
+     // normalization
+     struct ggml_tensor * ffn_norm;
+     struct ggml_tensor * ffn_norm_b;
++    struct ggml_tensor * ffn_post_norm;
+     struct ggml_tensor * layer_out_norm;
+     struct ggml_tensor * layer_out_norm_b;
+     struct ggml_tensor * ffn_norm_exps;
+@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
+                 }
+             } break;
+         case LLM_ARCH_GEMMA:
++            {
++                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
++
++                switch (hparams.n_layer) {
++                    case 18: model.type = e_model::MODEL_9B; break;
++                    case 28: model.type = e_model::MODEL_27B; break;
++                    default: model.type = e_model::MODEL_UNKNOWN;
++               }
++            } break;
++        case LLM_ARCH_GEMMA2:
+             {
+                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ 
+@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
+                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                     }
+                 } break;
++            case LLM_ARCH_GEMMA2:
++                {
++                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
++
++                    // output
++                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
++                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
++
++                    const int64_t n_ff          = hparams.n_ff;
++                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
++                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
++                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
++
++                    for (uint32_t i = 0; i < n_layer; ++i) {
++                        ggml_context * ctx_layer = ctx_for_layer(i);
++                        ggml_context * ctx_split = ctx_for_layer_split(i);
++
++                        auto & layer = model.layers[i];
++
++                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
++
++                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
++                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
++                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
++                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
++                        layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
++
++                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
++                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
++                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
++                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
++                        layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
++                    }
++                } break;
+             case LLM_ARCH_STARCODER2:
+                 {
+                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+@@ -10614,6 +10684,123 @@ struct llm_build_context {
+         return gf;
+     }
+ 
++    struct ggml_cgraph * build_gemma2() {
++        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
++
++        const int64_t n_embd_head_k = hparams.n_embd_head_k;
++
++        struct ggml_tensor * cur;
++        struct ggml_tensor * inpL;
++
++        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
++
++        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
++        cb(inpL, "inp_scaled", -1);
++
++        // inp_pos - contains the positions
++        struct ggml_tensor * inp_pos = build_inp_pos();
++
++        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
++        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
++
++        for (int il = 0; il < n_layer; ++il) {
++            // norm
++            cur = llm_build_norm(ctx0, inpL, hparams,
++                    model.layers[il].attn_norm, NULL,
++                    LLM_NORM_RMS, cb, il);
++            cb(cur, "attn_norm", il);
++
++            // self-attention
++            {
++                // compute Q and K and RoPE them
++                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
++                cb(Qcur, "Qcur", il);
++
++                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
++                cb(Kcur, "Kcur", il);
++
++                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
++                cb(Vcur, "Vcur", il);
++
++                Qcur = ggml_rope_ext(
++                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
++                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
++                        ext_factor, attn_factor, beta_fast, beta_slow);
++                cb(Qcur, "Qcur", il);
++
++                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
++                cb(Qcur, "Qcur_scaled", il);
++
++                Kcur = ggml_rope_ext(
++                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
++                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
++                        ext_factor, attn_factor, beta_fast, beta_slow);
++                cb(Kcur, "Kcur", il);
++
++                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
++                        model.layers[il].wo, NULL,
++                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
++            }
++
++            if (il == n_layer - 1) {
++                // skip computing output for unused tokens
++                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
++                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
++                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
++            }
++
++            cur = llm_build_norm(ctx0, cur, hparams,
++                    model.layers[il].attn_post_norm, NULL,
++                    LLM_NORM_RMS, cb, il);
++            cb(cur, "attn_post_norm", il);
++
++            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
++            cb(sa_out, "sa_out", il);
++
++            cur = llm_build_norm(ctx0, sa_out, hparams,
++                    model.layers[il].ffn_norm, NULL,
++                    LLM_NORM_RMS, cb, il);
++            cb(cur, "ffn_norm", il);
++
++            // feed-forward network
++            {
++                cur = llm_build_ffn(ctx0, cur,
++                        model.layers[il].ffn_up, NULL,
++                        model.layers[il].ffn_gate, NULL,
++                        model.layers[il].ffn_down, NULL,
++                        NULL,
++                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
++                cb(cur, "ffn_out", il);
++            }
++
++            cur = llm_build_norm(ctx0, cur, hparams,
++                model.layers[il].ffn_post_norm, NULL,
++                LLM_NORM_RMS, cb, -1);
++            cb(cur, "ffn_post_norm", -1);
++
++            cur = ggml_add(ctx0, cur, sa_out);
++            cb(cur, "l_out", il);
++
++            // input for next layer
++            inpL = cur;
++        }
++
++        cur = inpL;
++
++        cur = llm_build_norm(ctx0, cur, hparams,
++                model.output_norm, NULL,
++                LLM_NORM_RMS, cb, -1);
++        cb(cur, "result_norm", -1);
++
++        // lm_head
++        cur = ggml_mul_mat(ctx0, model.output, cur);
++        cb(cur, "result_output", -1);
++
++        ggml_build_forward_expand(gf, cur);
++
++        return gf;
++    }
++
+     struct ggml_cgraph * build_starcoder2() {
+         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ 
+@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
+             {
+                 result = llm.build_gemma();
+             } break;
++        case LLM_ARCH_GEMMA2:
++            {
++                result = llm.build_gemma2();
++            } break;
+         case LLM_ARCH_STARCODER2:
+             {
+                 result = llm.build_starcoder2();
+@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+         case LLM_ARCH_PHI2:
+         case LLM_ARCH_PHI3:
+         case LLM_ARCH_GEMMA:
++        case LLM_ARCH_GEMMA2:
+         case LLM_ARCH_STARCODER2:
+         case LLM_ARCH_GPTNEOX:
+             return LLAMA_ROPE_TYPE_NEOX;
+@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
+         if (add_ass) {
+             ss << "<s>assistant\n";
+         }
+-    } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
++    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
+         // google/gemma-7b-it
+         std::string system_prompt = "";
+         for (auto message : chat) {
+-- 
+2.45.2
+

From 123a722a6f541e300bc8e34297ac378ebe23f527 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 26 Jun 2024 21:38:21 -0700
Subject: [PATCH 41/43] zip: prevent extracting files into parent dirs (#5314)

---
 cmd/cmd.go           |  6 +--
 server/model.go      | 57 ++++++++++++++++++---------
 server/model_test.go | 92 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 22 deletions(-)
 create mode 100644 server/model_test.go

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 89b551f40..909e8e4b2 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
 	}
 	defer tempfile.Close()
 
-	zipfile := zip.NewWriter(tempfile)
-	defer zipfile.Close()
-
 	detectContentType := func(path string) (string, error) {
 		f, err := os.Open(path)
 		if err != nil {
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
 		files = append(files, tks...)
 	}
 
+	zipfile := zip.NewWriter(tempfile)
+	defer zipfile.Close()
+
 	for _, file := range files {
 		f, err := os.Open(file)
 		if err != nil {
diff --git a/server/model.go b/server/model.go
index 055ffd63a..d56e641ba 100644
--- a/server/model.go
+++ b/server/model.go
@@ -11,6 +11,7 @@ import (
 	"net/http"
 	"os"
 	"path/filepath"
+	"strings"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }
 
-func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
 	stat, err := file.Stat()
 	if err != nil {
-		return nil, err
+		return err
 	}
 
 	r, err := zip.NewReader(file, stat.Size())
 	if err != nil {
-		return nil, err
+		return err
 	}
 
-	tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
-	if err != nil {
-		return nil, err
-	}
-	defer os.RemoveAll(tempdir)
-
 	fn(api.ProgressResponse{Status: "unpacking model metadata"})
 	for _, f := range r.File {
+		n := filepath.Join(p, f.Name)
+		if !strings.HasPrefix(n, p) {
+			slog.Warn("skipped extracting file outside of context", "name", f.Name)
+			continue
+		}
+
+		if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
+			return err
+		}
+
 		// TODO(mxyng): this should not write out all files to disk
-		outfile, err := os.Create(filepath.Join(tempdir, f.Name))
+		outfile, err := os.Create(n)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		defer outfile.Close()
 
 		infile, err := f.Open()
 		if err != nil {
-			return nil, err
+			return err
 		}
 		defer infile.Close()
 
 		if _, err = io.Copy(outfile, infile); err != nil {
-			return nil, err
+			return err
 		}
 
 		if err := outfile.Close(); err != nil {
-			return nil, err
+			return err
 		}
 
 		if err := infile.Close(); err != nil {
-			return nil, err
+			return err
 		}
 	}
 
-	mf, err := convert.GetModelFormat(tempdir)
+	return nil
+}
+
+func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+	tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
+	if err != nil {
+		return nil, err
+	}
+	defer os.RemoveAll(tempDir)
+
+	if err := extractFromZipFile(tempDir, file, fn); err != nil {
+		return nil, err
+	}
+
+	mf, err := convert.GetModelFormat(tempDir)
 	if err != nil {
 		return nil, err
 	}
 
-	params, err := mf.GetParams(tempdir)
+	params, err := mf.GetParams(tempDir)
 	if err != nil {
 		return nil, err
 	}
 
-	mArch, err := mf.GetModelArch("", tempdir, params)
+	mArch, err := mf.GetModelArch("", tempDir, params)
 	if err != nil {
 		return nil, err
 	}
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
 
 	// TODO(mxyng): this should write directly into a layer
 	// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
-	temp, err := os.CreateTemp(tempdir, "fp16")
+	temp, err := os.CreateTemp(tempDir, "fp16")
 	if err != nil {
 		return nil, err
 	}
diff --git a/server/model_test.go b/server/model_test.go
new file mode 100644
index 000000000..c3023eb2b
--- /dev/null
+++ b/server/model_test.go
@@ -0,0 +1,92 @@
+package server
+
+import (
+	"archive/zip"
+	"bytes"
+	"io"
+	"os"
+	"path/filepath"
+	"slices"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+)
+
+func createZipFile(t *testing.T, name string) *os.File {
+	t.Helper()
+
+	f, err := os.CreateTemp(t.TempDir(), "")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	zf := zip.NewWriter(f)
+	defer zf.Close()
+
+	zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
+		t.Fatal(err)
+	}
+
+	return f
+}
+
+func TestExtractFromZipFile(t *testing.T) {
+	cases := []struct {
+		name   string
+		expect []string
+	}{
+		{
+			name:   "good",
+			expect: []string{"good"},
+		},
+		{
+			name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			f := createZipFile(t, tt.name)
+			defer f.Close()
+
+			tempDir := t.TempDir()
+			if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
+				t.Fatal(err)
+			}
+
+			var matches []string
+			if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
+				if err != nil {
+					return err
+				}
+
+				if !fi.IsDir() {
+					matches = append(matches, p)
+				}
+
+				return nil
+			}); err != nil {
+				t.Fatal(err)
+			}
+
+			var actual []string
+			for _, match := range matches {
+				rel, err := filepath.Rel(tempDir, match)
+				if err != nil {
+					t.Error(err)
+				}
+
+				actual = append(actual, rel)
+			}
+
+			if !slices.Equal(actual, tt.expect) {
+				t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
+			}
+		})
+	}
+}

From 2cc7d050124929ae4745633fddf053585a22f0a2 Mon Sep 17 00:00:00 2001
From: Michael <mchiang0610@users.noreply.github.com>
Date: Thu, 27 Jun 2024 12:45:16 -0400
Subject: [PATCH 42/43] update readme for gemma 2 (#5333)

* update readme for gemma 2
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 978625731..72ed8fa5e 100644
--- a/README.md
+++ b/README.md
@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
-| Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |

From de2163dafd19b5ba2bed3d459354179662cc524d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 27 Jun 2024 10:52:25 -0700
Subject: [PATCH 43/43] gemma2 graph

---
 llm/ggml.go | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/llm/ggml.go b/llm/ggml.go
index d0d0b6ddc..cfead450d 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -366,9 +366,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
-	case "gemma":
-		fullOffload = 4 * batch * (embedding + vocab)
-		partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
+	case "gemma", "gemma2":
+		fullOffload = max(
+			4*batch*(embedding+vocab),
+			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
+		)
+
+		partialOffload = max(
+			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
+				4*embeddingHeadsK*context*8+
+				embedding*embeddingHeadsK*heads*9/16,
+		)
 	case "command-r":
 		fullOffload = max(
 			4*batch*(embedding+vocab),