From dc5a645434f0ea6364c426c6ba112da1afa40cb2 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 13 Aug 2025 15:42:16 -0700
Subject: [PATCH 1/5] cuda: leverage JIT for smaller footprint (#11635)

Prior to this change our official binaries contained both JIT PTX code and
the cubin binary code for our chosen compute capabilities. This change
switches to only compile the PTX code and rely on JIT at runtime for
generating the cubin specific to the users GPU.  The cubins are cached
on the users system, so they should only see a small lag on the very
first model load for a given Ollama release.  This also adds the first
generation of Blackwell GPUs so they aren't reliant on the Hopper PTX.

This change reduces the ggml-cuda.dll from 1.2G to 460M
---
 CMakePresets.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index ab2cfe9d6..95ec0d799 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -22,7 +22,7 @@
       "name": "CUDA 12",
       "inherits": [ "CUDA" ],
       "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
+        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;120-virtual",
         "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
       }
     },
@@ -30,14 +30,14 @@
       "name": "JetPack 5",
       "inherits": [ "CUDA" ],
       "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "72;87"
+        "CMAKE_CUDA_ARCHITECTURES": "72-virtual;87-virtual"
       }
     },
     {
       "name": "JetPack 6",
       "inherits": [ "CUDA" ],
       "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "87"
+        "CMAKE_CUDA_ARCHITECTURES": "87-virtual"
       }
     },
     {

From a24f90604f883df01500b62992fb80b242022510 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 13 Aug 2025 15:42:36 -0700
Subject: [PATCH 2/5] int: adjust a few models for integration tests (#11872)

---
 integration/concurrency_test.go | 44 ++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index dbf1e6fa3..bb0348ebc 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -4,7 +4,9 @@ package integration
 
 import (
 	"context"
+	"fmt"
 	"log/slog"
+	"math"
 	"os"
 	"strconv"
 	"sync"
@@ -21,7 +23,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 	var (
 		req = [2]api.GenerateRequest{
 			{
-				Model:     "llama3.2:1b",
+				Model:     smol,
 				Prompt:    "why is the ocean blue?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
@@ -30,7 +32,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 					"temperature": 0.0,
 				},
 			}, {
-				Model:     "tinydolphin",
+				Model:     "qwen3:0.6b",
 				Prompt:    "what is the origin of the us thanksgiving holiday?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
@@ -132,16 +134,16 @@ func TestMultiModelStress(t *testing.T) {
 			size: 2876 * format.MebiByte,
 		},
 		{
-			name: "phi",
-			size: 2616 * format.MebiByte,
+			name: "qwen3:0.6b",
+			size: 1600 * format.MebiByte,
 		},
 		{
 			name: "gemma:2b",
 			size: 2364 * format.MebiByte,
 		},
 		{
-			name: "stable-code:3b",
-			size: 2608 * format.MebiByte,
+			name: "deepseek-r1:1.5b",
+			size: 2048 * format.MebiByte,
 		},
 		{
 			name: "starcoder2:3b",
@@ -149,17 +151,21 @@ func TestMultiModelStress(t *testing.T) {
 		},
 	}
 	mediumModels := []model{
+		{
+			name: "qwen3:8b",
+			size: 6600 * format.MebiByte,
+		},
 		{
 			name: "llama2",
 			size: 5118 * format.MebiByte,
 		},
 		{
-			name: "mistral",
-			size: 4620 * format.MebiByte,
+			name: "deepseek-r1:7b",
+			size: 5600 * format.MebiByte,
 		},
 		{
-			name: "orca-mini:7b",
-			size: 5118 * format.MebiByte,
+			name: "mistral",
+			size: 4620 * format.MebiByte,
 		},
 		{
 			name: "dolphin-mistral",
@@ -254,7 +260,7 @@ func TestMultiModelStress(t *testing.T) {
 	}
 	go func() {
 		for {
-			time.Sleep(2 * time.Second)
+			time.Sleep(10 * time.Second)
 			select {
 			case <-ctx.Done():
 				return
@@ -265,7 +271,21 @@ func TestMultiModelStress(t *testing.T) {
 					continue
 				}
 				for _, m := range models.Models {
-					slog.Info("loaded model snapshot", "model", m)
+					var procStr string
+					switch {
+					case m.SizeVRAM == 0:
+						procStr = "100% CPU"
+					case m.SizeVRAM == m.Size:
+						procStr = "100% GPU"
+					case m.SizeVRAM > m.Size || m.Size == 0:
+						procStr = "Unknown"
+					default:
+						sizeCPU := m.Size - m.SizeVRAM
+						cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
+						procStr = fmt.Sprintf("%d%%/%d%%", int(cpuPercent), int(100-cpuPercent))
+					}
+
+					slog.Info("loaded model snapshot", "model", m.Name, "CPU/GPU", procStr, "expires", format.HumanTime(m.ExpiresAt, "Never"))
 				}
 			}
 		}

From 837379a94c03e505bbad965a31eb1aa7976edb3c Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 13 Aug 2025 15:43:33 -0700
Subject: [PATCH 3/5] discovery: fix cudart driver version (#11614)

We prefer the nvcuda library, which reports driver versions. When we
dropped cuda v11, we added a safety check for too-old drivers.  What
we missed was the cudart fallback discovery logic didn't have driver
version wired up.  This fixes cudart discovery to expose the driver
version as well so we no longer reject all GPUs if nvcuda didn't work.
---
 discover/gpu.go            | 2 ++
 discover/gpu_info_cudart.c | 9 +++------
 discover/gpu_info_cudart.h | 7 ++-----
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/discover/gpu.go b/discover/gpu.go
index 15bad4466..f6e3c9cb1 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -263,6 +263,8 @@ func GetGPUInfo() GpuInfoList {
 				var driverMinor int
 				if cHandles.cudart != nil {
 					C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
+					driverMajor = int(cHandles.cudart.driver_major)
+					driverMinor = int(cHandles.cudart.driver_minor)
 				} else {
 					C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
 					driverMajor = int(cHandles.nvcuda.driver_major)
diff --git a/discover/gpu_info_cudart.c b/discover/gpu_info_cudart.c
index bc5115bfd..76c17b9d8 100644
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
@@ -69,18 +69,15 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
   }
 
   int version = 0;
-  cudartDriverVersion_t driverVersion;
-  driverVersion.major = 0;
-  driverVersion.minor = 0;
 
   // Report driver version if we're in verbose mode, ignore errors
   ret = (*resp->ch.cudaDriverGetVersion)(&version);
   if (ret != CUDART_SUCCESS) {
     LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
   } else {
-    driverVersion.major = version / 1000;
-    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
+    resp->ch.driver_major = version / 1000;
+    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
+    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", resp->ch.driver_major, resp->ch.driver_minor);
   }
 
   ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
diff --git a/discover/gpu_info_cudart.h b/discover/gpu_info_cudart.h
index ff0c0af19..893f3f7bd 100644
--- a/discover/gpu_info_cudart.h
+++ b/discover/gpu_info_cudart.h
@@ -29,11 +29,6 @@ typedef struct cudartMemory_st {
   size_t used;
 } cudartMemory_t;
 
-typedef struct cudartDriverVersion {
-  int major;
-  int minor;
-} cudartDriverVersion_t;
-
 typedef struct cudaUUID {
     unsigned char bytes[16];
 } cudaUUID_t;
@@ -123,6 +118,8 @@ typedef struct cudaDeviceProp {
 typedef struct cudart_handle {
   void *handle;
   uint16_t verbose;
+  int driver_major;
+  int driver_minor;
   cudartReturn_t (*cudaSetDevice)(int device);
   cudartReturn_t (*cudaDeviceSynchronize)(void);
   cudartReturn_t (*cudaDeviceReset)(void);

From c385ca86727ce2787d286a733cb35dd96c53bd16 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:07:13 -0700
Subject: [PATCH 4/5] test: add valid responses (#11902)

some of the new models need a few more valid responses to pass
---
 integration/utils_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integration/utils_test.go b/integration/utils_test.go
index 727825a41..6375b1f97 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -574,8 +574,8 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 			},
 		},
 		[][]string{
-			{"sunlight"},
-			{"soil", "organic", "earth", "black", "tan"},
+			{"sunlight", "scattering", "interact"},
+			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles"},
 			{"england", "english", "massachusetts", "pilgrims", "british"},
 			{"fourth", "july", "declaration", "independence"},
 			{"nitrogen", "oxygen", "carbon", "dioxide"},

From 7ccfd97a9381588562a4764e41bde27ae5197ace Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:54:55 -0700
Subject: [PATCH 5/5] doc: clarify both rocm and main bundle necessary (#11900)

Some users expect the rocm bundles to be self-sufficient, but are designed to be additive.
---
 docs/linux.md   | 6 +++++-
 docs/windows.md | 6 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/linux.md b/docs/linux.md
index 0c19ef0b4..9a156d1dc 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -34,7 +34,11 @@ ollama -v
 
 ### AMD GPU install
 
-If you have an AMD GPU, also download and extract the additional ROCm package:
+If you have an AMD GPU, **also** download and extract the additional ROCm package:
+
+> [!IMPORTANT]
+> The ROCm tgz contains only AMD dependent libraries.  You must extract **both** `ollama-linux-amd64.tgz` and `ollama-linux-amd64-rocm.tgz` into the same location.
+
 
 ```shell
 curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
diff --git a/docs/windows.md b/docs/windows.md
index 2e495e49d..eb067ed04 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -68,9 +68,9 @@ If you'd like to install or integrate Ollama as a service, a standalone
 `ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
 and GPU library dependencies for Nvidia.  If you have an AMD GPU, also download
 and extract the additional ROCm package `ollama-windows-amd64-rocm.zip` into the
-same directory.  This allows for embedding Ollama in existing applications, or
-running it as a system service via `ollama serve` with tools such as
-[NSSM](https://nssm.cc/). 
+same directory.  Both zip files are necessary for a complete AMD installation.
+This allows for embedding Ollama in existing applications, or running it as a
+system service via `ollama serve` with tools such as [NSSM](https://nssm.cc/). 
 
 > [!NOTE]  
 > If you are upgrading from a prior version, you should remove the old directories first.