Merge remote-tracking branch 'upstream/main' into vulkanV3

2025-08-14 22:11:08 +02:00 · 2025-08-14 22:11:08 +02:00 · d71c83f2ba
parent 6543213e6f 7ccfd97a93
commit d71c83f2ba
8 changed files with 52 additions and 32 deletions
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -22,7 +22,7 @@
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
+        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;120-virtual",
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
@ -30,14 +30,14 @@
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "72;87"
+        "CMAKE_CUDA_ARCHITECTURES": "72-virtual;87-virtual"
      }
    },
    {
      "name": "JetPack 6",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "87"
+        "CMAKE_CUDA_ARCHITECTURES": "87-virtual"
      }
    },
    {
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -300,6 +300,8 @@ func GetGPUInfo() GpuInfoList {
 				var driverMinor int
 				if cHandles.cudart != nil {
 					C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
+					driverMajor = int(cHandles.cudart.driver_major)
+					driverMinor = int(cHandles.cudart.driver_minor)
 				} else {
 					C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
 					driverMajor = int(cHandles.nvcuda.driver_major)
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
@ -69,18 +69,15 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  }

  int version = 0;
-  cudartDriverVersion_t driverVersion;
-  driverVersion.major = 0;
-  driverVersion.minor = 0;

  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.cudaDriverGetVersion)(&version);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
  } else {
-    driverVersion.major = version / 1000;
-    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
+    resp->ch.driver_major = version / 1000;
+    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
+    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }

  ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
--- a/discover/gpu_info_cudart.h
+++ b/discover/gpu_info_cudart.h
@ -29,11 +29,6 @@ typedef struct cudartMemory_st {
  size_t used;
 } cudartMemory_t;

-typedef struct cudartDriverVersion {
-  int major;
-  int minor;
-} cudartDriverVersion_t;
-
 typedef struct cudaUUID {
    unsigned char bytes[16];
 } cudaUUID_t;
@ -123,6 +118,8 @@ typedef struct cudaDeviceProp {
 typedef struct cudart_handle {
  void *handle;
  uint16_t verbose;
+  int driver_major;
+  int driver_minor;
  cudartReturn_t (*cudaSetDevice)(int device);
  cudartReturn_t (*cudaDeviceSynchronize)(void);
  cudartReturn_t (*cudaDeviceReset)(void);
--- a/docs/linux.md
+++ b/docs/linux.md
@ -34,7 +34,11 @@ ollama -v

 ### AMD GPU install

-If you have an AMD GPU, also download and extract the additional ROCm package:
+If you have an AMD GPU, **also** download and extract the additional ROCm package:
+
+> [!IMPORTANT]
+> The ROCm tgz contains only AMD dependent libraries.  You must extract **both** `ollama-linux-amd64.tgz` and `ollama-linux-amd64-rocm.tgz` into the same location.
+

 ```shell
 curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
--- a/docs/windows.md
+++ b/docs/windows.md
@ -68,9 +68,9 @@ If you'd like to install or integrate Ollama as a service, a standalone
 `ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
 and GPU library dependencies for Nvidia.  If you have an AMD GPU, also download
 and extract the additional ROCm package `ollama-windows-amd64-rocm.zip` into the
-same directory.  This allows for embedding Ollama in existing applications, or
-running it as a system service via `ollama serve` with tools such as
-[NSSM](https://nssm.cc/). 
+same directory.  Both zip files are necessary for a complete AMD installation.
+This allows for embedding Ollama in existing applications, or running it as a
+system service via `ollama serve` with tools such as [NSSM](https://nssm.cc/). 

 > [!NOTE]  
 > If you are upgrading from a prior version, you should remove the old directories first.
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@ -4,7 +4,9 @@ package integration

 import (
 	"context"
+	"fmt"
 	"log/slog"
+	"math"
 	"os"
 	"strconv"
 	"sync"
@ -21,7 +23,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 	var (
 		req = [2]api.GenerateRequest{
 			{
-				Model:     "llama3.2:1b",
+				Model:     smol,
 				Prompt:    "why is the ocean blue?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
@ -30,7 +32,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 					"temperature": 0.0,
 				},
 			}, {
-				Model:     "tinydolphin",
+				Model:     "qwen3:0.6b",
 				Prompt:    "what is the origin of the us thanksgiving holiday?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
@ -132,16 +134,16 @@ func TestMultiModelStress(t *testing.T) {
 			size: 2876 * format.MebiByte,
 		},
 		{
-			name: "phi",
-			size: 2616 * format.MebiByte,
+			name: "qwen3:0.6b",
+			size: 1600 * format.MebiByte,
 		},
 		{
 			name: "gemma:2b",
 			size: 2364 * format.MebiByte,
 		},
 		{
-			name: "stable-code:3b",
-			size: 2608 * format.MebiByte,
+			name: "deepseek-r1:1.5b",
+			size: 2048 * format.MebiByte,
 		},
 		{
 			name: "starcoder2:3b",
@ -149,17 +151,21 @@ func TestMultiModelStress(t *testing.T) {
 		},
 	}
 	mediumModels := []model{
+		{
+			name: "qwen3:8b",
+			size: 6600 * format.MebiByte,
+		},
 		{
 			name: "llama2",
 			size: 5118 * format.MebiByte,
 		},
 		{
-			name: "mistral",
-			size: 4620 * format.MebiByte,
+			name: "deepseek-r1:7b",
+			size: 5600 * format.MebiByte,
 		},
 		{
-			name: "orca-mini:7b",
-			size: 5118 * format.MebiByte,
+			name: "mistral",
+			size: 4620 * format.MebiByte,
 		},
 		{
 			name: "dolphin-mistral",
@ -254,7 +260,7 @@ func TestMultiModelStress(t *testing.T) {
 	}
 	go func() {
 		for {
-			time.Sleep(2 * time.Second)
+			time.Sleep(10 * time.Second)
 			select {
 			case <-ctx.Done():
 				return
@ -265,7 +271,21 @@ func TestMultiModelStress(t *testing.T) {
 					continue
 				}
 				for _, m := range models.Models {
-					slog.Info("loaded model snapshot", "model", m)
+					var procStr string
+					switch {
+					case m.SizeVRAM == 0:
+						procStr = "100% CPU"
+					case m.SizeVRAM == m.Size:
+						procStr = "100% GPU"
+					case m.SizeVRAM > m.Size || m.Size == 0:
+						procStr = "Unknown"
+					default:
+						sizeCPU := m.Size - m.SizeVRAM
+						cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
+						procStr = fmt.Sprintf("%d%%/%d%%", int(cpuPercent), int(100-cpuPercent))
+					}
+
+					slog.Info("loaded model snapshot", "model", m.Name, "CPU/GPU", procStr, "expires", format.HumanTime(m.ExpiresAt, "Never"))
 				}
 			}
 		}
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@ -574,8 +574,8 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 			},
 		},
 		[][]string{
-			{"sunlight"},
-			{"soil", "organic", "earth", "black", "tan"},
+			{"sunlight", "scattering", "interact"},
+			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles"},
 			{"england", "english", "massachusetts", "pilgrims", "british"},
 			{"fourth", "july", "declaration", "independence"},
 			{"nitrogen", "oxygen", "carbon", "dioxide"},