diff --git a/CMakePresets.json b/CMakePresets.json index 82da950bc..0c9b67225 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -22,7 +22,7 @@ "name": "CUDA 12", "inherits": [ "CUDA" ], "cacheVariables": { - "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120", + "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;120-virtual", "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2" } }, @@ -30,14 +30,14 @@ "name": "JetPack 5", "inherits": [ "CUDA" ], "cacheVariables": { - "CMAKE_CUDA_ARCHITECTURES": "72;87" + "CMAKE_CUDA_ARCHITECTURES": "72-virtual;87-virtual" } }, { "name": "JetPack 6", "inherits": [ "CUDA" ], "cacheVariables": { - "CMAKE_CUDA_ARCHITECTURES": "87" + "CMAKE_CUDA_ARCHITECTURES": "87-virtual" } }, { diff --git a/discover/gpu.go b/discover/gpu.go index 123177d3a..0ca0dde8d 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -300,6 +300,8 @@ func GetGPUInfo() GpuInfoList { var driverMinor int if cHandles.cudart != nil { C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo) + driverMajor = int(cHandles.cudart.driver_major) + driverMinor = int(cHandles.cudart.driver_minor) } else { C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo) driverMajor = int(cHandles.nvcuda.driver_major) diff --git a/discover/gpu_info_cudart.c b/discover/gpu_info_cudart.c index bc5115bfd..76c17b9d8 100644 --- a/discover/gpu_info_cudart.c +++ b/discover/gpu_info_cudart.c @@ -69,18 +69,15 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { } int version = 0; - cudartDriverVersion_t driverVersion; - driverVersion.major = 0; - driverVersion.minor = 0; // Report driver version if we're in verbose mode, ignore errors ret = (*resp->ch.cudaDriverGetVersion)(&version); if (ret != CUDART_SUCCESS) { LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret); } else { - driverVersion.major = version / 1000; - driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; - LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); + resp->ch.driver_major = version / 1000; + resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10; + LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", resp->ch.driver_major, resp->ch.driver_minor); } ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices); diff --git a/discover/gpu_info_cudart.h b/discover/gpu_info_cudart.h index ff0c0af19..893f3f7bd 100644 --- a/discover/gpu_info_cudart.h +++ b/discover/gpu_info_cudart.h @@ -29,11 +29,6 @@ typedef struct cudartMemory_st { size_t used; } cudartMemory_t; -typedef struct cudartDriverVersion { - int major; - int minor; -} cudartDriverVersion_t; - typedef struct cudaUUID { unsigned char bytes[16]; } cudaUUID_t; @@ -123,6 +118,8 @@ typedef struct cudaDeviceProp { typedef struct cudart_handle { void *handle; uint16_t verbose; + int driver_major; + int driver_minor; cudartReturn_t (*cudaSetDevice)(int device); cudartReturn_t (*cudaDeviceSynchronize)(void); cudartReturn_t (*cudaDeviceReset)(void); diff --git a/docs/linux.md b/docs/linux.md index 0c19ef0b4..9a156d1dc 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -34,7 +34,11 @@ ollama -v ### AMD GPU install -If you have an AMD GPU, also download and extract the additional ROCm package: +If you have an AMD GPU, **also** download and extract the additional ROCm package: + +> [!IMPORTANT] +> The ROCm tgz contains only AMD dependent libraries. You must extract **both** `ollama-linux-amd64.tgz` and `ollama-linux-amd64-rocm.tgz` into the same location. + ```shell curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz diff --git a/docs/windows.md b/docs/windows.md index 2e495e49d..eb067ed04 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -68,9 +68,9 @@ If you'd like to install or integrate Ollama as a service, a standalone `ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI and GPU library dependencies for Nvidia. If you have an AMD GPU, also download and extract the additional ROCm package `ollama-windows-amd64-rocm.zip` into the -same directory. This allows for embedding Ollama in existing applications, or -running it as a system service via `ollama serve` with tools such as -[NSSM](https://nssm.cc/). +same directory. Both zip files are necessary for a complete AMD installation. +This allows for embedding Ollama in existing applications, or running it as a +system service via `ollama serve` with tools such as [NSSM](https://nssm.cc/). > [!NOTE] > If you are upgrading from a prior version, you should remove the old directories first. diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index dbf1e6fa3..bb0348ebc 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -4,7 +4,9 @@ package integration import ( "context" + "fmt" "log/slog" + "math" "os" "strconv" "sync" @@ -21,7 +23,7 @@ func TestMultiModelConcurrency(t *testing.T) { var ( req = [2]api.GenerateRequest{ { - Model: "llama3.2:1b", + Model: smol, Prompt: "why is the ocean blue?", Stream: &stream, KeepAlive: &api.Duration{Duration: 10 * time.Second}, @@ -30,7 +32,7 @@ func TestMultiModelConcurrency(t *testing.T) { "temperature": 0.0, }, }, { - Model: "tinydolphin", + Model: "qwen3:0.6b", Prompt: "what is the origin of the us thanksgiving holiday?", Stream: &stream, KeepAlive: &api.Duration{Duration: 10 * time.Second}, @@ -132,16 +134,16 @@ func TestMultiModelStress(t *testing.T) { size: 2876 * format.MebiByte, }, { - name: "phi", - size: 2616 * format.MebiByte, + name: "qwen3:0.6b", + size: 1600 * format.MebiByte, }, { name: "gemma:2b", size: 2364 * format.MebiByte, }, { - name: "stable-code:3b", - size: 2608 * format.MebiByte, + name: "deepseek-r1:1.5b", + size: 2048 * format.MebiByte, }, { name: "starcoder2:3b", @@ -149,17 +151,21 @@ func TestMultiModelStress(t *testing.T) { }, } mediumModels := []model{ + { + name: "qwen3:8b", + size: 6600 * format.MebiByte, + }, { name: "llama2", size: 5118 * format.MebiByte, }, { - name: "mistral", - size: 4620 * format.MebiByte, + name: "deepseek-r1:7b", + size: 5600 * format.MebiByte, }, { - name: "orca-mini:7b", - size: 5118 * format.MebiByte, + name: "mistral", + size: 4620 * format.MebiByte, }, { name: "dolphin-mistral", @@ -254,7 +260,7 @@ func TestMultiModelStress(t *testing.T) { } go func() { for { - time.Sleep(2 * time.Second) + time.Sleep(10 * time.Second) select { case <-ctx.Done(): return @@ -265,7 +271,21 @@ func TestMultiModelStress(t *testing.T) { continue } for _, m := range models.Models { - slog.Info("loaded model snapshot", "model", m) + var procStr string + switch { + case m.SizeVRAM == 0: + procStr = "100% CPU" + case m.SizeVRAM == m.Size: + procStr = "100% GPU" + case m.SizeVRAM > m.Size || m.Size == 0: + procStr = "Unknown" + default: + sizeCPU := m.Size - m.SizeVRAM + cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100) + procStr = fmt.Sprintf("%d%%/%d%%", int(cpuPercent), int(100-cpuPercent)) + } + + slog.Info("loaded model snapshot", "model", m.Name, "CPU/GPU", procStr, "expires", format.HumanTime(m.ExpiresAt, "Never")) } } } diff --git a/integration/utils_test.go b/integration/utils_test.go index 727825a41..6375b1f97 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -574,8 +574,8 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { }, }, [][]string{ - {"sunlight"}, - {"soil", "organic", "earth", "black", "tan"}, + {"sunlight", "scattering", "interact"}, + {"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles"}, {"england", "english", "massachusetts", "pilgrims", "british"}, {"fourth", "july", "declaration", "independence"}, {"nitrogen", "oxygen", "carbon", "dioxide"},