add more search paths for cuda libs

2024-01-10 09:51:02 -05:00
8 changed files with 147 additions and 236 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -13,10 +13,7 @@ import "C"
 import (
 	"fmt"
 	"log"
-	"os"
-	"path/filepath"
 	"runtime"
-	"strings"
 	"sync"
 	"unsafe"
 )
@@ -32,79 +29,31 @@ var gpuHandles *handles = nil
 // With our current CUDA compile flags, 5.2 and older will not work properly
 const CudaComputeMajorMin = 6

-// Possible locations for the nvidia-ml library
-var CudaLinuxGlobs = []string{
-	"/usr/local/cuda/lib64/libnvidia-ml.so*",
-	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
-	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
-	"/usr/lib/wsl/lib/libnvidia-ml.so*",
-	"/opt/cuda/lib64/libnvidia-ml.so*",
-	"/usr/lib*/libnvidia-ml.so*",
-	"/usr/local/lib*/libnvidia-ml.so*",
-	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
-	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
-}
-
-var CudaWindowsGlobs = []string{
-	"c:\\Windows\\System32\\nvml.dll",
-}
-
-var RocmLinuxGlobs = []string{
-	"/opt/rocm*/lib*/librocm_smi64.so*",
-}
-
-var RocmWindowsGlobs = []string{
-	"c:\\Windows\\System32\\rocm_smi64.dll",
-}
-
 // Note: gpuMutex must already be held
 func initGPUHandles() {
-
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
-
-	var cudaMgmtName string
-	var cudaMgmtPatterns []string
-	var rocmMgmtName string
-	var rocmMgmtPatterns []string
-	switch runtime.GOOS {
-	case "windows":
-		cudaMgmtName = "nvml.dll"
-		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
-		copy(cudaMgmtPatterns, CudaWindowsGlobs)
-		rocmMgmtName = "rocm_smi64.dll"
-		rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
-		copy(rocmMgmtPatterns, RocmWindowsGlobs)
-	case "linux":
-		cudaMgmtName = "libnvidia-ml.so"
-		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
-		copy(cudaMgmtPatterns, CudaLinuxGlobs)
-		rocmMgmtName = "librocm_smi64.so"
-		rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
-		copy(rocmMgmtPatterns, RocmLinuxGlobs)
-	default:
-		return
-	}
-
 	log.Printf("Detecting GPU type")
 	gpuHandles = &handles{nil, nil}
-	cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
-	if len(cudaLibPaths) > 0 {
-		cuda := LoadCUDAMgmt(cudaLibPaths)
-		if cuda != nil {
-			log.Printf("Nvidia GPU detected")
-			gpuHandles.cuda = cuda
-			return
-		}
-	}
+	var resp C.cuda_init_resp_t
+	C.cuda_init(&resp)
+	if resp.err != nil {
+		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
+		C.free(unsafe.Pointer(resp.err))

-	rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
-	if len(rocmLibPaths) > 0 {
-		rocm := LoadROCMMgmt(rocmLibPaths)
-		if rocm != nil {
+		var resp C.rocm_init_resp_t
+		C.rocm_init(&resp)
+		if resp.err != nil {
+			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
 			log.Printf("Radeon GPU detected")
-			gpuHandles.rocm = rocm
-			return
+			rocm := resp.rh
+			gpuHandles.rocm = &rocm
 		}
+	} else {
+		log.Printf("Nvidia GPU detected")
+		cuda := resp.ch
+		gpuHandles.cuda = &cuda
 	}
 }

@@ -184,99 +133,13 @@ func getCPUMem() (memInfo, error) {
 func CheckVRAM() (int64, error) {
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		// leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead
-		overhead := gpuInfo.FreeMemory / 10
-		gpus := uint64(gpuInfo.DeviceCount)
-		if overhead < gpus*512*1024*1024 {
-			overhead = gpus * 512 * 1024 * 1024
+		// leave 10% or 384Mi of VRAM free for unaccounted for overhead
+		overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
+		if overhead < 384*1024*1024 {
+			overhead = 384 * 1024 * 1024
 		}
 		return int64(gpuInfo.FreeMemory - overhead), nil
 	}

 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
 }
-
-func FindGPULibs(baseLibName string, patterns []string) []string {
-	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
-	var ldPaths []string
-	gpuLibPaths := []string{}
-	log.Printf("Searching for GPU management library %s", baseLibName)
-
-	switch runtime.GOOS {
-	case "windows":
-		ldPaths = strings.Split(os.Getenv("PATH"), ";")
-	case "linux":
-		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-	default:
-		return gpuLibPaths
-	}
-	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
-	for _, ldPath := range ldPaths {
-		d, err := filepath.Abs(ldPath)
-		if err != nil {
-			continue
-		}
-		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
-	}
-	for _, pattern := range patterns {
-		// Ignore glob discovery errors
-		matches, _ := filepath.Glob(pattern)
-		for _, match := range matches {
-			// Resolve any links so we don't try the same lib multiple times
-			// and weed out any dups across globs
-			libPath := match
-			tmp := match
-			var err error
-			for ; err == nil; tmp, err = os.Readlink(libPath) {
-				if !filepath.IsAbs(tmp) {
-					tmp = filepath.Join(filepath.Dir(libPath), tmp)
-				}
-				libPath = tmp
-			}
-			new := true
-			for _, cmp := range gpuLibPaths {
-				if cmp == libPath {
-					new = false
-					break
-				}
-			}
-			if new {
-				gpuLibPaths = append(gpuLibPaths, libPath)
-			}
-		}
-	}
-	log.Printf("Discovered GPU libraries: %v", gpuLibPaths)
-	return gpuLibPaths
-}
-
-func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
-	var resp C.cuda_init_resp_t
-	for _, libPath := range cudaLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.cuda_init(lib, &resp)
-		if resp.err != nil {
-			log.Printf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err))
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			return &resp.ch
-		}
-	}
-	return nil
-}
-
-func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
-	var resp C.rocm_init_resp_t
-	for _, libPath := range rocmLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.rocm_init(lib, &resp)
-		if resp.err != nil {
-			log.Printf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err))
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			return &resp.rh
-		}
-	}
-	return nil
-}
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,9 +4,33 @@

 #include <string.h>

+#ifndef _WIN32
+const char *cuda_lib_paths[] = {
+    "libnvidia-ml.so",
+    "/usr/lib/wsl/lib/libnvidia-ml.so",  // TODO Maybe glob?
+    "/usr/lib/wsl/lib/libnvidia-ml.so.1",
+    "/usr/local/cuda/lib64/libnvidia-ml.so",
+    "/usr/lib/libnvidia-ml.so",
+    "/usr/lib/libnvidia-ml.so.1",
+    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
+    "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so",
+    "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
+    "/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so",
+    "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so",
+    "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
+    NULL,
+};
+#else
+const char *cuda_lib_paths[] = {
+    "nvml.dll",
+    "",
+    NULL,
+};
+#endif
+
 #define CUDA_LOOKUP_SIZE 6

-void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
+void cuda_init(cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
@@ -25,12 +49,16 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  };

-  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
+  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
+    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
+  }
  if (!resp->ch.handle) {
+    // TODO improve error message, as the LOAD_ERR will have typically have the
+    // final path that was checked which might be confusing.
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
-             cuda_lib_path, msg);
+             cuda_lib_paths[0], msg);
    free(msg);
    resp->err = strdup(buf);
    return;
@@ -52,8 +80,6 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -36,7 +36,7 @@ typedef struct cuda_compute_capability {
  int minor;
 } cuda_compute_capability_t;

-void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
+void cuda_init(cuda_init_resp_t *resp);
 void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
 void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);

--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,7 +4,22 @@

 #include <string.h>

-void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
+#ifndef _WIN32
+const char *rocm_lib_paths[] = {
+    "librocm_smi64.so",
+    "/opt/rocm/lib/librocm_smi64.so",
+    NULL,
+};
+#else
+// TODO untested
+const char *rocm_lib_paths[] = {
+    "rocm_smi64.dll",
+    "/opt/rocm/lib/rocm_smi64.dll",
+    NULL,
+};
+#endif
+
+void rocm_init(rocm_init_resp_t *resp) {
  rsmi_status_t ret;
  resp->err = NULL;
  const int buflen = 256;
@@ -21,12 +36,14 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
  };

-  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
+  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
+    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
+  }
  if (!resp->rh.handle) {
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
-             rocm_lib_path, msg);
+             rocm_lib_paths[0], msg);
    free(msg);
    resp->err = strdup(buf);
    return;
@@ -36,7 +53,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->rh.handle);
-      resp->rh.handle = NULL;
      char *msg = LOAD_ERR();
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               msg);
@@ -48,8 +64,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {

  ret = (*resp->rh.initFn)(0);
  if (ret != RSMI_STATUS_SUCCESS) {
-    UNLOAD_LIBRARY(resp->rh.handle);
-    resp->rh.handle = NULL;
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
  }
@@ -69,7 +83,7 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  int i;

  if (h.handle == NULL) {
-    resp->err = strdup("rocm handle not initialized");
+    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -29,7 +29,7 @@ typedef struct rocm_init_resp {
  rocm_handle_t rh;
 } rocm_init_resp_t;

-void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
+void rocm_init(rocm_init_resp_t *resp);
 void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);

 #endif  // __GPU_INFO_ROCM_H__
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -111,10 +111,6 @@ void llama_server_stop() {
  // TODO - too verbose, remove once things are solid
  LOG_TEE("requesting llama server shutdown\n");
  ext_server_running = false;
-
-  // unblocks the update_slots() loop so it can clean up and exit
-  llama->request_cancel(0);
-
  ext_server_thread.join();
  delete llama;
  llama = NULL;
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -2,6 +2,7 @@ package llm

 import (
 	"context"
+	"fmt"
 	"log"
 	"os"
 	"runtime"
@@ -40,76 +41,88 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 		opts.NumCtx = 4
 	}

-	vram, _ := gpu.CheckVRAM()
-	size := ggml.Size
+	fmt.Println("size", ggml.Size)
+	fmt.Println("filetype", ggml.FileType())
+	fmt.Println("architecture", ggml.ModelFamily())
+	fmt.Println("type", ggml.ModelType())
+	fmt.Println("name", ggml.Name())
+	fmt.Println("embd", ggml.NumEmbed())
+	fmt.Println("head", ggml.NumHead())
+	fmt.Println("head_kv", ggml.NumHeadKv())
+	fmt.Println("gqa", ggml.NumGQA())
+
+	available, _ := gpu.CheckVRAM()
+
+	// For now assume filesize = model size
+	// TODO: use actual model size
+	requiredModel := ggml.Size

 	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
+	requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())

 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calcluations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
-	graph := int64(ggml.NumGQA()) * kv / 6
+	requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
+
+	requiredTotal := requiredModel + requiredKv + requiredAlloc
+
+	log.Println("system memory bytes:", available)
+	log.Println("required model bytes:", requiredModel)
+	log.Println("required kv bytes:", requiredKv)
+	log.Println("required alloc bytes:", requiredAlloc)
+	log.Println("required total bytes:", requiredTotal)

 	info := gpu.GetGPUInfo()
 	library := info.Library
-	switch runtime.GOOS {
-	case "darwin":
-		if opts.NumGPU == 0 {
-			break
-		}

-		if size+kv+graph > vram {
-			log.Println("not enough vram available, falling back to CPU only")
-			opts.NumGPU = 0
-			break
-		}
-
-		opts.NumGPU = 1
-	default:
-		if library == "cpu" || library == "default" {
-			log.Println("GPU not available, falling back to CPU")
-			opts.NumGPU = 0
-			break
-		}
-
-		// don't use GPU at all if no layers are loaded
-		if opts.NumGPU == 0 {
-			library = "cpu"
-			break
-		}
-
-		// user-defined GPU count
-		if opts.NumGPU != -1 {
-			break
-		}
-
-		// the "main" GPU needs the most memory and determines the limit
-		// of how many layers can be loaded. It needs to fit:
-		// 1. the full compute graph allocation for all devices (graph)
-		// 2. the proportional kv cache for all devices (kv * % layers)
-		// 3. the proportional model (size * % layers / # devices)
-		// This estimates the number of layers
-		maxlayers := int64(ggml.NumLayers()) + 1
-		devices := int64(info.DeviceCount)
-		avg := vram / devices
-		layers := maxlayers * (avg - graph) / (kv + size/devices)
-		if layers > maxlayers {
-			layers = maxlayers
-		}
-
-		// 1 + 2 must fit on the main gpu
-		min := graph + kv*layers/maxlayers
-		if layers <= 0 || min > avg {
-			log.Printf("not enough vram available, falling back to CPU only")
-			library = "cpu"
-			opts.NumGPU = 0
-			break
-		}
-
-		opts.NumGPU = int(layers)
+	if opts.NumGPU == -1 {
+		// default to offloading all layers
+		opts.NumGPU = int(ggml.NumLayers()) + 1
 	}

+	// decide how many layers to put on the GPU
+	if opts.NumGPU > 0 {
+		switch runtime.GOOS {
+		case "darwin":
+			if requiredTotal > available {
+				log.Println("not enough vram available, falling back to CPU only")
+				opts.NumGPU = 0
+			}
+		default:
+			if library == "cpu" || library == "default" {
+				opts.NumGPU = 0
+				break
+			}
+
+			// no offloading required
+			if requiredTotal <= available {
+				break
+			}
+
+			// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
+			if requiredAlloc > available {
+				log.Printf("not enough vram available, falling back to CPU only")
+				library = "cpu"
+				opts.NumGPU = 0
+				break
+			}
+
+			available -= requiredAlloc
+
+			// fill remaining vram with layers
+			log.Println("splitting", available, "of available memory bytes into layers")
+			bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
+			log.Println("bytes per layer:", bytesPerLayer)
+			layers := available / bytesPerLayer
+			log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
+			if layers < int64(opts.NumGPU) {
+				opts.NumGPU = int(layers)
+			}
+		}
+	}
+
+	opts.NumGQA = 0
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
 	return newLlmServer(library, model, adapters, projectors, opts)
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -5,10 +5,9 @@ set -eu
 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"

-BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 mkdir -p dist

-for TARGETARCH in ${BUILD_ARCH}; do
+for TARGETARCH in amd64 arm64; do
    docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH