diff --git a/gpu/gpu.go b/gpu/gpu.go index 46359e340..11c72e151 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -24,20 +24,45 @@ import ( "github.com/ollama/ollama/format" ) -type handles struct { +type cudaHandles struct { deviceCount int cudart *C.cudart_handle_t nvcuda *C.nvcuda_handle_t + nvml *C.nvml_handle_t +} + +type oneapiHandles struct { oneapi *C.oneapi_handle_t - vulkan *C.vk_handle_t + deviceCount int +} + +type vulkanHandles struct { + vulkan *C.vulkan_handle_t + deviceCount int } const ( cudaMinimumMemory = 457 * format.MebiByte rocmMinimumMemory = 457 * format.MebiByte + // TODO OneAPI minimum memory ) -var gpuMutex sync.Mutex +var ( + gpuMutex sync.Mutex + bootstrapped bool + cpuCapability CPUCapability + cpus []CPUInfo + cudaGPUs []CudaGPUInfo + nvcudaLibPath string + cudartLibPath string + oneapiLibPath string + vulkanLibPath string + libcapLibPath string + nvmlLibPath string + rocmGPUs []RocmGPUInfo + oneapiGPUs []OneapiGPUInfo + vulkanGPUs []VulkanGPUInfo +) // With our current CUDA compile flags, older than 5.0 will not work properly var CudaComputeMin = [2]C.int{5, 0} @@ -47,152 +72,133 @@ var RocmComputeMin = 9 // TODO find a better way to detect iGPU instead of minimum memory const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU -var CudartLinuxGlobs = []string{ - "/usr/local/cuda/lib64/libcudart.so*", - "/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*", - "/usr/lib/x86_64-linux-gnu/libcudart.so*", - "/usr/lib/wsl/lib/libcudart.so*", - "/usr/lib/wsl/drivers/*/libcudart.so*", - "/opt/cuda/lib64/libcudart.so*", - "/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*", - "/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*", - "/usr/lib/aarch64-linux-gnu/libcudart.so*", - "/usr/local/cuda/lib*/libcudart.so*", - "/usr/lib*/libcudart.so*", - "/usr/local/lib*/libcudart.so*", -} - -var CudartWindowsGlobs = []string{ - "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", -} - -var NvcudaLinuxGlobs = []string{ - "/usr/local/cuda*/targets/*/lib/libcuda.so*", - "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*", - "/usr/lib/*-linux-gnu/libcuda.so*", - "/usr/lib/wsl/lib/libcuda.so*", - "/usr/lib/wsl/drivers/*/libcuda.so*", - "/opt/cuda/lib*/libcuda.so*", - "/usr/local/cuda/lib*/libcuda.so*", - "/usr/lib*/libcuda.so*", - "/usr/local/lib*/libcuda.so*", -} - -var NvcudaWindowsGlobs = []string{ - "c:\\windows\\system*\\nvcuda.dll", -} - -var OneapiWindowsGlobs = []string{ - "c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll", -} - -var OneapiLinuxGlobs = []string{ - "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*", - "/usr/lib*/libze_intel_gpu.so*", -} - -var VulkanLinuxGlobs = []string{ - "/usr/lib/x86_64-linux-gnu/libvulkan.so*", - "/usr/lib*/libvulkan.so*", -} - -var CapLinuxGlobs = []string{ - "/usr/lib/x86_64-linux-gnu/libcap.so*", - "/usr/lib*/libcap.so*", -} - // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. var CudaTegra string = os.Getenv("JETSON_JETPACK") // Note: gpuMutex must already be held -func initGPUHandles() *handles { +func initCudaHandles() *cudaHandles { // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing - gpuHandles := &handles{} - var cudartMgmtName string - var cudartMgmtPatterns []string - var nvcudaMgmtName string - var nvcudaMgmtPatterns []string - var vulkanMgmtName string - var vulkanMgmtPatterns []string - var libcapMgmtName string - var libcapMgmtPatterns []string - - tmpDir, _ := PayloadsDir() - switch runtime.GOOS { - case "windows": - cudartMgmtName = "cudart64_*.dll" - localAppData := os.Getenv("LOCALAPPDATA") - cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)} - cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...) - // Aligned with driver, we can't carry as payloads - nvcudaMgmtName = "nvcuda.dll" - nvcudaMgmtPatterns = NvcudaWindowsGlobs - case "linux": - cudartMgmtName = "libcudart.so*" - if tmpDir != "" { - // TODO - add "payloads" for subprocess - cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)} - } - cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) - // Aligned with driver, we can't carry as payloads - nvcudaMgmtName = "libcuda.so*" - nvcudaMgmtPatterns = NvcudaLinuxGlobs - - // Vulkan also needs libcap - vulkanMgmtName = "libvulkan.so*" - vulkanMgmtPatterns = VulkanLinuxGlobs - libcapMgmtName = "libcap.so*" - libcapMgmtPatterns = CapLinuxGlobs - default: - return gpuHandles + cHandles := &cudaHandles{} + // Short Circuit if we already know which library to use + if nvmlLibPath != "" { + cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath}) + return cHandles + } + if nvcudaLibPath != "" { + cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath}) + return cHandles + } + if cudartLibPath != "" { + cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath}) + return cHandles } - slog.Debug("Detecting GPUs") - nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) + slog.Debug("searching for GPU discovery libraries for NVIDIA") + var cudartMgmtPatterns []string + + // Aligned with driver, we can't carry as payloads + nvcudaMgmtPatterns := NvcudaGlobs + + if runtime.GOOS == "windows" { + localAppData := os.Getenv("LOCALAPPDATA") + cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)} + } + tmpDir, _ := PayloadsDir() + if tmpDir != "" { + // TODO - add "payloads" for subprocess + cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)} + } + cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...) + + if len(NvmlGlobs) > 0 { + nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs) + if len(nvmlLibPaths) > 0 { + nvml, libPath := LoadNVMLMgmt(nvmlLibPaths) + if nvml != nil { + slog.Debug("nvidia-ml loaded", "library", libPath) + cHandles.nvml = nvml + nvmlLibPath = libPath + } + } + } + + nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns) if len(nvcudaLibPaths) > 0 { deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) if nvcuda != nil { slog.Debug("detected GPUs", "count", deviceCount, "library", libPath) - gpuHandles.nvcuda = nvcuda - gpuHandles.deviceCount = deviceCount - return gpuHandles + cHandles.nvcuda = nvcuda + cHandles.deviceCount = deviceCount + nvcudaLibPath = libPath + return cHandles } } - cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns) + cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns) if len(cudartLibPaths) > 0 { deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) if cudart != nil { slog.Debug("detected GPUs", "library", libPath, "count", deviceCount) - gpuHandles.cudart = cudart - gpuHandles.deviceCount = deviceCount - return gpuHandles + cHandles.cudart = cudart + cHandles.deviceCount = deviceCount + cudartLibPath = libPath + return cHandles } } - vulkanLibPaths := FindGPULibs(vulkanMgmtName, vulkanMgmtPatterns) + return cHandles +} - var libcapLibPaths []string - if runtime.GOOS == "linux" { - libcapLibPaths = FindGPULibs(libcapMgmtName, libcapMgmtPatterns) +// Note: gpuMutex must already be held +func initOneAPIHandles() *oneapiHandles { + oHandles := &oneapiHandles{} + + // Short Circuit if we already know which library to use + if oneapiLibPath != "" { + oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath}) + return oHandles + } + + oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs) + if len(oneapiLibPaths) > 0 { + oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths) + } + + return oHandles +} + +// Note: gpuMutex must already be held +func initVulkanHandles() *vulkanHandles { + vHandles := &vulkanHandles{} + + // Short Circuit if we already know which library to use + if vulkanLibPath != "" && libcapLibPath != "" { + vHandles.deviceCount, vHandles.vulkan, _, _ = LoadVulkanMgmt([]string{vulkanLibPath}, []string{libcapLibPath}) + return vHandles + } + + vulkanPaths := FindGPULibs(VulkanMgmtName, VulkanGlobs) + libcapPaths := FindLibCapLibs() + + if len(vulkanPaths) > 0 && len(libcapPaths) > 0 { + vHandles.deviceCount, vHandles.vulkan, vulkanLibPath, libcapLibPath = LoadVulkanMgmt(vulkanPaths, libcapPaths) + } + + return vHandles +} + +func GetCPUInfo() GpuInfoList { + gpuMutex.Lock() + if !bootstrapped { + gpuMutex.Unlock() + GetGPUInfo() } else { - libcapLibPaths = []string{""} + gpuMutex.Unlock() } - - if len(vulkanLibPaths) > 0 && len(libcapLibPaths) > 0 { - deviceCount, vulkan, vkLibPath, capLibPath := LoadVulkanMgmt(vulkanLibPaths, libcapLibPaths) - if vulkan != nil { - slog.Debug("detected GPUs", "library", vkLibPath, capLibPath, "count", deviceCount) - gpuHandles.vulkan = vulkan - gpuHandles.deviceCount = deviceCount - return gpuHandles - } - } - - return gpuHandles + return GpuInfoList{cpus[0].GpuInfo} } func GetGPUInfo() GpuInfoList { @@ -200,141 +206,300 @@ func GetGPUInfo() GpuInfoList { // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries gpuMutex.Lock() defer gpuMutex.Unlock() - - gpuHandles := initGPUHandles() + needRefresh := true + var cHandles *cudaHandles + var oHandles *oneapiHandles + var vHandles *vulkanHandles defer func() { - if gpuHandles.cudart != nil { - C.cudart_release(*gpuHandles.cudart) + if cHandles != nil { + if cHandles.cudart != nil { + C.cudart_release(*cHandles.cudart) + } + if cHandles.nvcuda != nil { + C.nvcuda_release(*cHandles.nvcuda) + } + if cHandles.nvml != nil { + C.nvml_release(*cHandles.nvml) + } } - if gpuHandles.nvcuda != nil { - C.nvcuda_release(*gpuHandles.nvcuda) - } - if gpuHandles.vulkan != nil { - C.vk_release(*gpuHandles.vulkan) + if oHandles != nil { + if oHandles.oneapi != nil { + // TODO - is this needed? + C.oneapi_release(*oHandles.oneapi) + } } }() - // All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX - cpuVariant := GetCPUVariant() - if cpuVariant == "" && runtime.GOARCH == "amd64" { - slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.") - } + if !bootstrapped { + slog.Debug("Detecting GPUs") + needRefresh = false + cpuCapability = GetCPUCapability() + var memInfo C.mem_info_t - // On windows we bundle the nvidia library one level above the runner dir - depPath := "" - if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { - depPath = filepath.Dir(envconfig.RunnersDir) - } - - var memInfo C.mem_info_t - resp := []GpuInfo{} - - // NVIDIA and Vulkan first - for i := range gpuHandles.deviceCount { - // TODO once we support CPU compilation variants of GPU libraries refine this... - if cpuVariant == "" && runtime.GOARCH == "amd64" { - continue + mem, err := GetCPUMem() + if err != nil { + slog.Warn("error looking up system memory", "error", err) } - if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil { - gpuInfo := GpuInfo{ - Library: "cuda", + cpus = []CPUInfo{CPUInfo{ + GpuInfo: GpuInfo{ + memInfo: mem, + Library: "cpu", + Variant: cpuCapability, + ID: "0", + }, + }} + + // Fallback to CPU mode if we're lacking required vector extensions on x86 + if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" { + slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability) + bootstrapped = true + // No need to do any GPU discovery, since we can't run on them + return GpuInfoList{cpus[0].GpuInfo} + } + + // On windows we bundle the nvidia library one level above the runner dir + depPath := "" + if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { + depPath = filepath.Dir(envconfig.RunnersDir) + } + + // Load ALL libraries + cHandles = initCudaHandles() + + // NVIDIA + for i := range cHandles.deviceCount { + if cHandles.cudart != nil || cHandles.nvcuda != nil { + gpuInfo := CudaGPUInfo{ + GpuInfo: GpuInfo{ + Library: "cuda", + }, + index: i, + } + var driverMajor int + var driverMinor int + if cHandles.cudart != nil { + C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo) + } else { + C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo) + driverMajor = int(cHandles.nvcuda.driver_major) + driverMinor = int(cHandles.nvcuda.driver_minor) + } + if memInfo.err != nil { + slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) + C.free(unsafe.Pointer(memInfo.err)) + continue + } + if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) { + slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor)) + continue + } + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) + gpuInfo.MinimumMemory = cudaMinimumMemory + gpuInfo.DependencyPath = depPath + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DriverMajor = driverMajor + gpuInfo.DriverMinor = driverMinor + + // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... + cudaGPUs = append(cudaGPUs, gpuInfo) } - var driverMajor int - var driverMinor int - if gpuHandles.cudart != nil { - C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) + } + + // Intel + oHandles = initOneAPIHandles() + for d := 0; oHandles.oneapi != nil && d < int(oHandles.oneapi.num_drivers); d++ { + if oHandles.oneapi == nil { + // shouldn't happen + slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers)) + continue + } + devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) + for i := range devCount { + gpuInfo := OneapiGPUInfo{ + GpuInfo: GpuInfo{ + Library: "oneapi", + }, + driverIndex: d, + gpuIndex: int(i), + } + // TODO - split bootstrapping from updating free memory + C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) + // TODO - convert this to MinimumMemory based on testing... + var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. + memInfo.free = C.uint64_t(totalFreeMem) + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + // TODO dependency path? + oneapiGPUs = append(oneapiGPUs, gpuInfo) + } + } + + // Vulkan + vHandles = initVulkanHandles() + for i := range vHandles.deviceCount { + if vHandles.vulkan != nil { + gpuInfo := VulkanGPUInfo{ + GpuInfo: GpuInfo{ + Library: "vulkan", + }, + index: i, + } + + C.vk_check_vram(*vHandles.vulkan, C.int(i), &memInfo) + if memInfo.err != nil { + slog.Info("error looking up vulkan GPU memory", "error", C.GoString(memInfo.err)) + C.free(unsafe.Pointer(memInfo.err)) + continue + } + + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) + gpuInfo.MinimumMemory = 0 + gpuInfo.DependencyPath = depPath + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DriverMajor = int(memInfo.major) + gpuInfo.DriverMinor = int(memInfo.minor) + + // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... + vulkanGPUs = append(vulkanGPUs, gpuInfo) + } + } + + rocmGPUs = AMDGetGPUInfo() + bootstrapped = true + } + + // For detected GPUs, load library if not loaded + + // Refresh free memory usage + if needRefresh { + mem, err := GetCPUMem() + if err != nil { + slog.Warn("error looking up system memory", "error", err) + } else { + slog.Debug("updating system memory data", + slog.Group( + "before", + "total", format.HumanBytes2(cpus[0].TotalMemory), + "free", format.HumanBytes2(cpus[0].FreeMemory), + ), + slog.Group( + "now", + "total", format.HumanBytes2(mem.TotalMemory), + "free", format.HumanBytes2(mem.FreeMemory), + ), + ) + cpus[0].FreeMemory = mem.FreeMemory + } + + var memInfo C.mem_info_t + if cHandles == nil && len(cudaGPUs) > 0 { + cHandles = initCudaHandles() + } + for i, gpu := range cudaGPUs { + if cHandles.nvml != nil { + C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used) + } else if cHandles.cudart != nil { + C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo) + } else if cHandles.nvcuda != nil { + C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total) + memInfo.used = memInfo.total - memInfo.free } else { - C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) - driverMajor = int(gpuHandles.nvcuda.driver_major) - driverMinor = int(gpuHandles.nvcuda.driver_minor) + // shouldn't happen + slog.Warn("no valid cuda library loaded to refresh vram usage") + break } if memInfo.err != nil { - slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) + slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) C.free(unsafe.Pointer(memInfo.err)) continue } - if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) { - slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor)) + if memInfo.free == 0 { + slog.Warn("error looking up nvidia GPU memory") continue } - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) - gpuInfo.MinimumMemory = cudaMinimumMemory - gpuInfo.DependencyPath = depPath - gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - gpuInfo.DriverMajor = driverMajor - gpuInfo.DriverMinor = driverMinor - - // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... - resp = append(resp, gpuInfo) + slog.Debug("updating cuda memory data", + "gpu", gpu.ID, + "name", gpu.Name, + slog.Group( + "before", + "total", format.HumanBytes2(gpu.TotalMemory), + "free", format.HumanBytes2(gpu.FreeMemory), + ), + slog.Group( + "now", + "total", format.HumanBytes2(uint64(memInfo.total)), + "free", format.HumanBytes2(uint64(memInfo.free)), + "used", format.HumanBytes2(uint64(memInfo.used)), + ), + ) + cudaGPUs[i].FreeMemory = uint64(memInfo.free) } - if gpuHandles.vulkan != nil { - gpuInfo := GpuInfo{ - Library: "vulkan", - } - - C.vk_check_vram(*gpuHandles.vulkan, C.int(i), &memInfo) - if memInfo.err != nil { - slog.Info("error looking up vulkan GPU memory", "error", C.GoString(memInfo.err)) - C.free(unsafe.Pointer(memInfo.err)) + if oHandles == nil && len(oneapiGPUs) > 0 { + oHandles = initOneAPIHandles() + } + for i, gpu := range oneapiGPUs { + if oHandles.oneapi == nil { + // shouldn't happen + slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount) continue } + C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo) + // TODO - convert this to MinimumMemory based on testing... + var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. + memInfo.free = C.uint64_t(totalFreeMem) + oneapiGPUs[i].FreeMemory = uint64(memInfo.free) + } - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) - gpuInfo.MinimumMemory = 0 - gpuInfo.DependencyPath = depPath - gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - gpuInfo.DriverMajor = int(memInfo.major) - gpuInfo.DriverMinor = int(memInfo.minor) + if vHandles == nil && len(vulkanGPUs) > 0 { + vHandles = initVulkanHandles() + } + for i, gpu := range vulkanGPUs { + if vHandles.vulkan == nil { + // shouldn't happen + slog.Warn("nil vulkan handle with device count", "count", oHandles.deviceCount) + continue + } + C.vk_check_vram(*vHandles.vulkan, C.int(gpu.index), &memInfo) + // TODO - convert this to MinimumMemory based on testing... + var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. + memInfo.free = C.uint64_t(totalFreeMem) + vulkanGPUs[i].FreeMemory = uint64(memInfo.free) + } - // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... - resp = append(resp, gpuInfo) + err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory() + if err != nil { + slog.Debug("problem refreshing ROCm free memory", "error", err) } } - // Then AMD - resp = append(resp, AMDGetGPUInfo()...) - + resp := []GpuInfo{} + for _, gpu := range cudaGPUs { + resp = append(resp, gpu.GpuInfo) + } + for _, gpu := range rocmGPUs { + resp = append(resp, gpu.GpuInfo) + } + for _, gpu := range oneapiGPUs { + resp = append(resp, gpu.GpuInfo) + } + for _, gpu := range vulkanGPUs { + resp = append(resp, gpu.GpuInfo) + } if len(resp) == 0 { - C.cpu_check_ram(&memInfo) - if memInfo.err != nil { - slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err)) - C.free(unsafe.Pointer(memInfo.err)) - return resp - } - gpuInfo := GpuInfo{ - Library: "cpu", - Variant: cpuVariant, - } - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - - resp = append(resp, gpuInfo) + resp = append(resp, cpus[0].GpuInfo) } - return resp } -func GetCPUMem() (memInfo, error) { - var ret memInfo - var info C.mem_info_t - C.cpu_check_ram(&info) - if info.err != nil { - defer C.free(unsafe.Pointer(info.err)) - return ret, fmt.Errorf(C.GoString(info.err)) - } - ret.FreeMemory = uint64(info.free) - ret.TotalMemory = uint64(info.total) - return ret, nil -} - func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string @@ -431,8 +596,26 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { return 0, nil, "" } +func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) { + var resp C.nvml_init_resp_t + resp.ch.verbose = getVerboseState() + for _, libPath := range nvmlLibPaths { + lib := C.CString(libPath) + defer C.free(unsafe.Pointer(lib)) + C.nvml_init(lib, &resp) + if resp.err != nil { + slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))) + C.free(unsafe.Pointer(resp.err)) + } else { + return &resp.ch, libPath + } + } + return nil, "" +} + func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) { var resp C.oneapi_init_resp_t + num_devices := 0 resp.oh.verbose = getVerboseState() for _, libPath := range oneapiLibPaths { lib := C.CString(libPath) @@ -442,7 +625,10 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) { slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err)) C.free(unsafe.Pointer(resp.err)) } else { - return int(resp.num_devices), &resp.oh, libPath + for i := range resp.oh.num_drivers { + num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i))) + } + return num_devices, &resp.oh, libPath } } return 0, nil, "" diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go index a099bf822..2e723c4da 100644 --- a/gpu/gpu_linux.go +++ b/gpu/gpu_linux.go @@ -43,10 +43,26 @@ var OneapiGlobs = []string{ "/usr/lib*/libze_intel_gpu.so*", } +var VulkanGlobs = []string{ + "/usr/lib/x86_64-linux-gnu/libvulkan.so*", + "/usr/lib*/libvulkan.so*", +} + +var capLinuxGlobs = []string{ + "/usr/lib/x86_64-linux-gnu/libcap.so*", + "/usr/lib*/libcap.so*", +} + var CudartMgmtName = "libcudart.so*" var NvcudaMgmtName = "libcuda.so*" var NvmlMgmtName = "" // not currently wired on linux var OneapiMgmtName = "libze_intel_gpu.so" +var VulkanMgmtName = "libvulkan.so*" +var libcapMgmtName = "libcap.so*" + +func FindLibCapLibs() []string { + return FindGPULibs(libcapMgmtName, capLinuxGlobs) +} func GetCPUMem() (memInfo, error) { var mem memInfo diff --git a/gpu/gpu_windows.go b/gpu/gpu_windows.go index f8c2e76fe..328477440 100644 --- a/gpu/gpu_windows.go +++ b/gpu/gpu_windows.go @@ -45,6 +45,10 @@ var NvcudaMgmtName = "nvcuda.dll" var NvmlMgmtName = "nvml.dll" var OneapiMgmtName = "ze_intel_gpu64.dll" +func FindLibCapLibs() []string { + return []string{""} +} + func GetCPUMem() (memInfo, error) { memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx} r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus))) diff --git a/gpu/types.go b/gpu/types.go index 47355959c..b451c0f38 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -62,6 +62,13 @@ type OneapiGPUInfo struct { } type OneapiGPUInfoList []OneapiGPUInfo +type VulkanGPUInfo struct { + GpuInfo + index int +} + +type VulkanGPUInfoList []VulkanGPUInfo + type GpuInfoList []GpuInfo // Split up the set of gpu info's by Library and variant