From f8e4c812b9f89e2e4fb8cc6f7b0657798c5dc8a1 Mon Sep 17 00:00:00 2001 From: marx161-cmd <222994158+marx161-cmd@users.noreply.github.com> Date: Thu, 1 Jan 2026 21:18:53 +0100 Subject: [PATCH] Add experimental ROCm iGPU support --- README.md | 4 + cmake/CMakeDetermineHIPCompiler.cmake | 329 ++++++++++++++++++ docs/rocm-apu.md | 91 +++++ llm/server.go | 30 ++ .../ggml/ggml/src/ggml-cuda/vendors/hip.h | 328 +++++++++++++++++ ml/backend/ggml/ggml/src/mem_hip.cpp | 110 ++++-- ml/device.go | 4 + 7 files changed, 867 insertions(+), 29 deletions(-) create mode 100644 cmake/CMakeDetermineHIPCompiler.cmake create mode 100644 docs/rocm-apu.md diff --git a/README.md b/README.md index bb08819d6..7ab6ffb37 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,9 @@ curl -fsSL https://ollama.com/install.sh | sh [Manual install instructions](https://docs.ollama.com/linux#manual-install) +> [!TIP] +> Running ROCm on UMA-only AMD APUs (e.g., Radeon 760M) requires staging additional ROCm runtime files and enabling experimental discovery flags. See [docs/rocm-apu.md](docs/rocm-apu.md) for the exact build and runtime steps used on this branch. + ### Docker The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub. @@ -32,6 +35,7 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla - [ollama-python](https://github.com/ollama/ollama-python) - [ollama-js](https://github.com/ollama/ollama-js) +- [Experimental ROCm iGPU Guide](docs/rocm-apu.md) ### Community diff --git a/cmake/CMakeDetermineHIPCompiler.cmake b/cmake/CMakeDetermineHIPCompiler.cmake new file mode 100644 index 000000000..a17561766 --- /dev/null +++ b/cmake/CMakeDetermineHIPCompiler.cmake @@ -0,0 +1,329 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +include(${CMAKE_ROOT}/Modules/CMakeDetermineCompiler.cmake) +include(${CMAKE_ROOT}/Modules/CMakeParseImplicitLinkInfo.cmake) +include(${CMAKE_ROOT}/Modules/CMakeParseLibraryArchitecture.cmake) + +if(NOT ((CMAKE_GENERATOR MATCHES "Make") OR + (CMAKE_GENERATOR MATCHES "Ninja"))) + message(FATAL_ERROR "HIP language not currently supported by \"${CMAKE_GENERATOR}\" generator") +endif() + +if(NOT CMAKE_HIP_PLATFORM) + execute_process(COMMAND hipconfig --platform + OUTPUT_VARIABLE _CMAKE_HIPCONFIG_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT + ) + if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND _CMAKE_HIPCONFIG_PLATFORM MATCHES "^(nvidia|nvcc)$") + set(CMAKE_HIP_PLATFORM "nvidia" CACHE STRING "HIP platform" FORCE) + else() + set(CMAKE_HIP_PLATFORM "amd" CACHE STRING "HIP platform" FORCE) + endif() +endif() +if(NOT CMAKE_HIP_PLATFORM MATCHES "^(amd|nvidia)$") + message(FATAL_ERROR + "The CMAKE_HIP_PLATFORM has unsupported value:\n" + " '${CMAKE_HIP_PLATFORM}'\n" + "It must be 'amd' or 'nvidia'." + ) +endif() + +if(NOT CMAKE_HIP_COMPILER) + set(CMAKE_HIP_COMPILER_INIT NOTFOUND) + + # prefer the environment variable HIPCXX + if(NOT $ENV{HIPCXX} STREQUAL "") + get_filename_component(CMAKE_HIP_COMPILER_INIT $ENV{HIPCXX} PROGRAM PROGRAM_ARGS CMAKE_HIP_FLAGS_ENV_INIT) + if(CMAKE_HIP_FLAGS_ENV_INIT) + set(CMAKE_HIP_COMPILER_ARG1 "${CMAKE_HIP_FLAGS_ENV_INIT}" CACHE STRING "Arguments to CXX compiler") + endif() + if(NOT EXISTS ${CMAKE_HIP_COMPILER_INIT}) + message(FATAL_ERROR "Could not find compiler set in environment variable HIPCXX:\n$ENV{HIPCXX}.\n${CMAKE_HIP_COMPILER_INIT}") + endif() + endif() + + # finally list compilers to try + if(NOT CMAKE_HIP_COMPILER_INIT) + if(CMAKE_HIP_PLATFORM STREQUAL "nvidia") + set(CMAKE_HIP_COMPILER_LIST nvcc) + elseif(CMAKE_HIP_PLATFORM STREQUAL "amd") + set(CMAKE_HIP_COMPILER_LIST clang++) + + # Look for the Clang coming with ROCm to support HIP. + execute_process(COMMAND hipconfig --hipclangpath + OUTPUT_VARIABLE _CMAKE_HIPCONFIG_CLANGPATH + RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT + ) + if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND EXISTS "${_CMAKE_HIPCONFIG_CLANGPATH}") + set(CMAKE_HIP_COMPILER_HINTS "${_CMAKE_HIPCONFIG_CLANGPATH}") + endif() + endif() + endif() + + _cmake_find_compiler(HIP) +else() + _cmake_find_compiler_path(HIP) +endif() + +mark_as_advanced(CMAKE_HIP_COMPILER) + +# Build a small source file to identify the compiler. +if(NOT CMAKE_HIP_COMPILER_ID_RUN) + set(CMAKE_HIP_COMPILER_ID_RUN 1) + + include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake) + + # We determine the vendor to use the right flags for detection right away. + # The main compiler identification is still needed below to extract other information. + list(APPEND CMAKE_HIP_COMPILER_ID_VENDORS NVIDIA Clang) + set(CMAKE_HIP_COMPILER_ID_VENDOR_REGEX_NVIDIA "nvcc: NVIDIA \\(R\\) Cuda compiler driver") + set(CMAKE_HIP_COMPILER_ID_VENDOR_REGEX_Clang "(clang version)") + CMAKE_DETERMINE_COMPILER_ID_VENDOR(HIP "--version") + + if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") + # Find the CUDA toolkit to get: + # - CMAKE_HIP_COMPILER_CUDA_TOOLKIT_VERSION + # - CMAKE_HIP_COMPILER_CUDA_TOOLKIT_ROOT + # - CMAKE_HIP_COMPILER_CUDA_LIBRARY_ROOT + # We save them in CMakeHIPCompiler.cmake. + # Match arguments with cmake_cuda_architectures_all call. + include(Internal/CMakeCUDAFindToolkit) + cmake_cuda_find_toolkit(HIP CMAKE_HIP_COMPILER_CUDA_) + + # If the user set CMAKE_HIP_ARCHITECTURES, validate its value. + include(Internal/CMakeCUDAArchitecturesValidate) + cmake_cuda_architectures_validate(HIP) + + if(NOT CMAKE_HIP_HOST_COMPILER AND NOT $ENV{HIPHOSTCXX} STREQUAL "") + get_filename_component(CMAKE_HIP_HOST_COMPILER $ENV{HIPHOSTCXX} PROGRAM) + if(NOT EXISTS "${CMAKE_HIP_HOST_COMPILER}") + message(FATAL_ERROR "Could not find compiler set in environment variable HIPHOSTCXX:\n$ENV{HIPHOSTCXX}.\n${CMAKE_HIP_HOST_COMPILER}") + endif() + endif() + endif() + + if(CMAKE_HIP_COMPILER_ID STREQUAL "Clang") + list(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST "-v") + elseif(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") + # Tell nvcc to treat .hip files as CUDA sources. + list(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST "-x cu -v") + if(CMAKE_HIP_HOST_COMPILER) + string(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST " -ccbin=\"${CMAKE_HIP_HOST_COMPILER}\"") + endif() + endif() + + # We perform compiler identification for a second time to extract implicit linking info. + # We need to unset the compiler ID otherwise CMAKE_DETERMINE_COMPILER_ID() doesn't work. + set(CMAKE_HIP_COMPILER_ID) + set(CMAKE_HIP_PLATFORM_ID) + file(READ ${CMAKE_ROOT}/Modules/CMakePlatformId.h.in + CMAKE_HIP_COMPILER_ID_PLATFORM_CONTENT) + + CMAKE_DETERMINE_COMPILER_ID(HIP HIPFLAGS CMakeHIPCompilerId.hip) + + if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") + include(Internal/CMakeCUDAArchitecturesAll) + # From CMAKE_HIP_COMPILER_CUDA_TOOLKIT_VERSION and CMAKE_HIP_COMPILER_{ID,VERSION}, get: + # - CMAKE_HIP_ARCHITECTURES_ALL + # - CMAKE_HIP_ARCHITECTURES_ALL_MAJOR + # Match arguments with cmake_cuda_find_toolkit call. + cmake_cuda_architectures_all(HIP CMAKE_HIP_COMPILER_CUDA_) + endif() + + _cmake_find_compiler_sysroot(HIP) +endif() + +if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT AND CMAKE_HIP_COMPILER_ID STREQUAL "Clang") + execute_process(COMMAND "${CMAKE_HIP_COMPILER}" -v -print-targets + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE _CMAKE_HIP_COMPILER_RESULT + OUTPUT_VARIABLE _CMAKE_HIP_COMPILER_STDOUT + ERROR_VARIABLE _CMAKE_HIP_COMPILER_STDERR + ) + + if(_CMAKE_HIP_COMPILER_RESULT EQUAL 0 AND _CMAKE_HIP_COMPILER_STDERR MATCHES "Found HIP installation: *([^,]*)[,\n]") + set(CMAKE_HIP_COMPILER_ROCM_ROOT "${CMAKE_MATCH_1}") + file(TO_CMAKE_PATH "${CMAKE_HIP_COMPILER_ROCM_ROOT}" CMAKE_HIP_COMPILER_ROCM_ROOT) + endif() +endif() +if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT) + execute_process( + COMMAND hipconfig --rocmpath + OUTPUT_VARIABLE _CMAKE_HIPCONFIG_ROCMPATH + RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT + ) + if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND EXISTS "${_CMAKE_HIPCONFIG_ROCMPATH}") + set(CMAKE_HIP_COMPILER_ROCM_ROOT "${_CMAKE_HIPCONFIG_ROCMPATH}") + endif() +endif() +if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT) + message(FATAL_ERROR "Failed to find ROCm root directory.") +endif() + +if(CMAKE_HIP_PLATFORM STREQUAL "amd") + # For this platform we need the hip-lang cmake package. + + # Normally implicit link information is not detected until ABI detection, + # but we need to populate CMAKE_HIP_LIBRARY_ARCHITECTURE to find hip-lang. + cmake_parse_implicit_link_info("${CMAKE_HIP_COMPILER_PRODUCED_OUTPUT}" + _CMAKE_HIP_COMPILER_ID_IMPLICIT_LIBS + _CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS + _CMAKE_HIP_COMPILER_ID_IMPLICIT_FWKS + _CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG + "" LANGUAGE HIP) + message(CONFIGURE_LOG + "Parsed HIP implicit link information from compiler id output:\n${_CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG}\n\n") + cmake_parse_library_architecture(HIP "${_CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS}" "" CMAKE_HIP_LIBRARY_ARCHITECTURE) + if(CMAKE_HIP_LIBRARY_ARCHITECTURE) + message(CONFIGURE_LOG + "Parsed HIP library architecture from compiler id output: ${CMAKE_HIP_LIBRARY_ARCHITECTURE}\n") + endif() + unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_LIBS) + unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS) + unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_FWKS) + unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG) + + if(NOT CMAKE_HIP_COMPILER_ROCM_LIB) + set(_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS + "${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib" + "${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib64" + ) + if(CMAKE_HIP_LIBRARY_ARCHITECTURE) + list(APPEND _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS "${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib/${CMAKE_HIP_LIBRARY_ARCHITECTURE}") + endif() + foreach(dir IN LISTS _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS) + if(EXISTS "${dir}/cmake/hip-lang/hip-lang-config.cmake") + set(CMAKE_HIP_COMPILER_ROCM_LIB "${dir}") + break() + endif() + endforeach() + if(NOT CMAKE_HIP_COMPILER_ROCM_LIB) + list(TRANSFORM _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS APPEND "/cmake/hip-lang/hip-lang-config.cmake") + string(REPLACE ";" "\n " _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS "${_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS}") + message(FATAL_ERROR + "The ROCm root directory:\n" + " ${CMAKE_HIP_COMPILER_ROCM_ROOT}\n" + "does not contain the HIP runtime CMake package, expected at one of:\n" + " ${_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS}\n" + ) + endif() + unset(_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS) + endif() + if(NOT DEFINED CMAKE_SIZEOF_VOID_P) + # We have not yet determined the target ABI but we need 'find_package' to + # search lib64 directories to find hip-lang CMake package dependencies. + # This will be replaced by ABI detection later. + set(CMAKE_HIP_SIZEOF_DATA_PTR 8) + endif() +endif() + +if (NOT _CMAKE_TOOLCHAIN_LOCATION) + get_filename_component(_CMAKE_TOOLCHAIN_LOCATION "${CMAKE_HIP_COMPILER}" PATH) +endif () + +set(_CMAKE_PROCESSING_LANGUAGE "HIP") +include(CMakeFindBinUtils) +include(Compiler/${CMAKE_HIP_COMPILER_ID}-FindBinUtils OPTIONAL) +unset(_CMAKE_PROCESSING_LANGUAGE) + +if(CMAKE_HIP_COMPILER_ID STREQUAL "Clang") + set(CMAKE_HIP_RUNTIME_LIBRARY_DEFAULT "SHARED") +elseif(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") + include(Internal/CMakeNVCCParseImplicitInfo) + # Parse CMAKE_HIP_COMPILER_PRODUCED_OUTPUT to get: + # - CMAKE_HIP_ARCHITECTURES_DEFAULT + # - CMAKE_HIP_HOST_IMPLICIT_LINK_DIRECTORIES + # - CMAKE_HIP_HOST_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES + # - CMAKE_HIP_HOST_IMPLICIT_LINK_LIBRARIES + # - CMAKE_HIP_HOST_LINK_LAUNCHER + # - CMAKE_HIP_RUNTIME_LIBRARY_DEFAULT + # - CMAKE_HIP_CUDA_TOOLKIT_INCLUDE_DIRECTORIES + # Match arguments with cmake_nvcc_filter_implicit_info call in CMakeTestHIPCompiler. + cmake_nvcc_parse_implicit_info(HIP CMAKE_HIP_CUDA_) + + include(Internal/CMakeCUDAFilterImplicitLibs) + # Filter out implicit link libraries that should not be passed unconditionally. + cmake_cuda_filter_implicit_libs(CMAKE_HIP_HOST_IMPLICIT_LINK_LIBRARIES) +endif() + +if(CMAKE_HIP_COMPILER_SYSROOT) + string(CONCAT _SET_CMAKE_HIP_COMPILER_SYSROOT + "set(CMAKE_HIP_COMPILER_SYSROOT \"${CMAKE_HIP_COMPILER_SYSROOT}\")\n" + "set(CMAKE_COMPILER_SYSROOT \"${CMAKE_HIP_COMPILER_SYSROOT}\")") +else() + set(_SET_CMAKE_HIP_COMPILER_SYSROOT "") +endif() + +if(CMAKE_HIP_COMPILER_ARCHITECTURE_ID) + set(_SET_CMAKE_HIP_COMPILER_ARCHITECTURE_ID + "set(CMAKE_HIP_COMPILER_ARCHITECTURE_ID ${CMAKE_HIP_COMPILER_ARCHITECTURE_ID})") +else() + set(_SET_CMAKE_HIP_COMPILER_ARCHITECTURE_ID "") +endif() + +if(MSVC_HIP_ARCHITECTURE_ID) + set(SET_MSVC_HIP_ARCHITECTURE_ID + "set(MSVC_HIP_ARCHITECTURE_ID ${MSVC_HIP_ARCHITECTURE_ID})") +endif() + +if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") + if(NOT "$ENV{CUDAARCHS}" STREQUAL "") + set(CMAKE_HIP_ARCHITECTURES "$ENV{CUDAARCHS}" CACHE STRING "CUDA architectures") + endif() + + # If the user did not set CMAKE_HIP_ARCHITECTURES, use the compiler's default. + if("${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") + set(CMAKE_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES_DEFAULT}" CACHE STRING "HIP architectures" FORCE) + if(NOT CMAKE_HIP_ARCHITECTURES) + message(FATAL_ERROR "Failed to detect a default HIP architecture.\n\nCompiler output:\n${CMAKE_HIP_COMPILER_PRODUCED_OUTPUT}") + endif() + endif() + unset(CMAKE_HIP_ARCHITECTURES_DEFAULT) +elseif(NOT DEFINED CMAKE_HIP_ARCHITECTURES) + # Use 'rocm_agent_enumerator' to get the current GPU architecture. + set(_CMAKE_HIP_ARCHITECTURES) + find_program(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR + NAMES rocm_agent_enumerator + HINTS "${CMAKE_HIP_COMPILER_ROCM_ROOT}/bin" + NO_CACHE) + if(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR) + execute_process(COMMAND "${_CMAKE_HIP_ROCM_AGENT_ENUMERATOR}" -t GPU + RESULT_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_RESULT + OUTPUT_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT + ERROR_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_STDERR + ) + if(_CMAKE_ROCM_AGENT_ENUMERATOR_RESULT EQUAL 0) + separate_arguments(_hip_archs NATIVE_COMMAND "${_CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT}") + foreach(_hip_arch ${_hip_archs}) + if(_hip_arch STREQUAL "gfx000") + continue() + endif() + string(FIND ${_hip_arch} ":" pos) + if(NOT pos STREQUAL "-1") + string(SUBSTRING ${_hip_arch} 0 ${pos} _hip_arch) + endif() + list(APPEND _CMAKE_HIP_ARCHITECTURES "${_hip_arch}") + endforeach() + endif() + unset(_CMAKE_ROCM_AGENT_ENUMERATOR_RESULT) + unset(_CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT) + unset(_CMAKE_ROCM_AGENT_ENUMERATOR_STDERR) + endif() + unset(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR) + if(_CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES "${_CMAKE_HIP_ARCHITECTURES}" CACHE STRING "HIP architectures") + elseif(CMAKE_HIP_COMPILER_PRODUCED_OUTPUT MATCHES " -target-cpu ([a-z0-9]+) ") + set(CMAKE_HIP_ARCHITECTURES "${CMAKE_MATCH_1}" CACHE STRING "HIP architectures") + else() + message(FATAL_ERROR "Failed to find a default HIP architecture.") + endif() + unset(_CMAKE_HIP_ARCHITECTURES) +endif() + +# configure variables set in this file for fast reload later on +configure_file(${CMAKE_ROOT}/Modules/CMakeHIPCompiler.cmake.in + ${CMAKE_PLATFORM_INFO_DIR}/CMakeHIPCompiler.cmake + @ONLY + ) +set(CMAKE_HIP_COMPILER_ENV_VAR "HIPCXX") diff --git a/docs/rocm-apu.md b/docs/rocm-apu.md new file mode 100644 index 000000000..a0bb645f2 --- /dev/null +++ b/docs/rocm-apu.md @@ -0,0 +1,91 @@ +# Experimental ROCm iGPU Support + +This branch adds a ROCm backend path geared toward AMD APUs that only expose a small VRAM aperture but share a large UMA pool with the CPU. The steps below outline how to reproduce the build and how to run Ollama with the staged ROCm runtime. + +> **Warning** +> Upstream ROCm does not officially support these APUs yet. Expect driver updates, kernel parameters, or environment variables such as `HSA_OVERRIDE_GFX_VERSION` to change between releases. + +## 1. Stage the ROCm runtime + +We avoid touching the system installation by unpacking the required RPMs into `build/rocm-stage`. + +```bash +mkdir -p build/rocm-stage build/rpm-tmp +cd build/rpm-tmp +dnf download \ + hipblas hipblas-devel hipblas-common-devel \ + rocblas rocblas-devel \ + rocsolver rocsolver-devel \ + rocm-hip-devel rocm-device-libs rocm-comgr rocm-comgr-devel + +cd ../rocm-stage +for rpm in ../rpm-tmp/*.rpm; do + echo "extracting ${rpm}" + rpm2cpio "${rpm}" | bsdtar -xf - +done +``` + +Important staged paths after extraction: + +| Purpose | Location | +| ------------------------ | ----------------------------------------------- | +| HIP/rocBLAS libraries | `build/rocm-stage/lib64` | +| Tensile kernels (rocBLAS)| `build/rocm-stage/lib64/rocblas/library` | +| Headers (`hip`, `rocblas`)| `build/rocm-stage/include` | + +## 2. Build the ROCm backend + +Configure CMake with the preset that targets ROCm 6.x and point it at the staged HIP compiler: + +```bash +cmake --preset "ROCm 6" -B build/rocm \ + -DGGML_VULKAN=OFF \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DCMAKE_HIP_COMPILER=/usr/bin/hipcc \ + -DCMAKE_PREFIX_PATH="$PWD/build/rocm-stage" + +cmake --build build/rocm --target ggml-hip -j$(nproc) +``` + +Artifacts land in `build/lib/ollama/rocm` (and mirrored in `dist/lib/ollama/rocm` when packaging). These include `libggml-hip.so`, CPU fallback variants, Vulkan, and `librocsolver.so`. + +## 3. Run Ollama on ROCm + +The runner needs to see both the GGML plugins and the staged ROCm runtime. The following environment block works for an AMD Radeon 760M with a UMA carve-out: + +```bash +export BASE=$HOME/ollama-gpu +export OLLAMA_LIBRARY_PATH=$BASE/build/lib/ollama/rocm:$BASE/build/lib/ollama +export LD_LIBRARY_PATH=$OLLAMA_LIBRARY_PATH:$BASE/build/rocm-stage/lib64:${LD_LIBRARY_PATH:-} +export ROCBLAS_TENSILE_LIBPATH=$BASE/build/rocm-stage/lib64/rocblas/library +export ROCBLAS_TENSILE_PATH=$ROCBLAS_TENSILE_LIBPATH + +export HSA_OVERRIDE_GFX_VERSION=11.0.0 # spoof gfx1100 for Phoenix +export GGML_HIP_FORCE_GTT=1 # force GTT allocations for UMA memory +export OLLAMA_GPU_DRIVER=rocm +export OLLAMA_GPU=100 # opt into GPU-only scheduling +export OLLAMA_LLM_LIBRARY=rocm # skip CUDA/Vulkan discovery noise +export OLLAMA_VULKAN=0 # optional: suppress Vulkan backend + +$BASE/build/ollama serve +``` + +On launch you should see log lines similar to: + +``` +library=ROCm compute=gfx1100 name=ROCm0 description="AMD Radeon 760M Graphics" +ggml_hip_get_device_memory using GTT memory for 0000:0e:00.0 (total=16352354304 free=15034097664) +``` + +If the runner crashes before enumerating devices: + +- Double-check that `ROCBLAS_TENSILE_LIBPATH` points to the staged `rocblas/library`. +- Ensure no other `LD_LIBRARY_PATH` entries override `libamdhip64.so`. +- Try unsetting `HSA_OVERRIDE_GFX_VERSION` to confirm whether the kernel patch is still needed on your system. + +## 4. Sharing this build + +- Keep the staged RPMs alongside the branch so others can reproduce the exact runtime. +- Include `/tmp/ollama_rocm_run.log` or similar discovery logs in issues/PRs to help maintainers understand the UMA setup. +- Mention any kernel parameters (e.g., large UMA buffer in firmware) when opening upstream tickets. + diff --git a/llm/server.go b/llm/server.go index c83bd5a40..8464e89f9 100644 --- a/llm/server.go +++ b/llm/server.go @@ -360,6 +360,36 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W // Note: we always put our dependency paths first // since these are the exact version we compiled/linked against + userLibs := []string{} + if existing, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH"); ok && existing != "" { + userLibs = filepath.SplitList(existing) + } + if len(userLibs) != 0 { + seen := make(map[string]struct{}, len(userLibs)+len(gpuLibs)) + merged := make([]string, 0, len(userLibs)+len(gpuLibs)) + for _, dir := range userLibs { + if dir == "" { + continue + } + if _, ok := seen[dir]; ok { + continue + } + seen[dir] = struct{}{} + merged = append(merged, dir) + } + for _, dir := range gpuLibs { + if dir == "" { + continue + } + if _, ok := seen[dir]; ok { + continue + } + seen[dir] = struct{}{} + merged = append(merged, dir) + } + gpuLibs = merged + } + libraryPaths := append([]string{}, gpuLibs...) if libraryPath, ok := os.LookupEnv(pathEnv); ok { libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h index d89e35a8e..9e92f3fd1 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h +++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h @@ -1,5 +1,107 @@ #pragma once +#include +#include + +#if defined(__HIP_PLATFORM_AMD__) +#include +#include + +static __host__ __device__ inline float ggml_hip_max_f32(float a, float b) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_fmax_f32(a, b); +#else + return a > b ? a : b; +#endif +} + +static __host__ __device__ inline double ggml_hip_max_f64(double a, double b) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_fmax_f64(a, b); +#else + return a > b ? a : b; +#endif +} + +static __host__ __device__ inline float ggml_hip_min_f32(float a, float b) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_fmin_f32(a, b); +#else + return a < b ? a : b; +#endif +} + +static __host__ __device__ inline double ggml_hip_min_f64(double a, double b) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_fmin_f64(a, b); +#else + return a < b ? a : b; +#endif +} + +static __host__ __device__ inline float ggml_hip_pow_f32(float base, float exp) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_pow_f32(base, exp); +#else + return std::pow(base, exp); +#endif +} + +static __host__ __device__ inline double ggml_hip_pow_f64(double base, double exp) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_pow_f64(base, exp); +#else + return std::pow(base, exp); +#endif +} + +template ::value && std::is_integral::value, int>::type = 0> +__host__ __device__ inline typename std::common_type::type ggml_hip_int_max(A a, B b) { + using R = typename std::common_type::type; + const R aa = static_cast(a); + const R bb = static_cast(b); + return aa > bb ? aa : bb; +} + +template ::value && std::is_integral::value, int>::type = 0> +__host__ __device__ inline typename std::common_type::type ggml_hip_int_min(A a, B b) { + using R = typename std::common_type::type; + const R aa = static_cast(a); + const R bb = static_cast(b); + return aa < bb ? aa : bb; +} + +__host__ __device__ inline float max(float a, float b) { + return ggml_hip_max_f32(a, b); +} + +__host__ __device__ inline double max(double a, double b) { + return ggml_hip_max_f64(a, b); +} + +template ::value && std::is_integral::value, int>::type = 0> +__host__ __device__ inline typename std::common_type::type max(A a, B b) { + return ggml_hip_int_max(a, b); +} + +__host__ __device__ inline float min(float a, float b) { + return ggml_hip_min_f32(a, b); +} + +__host__ __device__ inline double min(double a, double b) { + return ggml_hip_min_f64(a, b); +} + +template ::value && std::is_integral::value, int>::type = 0> +__host__ __device__ inline typename std::common_type::type min(A a, B b) { + return ggml_hip_int_min(a, b); +} +#endif // defined(__HIP_PLATFORM_AMD__) + #define HIP_DISABLE_WARP_SYNC_BUILTINS 1 #include #include @@ -8,6 +110,232 @@ // for rocblas_initialize() #include "rocblas/rocblas.h" +#if defined(__HIP_PLATFORM_AMD__) +#undef fmaxf +#define fmaxf(a, b) ggml_hip_max_f32((a), (b)) +#undef fminf +#define fminf(a, b) ggml_hip_min_f32((a), (b)) +#undef powf +#define powf(a, b) ggml_hip_pow_f32((a), (b)) + +static __host__ __device__ inline float ggml_hip_expf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_exp_f32(x); +#else + using std::exp; + return static_cast(exp(x)); +#endif +} +#undef expf +#define expf(x) ggml_hip_expf((x)) + +static __host__ __device__ inline float ggml_hip_expm1f(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_expm1_f32(x); +#else + using std::expm1; + return static_cast(expm1(x)); +#endif +} +#undef expm1f +#define expm1f(x) ggml_hip_expm1f((x)) + +static __host__ __device__ inline float ggml_hip_logf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_log_f32(x); +#else + using std::log; + return static_cast(log(x)); +#endif +} +#undef logf +#define logf(x) ggml_hip_logf((x)) + +static __host__ __device__ inline float ggml_hip_log1pf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_log1p_f32(x); +#else + using std::log1p; + return static_cast(log1p(x)); +#endif +} +#undef log1pf +#define log1pf(x) ggml_hip_log1pf((x)) + +static __host__ __device__ inline float ggml_hip_log2f(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_log2_f32(x); +#else + using std::log2; + return static_cast(log2(x)); +#endif +} +#undef log2f +#define log2f(x) ggml_hip_log2f((x)) + +static __host__ __device__ inline float ggml_hip_tanhf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_tanh_f32(x); +#else + using std::tanh; + return static_cast(tanh(x)); +#endif +} +#undef tanhf +#define tanhf(x) ggml_hip_tanhf((x)) + +static __host__ __device__ inline float ggml_hip_sinf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_sin_f32(x); +#else + using std::sin; + return static_cast(sin(x)); +#endif +} +#undef sinf +#define sinf(x) ggml_hip_sinf((x)) + +static __host__ __device__ inline float ggml_hip_cosf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_cos_f32(x); +#else + using std::cos; + return static_cast(cos(x)); +#endif +} +#undef cosf +#define cosf(x) ggml_hip_cosf((x)) + +static __host__ __device__ inline float ggml_hip_erff(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_erf_f32(x); +#else + using std::erf; + return static_cast(erf(x)); +#endif +} +#undef erff +#define erff(x) ggml_hip_erff((x)) + +static __host__ __device__ inline float ggml_hip_fabsf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_fabs_f32(x); +#else + using std::fabs; + return static_cast(fabs(x)); +#endif +} +#undef fabsf +#define fabsf(x) ggml_hip_fabsf((x)) + +static __host__ __device__ inline float ggml_hip_floorf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_floor_f32(x); +#else + using std::floor; + return static_cast(floor(x)); +#endif +} +#undef floorf +#define floorf(x) ggml_hip_floorf((x)) + +static __host__ __device__ inline float ggml_hip_ceilf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_ceil_f32(x); +#else + using std::ceil; + return static_cast(ceil(x)); +#endif +} +#undef ceilf +#define ceilf(x) ggml_hip_ceilf((x)) + +static __host__ __device__ inline float ggml_hip_roundf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_round_f32(x); +#else + using std::round; + return static_cast(round(x)); +#endif +} +#undef roundf +#define roundf(x) ggml_hip_roundf((x)) + +static __host__ __device__ inline float ggml_hip_round_scalar(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_round_f32(x); +#else + using std::round; + return static_cast(round(x)); +#endif +} + +static __host__ __device__ inline double ggml_hip_round_scalar(double x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_round_f64(x); +#else + using std::round; + return round(x); +#endif +} +#undef round +#define round(x) ggml_hip_round_scalar((x)) + +static __host__ __device__ inline float ggml_hip_sqrtf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_sqrt_f32(x); +#else + using std::sqrt; + return static_cast(sqrt(x)); +#endif +} +#undef sqrtf +#define sqrtf(x) ggml_hip_sqrtf((x)) + +static __host__ __device__ inline float ggml_hip_rsqrtf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_rsqrt_f32(x); +#else + using std::sqrt; + return 1.0f / static_cast(sqrt(x)); +#endif +} +#undef rsqrtf +#define rsqrtf(x) ggml_hip_rsqrtf((x)) + +static __host__ __device__ inline float ggml_hip_trunc_scalar(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_trunc_f32(x); +#else + using std::trunc; + return static_cast(trunc(x)); +#endif +} + +static __host__ __device__ inline double ggml_hip_trunc_scalar(double x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_trunc_f64(x); +#else + using std::trunc; + return trunc(x); +#endif +} +#undef trunc +#define trunc(x) ggml_hip_trunc_scalar((x)) + +static __host__ __device__ inline int ggml_hip_isinf(float x) { +#if defined(__HIP_DEVICE_COMPILE__) + return __ocml_isinf_f32(x); +#else + using std::isinf; + return static_cast(isinf(x)); +#endif +} +#undef isinf +#define isinf(x) ggml_hip_isinf((x)) + +#endif + #if defined(GGML_HIP_ROCWMMA_FATTN) #include #endif // defined(GGML_HIP_ROCWMMA_FATTN) diff --git a/ml/backend/ggml/ggml/src/mem_hip.cpp b/ml/backend/ggml/ggml/src/mem_hip.cpp index 23c765806..3703f6a40 100644 --- a/ml/backend/ggml/ggml/src/mem_hip.cpp +++ b/ml/backend/ggml/ggml/src/mem_hip.cpp @@ -442,6 +442,8 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total, bool #include #include #include +#include +#include #include #include @@ -449,6 +451,60 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total, bool #include namespace fs = std::filesystem; +namespace { + +static bool ggml_env_flag_enabled(const char *name) { + const char *value = getenv(name); + if (value == nullptr) { + return false; + } + if (*value == '\0') { + return true; + } + + if ((value[0] == '0' && value[1] == '\0') || + strcasecmp(value, "false") == 0 || + strcasecmp(value, "off") == 0) { + return false; + } + return true; +} + +static bool ggml_read_sysfs_value(const std::string &path, uint64_t &value) { + std::ifstream stream(path.c_str()); + if (!stream.is_open()) { + GGML_LOG_DEBUG("%s unable to open sysfs node %s\n", __func__, path.c_str()); + return false; + } + stream >> value; + if (stream.fail()) { + GGML_LOG_DEBUG("%s unable to parse sysfs node %s\n", __func__, path.c_str()); + return false; + } + return true; +} + +static bool ggml_should_use_gtt(uint64_t vram_total, uint64_t gtt_total) { + if (ggml_env_flag_enabled("GGML_HIP_DISABLE_GTT")) { + return false; + } + if (ggml_env_flag_enabled("GGML_HIP_FORCE_GTT")) { + return gtt_total > 0; + } + if (gtt_total == 0) { + return false; + } + + const uint64_t umaThreshold = 1024ull * 1024ull * 1024ull; // 1 GiB + if (vram_total == 0) { + return true; + } + + return vram_total <= umaThreshold && gtt_total > vram_total; +} + +} // namespace + extern "C" { int ggml_hip_mgmt_init() { @@ -460,9 +516,9 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total, bool const std::string drmDeviceGlob = "/sys/class/drm/card*/device/uevent"; const std::string drmTotalMemoryFile = "mem_info_vram_total"; const std::string drmUsedMemoryFile = "mem_info_vram_used"; - const std::string drmGTTTotalMemoryFile = "mem_info_gtt_total"; - const std::string drmGTTUsedMemoryFile = "mem_info_gtt_used"; const std::string drmUeventPCISlotLabel = "PCI_SLOT_NAME="; + const std::string drmGttTotalFile = "mem_info_gtt_total"; + const std::string drmGttUsedFile = "mem_info_gtt_used"; glob_t glob_result; @@ -511,33 +567,29 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total, bool uint64_t memoryUsed; usedFileStream >> memoryUsed; - if (is_integrated_gpu) { - std::string totalFile = dir + "/" + drmGTTTotalMemoryFile; - std::ifstream totalFileStream(totalFile.c_str()); - if (!totalFileStream.is_open()) { - GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, totalFile.c_str()); - file.close(); - globfree(&glob_result); - return 1; - } - uint64_t gtt; - totalFileStream >> gtt; - std::string usedFile = dir + "/" + drmGTTUsedMemoryFile; - std::ifstream usedFileStream(usedFile.c_str()); - if (!usedFileStream.is_open()) { - GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, usedFile.c_str()); - file.close(); - globfree(&glob_result); - return 1; - } - uint64_t gttUsed; - usedFileStream >> gttUsed; - memory += gtt; - memoryUsed += gttUsed; - } + uint64_t gttTotal = 0; + uint64_t gttUsed = 0; + bool hasGttTotal = ggml_read_sysfs_value(dir + "/" + drmGttTotalFile, gttTotal); + bool hasGttUsed = ggml_read_sysfs_value(dir + "/" + drmGttUsedFile, gttUsed); + bool useGtt = ggml_should_use_gtt(memory, hasGttTotal ? gttTotal : 0); - *total = memory; - *free = memory - memoryUsed; + if (useGtt && hasGttTotal) { + uint64_t freeGtt = gttTotal; + if (hasGttUsed && gttTotal > gttUsed) { + freeGtt = gttTotal - gttUsed; + } + *total = gttTotal; + *free = freeGtt; + GGML_LOG_INFO("%s using GTT memory for %s (total=%zu free=%zu)\n", + __func__, id, *total, *free); + } else { + *total = memory; + if (memory > memoryUsed) { + *free = memory - memoryUsed; + } else { + *free = 0; + } + } file.close(); globfree(&glob_result); @@ -555,4 +607,4 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total, bool } // extern "C" -#endif // #ifdef _WIN32 \ No newline at end of file +#endif // #ifdef _WIN32 diff --git a/ml/device.go b/ml/device.go index 47e180d30..50558fc0e 100644 --- a/ml/device.go +++ b/ml/device.go @@ -17,6 +17,7 @@ import ( "strings" "time" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/logutil" ) @@ -538,6 +539,9 @@ func GetVisibleDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string { func (d DeviceInfo) NeedsInitValidation() bool { // ROCm: rocblas will crash on unsupported devices. // CUDA: verify CC is supported by the version of the library + if d.Library == "ROCm" && envconfig.HsaOverrideGfxVersion() != "" { + return false + } return d.Library == "ROCm" || d.Library == "CUDA" }