From c86af47ac0a8788a187c602377fc3911e9ce630f Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 5 Sep 2025 08:25:03 -0700
Subject: [PATCH] WIP - wire up Vulkan with the new engine based discovery

Not a complete implementation - free VRAM is better, but not accurate on
windows
---
 CMakeLists.txt                                |  12 +-
 Dockerfile                                    |  20 +-
 discover/gpu.go                               |  38 +-
 discover/gpu_info_vulkan.c                    | 241 -----------
 discover/gpu_info_vulkan.h                    | 394 ------------------
 discover/runner.go                            |   5 +-
 discover/types.go                             |  12 +-
 llm/server.go                                 |   3 +-
 .../ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp | 148 ++++++-
 scripts/build_windows.ps1                     |   7 +-
 10 files changed, 192 insertions(+), 688 deletions(-)
 delete mode 100644 discover/gpu_info_vulkan.c
 delete mode 100644 discover/gpu_info_vulkan.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29fbd00cd..94114a709 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,7 +114,6 @@ if(CMAKE_HIP_COMPILER)
 
         target_compile_definitions(ggml-hip PRIVATE GGML_HIP_NO_VMM)
 
-        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
         install(TARGETS ggml-hip
             RUNTIME_DEPENDENCY_SET rocm
             RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
@@ -125,13 +124,13 @@ if(CMAKE_HIP_COMPILER)
                 PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                 PRE_EXCLUDE_REGEXES ".*"
                 POST_EXCLUDE_REGEXES "system32"
-            RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
-            LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
+            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
         )
 
         foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
             if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
-                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
+                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP)
                 break()
             endif()
         endforeach()
@@ -141,12 +140,11 @@ endif()
 find_package(Vulkan)
 if(Vulkan_FOUND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
-    set(OLLAMA_VULKAN_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/vulkan)
     install(TARGETS ggml-vulkan
         RUNTIME_DEPENDENCIES
             PRE_INCLUDE_REGEXES vulkan
             PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_VULKAN_INSTALL_DIR} COMPONENT Vulkan
-        LIBRARY DESTINATION ${OLLAMA_VULKAN_INSTALL_DIR} COMPONENT Vulkan
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
     )
 endif()
diff --git a/Dockerfile b/Dockerfile
index aeab5947f..7478fbd95 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@ ARG ROCMVERSION=6.3.3
 ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
-ARG VULKANVERSION=1.4.313.2
+ARG VULKANVERSION=1.4.321.1
 
 # We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
@@ -88,7 +88,7 @@ FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'ROCm 6' \
+    cmake --preset 'ROCm 6' -DOLLAMA_RUNNER_DIR="rocm" \
         && cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
         && cmake --install build --component HIP --strip --parallel ${PARALLEL}
 
@@ -100,7 +100,7 @@ COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'JetPack 5' \
+    cmake --preset 'JetPack 5' -DOLLAMA_RUNNER_DIR="cuda_jetpack5" \
         && cmake --build --parallel ${PARALLEL} --preset 'JetPack 5' \
         && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
 
@@ -112,13 +112,13 @@ COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'JetPack 6' \
+    cmake --preset 'JetPack 6' -DOLLAMA_RUNNER_DIR="cuda_jetpack6" \
         && cmake --build --parallel ${PARALLEL} --preset 'JetPack 6' \
         && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
 
 FROM base AS vulkan
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'Vulkan' \
+    cmake --preset 'Vulkan' -DOLLAMA_RUNNER_DIR="vulkan" \
         && cmake --build --parallel --preset 'Vulkan' \
         && cmake --install build --component Vulkan --strip --parallel 8 
 
@@ -140,15 +140,15 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
 FROM --platform=linux/amd64 scratch AS amd64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
 COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
-COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
-COPY --from=vulkan  dist/lib/ollama/vulkan  /lib/ollama/vulkan
+COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
+COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
 
 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
 COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
-COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/
-COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
+COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
+COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/
+COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/
 
 FROM scratch AS rocm
 COPY --from=rocm-6 dist/lib/ollama /lib/ollama
diff --git a/discover/gpu.go b/discover/gpu.go
index 872b06c64..0cae79005 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -71,11 +71,9 @@ func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
 		} else {
 			info.Compute = fmt.Sprintf("%d.%d", dev.ComputeMajor, dev.ComputeMinor)
 		}
+		// TODO any special processing of Vulkan devices?
 		resp = append(resp, info)
 	}
-	for _, gpu := range vulkanGPUs {
-		resp = append(resp, gpu.GpuInfo)
-	}
 	if len(resp) == 0 {
 		mem, err := GetCPUMem()
 		if err != nil {
@@ -93,18 +91,20 @@ func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
 
 // Given the list of GPUs this instantiation is targeted for,
 // figure out the visible devices environment variable
-//
-// # If different libraries are detected, the first one is what we use
-//
-// TODO once we're purely running on the new runner, this level of device
-// filtering will no longer be necessary.  Instead the runner can be told which
-// of the set of GPUs to utilize and handle filtering itself, instead of relying
-// on the env var to hide devices from the underlying GPU libraries
 func (l GpuInfoList) GetVisibleDevicesEnv() []string {
 	if len(l) == 0 {
 		return nil
 	}
-	return []string{rocmGetVisibleDevicesEnv(l)}
+	res := []string{}
+	envVar := rocmGetVisibleDevicesEnv(l)
+	if envVar != "" {
+		res = append(res, envVar)
+	}
+	envVar = vkGetVisibleDevicesEnv(l)
+	if envVar != "" {
+		res = append(res, envVar)
+	}
+	return res
 }
 
 func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
@@ -134,6 +134,22 @@ func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
 	return envVar + strings.Join(ids, ",")
 }
 
+func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "VULKAN" {
+			continue
+		}
+		ids = append(ids, info.ID)
+
+	}
+	if len(ids) == 0 {
+		return ""
+	}
+	envVar := "GGML_VK_VISIBLE_DEVICES="
+	return envVar + strings.Join(ids, ",")
+}
+
 // GetSystemInfo returns the last cached state of the GPUs on the system
 func GetSystemInfo() SystemInfo {
 	deviceMu.Lock()
diff --git a/discover/gpu_info_vulkan.c b/discover/gpu_info_vulkan.c
deleted file mode 100644
index 65033ad8a..000000000
--- a/discover/gpu_info_vulkan.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#ifndef __APPLE__
-#include "gpu_info_vulkan.h"
-
-#include <string.h>
-
-int is_extension_supported(vk_handle_t* rh, VkPhysicalDevice device, char* extension) {
-  VkPhysicalDeviceProperties properties = {};
-  (*rh->vkGetPhysicalDeviceProperties)(device, &properties);
-
-  uint32_t extensionCount;
-  (*rh->vkEnumerateDeviceExtensionProperties)(device, NULL, &extensionCount, NULL);
-
-  if (extensionCount == 0) {
-    return 0;
-  }
-
-  VkExtensionProperties* extensions = malloc(extensionCount * sizeof(VkExtensionProperties));
-  if (extensions == NULL) {
-    return 0;
-  }
-
-  (*rh->vkEnumerateDeviceExtensionProperties)(device, NULL, &extensionCount, extensions);
-
-  for (int j = 0; j < extensionCount; j++) {
-    if (strcmp(extensions[j].extensionName, extension) == 0) {
-      free(extensions);
-      return 1;
-    }
-  }
-
-  free(extensions);
-  return 0;
-}
-
-void vk_init(char* vk_lib_path, vk_init_resp_t *resp) {
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"vkGetPhysicalDeviceProperties", (void *)&resp->ch.vkGetPhysicalDeviceProperties},
-      {"vkGetPhysicalDeviceProperties2", (void *)&resp->ch.vkGetPhysicalDeviceProperties2},
-      {"vkEnumerateDeviceExtensionProperties", (void *)&resp->ch.vkEnumerateDeviceExtensionProperties},
-      {"vkCreateInstance", (void *)&resp->ch.vkCreateInstance},
-      {"vkEnumeratePhysicalDevices", (void *)&resp->ch.vkEnumeratePhysicalDevices},
-      {"vkGetPhysicalDeviceMemoryProperties2", (void *)&resp->ch.vkGetPhysicalDeviceMemoryProperties2},
-      {"vkDestroyInstance", (void *)&resp->ch.vkDestroyInstance},
-      {NULL, NULL},
-  };
-
-  resp->ch.vk_handle = LOAD_LIBRARY(vk_lib_path, RTLD_LAZY);
-  if (!resp->ch.vk_handle) {
-    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", vk_lib_path, msg);
-    snprintf(buf, buflen,
-            "Unable to load %s library to query for Vulkan GPUs: %s",
-            vk_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  for (i = 0; l[i].s != NULL; i++) {
-    *l[i].p = LOAD_SYMBOL(resp->ch.vk_handle, l[i].s);
-    if (!*l[i].p) {
-      char *msg = LOAD_ERR();
-      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->ch.vk_handle);
-      resp->ch.vk_handle = NULL;
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-              msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  VkInstance instance;
-
-  VkApplicationInfo appInfo = {};
-  appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
-  appInfo.pNext = NULL;
-  appInfo.pApplicationName = "Ollama";
-  appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
-  appInfo.pEngineName = "No Engine";
-  appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
-  appInfo.apiVersion = VK_API_VERSION_1_2;
-
-  VkInstanceCreateInfo createInfo = {};
-  createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
-  createInfo.pNext = NULL;
-  createInfo.flags = 0;
-  createInfo.enabledExtensionCount = 1;
-  const char* extensions[] = { VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME };
-  createInfo.ppEnabledExtensionNames = extensions;
-  createInfo.pApplicationInfo = &appInfo;
-
-  VkResult result = (*resp->ch.vkCreateInstance)(&createInfo, NULL, &instance);
-  if (result != VK_SUCCESS) {
-    resp->err = strdup("failed to create instance");
-    return;
-  }
-
-  uint32_t deviceCount;
-  result = (*resp->ch.vkEnumeratePhysicalDevices)(instance, &deviceCount, NULL);
-  if (result != VK_SUCCESS) {
-    resp->err = strdup("failed to enumerate physical devices");
-    return;
-  }
-
-  resp->err = NULL;
-  resp->ch.vk = instance;
-  resp->ch.num_devices = deviceCount;
-  resp->num_devices = deviceCount;
-}
-
-int vk_check_flash_attention(vk_handle_t rh, int i) {
-  VkInstance instance = rh.vk;
-  uint32_t deviceCount = rh.num_devices;
-
-  VkPhysicalDevice* devices = malloc(deviceCount * sizeof(VkPhysicalDevice));
-  if (devices == NULL) {
-    return 0;
-  }
-
-  VkResult result = (*rh.vkEnumeratePhysicalDevices)(instance, &deviceCount, devices);
-  if (result != VK_SUCCESS) {
-    free(devices);
-    return 0;
-  }
-
-  VkPhysicalDeviceProperties properties = {};
-  (*rh.vkGetPhysicalDeviceProperties)(devices[i], &properties);
-
-  int supports_nv_coopmat2 = is_extension_supported(&rh, devices[i], VK_NV_COOPERATIVE_MATRIX_2_EXTENSION_NAME);
-  if (!supports_nv_coopmat2) {
-    free(devices);
-    return 1;
-  }
-
-  free(devices);
-  return 0;
-}
-
-void vk_check_vram(vk_handle_t rh, int i, mem_info_t *resp) {
-  VkInstance instance = rh.vk;
-  uint32_t deviceCount = rh.num_devices;
-
-  VkPhysicalDevice* devices = malloc(deviceCount * sizeof(VkPhysicalDevice));
-  if (devices == NULL) {
-    resp->err = strdup("memory allocation failed for devices array");
-    return;
-  }
-
-  VkResult result = (*rh.vkEnumeratePhysicalDevices)(instance, &deviceCount, devices);
-  if (result != VK_SUCCESS) {
-    free(devices);
-    resp->err = strdup("failed to enumerate physical devices");
-    return;
-  }
-
-  VkPhysicalDeviceProperties properties = {};
-  (*rh.vkGetPhysicalDeviceProperties)(devices[i], &properties);
-
-  int supports_budget = is_extension_supported(&rh, devices[i], VK_EXT_MEMORY_BUDGET_EXTENSION_NAME);
-  if (!supports_budget) {
-    free(devices);
-    resp->err = strdup("device does not support memory budget");
-    return;
-  }
-
-  if (properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU) {
-    free(devices);
-    resp->err = strdup("device is a CPU");
-    return;
-  }
-
-  VkPhysicalDeviceProperties2 device_props2 = {};
-  device_props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
-
-  VkPhysicalDeviceIDProperties id_props = {};
-  id_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
-
-  device_props2.pNext = &id_props;
-  (*rh.vkGetPhysicalDeviceProperties2)(devices[i], &device_props2);
-
-  VkPhysicalDeviceMemoryBudgetPropertiesEXT physical_device_memory_budget_properties = {};
-  physical_device_memory_budget_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT;
-  physical_device_memory_budget_properties.pNext = NULL;
-
-  VkPhysicalDeviceMemoryProperties2 device_memory_properties = {};
-  device_memory_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2;
-  device_memory_properties.pNext = &physical_device_memory_budget_properties;
-
-  (*rh.vkGetPhysicalDeviceMemoryProperties2)(devices[i], &device_memory_properties);
-
-  VkDeviceSize device_memory_total_size  = 0;
-  VkDeviceSize device_memory_heap_budget = 0;
-
-  for (uint32_t j = 0; j < device_memory_properties.memoryProperties.memoryHeapCount; j++) {
-    VkMemoryHeap heap = device_memory_properties.memoryProperties.memoryHeaps[j];
-    if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
-      device_memory_total_size  += heap.size;
-      device_memory_heap_budget += physical_device_memory_budget_properties.heapBudget[j];
-    }
-  }
-
-  free(devices);
-
-  resp->err = NULL;
-  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
-  resp->gpu_name[GPU_NAME_LEN - 1] = '\0';
-  strncpy(&resp->gpu_name[0], properties.deviceName, GPU_NAME_LEN - 1);
-  resp->gpu_name[GPU_NAME_LEN - 1] = '\0';
-  const uint8_t *uuid = id_props.deviceUUID;
-  snprintf(&resp->gpu_id[0], GPU_ID_LEN,
-      "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-      uuid[0], uuid[1], uuid[2], uuid[3],
-      uuid[4], uuid[5],
-      uuid[6], uuid[7],
-      uuid[8], uuid[9],
-      uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
-    );
-  resp->total = (uint64_t) device_memory_total_size;
-  resp->free = (uint64_t) device_memory_heap_budget;
-  resp->major = VK_API_VERSION_MAJOR(properties.apiVersion);
-  resp->minor = VK_API_VERSION_MINOR(properties.apiVersion);
-  resp->patch = VK_API_VERSION_PATCH(properties.apiVersion);
-}
-
-void vk_release(vk_handle_t rh) {
-  LOG(rh.verbose, "releasing vulkan library\n");
-  (*rh.vkDestroyInstance)(rh.vk, NULL);
-  UNLOAD_LIBRARY(rh.vk_handle);
-  rh.vk_handle = NULL;
-}
-
-#endif  // __APPLE__
diff --git a/discover/gpu_info_vulkan.h b/discover/gpu_info_vulkan.h
deleted file mode 100644
index 42e4b1610..000000000
--- a/discover/gpu_info_vulkan.h
+++ /dev/null
@@ -1,394 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_VULKAN_H__
-#define __GPU_INFO_VULKAN_H__
-
-#include "gpu_info.h"
-
-#define VK_DEFINE_HANDLE(object) typedef struct object##_T* object;
-VK_DEFINE_HANDLE(VkInstance)
-VK_DEFINE_HANDLE(VkPhysicalDevice)
-
-#define VK_MAX_EXTENSION_NAME_SIZE       256U
-#define VK_MAX_DESCRIPTION_SIZE          256U
-#define VK_LUID_SIZE                     8U
-#define VK_UUID_SIZE                     16U
-#define VK_MAX_MEMORY_TYPES              32U
-#define VK_MAX_MEMORY_HEAPS              16U
-#define VK_MAX_PHYSICAL_DEVICE_NAME_SIZE  256U
-
-#define VK_MAKE_VERSION(major, minor, patch) \
-    ((((uint32_t)(major)) << 22U) | (((uint32_t)(minor)) << 12U) | ((uint32_t)(patch)))
-
-#define VK_MAKE_API_VERSION(variant, major, minor, patch) \
-    ((((uint32_t)(variant)) << 29U) | (((uint32_t)(major)) << 22U) | (((uint32_t)(minor)) << 12U) | ((uint32_t)(patch)))
-
-#define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0
-#define VK_API_VERSION_1_1 VK_MAKE_API_VERSION(0, 1, 1, 0)// Patch version should always be set to 0
-#define VK_API_VERSION_1_2 VK_MAKE_API_VERSION(0, 1, 2, 0)// Patch version should always be set to 0
-#define VK_API_VERSION_1_3 VK_MAKE_API_VERSION(0, 1, 3, 0)// Patch version should always be set to 0
-#define VK_API_VERSION_MAJOR(version) (((uint32_t)(version) >> 22U) & 0x7FU)
-#define VK_API_VERSION_MINOR(version) (((uint32_t)(version) >> 12U) & 0x3FFU)
-#define VK_API_VERSION_PATCH(version) ((uint32_t)(version) & 0xFFFU)
-
-#define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME "VK_KHR_get_physical_device_properties2"
-#define VK_NV_COOPERATIVE_MATRIX_2_EXTENSION_NAME "VK_NV_cooperative_matrix2"
-#define VK_EXT_MEMORY_BUDGET_EXTENSION_NAME "VK_EXT_memory_budget"
-
-typedef uint32_t VkFlags;
-typedef uint32_t VkBool32;
-typedef uint64_t VkDeviceSize;
-typedef uint32_t VkSampleMask;
-typedef VkFlags VkSampleCountFlags;
-typedef VkFlags VkMemoryPropertyFlags;
-typedef VkFlags VkMemoryHeapFlags;
-typedef VkFlags VkInstanceCreateFlags;
-
-typedef enum VkResult {
-    VK_SUCCESS = 0,
-    VK_NOT_READY = 1,
-    VK_TIMEOUT = 2,
-    VK_EVENT_SET = 3,
-    VK_EVENT_RESET = 4,
-    VK_INCOMPLETE = 5,
-    VK_ERROR_OUT_OF_HOST_MEMORY = -1,
-    VK_ERROR_OUT_OF_DEVICE_MEMORY = -2,
-    VK_ERROR_INITIALIZATION_FAILED = -3,
-    VK_ERROR_DEVICE_LOST = -4,
-    VK_ERROR_MEMORY_MAP_FAILED = -5,
-    VK_ERROR_LAYER_NOT_PRESENT = -6,
-    VK_ERROR_EXTENSION_NOT_PRESENT = -7,
-    VK_ERROR_FEATURE_NOT_PRESENT = -8,
-    VK_ERROR_INCOMPATIBLE_DRIVER = -9,
-    VK_ERROR_TOO_MANY_OBJECTS = -10,
-    VK_ERROR_FORMAT_NOT_SUPPORTED = -11,
-    VK_ERROR_FRAGMENTED_POOL = -12,
-    VK_ERROR_UNKNOWN = -13,
-    VK_ERROR_OUT_OF_POOL_MEMORY = -1000069000,
-    VK_ERROR_INVALID_EXTERNAL_HANDLE = -1000072003,
-    VK_ERROR_FRAGMENTATION = -1000168000,
-    VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS = -1000257000,
-    VK_PIPELINE_COMPILE_REQUIRED = 1000297000,
-    VK_ERROR_SURFACE_LOST_KHR = -1000000000,
-    VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001,
-    VK_SUBOPTIMAL_KHR = 1000001003,
-    VK_ERROR_OUT_OF_DATE_KHR = -1000001004,
-    VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001,
-    VK_ERROR_VALIDATION_FAILED_EXT = -1000011001,
-    VK_ERROR_INVALID_SHADER_NV = -1000012000,
-    VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR = -1000158000,
-    VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR = -1000158001,
-    VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR = -1000158002,
-    VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR = -1000158003,
-    VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR = -1000158004,
-    VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR = -1000158005,
-    VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT = -1000158006,
-    VK_ERROR_NOT_PERMITTED_KHR = -1000174001,
-    VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT = -1000255000,
-    VK_THREAD_IDLE_KHR = 1000268000,
-    VK_THREAD_DONE_KHR = 1000268001,
-    VK_OPERATION_DEFERRED_KHR = 1000268002,
-    VK_OPERATION_NOT_DEFERRED_KHR = 1000268003,
-    VK_ERROR_COMPRESSION_EXHAUSTED_EXT = -1000338000,
-    VK_RESULT_MAX_ENUM = 0x7FFFFFFF
-} VkResult;
-
-typedef enum VkStructureType {
-    VK_STRUCTURE_TYPE_APPLICATION_INFO = 0,
-    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO = 1,
-    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2 = 1000059001,
-    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2 = 1000059006,
-    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES = 1000071004,
-    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT = 1000237000,
-    VK_STRUCTURE_TYPE_MAX_ENUM = 0x7FFFFFFF
-} VkStructureType;
-
-typedef enum VkPhysicalDeviceType {
-    VK_PHYSICAL_DEVICE_TYPE_OTHER = 0,
-    VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU = 1,
-    VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU = 2,
-    VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU = 3,
-    VK_PHYSICAL_DEVICE_TYPE_CPU = 4,
-    VK_PHYSICAL_DEVICE_TYPE_MAX_ENUM = 0x7FFFFFFF
-} VkPhysicalDeviceType;
-
-typedef enum VkSystemAllocationScope {
-    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND = 0,
-    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT = 1,
-    VK_SYSTEM_ALLOCATION_SCOPE_CACHE = 2,
-    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE = 3,
-    VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE = 4,
-    VK_SYSTEM_ALLOCATION_SCOPE_MAX_ENUM = 0x7FFFFFFF
-} VkSystemAllocationScope;
-
-typedef enum VkInternalAllocationType {
-    VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE = 0,
-    VK_INTERNAL_ALLOCATION_TYPE_NON_EXECUTABLE = 1,
-    VK_INTERNAL_ALLOCATION_TYPE_MAX_ENUM = 0x7FFFFFFF
-} VkInternalAllocationType;
-
-typedef enum VkMemoryHeapFlagBits {
-    VK_MEMORY_HEAP_DEVICE_LOCAL_BIT = 0x00000001,
-    VK_MEMORY_HEAP_MULTI_INSTANCE_BIT = 0x00000002,
-    VK_MEMORY_HEAP_TILE_MEMORY_BIT_QCOM = 0x00000008,
-    VK_MEMORY_HEAP_MULTI_INSTANCE_BIT_KHR = VK_MEMORY_HEAP_MULTI_INSTANCE_BIT,
-    VK_MEMORY_HEAP_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
-} VkMemoryHeapFlagBits;
-
-typedef struct VkExtensionProperties {
-    char extensionName[VK_MAX_EXTENSION_NAME_SIZE];
-    uint32_t specVersion;
-} VkExtensionProperties;
-
-typedef struct VkPhysicalDeviceLimits {
-    uint32_t              maxImageDimension1D;
-    uint32_t              maxImageDimension2D;
-    uint32_t              maxImageDimension3D;
-    uint32_t              maxImageDimensionCube;
-    uint32_t              maxImageArrayLayers;
-    uint32_t              maxTexelBufferElements;
-    uint32_t              maxUniformBufferRange;
-    uint32_t              maxStorageBufferRange;
-    uint32_t              maxPushConstantsSize;
-    uint32_t              maxMemoryAllocationCount;
-    uint32_t              maxSamplerAllocationCount;
-    VkDeviceSize          bufferImageGranularity;
-    VkDeviceSize          sparseAddressSpaceSize;
-    uint32_t              maxBoundDescriptorSets;
-    uint32_t              maxPerStageDescriptorSamplers;
-    uint32_t              maxPerStageDescriptorUniformBuffers;
-    uint32_t              maxPerStageDescriptorStorageBuffers;
-    uint32_t              maxPerStageDescriptorSampledImages;
-    uint32_t              maxPerStageDescriptorStorageImages;
-    uint32_t              maxPerStageDescriptorInputAttachments;
-    uint32_t              maxPerStageResources;
-    uint32_t              maxDescriptorSetSamplers;
-    uint32_t              maxDescriptorSetUniformBuffers;
-    uint32_t              maxDescriptorSetUniformBuffersDynamic;
-    uint32_t              maxDescriptorSetStorageBuffers;
-    uint32_t              maxDescriptorSetStorageBuffersDynamic;
-    uint32_t              maxDescriptorSetSampledImages;
-    uint32_t              maxDescriptorSetStorageImages;
-    uint32_t              maxDescriptorSetInputAttachments;
-    uint32_t              maxVertexInputAttributes;
-    uint32_t              maxVertexInputBindings;
-    uint32_t              maxVertexInputAttributeOffset;
-    uint32_t              maxVertexInputBindingStride;
-    uint32_t              maxVertexOutputComponents;
-    uint32_t              maxTessellationGenerationLevel;
-    uint32_t              maxTessellationPatchSize;
-    uint32_t              maxTessellationControlPerVertexInputComponents;
-    uint32_t              maxTessellationControlPerVertexOutputComponents;
-    uint32_t              maxTessellationControlPerPatchOutputComponents;
-    uint32_t              maxTessellationControlTotalOutputComponents;
-    uint32_t              maxTessellationEvaluationInputComponents;
-    uint32_t              maxTessellationEvaluationOutputComponents;
-    uint32_t              maxGeometryShaderInvocations;
-    uint32_t              maxGeometryInputComponents;
-    uint32_t              maxGeometryOutputComponents;
-    uint32_t              maxGeometryOutputVertices;
-    uint32_t              maxGeometryTotalOutputComponents;
-    uint32_t              maxFragmentInputComponents;
-    uint32_t              maxFragmentOutputAttachments;
-    uint32_t              maxFragmentDualSrcAttachments;
-    uint32_t              maxFragmentCombinedOutputResources;
-    uint32_t              maxComputeSharedMemorySize;
-    uint32_t              maxComputeWorkGroupCount[3];
-    uint32_t              maxComputeWorkGroupInvocations;
-    uint32_t              maxComputeWorkGroupSize[3];
-    uint32_t              subPixelPrecisionBits;
-    uint32_t              subTexelPrecisionBits;
-    uint32_t              mipmapPrecisionBits;
-    uint32_t              maxDrawIndexedIndexValue;
-    uint32_t              maxDrawIndirectCount;
-    float                 maxSamplerLodBias;
-    float                 maxSamplerAnisotropy;
-    uint32_t              maxViewports;
-    uint32_t              maxViewportDimensions[2];
-    float                 viewportBoundsRange[2];
-    uint32_t              viewportSubPixelBits;
-    size_t                minMemoryMapAlignment;
-    VkDeviceSize          minTexelBufferOffsetAlignment;
-    VkDeviceSize          minUniformBufferOffsetAlignment;
-    VkDeviceSize          minStorageBufferOffsetAlignment;
-    int32_t               minTexelOffset;
-    uint32_t              maxTexelOffset;
-    int32_t               minTexelGatherOffset;
-    uint32_t              maxTexelGatherOffset;
-    float                 minInterpolationOffset;
-    float                 maxInterpolationOffset;
-    uint32_t              subPixelInterpolationOffsetBits;
-    uint32_t              maxFramebufferWidth;
-    uint32_t              maxFramebufferHeight;
-    uint32_t              maxFramebufferLayers;
-    VkSampleCountFlags    framebufferColorSampleCounts;
-    VkSampleCountFlags    framebufferDepthSampleCounts;
-    VkSampleCountFlags    framebufferStencilSampleCounts;
-    VkSampleCountFlags    framebufferNoAttachmentsSampleCounts;
-    uint32_t              maxColorAttachments;
-    VkSampleCountFlags    sampledImageColorSampleCounts;
-    VkSampleCountFlags    sampledImageIntegerSampleCounts;
-    VkSampleCountFlags    sampledImageDepthSampleCounts;
-    VkSampleCountFlags    sampledImageStencilSampleCounts;
-    VkSampleCountFlags    storageImageSampleCounts;
-    uint32_t              maxSampleMaskWords;
-    VkBool32              timestampComputeAndGraphics;
-    float                 timestampPeriod;
-    uint32_t              maxClipDistances;
-    uint32_t              maxCullDistances;
-    uint32_t              maxCombinedClipAndCullDistances;
-    uint32_t              discreteQueuePriorities;
-    float                 pointSizeRange[2];
-    float                 lineWidthRange[2];
-    float                 pointSizeGranularity;
-    float                 lineWidthGranularity;
-    VkBool32              strictLines;
-    VkBool32              standardSampleLocations;
-    VkDeviceSize          optimalBufferCopyOffsetAlignment;
-    VkDeviceSize          optimalBufferCopyRowPitchAlignment;
-    VkDeviceSize          nonCoherentAtomSize;
-} VkPhysicalDeviceLimits;
-
-typedef struct VkPhysicalDeviceSparseProperties {
-    VkBool32    residencyStandard2DBlockShape;
-    VkBool32    residencyStandard2DMultisampleBlockShape;
-    VkBool32    residencyStandard3DBlockShape;
-    VkBool32    residencyAlignedMipSize;
-    VkBool32    residencyNonResidentStrict;
-} VkPhysicalDeviceSparseProperties;
-
-typedef struct VkPhysicalDeviceProperties {
-    uint32_t apiVersion;
-    uint32_t driverVersion;
-    uint32_t vendorID;
-    uint32_t deviceID;
-    uint32_t deviceType;
-    char deviceName[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
-    uint8_t pipelineCacheUUID[VK_UUID_SIZE];
-    VkPhysicalDeviceLimits limits;
-    VkPhysicalDeviceSparseProperties sparseProperties;
-} VkPhysicalDeviceProperties;
-
-typedef struct VkPhysicalDeviceProperties2 {
-    VkStructureType sType;
-    void* pNext;
-    VkPhysicalDeviceProperties properties;
-} VkPhysicalDeviceProperties2;
-
-typedef struct VkPhysicalDeviceIDProperties {
-    VkStructureType sType;
-    void* pNext;
-    uint8_t deviceUUID[VK_UUID_SIZE];
-    uint8_t driverUUID[VK_UUID_SIZE];
-    uint8_t deviceLUID[VK_LUID_SIZE];
-    uint32_t deviceNodeMask;
-    VkBool32 deviceLUIDValid;
-} VkPhysicalDeviceIDProperties;
-
-typedef struct VkMemoryType {
-    VkMemoryPropertyFlags propertyFlags;
-    uint32_t heapIndex;
-} VkMemoryType;
-
-typedef struct VkMemoryHeap {
-    VkDeviceSize size;
-    VkMemoryHeapFlags flags;
-} VkMemoryHeap;
-
-typedef struct VkPhysicalDeviceMemoryProperties {
-    uint32_t memoryTypeCount;
-    VkMemoryType memoryTypes[VK_MAX_MEMORY_TYPES];
-    uint32_t memoryHeapCount;
-    VkMemoryHeap memoryHeaps[VK_MAX_MEMORY_HEAPS];
-} VkPhysicalDeviceMemoryProperties;
-
-typedef struct VkPhysicalDeviceMemoryProperties2 {
-    VkStructureType sType;
-    void* pNext;
-    VkPhysicalDeviceMemoryProperties memoryProperties;
-} VkPhysicalDeviceMemoryProperties2;
-
-typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT {
-    VkStructureType sType;
-    void* pNext;
-    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
-    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
-} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
-
-typedef struct VkApplicationInfo {
-    VkStructureType sType;
-    const void* pNext;
-    const char* pApplicationName;
-    uint32_t applicationVersion;
-    const char* pEngineName;
-    uint32_t engineVersion;
-    uint32_t apiVersion;
-} VkApplicationInfo;
-
-typedef struct VkInstanceCreateInfo {
-    VkStructureType sType;
-    const void* pNext;
-    VkInstanceCreateFlags flags;
-    const VkApplicationInfo* pApplicationInfo;
-    uint32_t enabledLayerCount;
-    const char* const* ppEnabledLayerNames;
-    uint32_t enabledExtensionCount;
-    const char* const* ppEnabledExtensionNames;
-} VkInstanceCreateInfo;
-
-typedef struct VkAllocationCallbacks {
-    void* pUserData;
-    void* (*pfnAllocation)(void* pUserData, size_t size, size_t alignment, VkSystemAllocationScope allocationScope);
-    void* (*pfnReallocation)(void* pUserData, void* pOriginal, size_t size, size_t alignment, VkSystemAllocationScope allocationScope);
-    void (*pfnFree)(void* pUserData, void* pMemory);
-    void (*pfnInternalAllocation)(void* pUserData, size_t size, VkInternalAllocationType allocationType, VkSystemAllocationScope allocationScope);
-    void (*pfnInternalFree)(void* pUserData, size_t size, VkInternalAllocationType allocationType, VkSystemAllocationScope allocationScope);
-} VkAllocationCallbacks;
-
-typedef struct {
-  void* vk_handle;
-  uint16_t verbose;
-
-  VkInstance vk;
-  int num_devices;
-
-  void (*vkGetPhysicalDeviceProperties)(
-    VkPhysicalDevice                            physicalDevice,
-    VkPhysicalDeviceProperties*                 pProperties);
-  void (*vkGetPhysicalDeviceProperties2)(
-    VkPhysicalDevice                             physicalDevice,
-    VkPhysicalDeviceProperties2*                 pProperties);
-  VkResult (*vkEnumerateDeviceExtensionProperties)(
-      VkPhysicalDevice                            physicalDevice,
-      const char*                                 pLayerName,
-      uint32_t*                                   pPropertyCount,
-      VkExtensionProperties*                      pProperties);
-  VkResult (*vkCreateInstance)(
-      const VkInstanceCreateInfo*                 pCreateInfo,
-      const VkAllocationCallbacks*                pAllocator,
-      VkInstance*                                 pInstance);
-  VkResult (*vkEnumeratePhysicalDevices)(
-      VkInstance                                  instance,
-      uint32_t*                                   pPhysicalDeviceCount,
-      VkPhysicalDevice*                           pPhysicalDevices);
-  void (*vkGetPhysicalDeviceMemoryProperties2)(
-      VkPhysicalDevice                            physicalDevice,
-      VkPhysicalDeviceMemoryProperties2*          pMemoryProperties);
-  void (*vkDestroyInstance)(
-      VkInstance                                  instance,
-      const VkAllocationCallbacks*                pAllocator);
-} vk_handle_t;
-
-typedef struct vk_init_resp
-{
-  char *err; // If err is non-null handle is invalid
-  int num_devices;
-  vk_handle_t ch;
-} vk_init_resp_t;
-
-void vk_init(char* vk_lib_path, vk_init_resp_t *resp);
-void vk_check_vram(vk_handle_t rh, int i, mem_info_t *resp);
-int vk_check_flash_attention(vk_handle_t rh, int i);
-void vk_release(vk_handle_t rh);
-
-#endif
-#endif
\ No newline at end of file
diff --git a/discover/runner.go b/discover/runner.go
index 5e4e05f95..c9bbef7f5 100644
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -92,6 +92,7 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 		// are enumerated, but not actually supported.
 		// We run this in serial to avoid potentially initializing a GPU multiple
 		// times concurrently leading to memory contention
+		// TODO refactor so we group the lib dirs and do serial per version, but parallel for different libs
 		for dir := range libDirs {
 			var dirs []string
 			if dir == "" {
@@ -125,8 +126,10 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 					} else {
 						envVar = "ROCR_VISIBLE_DEVICES"
 					}
-				} else {
+				} else if devices[i].Library == "CUDA" {
 					envVar = "CUDA_VISIBLE_DEVICES"
+				} else if devices[i].Library == "VULKAN" {
+					envVar = "GGML_VK_VISIBLE_DEVICES"
 				}
 
 				extraEnvs := []string{
diff --git a/discover/types.go b/discover/types.go
index feb8c08e0..5a9ce1865 100644
--- a/discover/types.go
+++ b/discover/types.go
@@ -36,10 +36,11 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	UnreliableFreeMemory bool
 
 	// GPU information
-	ID       string `json:"gpu_id"` // string to use for selection of this specific GPU
-	filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
-	Name     string `json:"name"`    // user friendly name if available
-	Compute  string `json:"compute"` // Compute Capability or gfx
+	ID             string `json:"gpu_id"` // string to use for selection of this specific GPU
+	filterID       string // AMD Workaround: The numeric ID of the device used to filter out other devices
+	Name           string `json:"name"`            // user friendly name if available
+	Compute        string `json:"compute"`         // Compute Capability or gfx
+	FlashAttention bool   `json:"flash_attention"` // is flash attention supported
 
 	// Driver Information - TODO no need to put this on each GPU
 	DriverMajor int `json:"driver_major,omitempty"`
@@ -174,7 +175,8 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
 		supportsFA := gpu.Library == "cpu" ||
 			gpu.Name == "Metal" ||
 			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
-			gpu.Library == "HIP"
+			gpu.Library == "HIP" ||
+			gpu.Library == "VULKAN"
 
 		if !supportsFA {
 			return false
diff --git a/llm/server.go b/llm/server.go
index d3438e6c2..8757d4d56 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -561,10 +561,11 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		// Windows CUDA should not use mmap for best performance
 		// Linux  with a model larger than free space, mmap leads to thrashing
 		// For CPU loads we want the memory to be allocated, not FS cache
-		if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && s.options.UseMMap == nil) ||
+		if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
 			(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
 			(gpus[0].Library == "vulkan" && s.options.UseMMap == nil) ||
 			(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
+			(gpus[0].Library == "VULKAN" && s.options.UseMMap == nil) ||
 			(s.options.UseMMap != nil && !*s.options.UseMMap) {
 			s.loadRequest.UseMmap = false
 		}
diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index d73cdf176..3b0a0891e 100644
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -4123,7 +4123,6 @@ static void ggml_vk_instance_init() {
         }
     } else {
         std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
-
         // If no vulkan devices are found, return early
         if (devices.empty()) {
             GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
@@ -10821,14 +10820,90 @@ std::string ggml_backend_vk_get_device_id(int device) {
     return ggml_vk_get_device_id(dev_idx);
 }
 
-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+//////////////////////////
 
-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
+struct ggml_backend_vk_device_context {
+    size_t device;
+    std::string name;
+    std::string description;
+    std::string id;
+    std::string uuid;
+    int major;
+    int minor;
+    int driver_major;
+    int driver_minor;
+    int integrated;
+    int pci_bus_id;
+    int pci_device_id;
+    int pci_domain_id;
+};
+
+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
+    GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
+
+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
 
     vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+    vk::PhysicalDeviceProperties2 props2;
+    vkdev.getProperties2(&props2);
 
-    for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
+    // Use vendor specific management libraries for best VRAM reporting if available
+    switch (props2.properties.vendorID) {
+    case VK_VENDOR_ID_AMD:
+        if (ggml_hip_mgmt_init() == 0) {
+            int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+            if (status == 0) {
+                GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                ggml_hip_mgmt_release();
+                return;
+            }
+            ggml_hip_mgmt_release();
+        }
+        break;
+    case VK_VENDOR_ID_NVIDIA:
+        if (ggml_nvml_init() == 0) {
+            int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
+            if (status == 0) {
+                GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                ggml_nvml_release();
+                return;
+            }
+            ggml_nvml_release();
+        }
+        break;
+    }
+    // else fallback to memory budget if supported
+
+    *total = 0;
+    *free = 0;
+    vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
+    vk::PhysicalDeviceMemoryProperties2 memprops2;
+    memprops2.pNext = &mem_budget_props;
+    vkdev.getMemoryProperties2(&memprops2);
+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
+        } else if (ctx->integrated) {
+            // Include shared memory on iGPUs
+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
+        }
+    }
+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+            *free += mem_budget_props.heapBudget[i];
+        } else if (ctx->integrated) {
+            *free += mem_budget_props.heapBudget[i];
+        }
+    }
+    if (*total > 0 && *free > 0) {
+        return;
+    } else if (*total > 0) {
+        *free = *total;
+        return;
+    }
+
+    // else just report the physical memory
+    for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
         if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
             *total = heap.size;
             *free = heap.size;
@@ -10837,14 +10912,6 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
     }
 }
 
-//////////////////////////
-
-struct ggml_backend_vk_device_context {
-    size_t device;
-    std::string name;
-    std::string description;
-    std::string id;
-};
 
 static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
@@ -10863,7 +10930,7 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
 
 static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
-    ggml_backend_vk_get_device_memory(ctx->device, free, total);
+    ggml_backend_vk_get_device_memory(ctx, free, total);
 }
 
 static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -10881,6 +10948,7 @@ static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_d
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
+#define GGML_VULKAN_NAME "VULKAN"
 static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
     props->name        = ggml_backend_vk_device_get_name(dev);
     props->description = ggml_backend_vk_device_get_description(dev);
@@ -10893,6 +10961,18 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
         /* .buffer_from_host_ptr  = */ false,
         /* .events                = */ false,
     };
+
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    props->id = ctx->id.c_str();
+    props->compute_major = ctx->major;
+    props->compute_minor = ctx->minor;
+    props->driver_major = ctx->driver_major;
+    props->driver_minor = ctx->driver_minor;
+    props->integrated = ctx->integrated;
+    props->pci_bus_id = ctx->pci_bus_id;
+    props->pci_device_id = ctx->pci_device_id;
+    props->pci_domain_id = ctx->pci_domain_id;
+    props->library = GGML_VULKAN_NAME;
 }
 
 static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -11296,6 +11376,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
+            std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
+
             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
                 char desc[256];
@@ -11309,6 +11391,44 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                     /* .reg     = */ reg,
                     /* .context = */ ctx,
                 });
+
+                // Gather additional information about the device
+                int dev_idx = vk_instance.device_indices[i];
+                vk::PhysicalDeviceProperties props1;
+                vk_devices[dev_idx].getProperties(&props1);
+                vk::PhysicalDeviceProperties2 props2;
+                vk::PhysicalDeviceIDProperties device_id_props;
+                vk::PhysicalDevicePCIBusInfoPropertiesEXT  pci_bus_props;
+                vk::PhysicalDeviceDriverProperties driver_props;
+                props2.pNext = &device_id_props;
+                device_id_props.pNext = &pci_bus_props;
+                pci_bus_props.pNext = &driver_props;
+                vk_devices[dev_idx].getProperties2(&props2);
+                std::ostringstream oss;
+                oss << std::hex << std::setfill('0');
+                oss << "GPU-";
+                int byteIdx = 0;
+                for (int i = 0; i < 16; ++i, ++byteIdx) {
+                    oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
+                    if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
+                        oss << '-';
+                    }
+                }
+                ctx->uuid = oss.str();
+                ctx->pci_bus_id = pci_bus_props.pciBus;
+                ctx->pci_device_id = pci_bus_props.pciDevice;
+                ctx->pci_domain_id = pci_bus_props.pciDomain;
+                ctx->id = std::to_string(i);
+                if (props1.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) {
+                    ctx->integrated = 1;
+                } else {
+                    ctx->integrated = 0;
+                }
+                ctx->major = 0;
+                ctx->minor = 0;
+                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
+                ctx->driver_major = 0;
+                ctx->driver_minor = 0;
             }
             initialized = true;
         }
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 3ca25a13c..0a3d7c888 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -165,12 +165,11 @@ function buildROCm() {
             $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
             $env:HIP_PLATFORM="amd"
             $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-            & cmake --fresh --preset "ROCm 6" -G Ninja `
+            & cmake --fresh --preset "ROCm 6" -G Ninja --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="rocm" `
                 -DCMAKE_C_COMPILER=clang `
                 -DCMAKE_CXX_COMPILER=clang++ `
                 -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
-                -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
-                --install-prefix $script:DIST_DIR
+                -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
             $env:HIPCXX=""
             $env:HIP_PLATFORM=""
@@ -186,7 +185,7 @@ function buildROCm() {
 function buildVulkan(){
     if ($env:VULKAN_SDK) {
         write-host "Building Vulkan backend libraries"
-        & cmake --fresh --preset Vulkan --install-prefix $script:DIST_DIR
+        & cmake --fresh --preset Vulkan --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="vulkan"
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
         & cmake --build --preset Vulkan  --config Release --parallel $script:JOBS
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}