OpenAI Delete Endpoint

2024-06-14 16:28:22 -07:00
42 changed files with 846 additions and 1978 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -124,7 +124,7 @@ jobs:
    strategy:
      matrix:
        rocm-version:
-          - '6.1.1'
+          - '6.0.2'
    runs-on: linux
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
    steps:
--- a/2
+++ b/2
@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.1.1
+ARG ROCM_VERSION=6.0.2

 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
--- a/api/types.go
+++ b/api/types.go
@@ -238,7 +238,6 @@ type ShowResponse struct {
 	System     string       `json:"system,omitempty"`
 	Details    ModelDetails `json:"details,omitempty"`
 	Messages   []Message    `json:"messages,omitempty"`
-	ModifiedAt time.Time    `json:"modified_at,omitempty"`
 }

 // CopyRequest is the request passed to [Client.Copy].
--- a/docs/development.md
+++ b/docs/development.md
@@ -114,18 +114,15 @@ If you have Docker available, you can build linux binaries with `./scripts/build

 ### Windows

-Note: The Windows build for Ollama is still under development.
+Note: The windows build for Ollama is still under development.

-First, install required tools:
+Install required tools:

 - MSVC toolchain - C/C++ and cmake as minimal requirements
 - Go version 1.22 or higher
 - MinGW (pick one variant) with GCC.
  - [MinGW-w64](https://www.mingw-w64.org/)
  - [MSYS2](https://www.msys2.org/)
- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
-
-Then, build the `ollama` binary:

 ```powershell
 $env:CGO_ENABLED="1"
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -8,7 +8,7 @@ Check your compute compatibility to see if your card is supported:
 | Compute Capability | Family              | Cards                                                                                                       |
 | ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
 | 9.0                | NVIDIA              | `H100`                                                                                                      |
-| 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
+| 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti`                                                           |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
 | 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060`         |
 |                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -104,6 +104,7 @@ curl http://localhost:11434/v1/chat/completions \

 #### Notes

+- Setting `seed` will always set `temperature` to `0`
 - `finish_reason` will always be `stop`
 - `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -53,23 +53,8 @@ var (
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
-	// Set via OLLAMA_SCHED_SPREAD in the environment
-	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
-	// Set via OLLAMA_INTEL_GPU in the environment
-	IntelGpu bool
-
-	// Set via CUDA_VISIBLE_DEVICES in the environment
-	CudaVisibleDevices string
-	// Set via HIP_VISIBLE_DEVICES in the environment
-	HipVisibleDevices string
-	// Set via ROCR_VISIBLE_DEVICES in the environment
-	RocrVisibleDevices string
-	// Set via GPU_DEVICE_ORDINAL in the environment
-	GpuDeviceOrdinal string
-	// Set via HSA_OVERRIDE_GFX_VERSION in the environment
-	HsaOverrideGfxVersion string
 )

 type EnvVar struct {
@@ -79,7 +64,7 @@ type EnvVar struct {
 }

 func AsMap() map[string]EnvVar {
-	ret := map[string]EnvVar{
+	return map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
@@ -94,18 +79,8 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
-		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 	}
-	if runtime.GOOS != "darwin" {
-		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
-		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
-		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
-		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
-		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
-	}
-	return ret
 }

 func Values() map[string]string {
@@ -216,15 +191,6 @@ func LoadConfig() {
 		NoHistory = true
 	}

-	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
-		s, err := strconv.ParseBool(spread)
-		if err == nil {
-			SchedSpread = s
-		} else {
-			SchedSpread = true
-		}
-	}
-
 	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
 		NoPrune = true
 	}
@@ -278,16 +244,6 @@ func LoadConfig() {
 	if err != nil {
 		slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
 	}
-
-	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
-		IntelGpu = set
-	}
-
-	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
-	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
-	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
-	GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
-	HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
 }

 func getModelsDir() (string, error) {
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -13,7 +13,6 @@ import (
 	"strconv"
 	"strings"

-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )

@@ -26,16 +25,7 @@ const (

 	// Prefix with the node dir
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
-
-	// Direct Rendering Manager sysfs location
-	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
-	DRMTotalMemoryFile = "mem_info_vram_total"
-	DRMUsedMemoryFile  = "mem_info_vram_used"
-
-	// In hex; properties file is in decimal
-	DRMUniqueIDFile = "unique_id"
-	DRMVendorFile   = "vendor"
-	DRMDeviceFile   = "device"
+	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
 )

 var (
@@ -45,8 +35,8 @@ var (
 )

 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
-func AMDGetGPUInfo() []RocmGPUInfo {
-	resp := []RocmGPUInfo{}
+func AMDGetGPUInfo() []GpuInfo {
+	resp := []GpuInfo{}
 	if !AMDDetected() {
 		return resp
 	}
@@ -60,9 +50,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {

 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
 	var visibleDevices []string
-	hipVD := envconfig.HipVisibleDevices   // zero based index only
-	rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
-	gpuDO := envconfig.GpuDeviceOrdinal    // zero based index
+	hipVD := os.Getenv("HIP_VISIBLE_DEVICES")   // zero based index only
+	rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
+	gpuDO := os.Getenv("GPU_DEVICE_ORDINAL")    // zero based index
 	switch {
 	// TODO is this priorty order right?
 	case hipVD != "":
@@ -75,7 +65,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		visibleDevices = strings.Split(gpuDO, ",")
 	}

-	gfxOverride := envconfig.HsaOverrideGfxVersion
+	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
 	var supported []string
 	libDir := ""

@@ -100,7 +90,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		scanner := bufio.NewScanner(fp)
 		isCPU := false
 		var major, minor, patch uint64
-		var vendor, device, uniqueID uint64
+		var vendor, device uint64
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -131,43 +121,30 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			} else if strings.HasPrefix(line, "vendor_id") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 {
-					slog.Debug("malformed", "vendor_id", line)
+					slog.Debug("malformed vendor_id", "vendor_id", line)
 					continue
 				}
-				vendor, err = strconv.ParseUint(ver[1], 10, 64)
+				vendor, err = strconv.ParseUint(ver[1], 10, 32)
 				if err != nil {
-					slog.Debug("malformed", "vendor_id", line, "error", err)
+					slog.Debug("malformed vendor_id" + line)
 				}
 			} else if strings.HasPrefix(line, "device_id") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 {
-					slog.Debug("malformed", "device_id", line)
+					slog.Debug("malformed device_id", "device_id", line)
 					continue
 				}
-				device, err = strconv.ParseUint(ver[1], 10, 64)
+				device, err = strconv.ParseUint(ver[1], 10, 32)
 				if err != nil {
-					slog.Debug("malformed", "device_id", line, "error", err)
-				}
-			} else if strings.HasPrefix(line, "unique_id") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 {
-					slog.Debug("malformed", "unique_id", line)
-					continue
-				}
-				uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
-				if err != nil {
-					slog.Debug("malformed", "unique_id", line, "error", err)
+					slog.Debug("malformed device_id" + line)
 				}
 			}
+
 			// TODO - any other properties we want to extract and record?
 			// vendor_id + device_id -> pci lookup for "Name"
 			// Other metrics that may help us understand relative performance between multiple GPUs
 		}

-		// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
-		// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
-		// do reliably report VRAM usage.
-
 		if isCPU {
 			cpuCount++
 			continue
@@ -179,7 +156,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		// Shouldn't happen, but just in case...
 		if gpuID < 0 {
 			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
-			return nil
+			return []GpuInfo{}
 		}

 		if int(major) < RocmComputeMin {
@@ -190,68 +167,65 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		// Look up the memory for the current node
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
-		var usedFile string
-		mapping := []struct {
-			id       uint64
-			filename string
-		}{
-			{vendor, DRMVendorFile},
-			{device, DRMDeviceFile},
-			{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
+		propFiles, err := filepath.Glob(propGlob)
+		if err != nil {
+			slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
 		}
-		slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
-		// Map over to DRM location to find the total/free memory
-		drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
-		for _, devDir := range drmMatches {
-			matched := true
-			for _, m := range mapping {
-				if m.id == 0 {
-					// Null ID means it didn't populate, so we can't use it to match
-					continue
-				}
-				filename := filepath.Join(devDir, m.filename)
-				buf, err := os.ReadFile(filename)
-				if err != nil {
-					slog.Debug("failed to read sysfs node", "file", filename, "error", err)
-					matched = false
-					break
-				}
-				// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
-				cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
-				if err != nil {
-					slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
-					matched = false
-					break
-				}
-				if cmp != m.id {
-					matched = false
-					break
-				}
-			}
-			if !matched {
+		// 1 or more memory banks - sum the values of all of them
+		for _, propFile := range propFiles {
+			fp, err := os.Open(propFile)
+			if err != nil {
+				slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
 				continue
 			}
-
-			// Found the matching DRM directory
-			slog.Debug("matched", "amdgpu", match, "drm", devDir)
-			totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
-			buf, err := os.ReadFile(totalFile)
-			if err != nil {
-				slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
-				break
+			defer fp.Close()
+			scanner := bufio.NewScanner(fp)
+			for scanner.Scan() {
+				line := strings.TrimSpace(scanner.Text())
+				if strings.HasPrefix(line, "size_in_bytes") {
+					ver := strings.Fields(line)
+					if len(ver) != 2 {
+						slog.Warn("malformed " + line)
+						continue
+					}
+					bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
+					if err != nil {
+						slog.Warn("malformed int " + line)
+						continue
+					}
+					totalMemory += bankSizeInBytes
+				}
 			}
-			totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
+		}
+		if totalMemory == 0 {
+			slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
+			continue
+		}
+		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
+		usedFiles, err := filepath.Glob(usedGlob)
+		if err != nil {
+			slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
+			continue
+		}
+		for _, usedFile := range usedFiles {
+			fp, err := os.Open(usedFile)
 			if err != nil {
-				slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
-				break
+				slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
+				continue
 			}
-
-			usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
-			usedMemory, err = getFreeMemory(usedFile)
+			defer fp.Close()
+			data, err := io.ReadAll(fp)
 			if err != nil {
-				slog.Debug("failed to update used memory", "error", err)
+				slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
+				continue
 			}
-			break
+			used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
+			if err != nil {
+				slog.Warn("malformed used memory", "data", string(data), "error", err)
+				continue
+			}
+			usedMemory += used
 		}

 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
@@ -267,21 +241,18 @@ func AMDGetGPUInfo() []RocmGPUInfo {

 		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
-		gpuInfo := RocmGPUInfo{
-			GpuInfo: GpuInfo{
-				Library: "rocm",
-				memInfo: memInfo{
-					TotalMemory: totalMemory,
-					FreeMemory:  (totalMemory - usedMemory),
-				},
-				ID:            strconv.Itoa(gpuID),
-				Name:          name,
-				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
-				MinimumMemory: rocmMinimumMemory,
-				DriverMajor:   driverMajor,
-				DriverMinor:   driverMinor,
+		gpuInfo := GpuInfo{
+			Library: "rocm",
+			memInfo: memInfo{
+				TotalMemory: totalMemory,
+				FreeMemory:  (totalMemory - usedMemory),
 			},
-			usedFilepath: usedFile,
+			ID:            fmt.Sprintf("%d", gpuID),
+			Name:          name,
+			Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
+			MinimumMemory: rocmMinimumMemory,
+			DriverMajor:   driverMajor,
+			DriverMinor:   driverMinor,
 		}

 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -305,7 +276,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			libDir, err = AMDValidateLibDir()
 			if err != nil {
 				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
-				return nil
+				return []GpuInfo{}
 			}
 		}
 		gpuInfo.DependencyPath = libDir
@@ -316,7 +287,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 				supported, err = GetSupportedGFX(libDir)
 				if err != nil {
 					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
-					return nil
+					return []GpuInfo{}
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
@@ -333,11 +304,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 		}

-		// Check for env var workarounds
-		if name == "1002:687f" { // Vega RX 56
-			gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
-		}
-
 		// The GPU has passed all the verification steps and is supported
 		resp = append(resp, gpuInfo)
 	}
@@ -412,31 +378,3 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	}
 	return driverMajor, driverMinor, nil
 }
-
-func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
-	if len(gpus) == 0 {
-		return nil
-	}
-	for i := range gpus {
-		usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
-		if err != nil {
-			return err
-		}
-		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
-		gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
-	}
-	return nil
-}
-
-func getFreeMemory(usedFile string) (uint64, error) {
-	buf, err := os.ReadFile(usedFile)
-	if err != nil {
-		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
-	}
-	usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
-	if err != nil {
-		slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
-		return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
-	}
-	return usedMemory, nil
-}
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,10 +7,8 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
-	"strconv"
 	"strings"

-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )

@@ -26,8 +24,8 @@ var (
 	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
 )

-func AMDGetGPUInfo() []RocmGPUInfo {
-	resp := []RocmGPUInfo{}
+func AMDGetGPUInfo() []GpuInfo {
+	resp := []GpuInfo{}
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
@@ -54,7 +52,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	}

 	var supported []string
-	gfxOverride := envconfig.HsaOverrideGfxVersion
+	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
 	if gfxOverride == "" {
 		supported, err = GetSupportedGFX(libDir)
 		if err != nil {
@@ -119,24 +117,21 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
 		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
-		gpuInfo := RocmGPUInfo{
-			GpuInfo: GpuInfo{
-				Library: "rocm",
-				memInfo: memInfo{
-					TotalMemory: totalMemory,
-					FreeMemory:  freeMemory,
-				},
-				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: libDir,
-				MinimumMemory:  rocmMinimumMemory,
-				Name:           name,
-				Compute:        gfx,
-
-				// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
-				// DriverMajor:    driverMajor,
-				// DriverMinor:    driverMinor,
+		gpuInfo := GpuInfo{
+			Library: "rocm",
+			memInfo: memInfo{
+				TotalMemory: totalMemory,
+				FreeMemory:  freeMemory,
 			},
-			index: i,
+			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
+			DependencyPath: libDir,
+			MinimumMemory:  rocmMinimumMemory,
+			Name:           name,
+			Compute:        gfx,
+
+			// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
+			// DriverMajor:    driverMajor,
+			// DriverMinor:    driverMinor,
 		}

 		resp = append(resp, gpuInfo)
@@ -164,30 +159,3 @@ func AMDValidateLibDir() (string, error) {
 	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }
-
-func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
-	if len(gpus) == 0 {
-		return nil
-	}
-	hl, err := NewHipLib()
-	if err != nil {
-		slog.Debug(err.Error())
-		return nil
-	}
-	defer hl.Release()
-
-	for i := range gpus {
-		err := hl.HipSetDevice(gpus[i].index)
-		if err != nil {
-			return err
-		}
-		freeMemory, _, err := hl.HipMemGetInfo()
-		if err != nil {
-			slog.Warn("get mem info", "id", i, "error", err)
-			continue
-		}
-		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
-		gpus[i].FreeMemory = freeMemory
-	}
-	return nil
-}
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -1,16 +1,21 @@
 package gpu

 import (
+	"log/slog"
+
 	"golang.org/x/sys/cpu"
 )

-func GetCPUCapability() CPUCapability {
+func GetCPUVariant() string {
 	if cpu.X86.HasAVX2 {
-		return CPUCapabilityAVX2
+		slog.Debug("CPU has AVX2")
+		return "avx2"
 	}
 	if cpu.X86.HasAVX {
-		return CPUCapabilityAVX
+		slog.Debug("CPU has AVX")
+		return "avx"
 	}
+	slog.Debug("CPU does not have vector extensions")
 	// else LCD
-	return CPUCapabilityNone
+	return ""
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -24,37 +24,19 @@ import (
 	"github.com/ollama/ollama/format"
 )

-type cudaHandles struct {
+type handles struct {
 	deviceCount int
 	cudart      *C.cudart_handle_t
 	nvcuda      *C.nvcuda_handle_t
-	nvml        *C.nvml_handle_t
-}
-
-type oneapiHandles struct {
 	oneapi      *C.oneapi_handle_t
-	deviceCount int
 }

 const (
 	cudaMinimumMemory = 457 * format.MebiByte
 	rocmMinimumMemory = 457 * format.MebiByte
-	// TODO OneAPI minimum memory
 )

-var (
-	gpuMutex      sync.Mutex
-	bootstrapped  bool
-	cpuCapability CPUCapability
-	cpus          []CPUInfo
-	cudaGPUs      []CudaGPUInfo
-	nvcudaLibPath string
-	cudartLibPath string
-	oneapiLibPath string
-	nvmlLibPath   string
-	rocmGPUs      []RocmGPUInfo
-	oneapiGPUs    []OneapiGPUInfo
-)
+var gpuMutex sync.Mutex

 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}
@@ -64,113 +46,113 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU

+var CudartLinuxGlobs = []string{
+	"/usr/local/cuda/lib64/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
+	"/usr/lib/wsl/lib/libcudart.so*",
+	"/usr/lib/wsl/drivers/*/libcudart.so*",
+	"/opt/cuda/lib64/libcudart.so*",
+	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
+	"/usr/local/cuda/lib*/libcudart.so*",
+	"/usr/lib*/libcudart.so*",
+	"/usr/local/lib*/libcudart.so*",
+}
+
+var CudartWindowsGlobs = []string{
+	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
+}
+
+var NvcudaLinuxGlobs = []string{
+	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
+	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
+	"/usr/lib/*-linux-gnu/libcuda.so*",
+	"/usr/lib/wsl/lib/libcuda.so*",
+	"/usr/lib/wsl/drivers/*/libcuda.so*",
+	"/opt/cuda/lib*/libcuda.so*",
+	"/usr/local/cuda/lib*/libcuda.so*",
+	"/usr/lib*/libcuda.so*",
+	"/usr/local/lib*/libcuda.so*",
+}
+
+var NvcudaWindowsGlobs = []string{
+	"c:\\windows\\system*\\nvcuda.dll",
+}
+
+var OneapiWindowsGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var OneapiLinuxGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")

 // Note: gpuMutex must already be held
-func initCudaHandles() *cudaHandles {
+func initGPUHandles() *handles {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

-	cHandles := &cudaHandles{}
-	// Short Circuit if we already know which library to use
-	if nvmlLibPath != "" {
-		cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
-		return cHandles
-	}
-	if nvcudaLibPath != "" {
-		cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
-		return cHandles
-	}
-	if cudartLibPath != "" {
-		cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
-		return cHandles
-	}
-
-	slog.Debug("searching for GPU discovery libraries for NVIDIA")
+	gpuHandles := &handles{}
+	var cudartMgmtName string
 	var cudartMgmtPatterns []string
+	var nvcudaMgmtName string
+	var nvcudaMgmtPatterns []string

-	// Aligned with driver, we can't carry as payloads
-	nvcudaMgmtPatterns := NvcudaGlobs
-
-	if runtime.GOOS == "windows" {
-		localAppData := os.Getenv("LOCALAPPDATA")
-		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
-	}
 	tmpDir, _ := PayloadsDir()
-	if tmpDir != "" {
-		// TODO - add "payloads" for subprocess
-		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
-	}
-	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
-
-	if len(NvmlGlobs) > 0 {
-		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
-		if len(nvmlLibPaths) > 0 {
-			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
-			if nvml != nil {
-				slog.Debug("nvidia-ml loaded", "library", libPath)
-				cHandles.nvml = nvml
-				nvmlLibPath = libPath
-			}
+	switch runtime.GOOS {
+	case "windows":
+		cudartMgmtName = "cudart64_*.dll"
+		localAppData := os.Getenv("LOCALAPPDATA")
+		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
+		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
+		// Aligned with driver, we can't carry as payloads
+		nvcudaMgmtName = "nvcuda.dll"
+		nvcudaMgmtPatterns = NvcudaWindowsGlobs
+	case "linux":
+		cudartMgmtName = "libcudart.so*"
+		if tmpDir != "" {
+			// TODO - add "payloads" for subprocess
+			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
 		}
+		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
+		// Aligned with driver, we can't carry as payloads
+		nvcudaMgmtName = "libcuda.so*"
+		nvcudaMgmtPatterns = NvcudaLinuxGlobs
+	default:
+		return gpuHandles
 	}

-	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
+	slog.Debug("Detecting GPUs")
+	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
 	if len(nvcudaLibPaths) > 0 {
 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
 		if nvcuda != nil {
 			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
-			cHandles.nvcuda = nvcuda
-			cHandles.deviceCount = deviceCount
-			nvcudaLibPath = libPath
-			return cHandles
+			gpuHandles.nvcuda = nvcuda
+			gpuHandles.deviceCount = deviceCount
+			return gpuHandles
 		}
 	}

-	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
+	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
 	if len(cudartLibPaths) > 0 {
 		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
 			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
-			cHandles.cudart = cudart
-			cHandles.deviceCount = deviceCount
-			cudartLibPath = libPath
-			return cHandles
+			gpuHandles.cudart = cudart
+			gpuHandles.deviceCount = deviceCount
+			return gpuHandles
 		}
 	}

-	return cHandles
-}
-
-// Note: gpuMutex must already be held
-func initOneAPIHandles() *oneapiHandles {
-	oHandles := &oneapiHandles{}
-
-	// Short Circuit if we already know which library to use
-	if oneapiLibPath != "" {
-		oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
-		return oHandles
-	}
-
-	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
-	if len(oneapiLibPaths) > 0 {
-		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
-	}
-
-	return oHandles
-}
-
-func GetCPUInfo() GpuInfoList {
-	gpuMutex.Lock()
-	if !bootstrapped {
-		gpuMutex.Unlock()
-		GetGPUInfo()
-	} else {
-		gpuMutex.Unlock()
-	}
-	return GpuInfoList{cpus[0].GpuInfo}
+	return gpuHandles
 }

 func GetGPUInfo() GpuInfoList {
@@ -178,249 +160,112 @@ func GetGPUInfo() GpuInfoList {
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
-	needRefresh := true
-	var cHandles *cudaHandles
-	var oHandles *oneapiHandles
+
+	gpuHandles := initGPUHandles()
 	defer func() {
-		if cHandles != nil {
-			if cHandles.cudart != nil {
-				C.cudart_release(*cHandles.cudart)
-			}
-			if cHandles.nvcuda != nil {
-				C.nvcuda_release(*cHandles.nvcuda)
-			}
-			if cHandles.nvml != nil {
-				C.nvml_release(*cHandles.nvml)
-			}
+		if gpuHandles.cudart != nil {
+			C.cudart_release(*gpuHandles.cudart)
 		}
-		if oHandles != nil {
-			if oHandles.oneapi != nil {
-				// TODO - is this needed?
-				C.oneapi_release(*oHandles.oneapi)
-			}
+		if gpuHandles.nvcuda != nil {
+			C.nvcuda_release(*gpuHandles.nvcuda)
 		}
 	}()

-	if !bootstrapped {
-		slog.Debug("Detecting GPUs")
-		needRefresh = false
-		cpuCapability = GetCPUCapability()
-		var memInfo C.mem_info_t
-
-		mem, err := GetCPUMem()
-		if err != nil {
-			slog.Warn("error looking up system memory", "error", err)
-		}
-		cpus = []CPUInfo{CPUInfo{
-			GpuInfo: GpuInfo{
-				memInfo: mem,
-				Library: "cpu",
-				Variant: cpuCapability,
-				ID:      "0",
-			},
-		}}
-
-		// Fallback to CPU mode if we're lacking required vector extensions on x86
-		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
-			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
-			bootstrapped = true
-			// No need to do any GPU discovery, since we can't run on them
-			return GpuInfoList{cpus[0].GpuInfo}
-		}
-
-		// On windows we bundle the nvidia library one level above the runner dir
-		depPath := ""
-		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-			depPath = filepath.Dir(envconfig.RunnersDir)
-		}
-
-		// Load ALL libraries
-		cHandles = initCudaHandles()
-
-		// NVIDIA
-		for i := range cHandles.deviceCount {
-			if cHandles.cudart != nil || cHandles.nvcuda != nil {
-				gpuInfo := CudaGPUInfo{
-					GpuInfo: GpuInfo{
-						Library: "cuda",
-					},
-					index: i,
-				}
-				var driverMajor int
-				var driverMinor int
-				if cHandles.cudart != nil {
-					C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
-				} else {
-					C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
-					driverMajor = int(cHandles.nvcuda.driver_major)
-					driverMinor = int(cHandles.nvcuda.driver_minor)
-				}
-				if memInfo.err != nil {
-					slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
-					C.free(unsafe.Pointer(memInfo.err))
-					continue
-				}
-				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
-					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
-					continue
-				}
-				gpuInfo.TotalMemory = uint64(memInfo.total)
-				gpuInfo.FreeMemory = uint64(memInfo.free)
-				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
-				gpuInfo.MinimumMemory = cudaMinimumMemory
-				gpuInfo.DependencyPath = depPath
-				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				gpuInfo.DriverMajor = driverMajor
-				gpuInfo.DriverMinor = driverMinor
-
-				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
-				cudaGPUs = append(cudaGPUs, gpuInfo)
-			}
-		}
-
-		// Intel
-		if envconfig.IntelGpu {
-			oHandles = initOneAPIHandles()
-			for d := range oHandles.oneapi.num_drivers {
-				if oHandles.oneapi == nil {
-					// shouldn't happen
-					slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
-					continue
-				}
-				devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-				for i := range devCount {
-					gpuInfo := OneapiGPUInfo{
-						GpuInfo: GpuInfo{
-							Library: "oneapi",
-						},
-						driverIndex: int(d),
-						gpuIndex:    int(i),
-					}
-					// TODO - split bootstrapping from updating free memory
-					C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
-					// TODO - convert this to MinimumMemory based on testing...
-					var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-					memInfo.free = C.uint64_t(totalFreeMem)
-					gpuInfo.TotalMemory = uint64(memInfo.total)
-					gpuInfo.FreeMemory = uint64(memInfo.free)
-					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-					// TODO dependency path?
-					oneapiGPUs = append(oneapiGPUs, gpuInfo)
-				}
-			}
-		}
-
-		rocmGPUs = AMDGetGPUInfo()
-		bootstrapped = true
+	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
+	cpuVariant := GetCPUVariant()
+	if cpuVariant == "" && runtime.GOARCH == "amd64" {
+		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
 	}

-	// For detected GPUs, load library if not loaded
+	// On windows we bundle the nvidia library one level above the runner dir
+	depPath := ""
+	if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+		depPath = filepath.Dir(envconfig.RunnersDir)
+	}

-	// Refresh free memory usage
-	if needRefresh {
-		mem, err := GetCPUMem()
-		if err != nil {
-			slog.Warn("error looking up system memory", "error", err)
-		} else {
-			slog.Debug("updating system memory data",
-				slog.Group(
-					"before",
-					"total", format.HumanBytes2(cpus[0].TotalMemory),
-					"free", format.HumanBytes2(cpus[0].FreeMemory),
-				),
-				slog.Group(
-					"now",
-					"total", format.HumanBytes2(mem.TotalMemory),
-					"free", format.HumanBytes2(mem.FreeMemory),
-				),
-			)
-			cpus[0].FreeMemory = mem.FreeMemory
-		}
+	var memInfo C.mem_info_t
+	resp := []GpuInfo{}

-		var memInfo C.mem_info_t
-		if cHandles == nil && len(cudaGPUs) > 0 {
-			cHandles = initCudaHandles()
+	// NVIDIA first
+	for i := range gpuHandles.deviceCount {
+		// TODO once we support CPU compilation variants of GPU libraries refine this...
+		if cpuVariant == "" && runtime.GOARCH == "amd64" {
+			continue
 		}
-		for i, gpu := range cudaGPUs {
-			if cHandles.nvml != nil {
-				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
-			} else if cHandles.cudart != nil {
-				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
-			} else if cHandles.nvcuda != nil {
-				C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
-				memInfo.used = memInfo.total - memInfo.free
+		if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
+			gpuInfo := GpuInfo{
+				Library: "cuda",
+			}
+			var driverMajor int
+			var driverMinor int
+			if gpuHandles.cudart != nil {
+				C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
 			} else {
-				// shouldn't happen
-				slog.Warn("no valid cuda library loaded to refresh vram usage")
-				break
+				C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
+				driverMajor = int(gpuHandles.nvcuda.driver_major)
+				driverMinor = int(gpuHandles.nvcuda.driver_minor)
 			}
 			if memInfo.err != nil {
-				slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+				slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 				C.free(unsafe.Pointer(memInfo.err))
 				continue
 			}
-			if memInfo.free == 0 {
-				slog.Warn("error looking up nvidia GPU memory")
+			if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
+				slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
 				continue
 			}
-			slog.Debug("updating cuda memory data",
-				"gpu", gpu.ID,
-				"name", gpu.Name,
-				slog.Group(
-					"before",
-					"total", format.HumanBytes2(gpu.TotalMemory),
-					"free", format.HumanBytes2(gpu.FreeMemory),
-				),
-				slog.Group(
-					"now",
-					"total", format.HumanBytes2(uint64(memInfo.total)),
-					"free", format.HumanBytes2(uint64(memInfo.free)),
-					"used", format.HumanBytes2(uint64(memInfo.used)),
-				),
-			)
-			cudaGPUs[i].FreeMemory = uint64(memInfo.free)
-		}
+			gpuInfo.TotalMemory = uint64(memInfo.total)
+			gpuInfo.FreeMemory = uint64(memInfo.free)
+			gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+			gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+			gpuInfo.MinimumMemory = cudaMinimumMemory
+			gpuInfo.DependencyPath = depPath
+			gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+			gpuInfo.DriverMajor = driverMajor
+			gpuInfo.DriverMinor = driverMinor

-		if oHandles == nil && len(oneapiGPUs) > 0 {
-			oHandles = initOneAPIHandles()
-		}
-		for i, gpu := range oneapiGPUs {
-			if oHandles.oneapi == nil {
-				// shouldn't happen
-				slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
-				continue
-			}
-			C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
-			// TODO - convert this to MinimumMemory based on testing...
-			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-			memInfo.free = C.uint64_t(totalFreeMem)
-			oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
-		}
-
-		err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
-		if err != nil {
-			slog.Debug("problem refreshing ROCm free memory", "error", err)
+			// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
+			resp = append(resp, gpuInfo)
 		}
 	}

-	resp := []GpuInfo{}
-	for _, gpu := range cudaGPUs {
-		resp = append(resp, gpu.GpuInfo)
-	}
-	for _, gpu := range rocmGPUs {
-		resp = append(resp, gpu.GpuInfo)
-	}
-	for _, gpu := range oneapiGPUs {
-		resp = append(resp, gpu.GpuInfo)
-	}
+	// Then AMD
+	resp = append(resp, AMDGetGPUInfo()...)
+
 	if len(resp) == 0 {
-		resp = append(resp, cpus[0].GpuInfo)
+		C.cpu_check_ram(&memInfo)
+		if memInfo.err != nil {
+			slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
+			C.free(unsafe.Pointer(memInfo.err))
+			return resp
+		}
+		gpuInfo := GpuInfo{
+			Library: "cpu",
+			Variant: cpuVariant,
+		}
+		gpuInfo.TotalMemory = uint64(memInfo.total)
+		gpuInfo.FreeMemory = uint64(memInfo.free)
+		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+
+		resp = append(resp, gpuInfo)
 	}
+
 	return resp
 }

+func GetCPUMem() (memInfo, error) {
+	var ret memInfo
+	var info C.mem_info_t
+	C.cpu_check_ram(&info)
+	if info.err != nil {
+		defer C.free(unsafe.Pointer(info.err))
+		return ret, fmt.Errorf(C.GoString(info.err))
+	}
+	ret.FreeMemory = uint64(info.free)
+	ret.TotalMemory = uint64(info.total)
+	return ret, nil
+}
+
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
@@ -517,26 +362,8 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 	return 0, nil, ""
 }

-func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
-	var resp C.nvml_init_resp_t
-	resp.ch.verbose = getVerboseState()
-	for _, libPath := range nvmlLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.nvml_init(lib, &resp)
-		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			return &resp.ch, libPath
-		}
-	}
-	return nil, ""
-}
-
 func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 	var resp C.oneapi_init_resp_t
-	num_devices := 0
 	resp.oh.verbose = getVerboseState()
 	for _, libPath := range oneapiLibPaths {
 		lib := C.CString(libPath)
@@ -546,10 +373,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			for i := range resp.oh.num_drivers {
-				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
-			}
-			return num_devices, &resp.oh, libPath
+			return int(resp.num_devices), &resp.oh, libPath
 		}
 	}
 	return 0, nil, ""
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability(),
+				Variant: GetCPUVariant(),
 				memInfo: mem,
 			},
 		}
@@ -42,17 +42,6 @@ func GetGPUInfo() GpuInfoList {
 	return []GpuInfo{info}
 }

-func GetCPUInfo() GpuInfoList {
-	mem, _ := GetCPUMem()
-	return []GpuInfo{
-		{
-			Library: "cpu",
-			Variant: GetCPUCapability(),
-			memInfo: mem,
-		},
-	}
-}
-
 func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -47,7 +47,6 @@ typedef struct mem_info {
  char gpu_name[GPU_NAME_LEN];
  uint64_t total;
  uint64_t free;
-  uint64_t used;

  // Compute Capability
  int major; 
@@ -63,7 +62,6 @@ void cpu_check_ram(mem_info_t *resp);

 #include "gpu_info_cudart.h"
 #include "gpu_info_nvcuda.h"
-#include "gpu_info_nvml.h"
 #include "gpu_info_oneapi.h"

 #endif  // __GPU_INFO_H__
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -0,0 +1,45 @@
+#include "gpu_info.h"
+// Fallbacks for CPU mode
+
+#ifdef _WIN32
+#include <sysinfoapi.h>
+void cpu_check_ram(mem_info_t *resp) {
+  resp->err = NULL;
+  MEMORYSTATUSEX info;
+  info.dwLength = sizeof(info);
+  if (GlobalMemoryStatusEx(&info) != 0) {
+    resp->total = info.ullTotalPhys;
+    resp->free = info.ullAvailPhys;
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
+  } else {
+    resp->err = LOAD_ERR();
+  }
+  return;
+}
+
+#elif __linux__
+#include <errno.h>
+#include <string.h>
+#include <sys/sysinfo.h>
+void cpu_check_ram(mem_info_t *resp) {
+  struct sysinfo info;
+  resp->err = NULL;
+  if (sysinfo(&info) != 0) {
+    resp->err = strdup(strerror(errno));
+  } else {
+    resp->total = info.totalram * info.mem_unit;
+    resp->free = info.freeram * info.mem_unit;
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
+  }
+  return;
+}
+
+#elif __APPLE__
+// TODO consider an Apple implementation that does something useful
+// mem_info_t cpu_check_ram() {
+//   mem_info_t resp = {0, 0, NULL};
+//   return resp;
+// }
+#else
+#error "Unsupported platform"
+#endif
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
 }


-void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
+void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  cudartMemory_t memInfo = {0,0,0};
  cudartReturn_t ret;
@@ -166,11 +166,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {

  resp->total = memInfo.total;
  resp->free = memInfo.free;
-  resp->used = memInfo.used;

  LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
  LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
-  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
 }

--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -140,8 +140,7 @@ typedef struct cudart_init_resp {
 } cudart_init_resp_t;

 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
-void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
-// TODO - if we keep this library longer term, add cudart_get_free
+void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
 void cudart_release(cudart_handle_t ch);

 #endif  // __GPU_INFO_CUDART_H__
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -96,7 +96,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
 }

 const int buflen = 256;
-void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
+void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  nvcudaMemory_t memInfo = {0,0};
  CUresult ret;
@@ -168,7 +168,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
+    snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
    resp->err = strdup(buf);
    return;
  }
@@ -193,42 +193,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to release device context %d", ret);
-  }
-}
-
-void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
-  CUresult ret;
-  CUcontext ctx = NULL;
-  CUdevice device = -1;
-  *free = 0;
-  *total = 0;
-
-  ret = (*h.cuDeviceGet)(&device, i);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda device failed to initialize");
-    return;
-  }
-
-
-  // To get memory we have to set (and release) a context
-  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to get device context %d", ret);
-    return;
-  }
-
-  ret = (*h.cuMemGetInfo_v2)(free, total);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda device memory info lookup failure %d", ret);
-    // Best effort on failure...
-    (*h.cuCtxDestroy)(ctx);
-    return;
-  }
-
-  ret = (*h.cuCtxDestroy)(ctx);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to release device context %d", ret);
+    LOG(1, "nvcuda failed to release primary device context %d", ret);
  }
 }

--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -67,8 +67,7 @@ typedef struct nvcuda_init_resp {
 } nvcuda_init_resp_t;

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
-void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
-void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free, uint64_t *total);
+void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
 void nvcuda_release(nvcuda_handle_t ch);

 #endif  // __GPU_INFO_NVCUDA_H__
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -1,104 +0,0 @@
-#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
-
-#include <string.h>
-
-#include "gpu_info_nvml.h"
-
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
-  nvmlReturn_t ret;
-  resp->err = NULL;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
-      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
-      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
-      {NULL, NULL},
-  };
-
-  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
-  if (!resp->ch.handle) {
-    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
-    snprintf(buf, buflen,
-             "Unable to load %s library to query for Nvidia GPUs: %s",
-             nvml_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  // TODO once we've squashed the remaining corner cases remove this log
-  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
-  
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
-
-    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
-      resp->ch.handle = NULL;
-      char *msg = LOAD_ERR();
-      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->ch.handle);
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-               msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  ret = (*resp->ch.nvmlInit_v2)();
-  if (ret != NVML_SUCCESS) {
-    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-}
-
-
-void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
-    nvmlDevice_t device;
-    nvmlMemory_t memInfo = {0};
-    nvmlReturn_t ret;
-    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
-    if (ret != NVML_SUCCESS) {
-        LOG(1, "unable to get device handle %d: %d", device_id, ret);
-        *free = 0;
-        return;
-    }
-
-    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
-    if (ret != NVML_SUCCESS) {
-        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
-        *free = 0;
-        return;
-    }
-    *free = memInfo.free;
-    *total = memInfo.total;
-    *used = memInfo.used;
-}
-
-
-void nvml_release(nvml_handle_t h) {
-  LOG(h.verbose, "releasing nvml library\n");
-  nvmlReturn_t ret;
-  ret = (*h.nvmlShutdown)();
-  if (ret != NVML_SUCCESS) {
-    LOG(1, "error during nvmlShutdown %d", ret);
-  }
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-
-#endif  // __APPLE__
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
@@ -1,48 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_NVML_H__
-#define __GPU_INFO_NVML_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum nvmlReturn_enum {
-  NVML_SUCCESS = 0,
-  // Other values omitted for now...
-} nvmlReturn_t;
-typedef void *nvmlDevice_t;  // Opaque is sufficient
-typedef struct nvmlMemory_st {
-  unsigned long long total;
-  unsigned long long free;
-  unsigned long long used;
-} nvmlMemory_t;
-
-typedef enum nvmlBrandType_enum
-{
-    NVML_BRAND_UNKNOWN          = 0,
-} nvmlBrandType_t;
-
-typedef struct nvml_handle {
-  void *handle;
-  uint16_t verbose;
-  nvmlReturn_t (*nvmlInit_v2)(void);
-  nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
-  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
-} nvml_handle_t;
-
-typedef struct nvml_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  nvml_handle_t ch;
-} nvml_init_resp_t;
-
-typedef struct nvml_compute_capability {
-  char *err;
-  int major;
-  int minor;
-} nvml_compute_capability_t;
-
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
-void nvml_release(nvml_handle_t ch);
-
-#endif  // __GPU_INFO_NVML_H__
-#endif  // __APPLE__
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -4,17 +4,15 @@

 #include <string.h>

-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
+{
  ze_result_t ret;
  resp->err = NULL;
-  resp->oh.devices = NULL;
-  resp->oh.num_devices = NULL;
-  resp->oh.drivers = NULL;
-  resp->oh.num_drivers = 0;
  const int buflen = 256;
  char buf[buflen + 1];
-  int i, d;
-  struct lookup {
+  int i;
+  struct lookup
+  {
    char *s;
    void **p;
  } l[] = {
@@ -30,7 +28,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
  };

  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
-  if (!resp->oh.handle) {
+  if (!resp->oh.handle)
+  {
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Intel GPUs: %s\n",
@@ -45,12 +44,14 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
      "wiring Level-Zero management library functions in %s\n",
      oneapi_lib_path);

-  for (i = 0; l[i].s != NULL; i++) {
+  for (i = 0; l[i].s != NULL; i++)
+  {
    // TODO once we've squashed the remaining corner cases remove this log
    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);

    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!l[i].p) {
+    if (!l[i].p)
+    {
      resp->oh.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -62,70 +63,23 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
    }
  }

-  LOG(resp->oh.verbose, "calling zesInit\n");
-
  ret = (*resp->oh.zesInit)(0);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
-    snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
+  if (ret != ZE_RESULT_SUCCESS)
+  {
+    LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->oh.handle);
+    resp->oh.handle = NULL;
+    snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
  }

-  LOG(resp->oh.verbose, "calling zesDriverGet\n");
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
-  resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
-  resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
-  memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
-  resp->oh.devices =
-      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-
-  for (d = 0; d < resp->oh.num_drivers; d++) {
-    LOG(resp->oh.verbose, "calling zesDeviceGet %d\n", resp->oh.drivers[d]);
-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
-                                   &resp->oh.num_devices[d], NULL);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-    resp->oh.devices[d] =
-        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
-    ret = (*resp->oh.zesDeviceGet)(
-        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-  }
+  (*resp->oh.zesDriverGet)(&resp->num_devices, NULL);

  return;
 }

-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp) {
+void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
+{
  ze_result_t ret;
  resp->err = NULL;
  uint64_t totalMem = 0;
@@ -134,126 +88,127 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
  char buf[buflen + 1];
  int i, d, m;

-  if (h.handle == NULL) {
+  if (h.handle == NULL)
+  {
    resp->err = strdup("Level-Zero handle not initialized");
    return;
  }

-  if (driver > h.num_drivers || device > h.num_devices[driver]) {
-    resp->err = strdup("driver of device index out of bounds");
+  uint32_t driversCount = 0;
+  ret = (*h.zesDriverGet)(&driversCount, NULL);
+  if (ret != ZE_RESULT_SUCCESS)
+  {
+    snprintf(buf, buflen, "unable to get driver count: %d", ret);
+    resp->err = strdup(buf);
    return;
  }
+  LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
+
+  zes_driver_handle_t *allDrivers =
+      malloc(driversCount * sizeof(zes_driver_handle_t));
+  (*h.zesDriverGet)(&driversCount, allDrivers);

  resp->total = 0;
  resp->free = 0;

-  zes_device_ext_properties_t ext_props;
-  ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
-  ext_props.pNext = NULL;
-
-  zes_device_properties_t props;
-  props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-  props.pNext = &ext_props;
-
-  ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device properties: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
-
-  // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
-  // (this is probably wrong...)
-  // TODO - the driver isn't included - what if there are multiple drivers?
-  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
-
-  if (h.verbose) {
-    // When in verbose mode, report more information about
-    // the card we discover.
-    LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
-        props.modelName);
-    LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
-        props.brandName);
-    LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
-        props.vendorName);
-    LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
-        props.serialNumber);
-    LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
-        props.boardNumber);
-  }
-
-  // TODO
-  // Compute Capability equivalent in resp->major, resp->minor, resp->patch
-
-  uint32_t memCount = 0;
-  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
-                                        NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
-             ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
-
-  zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
-  (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
-
-  for (m = 0; m < memCount; m++) {
-    zes_mem_state_t state;
-    state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
-    state.pNext = NULL;
-    ret = (*h.zesMemoryGetState)(mems[m], &state);
-    if (ret != ZE_RESULT_SUCCESS) {
-      snprintf(buf, buflen, "unable to get memory state: %x", ret);
+  for (d = 0; d < driversCount; d++)
+  {
+    uint32_t deviceCount = 0;
+    ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
+    if (ret != ZE_RESULT_SUCCESS)
+    {
+      snprintf(buf, buflen, "unable to get device count: %d", ret);
      resp->err = strdup(buf);
-      free(mems);
+      free(allDrivers);
      return;
    }

-    resp->total += state.size;
-    resp->free += state.free;
-  }
+    LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);

-  free(mems);
-}
+    zes_device_handle_t *devices =
+        malloc(deviceCount * sizeof(zes_device_handle_t));
+    (*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);

-void oneapi_release(oneapi_handle_t h) {
-  int d;
-  LOG(h.verbose, "releasing oneapi library\n");
-  for (d = 0; d < h.num_drivers; d++) {
-    if (h.devices != NULL && h.devices[d] != NULL) {
-      free(h.devices[d]);
+    for (i = 0; i < deviceCount; i++)
+    {
+      zes_device_ext_properties_t ext_props;
+      ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
+      ext_props.pNext = NULL;
+
+      zes_device_properties_t props;
+      props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+      props.pNext = &ext_props;
+
+      ret = (*h.zesDeviceGetProperties)(devices[i], &props);
+      if (ret != ZE_RESULT_SUCCESS)
+      {
+        snprintf(buf, buflen, "unable to get device properties: %d", ret);
+        resp->err = strdup(buf);
+        free(allDrivers);
+        free(devices);
+        return;
+      }
+
+      if (h.verbose)
+      {
+        // When in verbose mode, report more information about
+        // the card we discover.
+        LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
+            props.modelName);
+        LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
+            props.brandName);
+        LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
+            props.vendorName);
+        LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
+            props.serialNumber);
+        LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
+            props.boardNumber);
+      }
+
+      uint32_t memCount = 0;
+      ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
+      if (ret != ZE_RESULT_SUCCESS)
+      {
+        snprintf(buf, buflen,
+                 "unable to enumerate Level-Zero memory modules: %d", ret);
+        resp->err = strdup(buf);
+        free(allDrivers);
+        free(devices);
+        return;
+      }
+
+      LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
+
+      zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
+      (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
+
+      for (m = 0; m < memCount; m++)
+      {
+        zes_mem_state_t state;
+        state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
+        state.pNext = NULL;
+        ret = (*h.zesMemoryGetState)(mems[m], &state);
+        if (ret != ZE_RESULT_SUCCESS)
+        {
+          snprintf(buf, buflen, "unable to get memory state: %d", ret);
+          resp->err = strdup(buf);
+          free(allDrivers);
+          free(devices);
+          free(mems);
+          return;
+        }
+
+        resp->total += state.size;
+        resp->free += state.free;
+      }
+
+      free(mems);
    }
-  }
-  if (h.devices != NULL) {
-    free(h.devices);
-    h.devices = NULL;
-  }
-  if (h.num_devices != NULL) {
-    free(h.num_devices);
-    h.num_devices = NULL;
-  }
-  if (h.drivers != NULL) {
-    free(h.drivers);
-    h.drivers = NULL;
-  }
-  h.num_drivers = 0;
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}

-int oneapi_get_device_count(oneapi_handle_t h, int driver) {
-  if (h.handle == NULL || h.num_devices == NULL) {
-    return 0;
+    free(devices);
  }
-  if (driver > h.num_drivers) {
-    return 0;
-  }
-  return (int)h.num_devices[driver];
+
+  free(allDrivers);
 }

 #endif // __APPLE__
--- a/gpu/gpu_info_oneapi.h
+++ b/gpu/gpu_info_oneapi.h
@@ -9,7 +9,8 @@
 #define ZE_BIT(_i) (1 << _i)

 // Just enough typedef's to dlopen/dlsym for memory information
-typedef enum ze_result_t {
+typedef enum ze_result_t
+{
  ZE_RESULT_SUCCESS = 0,
  // Other values omitted for now...
 } ze_result_t;
@@ -19,11 +20,13 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
 typedef struct _zes_device_handle_t *zes_device_handle_t;
 typedef struct _zes_mem_handle_t *zes_mem_handle_t;

-typedef enum _ze_structure_type_t {
+typedef enum _ze_structure_type_t
+{
  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_structure_type_t;

-typedef enum _zes_structure_type_t {
+typedef enum _zes_structure_type_t
+{
  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
@@ -31,29 +34,35 @@ typedef enum _zes_structure_type_t {
  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_structure_type_t;

-typedef enum _zes_mem_type_t {
+typedef enum _zes_mem_type_t
+{
  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_mem_type_t;

-typedef enum _zes_mem_loc_t {
+typedef enum _zes_mem_loc_t
+{
  ZES_MEM_LOC_SYSTEM = 0,
  ZES_MEM_LOC_DEVICE = 1,
  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
 } zes_mem_loc_t;

-typedef enum _zes_mem_health_t {
+typedef enum _zes_mem_health_t
+{
  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
 } zes_mem_health_t;

-typedef struct _ze_device_uuid_t {
+typedef struct _ze_device_uuid_t
+{
  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } ze_device_uuid_t;

-typedef struct _zes_uuid_t {
+typedef struct _zes_uuid_t
+{
  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } zes_uuid_t;

-typedef enum _ze_device_type_t {
+typedef enum _ze_device_type_t
+{
  ZE_DEVICE_TYPE_GPU = 1,
  ZE_DEVICE_TYPE_CPU = 2,
  ZE_DEVICE_TYPE_FPGA = 3,
@@ -62,7 +71,8 @@ typedef enum _ze_device_type_t {
  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_device_type_t;

-typedef enum _zes_device_type_t {
+typedef enum _zes_device_type_t
+{
  ZES_DEVICE_TYPE_GPU = 1,
  ZES_DEVICE_TYPE_CPU = 2,
  ZES_DEVICE_TYPE_FPGA = 3,
@@ -72,7 +82,8 @@ typedef enum _zes_device_type_t {
 } zes_device_type_t;

 typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t {
+typedef enum _ze_device_property_flag_t
+{
  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -81,7 +92,8 @@ typedef enum _ze_device_property_flag_t {
 } ze_device_property_flag_t;

 typedef uint32_t zes_device_property_flags_t;
-typedef enum _zes_device_property_flag_t {
+typedef enum _zes_device_property_flag_t
+{
  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -89,7 +101,8 @@ typedef enum _zes_device_property_flag_t {
  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 } zes_device_property_flag_t;

-typedef struct _ze_device_properties_t {
+typedef struct _ze_device_properties_t
+{
  ze_structure_type_t stype;
  void *pNext;
  ze_device_type_t type;
@@ -113,7 +126,8 @@ typedef struct _ze_device_properties_t {
  char name[ZE_MAX_DEVICE_NAME];
 } ze_device_properties_t;

-typedef struct _zes_device_properties_t {
+typedef struct _zes_device_properties_t
+{
  zes_structure_type_t stype;
  void *pNext;
  ze_device_properties_t core;
@@ -126,7 +140,8 @@ typedef struct _zes_device_properties_t {
  char driverVersion[ZES_STRING_PROPERTY_SIZE];
 } zes_device_properties_t;

-typedef struct _zes_device_ext_properties_t {
+typedef struct _zes_device_ext_properties_t
+{
  zes_structure_type_t stype;
  void *pNext;
  zes_uuid_t uuid;
@@ -134,7 +149,8 @@ typedef struct _zes_device_ext_properties_t {
  zes_device_property_flags_t flags;
 } zes_device_ext_properties_t;

-typedef struct _zes_mem_properties_t {
+typedef struct _zes_mem_properties_t
+{
  zes_structure_type_t stype;
  void *pNext;
  zes_mem_type_t type;
@@ -146,7 +162,8 @@ typedef struct _zes_mem_properties_t {
  int32_t numChannels;
 } zes_mem_properties_t;

-typedef struct _zes_mem_state_t {
+typedef struct _zes_mem_state_t
+{
  zes_structure_type_t stype;
  const void *pNext;
  zes_mem_health_t health;
@@ -154,19 +171,10 @@ typedef struct _zes_mem_state_t {
  uint64_t size;
 } zes_mem_state_t;

-typedef struct oneapi_handle {
+typedef struct oneapi_handle
+{
  void *handle;
  uint16_t verbose;
-
-  uint32_t num_drivers;
-  zes_driver_handle_t *drivers;
-  uint32_t *num_devices;
-  zes_device_handle_t **devices;
-
-  // TODO Driver major, minor information
-  // int driver_major;
-  // int driver_minor;
-
  ze_result_t (*zesInit)(int);
  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
@@ -183,21 +191,21 @@ typedef struct oneapi_handle {

 } oneapi_handle_t;

-typedef struct oneapi_init_resp {
+typedef struct oneapi_init_resp
+{
  char *err; // If err is non-null handle is invalid
+  int num_devices;
  oneapi_handle_t oh;
 } oneapi_init_resp_t;

-typedef struct oneapi_version_resp {
+typedef struct oneapi_version_resp
+{
  ze_result_t status;
  char *str; // Contains version or error string if status != 0
 } oneapi_version_resp_t;

 void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp);
-void oneapi_release(oneapi_handle_t h);
-int oneapi_get_device_count(oneapi_handle_t h, int driver);
+void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);

 #endif // __GPU_INFO_INTEL_H__
 #endif // __APPLE__
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@@ -1,89 +0,0 @@
-package gpu
-
-import (
-	"bufio"
-	"fmt"
-	"os"
-	"strings"
-
-	"github.com/ollama/ollama/format"
-)
-
-var CudartGlobs = []string{
-	"/usr/local/cuda/lib64/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
-	"/usr/lib/wsl/lib/libcudart.so*",
-	"/usr/lib/wsl/drivers/*/libcudart.so*",
-	"/opt/cuda/lib64/libcudart.so*",
-	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
-	"/usr/local/cuda/lib*/libcudart.so*",
-	"/usr/lib*/libcudart.so*",
-	"/usr/local/lib*/libcudart.so*",
-}
-
-var NvmlGlobs = []string{}
-
-var NvcudaGlobs = []string{
-	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
-	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
-	"/usr/lib/*-linux-gnu/libcuda.so*",
-	"/usr/lib/wsl/lib/libcuda.so*",
-	"/usr/lib/wsl/drivers/*/libcuda.so*",
-	"/opt/cuda/lib*/libcuda.so*",
-	"/usr/local/cuda/lib*/libcuda.so*",
-	"/usr/lib*/libcuda.so*",
-	"/usr/local/lib*/libcuda.so*",
-}
-
-var OneapiGlobs = []string{
-	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
-	"/usr/lib*/libze_intel_gpu.so*",
-}
-
-var CudartMgmtName = "libcudart.so*"
-var NvcudaMgmtName = "libcuda.so*"
-var NvmlMgmtName = "" // not currently wired on linux
-var OneapiMgmtName = "libze_intel_gpu.so"
-
-func GetCPUMem() (memInfo, error) {
-	var mem memInfo
-	var total, available, free, buffers, cached uint64
-	f, err := os.Open("/proc/meminfo")
-	if err != nil {
-		return mem, err
-	}
-	defer f.Close()
-	s := bufio.NewScanner(f)
-	for s.Scan() {
-		line := s.Text()
-		switch {
-		case strings.HasPrefix(line, "MemTotal:"):
-			_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
-		case strings.HasPrefix(line, "MemAvailable:"):
-			_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
-		case strings.HasPrefix(line, "MemFree:"):
-			_, err = fmt.Sscanf(line, "MemFree:%d", &free)
-		case strings.HasPrefix(line, "Buffers:"):
-			_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
-		case strings.HasPrefix(line, "Cached:"):
-			_, err = fmt.Sscanf(line, "Cached:%d", &cached)
-		default:
-			continue
-		}
-		if err != nil {
-			return mem, err
-		}
-
-		if total > 0 && available > 0 {
-			mem.TotalMemory = total * format.KibiByte
-			mem.FreeMemory = available * format.KibiByte
-			return mem, nil
-		}
-	}
-	mem.TotalMemory = total * format.KibiByte
-	mem.FreeMemory = (free + buffers + cached) * format.KibiByte
-	return mem, nil
-}
--- a/gpu/gpu_windows.go
+++ b/gpu/gpu_windows.go
@@ -1,55 +0,0 @@
-package gpu
-
-import (
-	"fmt"
-	"syscall"
-	"unsafe"
-)
-
-type MEMORYSTATUSEX struct {
-	length               uint32
-	MemoryLoad           uint32
-	TotalPhys            uint64
-	AvailPhys            uint64
-	TotalPageFile        uint64
-	AvailPageFile        uint64
-	TotalVirtual         uint64
-	AvailVirtual         uint64
-	AvailExtendedVirtual uint64
-}
-
-var (
-	k32                      = syscall.NewLazyDLL("kernel32.dll")
-	globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
-	sizeofMemoryStatusEx     = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
-)
-
-var CudartGlobs = []string{
-	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
-}
-
-var NvmlGlobs = []string{
-	"c:\\Windows\\System32\\nvml.dll",
-}
-
-var NvcudaGlobs = []string{
-	"c:\\windows\\system*\\nvcuda.dll",
-}
-
-var OneapiGlobs = []string{
-	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
-}
-
-var CudartMgmtName = "cudart64_*.dll"
-var NvcudaMgmtName = "nvcuda.dll"
-var NvmlMgmtName = "nvml.dll"
-var OneapiMgmtName = "ze_intel_gpu64.dll"
-
-func GetCPUMem() (memInfo, error) {
-	memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
-	r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
-	if r1 == 0 {
-		return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
-	}
-	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
-}
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -18,7 +18,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`

 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant CPUCapability `json:"variant"`
+	Variant string `json:"variant,omitempty"`

 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@@ -26,9 +26,6 @@ type GpuInfo struct {
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath string `json:"lib_path,omitempty"`

-	// Extra environment variables specific to the GPU as list of [key,value]
-	EnvWorkarounds [][2]string `json:"envs,omitempty"`
-
 	// GPU information
 	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
 	Name    string `json:"name"`    // user friendly name if available
@@ -41,30 +38,6 @@ type GpuInfo struct {
 	// TODO other performance capability info to help in scheduling decisions
 }

-type CPUInfo struct {
-	GpuInfo
-}
-
-type CudaGPUInfo struct {
-	GpuInfo
-	index int //nolint:unused,nolintlint
-}
-type CudaGPUInfoList []CudaGPUInfo
-
-type RocmGPUInfo struct {
-	GpuInfo
-	usedFilepath string //nolint:unused,nolintlint
-	index        int    //nolint:unused,nolintlint
-}
-type RocmGPUInfoList []RocmGPUInfo
-
-type OneapiGPUInfo struct {
-	GpuInfo
-	driverIndex int //nolint:unused,nolintlint
-	gpuIndex    int //nolint:unused,nolintlint
-}
-type OneapiGPUInfoList []OneapiGPUInfo
-
 type GpuInfoList []GpuInfo

 // Split up the set of gpu info's by Library and variant
@@ -74,8 +47,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone {
-			requested += "_" + info.Variant.String()
+		if info.Variant != "" {
+			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
 			if lib == requested {
@@ -113,26 +86,3 @@ type ByFreeMemory []GpuInfo
 func (a ByFreeMemory) Len() int           { return len(a) }
 func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
-
-type CPUCapability uint32
-
-// Override at build time when building base GPU runners
-var GPURunnerCPUCapability = CPUCapabilityAVX
-
-const (
-	CPUCapabilityNone CPUCapability = iota
-	CPUCapabilityAVX
-	CPUCapabilityAVX2
-	// TODO AVX512
-)
-
-func (c CPUCapability) String() string {
-	switch c {
-	case CPUCapabilityAVX:
-		return "avx"
-	case CPUCapabilityAVX2:
-		return "avx2"
-	default:
-		return "no vector extensions"
-	}
-}
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -19,19 +19,17 @@ func TestMultiModelConcurrency(t *testing.T) {
 	var (
 		req = [2]api.GenerateRequest{
 			{
-				Model:     "orca-mini",
-				Prompt:    "why is the ocean blue?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Model:  "orca-mini",
+				Prompt: "why is the ocean blue?",
+				Stream: &stream,
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
 				},
 			}, {
-				Model:     "tinydolphin",
-				Prompt:    "what is the origin of the us thanksgiving holiday?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Model:  "tinydolphin",
+				Prompt: "what is the origin of the us thanksgiving holiday?",
+				Stream: &stream,
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
@@ -40,64 +38,42 @@ func TestMultiModelConcurrency(t *testing.T) {
 		}
 		resp = [2][]string{
 			[]string{"sunlight"},
-			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
+			[]string{"england", "english", "massachusetts", "pilgrims"},
 		}
 	)
 	var wg sync.WaitGroup
 	wg.Add(len(req))
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
 	defer cancel()
-
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	for i := 0; i < len(req); i++ {
-		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
-	}
-
 	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
-			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
+			GenerateTestHelper(ctx, t, req[i], resp[i])
 		}(i)
 	}
 	wg.Wait()
 }

 func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
-	req, resp := GenerateRequests()
-	reqLimit := len(req)
-	iterLimit := 5
-
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
-	if vram != "" {
-		max, err := strconv.ParseUint(vram, 10, 64)
-		require.NoError(t, err)
-		// Don't hammer on small VRAM cards...
-		if max < 4*1024*1024*1024 {
-			reqLimit = min(reqLimit, 2)
-			iterLimit = 2
-		}
-	}
-
-	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

+	req, resp := GenerateRequests()
 	// Get the server running (if applicable) warm the model up with a single initial request
-	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
+	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)

 	var wg sync.WaitGroup
-	wg.Add(reqLimit)
-	for i := 0; i < reqLimit; i++ {
+	wg.Add(len(req))
+	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
-			for j := 0; j < iterLimit; j++ {
+			for j := 0; j < 5; j++ {
 				slog.Info("Starting", "req", i, "iter", j)
-				// On slower GPUs it can take a while to process the concurrent requests
+				// On slower GPUs it can take a while to process the 4 concurrent requests
 				// so we allow a much longer initial timeout
-				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
+				DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
 			}
 		}(i)
 	}
@@ -245,23 +221,5 @@ func TestMultiModelStress(t *testing.T) {
 			}
 		}(i)
 	}
-	go func() {
-		for {
-			time.Sleep(2 * time.Second)
-			select {
-			case <-ctx.Done():
-				return
-			default:
-				models, err := client.ListRunning(ctx)
-				if err != nil {
-					slog.Warn("failed to list running models", "error", err)
-					continue
-				}
-				for _, m := range models.Models {
-					slog.Info("loaded model snapshot", "model", m)
-				}
-			}
-		}
-	}()
 	wg.Wait()
 }
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,8 +11,7 @@ import (
 )

 func TestContextExhaustion(t *testing.T) {
-	// Longer needed for small footprint GPUs
-	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -32,11 +32,7 @@ func TestIntegrationMultimodal(t *testing.T) {
 	resp := "the ollam"
 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
-	// llava models on CPU can be quite slow to start,
-	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
+	GenerateTestHelper(ctx, t, req, []string{resp})
 }

 const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -140,7 +140,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er

 	showCtx, cancel := context.WithDeadlineCause(
 		ctx,
-		time.Now().Add(10*time.Second),
+		time.Now().Add(5*time.Second),
 		fmt.Errorf("show for existing model %s took too long", modelName),
 	)
 	defer cancel()
@@ -287,46 +287,41 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 	return []api.GenerateRequest{
 			{
-				Model:     "orca-mini",
-				Prompt:    "why is the ocean blue?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Model:  "orca-mini",
+				Prompt: "why is the ocean blue?",
+				Stream: &stream,
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
 				},
 			}, {
-				Model:     "orca-mini",
-				Prompt:    "why is the color of dirt brown?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Model:  "orca-mini",
+				Prompt: "why is the color of dirt brown?",
+				Stream: &stream,
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
 				},
 			}, {
-				Model:     "orca-mini",
-				Prompt:    "what is the origin of the us thanksgiving holiday?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Model:  "orca-mini",
+				Prompt: "what is the origin of the us thanksgiving holiday?",
+				Stream: &stream,
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
 				},
 			}, {
-				Model:     "orca-mini",
-				Prompt:    "what is the origin of independence day?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Model:  "orca-mini",
+				Prompt: "what is the origin of independence day?",
+				Stream: &stream,
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
 				},
 			}, {
-				Model:     "orca-mini",
-				Prompt:    "what is the composition of air?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Model:  "orca-mini",
+				Prompt: "what is the composition of air?",
+				Stream: &stream,
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
@@ -336,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 		[][]string{
 			[]string{"sunlight"},
 			[]string{"soil", "organic", "earth", "black", "tan"},
-			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
+			[]string{"england", "english", "massachusetts", "pilgrims"},
 			[]string{"fourth", "july", "declaration", "independence"},
 			[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
 		}
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -2335,9 +2335,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUDA
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS
        }
        else if (arg == "--tensor-split" || arg == "-ts")
        {
@@ -2346,7 +2346,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
            std::string arg_next = argv[i];

            // split string by , and /
@@ -2367,8 +2367,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUDA
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
+#endif // GGML_USE_CUBLAS
        }
        else if (arg == "--main-gpu" || arg == "-mg")
        {
@@ -2377,7 +2377,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
            params.main_gpu = std::stoi(argv[i]);
 #else
            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,5 +1,7 @@
 #!powershell

+$ErrorActionPreference = "Stop"
+
 function amdGPUs {
    if ($env:AMDGPU_TARGETS) {
        return $env:AMDGPU_TARGETS
@@ -83,9 +85,9 @@ function init_vars {
 function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }

 function apply_patches {
@@ -119,15 +121,10 @@ function build {
    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
-    if ($cmakeDefs -contains "-G") {
-        $extra=@("-j8")
-    } else {
-        $extra= @("--", "/p:CL_MPcount=8")
-    }
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
-    if ($LASTEXITCODE -ne 0) { write-host "cmake build exit status $LASTEXITCODE"; throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
+    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    # Rearrange output to be consistent between different generators
    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
@@ -141,7 +138,7 @@ function sign {
        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
 }
@@ -216,13 +213,7 @@ function build_static() {
    }
 }

-function build_cpu() {
-    if ($script:ARCH -eq "arm64") {
-        $gen_arch = "ARM64"
-    } else { # amd64
-        $gen_arch = "x64"
-    }
-
+function build_cpu($gen_arch) {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        # remaining llama.cpp builds use MSVC 
        init_vars
@@ -279,15 +270,7 @@ function build_cuda() {
        init_vars
        $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
        $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
-        $script:cmakeDefs += @(
-            "-A", "x64",
-            "-DLLAMA_CUDA=ON",
-            "-DLLAMA_AVX=on",
-            "-DLLAMA_AVX2=off",
-            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
-            "-DCMAKE_CUDA_FLAGS=-t8"
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
-            )
+        $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
            $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
@@ -408,29 +391,16 @@ init_vars
 if ($($args.count) -eq 0) {
    git_module_setup
    apply_patches
-
-    $tasks = @("build_static", "build_cpu")
-    $jobs = @()
-    if ($script:ARCH -ne "arm64") {
-        $tasks += $("build_cpu_avx", "build_cpu_avx2", "build_cuda", "build_oneapi", "build_rocm")
-    }
-    foreach ($t in $tasks) {
-        $jobs += @(Start-ThreadJob -ThrottleLimit 12 -FilePath .\gen_windows.ps1 -ArgumentList $t -Name $t)
-    }
-    get-job
-    foreach ($job in $jobs) {
-        write-host "----" $job.Name output follows
-        receive-job -wait -job $job
-        write-host "----" $job.Name $job.State
-        write-host ""
-        if ($job.State -contains 'Failed') {
-            cleanup
-            write-host "Terminating remaining jobs (this takes a while, you can ^C)"
-            # TODO find some way to kill the spawned cmake processes faster
-            remove-job -force -job $jobs
-            exit(-1)
-        }
-        get-job
+    build_static
+    if ($script:ARCH -eq "arm64") {
+        build_cpu("ARM64")
+    } else { # amd64
+        build_cpu("x64")
+        build_cpu_avx
+        build_cpu_avx2
+        build_cuda
+        build_oneapi
+        build_rocm
    }

    cleanup
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -307,7 +307,6 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui

 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
-			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,11 +1,11 @@
 package llm

 import (
+	"fmt"
 	"log/slog"
-	"strconv"
-	"strings"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )
@@ -16,8 +16,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
-		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
+		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
 				return true, estimatedVRAM
@@ -31,64 +30,24 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	return false, estimatedVRAM
 }

-type MemoryEstimate struct {
-	// How many layers we predict we can load
-	Layers int
-
-	// The size of the graph which occupies the main GPU
-	Graph uint64
-
-	// How much VRAM will be allocated given the number of layers we predict
-	VRAMSize uint64
-
-	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
-	TotalSize uint64
-
-	// For multi-GPU scenarios, this provides the tensor split parameter
-	TensorSplit string
-
-	// For multi-GPU scenarios, this is the size in bytes per GPU
-	GPUSizes []uint64
-}
-
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
-	// Graph size for a partial offload, applies to all GPUs
-	var graphPartialOffload uint64
-
-	// Graph size when all layers are offloaded, applies to all GPUs
-	var graphFullOffload uint64
-
-	// Final graph offload once we know full or partial
-	var graphOffload uint64
-
-	// Projectors loaded into GPU0 only
-	var projectorSize uint64
-
-	// Conditional output size on GPU 0
-	var memoryLayerOutput uint64
-
-	// The sizes of a layer
-	var layerSize uint64
-
-	// The sum of all the layer sizes (just for logging)
-	var memoryWeights uint64
-
-	// True if all the layers are loaded
-	var fullyLoaded bool
-
-	// Overflow that didn't fit into the GPU
-	var overflow uint64
-
-	availableList := make([]string, len(gpus))
-	for i, gpu := range gpus {
-		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
+	var memoryAvailable uint64
+	for _, info := range gpus {
+		memoryAvailable += info.FreeMemory
 	}
-	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
+	if envconfig.MaxVRAM > 0 {
+		memoryAvailable = envconfig.MaxVRAM
+	}
+
+	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
+
+	// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
+	memoryMinimum := gpus[0].MinimumMemory

 	for _, projector := range projectors {
-		projectorSize += projectorMemoryRequirements(projector)
+		memoryMinimum += projectorMemoryRequirements(projector)

 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@@ -97,160 +56,79 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	layers := ggml.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.size()
-	} else {
-		slog.Warn("model missing blk.0 layer size")
+		memoryMinimum += blk0.size()
 	}

 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()

-	// KV is proportional to the number of layers
-	layerSize += kv / ggml.KV().BlockCount()
-
-	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
+	graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
+
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
 	}

+	graphFullOffload *= uint64(len(gpus))
+	graphPartialOffload *= uint64(len(gpus))
+
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
-	} else if len(gpus) > 1 {
-		// multigpu should always use the partial graph size
-		graphFullOffload = graphPartialOffload
 	}

+	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
+	memoryRequiredTotal := memoryMinimum + graphFullOffload
+
+	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
+	memoryRequiredPartial := memoryMinimum + graphPartialOffload
+
+	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.size()
 	}
+
 	if layer, ok := layers["output"]; ok {
 		memoryLayerOutput += layer.size()
 	} else if layer, ok := layers["token_embd"]; ok {
 		memoryLayerOutput += layer.size()
 	}

-	// Output layer handled at the end if we have space
-	gpuZeroOverhead := projectorSize
+	if gpus[0].Library == "metal" && opts.UseMMap {
+		// memory is preallocated for output tensors
+		memoryRequiredTotal += memoryLayerOutput
+		memoryRequiredPartial += memoryLayerOutput
+	}

-	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
-	layerCounts := make([]int, len(gpus))
-	gpuAllocations := make([]uint64, len(gpus))
-	type gs struct {
-		i int
-		g *gpu.GpuInfo
-	}
-	gpusWithSpace := []gs{}
-	for i := range gpus {
-		var gzo uint64
-		if len(gpusWithSpace) == 0 {
-			gzo = gpuZeroOverhead
-		}
-		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
-			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
-			continue
-		}
-		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
-		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
-	}
-
-	var gpuZeroID int
-	if len(gpusWithSpace) > 0 {
-		gpuZeroID = gpusWithSpace[0].i
-		gpuAllocations[gpuZeroID] += gpuZeroOverhead
-	}
-
-	// For all the layers, find where they can fit on the GPU(s)
 	for i := range int(ggml.KV().BlockCount()) {
-		memoryWeights += layerSize
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			memoryLayer := blk.size()

-		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
-			// Stop allocating on GPU(s) once we hit the users target NumGPU
-			continue
-		}
+			// KV is proportional to the number of layers
+			memoryLayer += kv / ggml.KV().BlockCount()

-		// distribute the layers across the GPU(s) that have space
-		for j := len(gpusWithSpace); j > 0; j-- {
-			g := gpusWithSpace[i%j]
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+layerSize {
-				gpuAllocations[g.i] += layerSize
-				layerCounts[g.i]++
+			memoryRequiredTotal += memoryLayer
+			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
+				memoryRequiredPartial += memoryLayer
 				layerCount++
-				break
-			} else {
-				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
 	}
-	if layerCount >= int(ggml.KV().BlockCount()) {
-		fullyLoaded = true
-	} else {
-		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
-			overflow += layerSize
-		}
+
+	if gpus[0].Library != "metal" || !opts.UseMMap {
+		// memory was not preallocated for output tensors
+		memoryRequiredTotal += memoryLayerOutput
 	}

-	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
-		for j := len(gpusWithSpace); j > 0; j-- {
-			g := gpusWithSpace[layerCount%j]
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+memoryLayerOutput {
-				gpuAllocations[g.i] += memoryLayerOutput
-				layerCounts[g.i]++
-				layerCount++
-				break
-			}
-		}
-
-		if layerCount < int(ggml.KV().BlockCount())+1 {
-			fullyLoaded = false
-			overflow += memoryLayerOutput
-		}
+	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
+		layerCount = int(ggml.KV().BlockCount()) + 1
+		memoryRequiredPartial = memoryRequiredTotal
 	}

-	// Add the applicable (full or partial) graph allocations
-	for i := range gpus {
-		if layerCounts[i] <= 0 {
-			continue
-		}
-		if fullyLoaded {
-			gpuAllocations[i] += graphFullOffload
-		} else {
-			gpuAllocations[i] += graphPartialOffload
-		}
-	}
-	if fullyLoaded {
-		graphOffload = graphFullOffload
-	} else {
-		graphOffload = graphPartialOffload
-	}
-
-	// Summaries for the log
-	var memoryRequiredPartial, memoryRequiredTotal uint64
-	for i := range gpuAllocations {
-		memoryRequiredPartial += gpuAllocations[i]
-	}
-	memoryRequiredTotal = memoryRequiredPartial + overflow
-
-	tensorSplit := ""
-	if len(gpus) > 1 {
-		splits := make([]string, len(gpus))
-		for i, count := range layerCounts {
-			splits[i] = strconv.Itoa(count)
-		}
-		tensorSplit = strings.Join(splits, ",")
-	}
-	allocationsList := []string{}
-	for _, a := range gpuAllocations {
-		allocationsList = append(allocationsList, format.HumanBytes2(a))
-	}
+	memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv

 	slog.Info(
 		"offload to gpu",
@@ -258,17 +136,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			"layers",
 			// requested number of layers to offload
 			"requested", opts.NumGPU,
-			// The number of layers the model has (including output)
-			"model", int(ggml.KV().BlockCount())+1,
 			// estimated number of layers that can be offloaded
-			"offload", layerCount,
-			// multi-gpu split for tesnors
-			"split", tensorSplit,
+			"real", layerCount,
 		),
 		slog.Group(
 			"memory",
-			// memory available by GPU for offloading
-			"available", availableList,
+			// memory available for offloading
+			"available", format.HumanBytes2(memoryAvailable),
 			slog.Group(
 				"required",
 				// memory required for full offloading
@@ -277,8 +151,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 				"partial", format.HumanBytes2(memoryRequiredPartial),
 				// memory of KV cache
 				"kv", format.HumanBytes2(kv),
-				// Allocations across the GPUs
-				"allocations", allocationsList,
 			),
 			slog.Group(
 				"weights",
@@ -299,31 +171,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		),
 	)
 	if gpus[0].Library == "cpu" {
-		return MemoryEstimate{
-			Layers:    0,
-			Graph:     0,
-			VRAMSize:  0,
-			TotalSize: memoryRequiredTotal,
-			GPUSizes:  []uint64{},
-		}
+		return 0, 0, memoryRequiredTotal
 	}
-	if layerCount == 0 {
+	if memoryRequiredPartial > memoryAvailable {
 		slog.Debug("insufficient VRAM to load any model layers")
-		return MemoryEstimate{
-			Layers:    0,
-			Graph:     0,
-			VRAMSize:  0,
-			TotalSize: memoryRequiredTotal,
-			GPUSizes:  []uint64{},
-		}
+		return 0, 0, memoryRequiredTotal
 	}

-	return MemoryEstimate{
-		Layers:      layerCount,
-		Graph:       graphOffload,
-		VRAMSize:    memoryRequiredPartial,
-		TotalSize:   memoryRequiredTotal,
-		TensorSplit: tensorSplit,
-		GPUSizes:    gpuAllocations,
-	}
+	return layerCount, memoryRequiredPartial, memoryRequiredTotal
 }
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -1,127 +0,0 @@
-package llm
-
-import (
-	"bytes"
-	"encoding/binary"
-	"fmt"
-	"os"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/gpu"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-func TestEstimateGPULayers(t *testing.T) {
-	envconfig.Debug = true
-	modelName := "dummy"
-	f, err := os.CreateTemp(t.TempDir(), modelName)
-	require.NoError(t, err)
-	defer f.Close()
-	gguf := NewGGUFV3(binary.LittleEndian)
-	inputLayerCount := 5
-	tensors := []Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-	}
-	assert.Len(t, tensors, inputLayerCount+1)
-	err = gguf.Encode(f, KV{
-		"general.architecture":          "llama",
-		"general.name":                  "name",
-		"llama.context_length":          uint32(32),
-		"llama.embedding_length":        uint32(4096),
-		"llama.block_count":             uint32(inputLayerCount),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(32),
-		"tokenizer.ggml.tokens":         []string{" "},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, tensors)
-	require.NoError(t, err)
-
-	ggml, err := LoadModel(f.Name())
-	require.NoError(t, err)
-
-	// Simple CPU scenario
-	gpus := []gpu.GpuInfo{
-		{
-			Library: "cpu",
-		},
-	}
-	projectors := []string{}
-	opts := api.DefaultOptions()
-	t.Run("cpu", func(t *testing.T) {
-		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
-		assert.Equal(t, 0, estimate.Layers)
-		assert.Equal(t, uint64(0), estimate.Graph)
-	})
-
-	// derived from the dummy ggml file above
-	graphPartialOffload := uint64(202377216)
-	graphFullOffload := uint64(171968512)
-	layerSize := uint64(33554436)
-	projectorSize := uint64(0)
-	memoryLayerOutput := uint64(4)
-
-	// Dual CUDA scenario with assymetry
-	gpuMinimumMemory := uint64(2048)
-	gpus = []gpu.GpuInfo{
-		{
-			Library:       "cuda",
-			MinimumMemory: gpuMinimumMemory,
-		},
-		{
-			Library:       "cuda",
-			MinimumMemory: gpuMinimumMemory,
-		},
-	}
-	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
-	for i, s := range []struct {
-		layer0, layer1   uint64
-		expect0, expect1 uint64
-	}{
-		{1, 1, 1, 1},
-		{2, 1, 2, 1},
-		{2, 2, 2, 2},
-		{1, 2, 1, 2},
-		{3, 3, 3, 3},
-		{4, 4, 3, 3},
-		{6, 6, 3, 3},
-		{0, 3, 0, 3},
-	} {
-		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
-			gpus[0].FreeMemory = 0
-			gpus[1].FreeMemory = 0
-			gpus[0].FreeMemory += projectorSize
-			if s.layer0 > 0 {
-				gpus[0].FreeMemory += memoryLayerOutput
-			} else {
-				gpus[1].FreeMemory += memoryLayerOutput
-			}
-			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
-			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
-			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
-			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
-			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
-			var layerSums uint64
-			for _, b := range estimate.GPUSizes {
-				layerSums += b
-			}
-			if estimate.Layers < inputLayerCount+1 {
-				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-			} else {
-				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-			}
-		})
-	}
-}
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := availableServers()
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone {
-		requested += "_" + info.Variant.String()
+	if info.Variant != "" {
+		requested += "_" + info.Variant
 	}

 	servers := []string{}
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {

 	// Load up the best CPU variant if not primary requested
 	if info.Library != "cpu" {
-		variant := gpu.GetCPUCapability()
+		variant := gpu.GetCPUVariant()
 		// If no variant, then we fall back to default
 		// If we have a variant, try that if we find an exact match
 		// Attempting to run the wrong CPU instructions will panic the
 		// process
-		if variant != gpu.CPUCapabilityNone {
+		if variant != "" {
 			for cmp := range availableServers {
-				if cmp == "cpu_"+variant.String() {
+				if cmp == "cpu_"+variant {
 					servers = append(servers, cmp)
 					break
 				}
@@ -146,11 +146,11 @@ func serverForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
-	variant := gpu.GetCPUCapability()
+	variant := gpu.GetCPUVariant()
 	availableServers := availableServers()
-	if variant != gpu.CPUCapabilityNone {
+	if variant != "" {
 		for cmp := range availableServers {
-			if cmp == "cpu_"+variant.String() {
+			if cmp == "cpu_"+variant {
 				return cmp
 			}
 		}
--- a/llm/server.go
+++ b/llm/server.go
@@ -37,9 +37,8 @@ type LlamaServer interface {
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
-	EstimatedVRAM() uint64 // Total VRAM across all GPUs
+	EstimatedVRAM() uint64
 	EstimatedTotal() uint64
-	EstimatedVRAMByGPU(gpuID string) uint64
 }

 // llmServer is an instance of the llama.cpp server
@@ -50,12 +49,13 @@ type llmServer struct {
 	status  *StatusWriter
 	options api.Options

-	estimate    MemoryEstimate
-	totalLayers uint64
-	// gpuCount     int
-	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
-	loadDuration time.Duration   // Record how long it took the model to load
-	loadProgress float32
+	// TODO - this should be broken down by GPU
+	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
+	estimatedTotal uint64 // Total size of model
+	totalLayers    uint64
+	gpuCount       int
+	loadDuration   time.Duration // Record how long it took the model to load
+	loadProgress   float32

 	sem *semaphore.Weighted
 }
@@ -80,16 +80,16 @@ func LoadModel(model string) (*GGML, error) {
 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
 	var err error
 	var cpuRunner string
-	var estimate MemoryEstimate
+	var estimatedVRAM uint64
+	var estimatedTotal uint64
 	var systemMemory uint64
+	gpuCount := len(gpus)
+	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
+		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner

-	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
-	if opts.NumGPU == 0 {
-		gpus = gpu.GetCPUInfo()
-	}
-	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = serverForCpu()
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		gpuCount = 0
+		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
@@ -100,19 +100,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 				slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
 			}
 		}
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		var layers int
+		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)

 		switch {
-		case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
+		case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
-		case gpus[0].Library != "metal" && estimate.Layers == 0:
+		case gpus[0].Library != "metal" && layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = serverForCpu()
-			gpus = gpu.GetCPUInfo()
-		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
-			opts.NumGPU = estimate.Layers
+			gpuCount = 0
+		case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
+			opts.NumGPU = layers
 		}
 	}

@@ -231,14 +232,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))

-	if estimate.TensorSplit != "" {
-		params = append(params, "--tensor-split", estimate.TensorSplit)
-	}
-
-	if estimate.TensorSplit != "" {
-		params = append(params, "--tensor-split", estimate.TensorSplit)
-	}
-
 	for i := range len(servers) {
 		dir := availableServers[servers[i]]
 		if dir == "" {
@@ -249,7 +242,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		if strings.HasPrefix(servers[i], "cpu") {
-			gpus = gpu.GetCPUInfo()
+			// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
+			gpuCount = 0
 		}

 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
@@ -305,25 +299,22 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		s := &llmServer{
-			port:        port,
-			cmd:         exec.Command(server, finalParams...),
-			status:      NewStatusWriter(os.Stderr),
-			options:     opts,
-			estimate:    estimate,
-			sem:         semaphore.NewWeighted(int64(numParallel)),
-			totalLayers: ggml.KV().BlockCount() + 1,
-			gpus:        gpus,
-			done:        make(chan error, 1),
+			port:           port,
+			cmd:            exec.Command(server, finalParams...),
+			status:         NewStatusWriter(os.Stderr),
+			options:        opts,
+			estimatedVRAM:  estimatedVRAM,
+			estimatedTotal: estimatedTotal,
+			sem:            semaphore.NewWeighted(int64(numParallel)),
+			totalLayers:    ggml.KV().BlockCount() + 1,
+			gpuCount:       gpuCount,
+			done:           make(chan error, 1),
 		}

 		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status

-		envWorkarounds := [][2]string{}
-		for _, gpu := range gpus {
-			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
-		}
 		visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
 		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))

@@ -338,12 +329,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
 				s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
 				devicesNeeded = false
-			} else if len(envWorkarounds) != 0 {
-				for _, kv := range envWorkarounds {
-					if strings.EqualFold(cmp[0], kv[0]) {
-						s.cmd.Env[i] = kv[0] + "=" + kv[1]
-					}
-				}
 			}
 		}
 		if pathNeeded {
@@ -1019,20 +1004,11 @@ func (s *llmServer) Close() error {
 }

 func (s *llmServer) EstimatedVRAM() uint64 {
-	return s.estimate.VRAMSize
+	return s.estimatedVRAM
 }

 func (s *llmServer) EstimatedTotal() uint64 {
-	return s.estimate.TotalSize
-}
-
-func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
-	for i, gpu := range s.gpus {
-		if gpu.ID == gpuID {
-			return s.estimate.GPUSizes[i]
-		}
-	}
-	return 0
+	return s.estimatedTotal
 }

 func parseDurationMs(ms float64) time.Duration {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -178,6 +178,9 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {

 	if r.Seed != nil {
 		options["seed"] = *r.Seed
+
+		// temperature=0 is required for reproducible outputs
+		options["temperature"] = 0.0
 	}

 	if r.FrequencyPenalty != nil {
@@ -208,6 +211,29 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {
 	}
 }

+type DeleteCompletion struct {
+	Id      string `json:"id"`
+	Object  string `json:"object"`
+	Deleted bool   `json:"deleted"`
+}
+
+func toDeleteCompletion(model string) DeleteCompletion {
+	return DeleteCompletion{
+		Id:      model,
+		Object:  "model",
+		Deleted: true,
+	}
+}
+
+type BaseWriter struct {
+	gin.ResponseWriter
+}
+
+type DeleteWriter struct {
+	BaseWriter
+	model string
+}
+
 type writer struct {
 	stream bool
 	id     string
@@ -312,3 +338,69 @@ func Middleware() gin.HandlerFunc {
 		c.Next()
 	}
 }
+
+func (w *DeleteWriter) writeError(code int, data []byte) (int, error) {
+	var serr api.StatusError
+	err := json.Unmarshal(data, &serr)
+	if err != nil {
+		return 0, err
+	}
+
+	w.ResponseWriter.Header().Set("Content-Type", "application/json")
+	err = json.NewEncoder(w.ResponseWriter).Encode(NewError(http.StatusInternalServerError, serr.Error()))
+	if err != nil {
+		return 0, err
+	}
+
+	return len(data), nil
+}
+
+func (w *DeleteWriter) writeResponse(data []byte) (int, error) {
+	// delete completion
+	w.ResponseWriter.Header().Set("Content-Type", "application/json")
+	err := json.NewEncoder(w.ResponseWriter).Encode(toDeleteCompletion(w.model))
+	if err != nil {
+		return 0, err
+	}
+
+	return len(data), nil
+}
+
+func (w *DeleteWriter) Write(data []byte) (int, error) {
+	code := w.ResponseWriter.Status()
+	if code != http.StatusOK {
+		return w.writeError(code, data)
+	}
+
+	return w.writeResponse(data)
+}
+
+func DeleteMiddleware() gin.HandlerFunc {
+	return func(c *gin.Context) {
+		var b bytes.Buffer
+		if err := json.NewEncoder(&b).Encode(api.DeleteRequest{Model: c.Param("model")}); err != nil {
+			c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
+			return
+		}
+
+		c.Request.Body = io.NopCloser(&b)
+
+		// response writer
+		w := &DeleteWriter{
+			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
+			model:      c.Param("model"),
+		}
+
+		c.Writer = w
+
+		c.Next()
+
+		// If the status code is OK, write the DeleteCompletion response
+		if c.Writer.Status() == http.StatusOK {
+			_, err := w.writeResponse(nil)
+			if err != nil {
+				c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
+			}
+		}
+	}
+}
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -159,8 +159,8 @@ check_gpu() {
            esac ;;
        lshw)
            case $2 in
-                nvidia) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
-                amdgpu) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[1002\]' || return 1 ;;
+                nvidia) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
+                amdgpu) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[1002\]' || return 1 ;;
            esac ;;
        nvidia-smi) available nvidia-smi || return 1 ;;
    esac
--- a/server/routes.go
+++ b/server/routes.go
@@ -646,12 +646,9 @@ func (s *Server) ShowModelHandler(c *gin.Context) {

 	resp, err := GetModelInfo(req)
 	if err != nil {
-		switch {
-		case os.IsNotExist(err):
+		if os.IsNotExist(err) {
 			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
-		case err.Error() == "invalid model name":
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		default:
+		} else {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		}
 		return
@@ -661,55 +658,44 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
 }

 func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
-	m, err := GetModel(req.Model)
+	model, err := GetModel(req.Model)
 	if err != nil {
 		return nil, err
 	}

 	modelDetails := api.ModelDetails{
-		ParentModel:       m.ParentModel,
-		Format:            m.Config.ModelFormat,
-		Family:            m.Config.ModelFamily,
-		Families:          m.Config.ModelFamilies,
-		ParameterSize:     m.Config.ModelType,
-		QuantizationLevel: m.Config.FileType,
+		ParentModel:       model.ParentModel,
+		Format:            model.Config.ModelFormat,
+		Family:            model.Config.ModelFamily,
+		Families:          model.Config.ModelFamilies,
+		ParameterSize:     model.Config.ModelType,
+		QuantizationLevel: model.Config.FileType,
 	}

 	if req.System != "" {
-		m.System = req.System
+		model.System = req.System
 	}

 	if req.Template != "" {
-		m.Template = req.Template
+		model.Template = req.Template
 	}

 	msgs := make([]api.Message, 0)
-	for _, msg := range m.Messages {
+	for _, msg := range model.Messages {
 		msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content})
 	}

-	n := model.ParseName(req.Model)
-	if !n.IsValid() {
-		return nil, fmt.Errorf("invalid model name")
-	}
-
-	manifest, err := ParseNamedManifest(n)
-	if err != nil {
-		return nil, err
-	}
-
 	resp := &api.ShowResponse{
-		License:    strings.Join(m.License, "\n"),
-		System:     m.System,
-		Template:   m.Template,
-		Details:    modelDetails,
-		Messages:   msgs,
-		ModifiedAt: manifest.fi.ModTime(),
+		License:  strings.Join(model.License, "\n"),
+		System:   model.System,
+		Template: model.Template,
+		Details:  modelDetails,
+		Messages: msgs,
 	}

 	var params []string
 	cs := 30
-	for k, v := range m.Options {
+	for k, v := range model.Options {
 		switch val := v.(type) {
 		case []interface{}:
 			for _, nv := range val {
@@ -723,15 +709,15 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {

 	for k, v := range req.Options {
 		if _, ok := req.Options[k]; ok {
-			m.Options[k] = v
+			model.Options[k] = v
 		}
 	}

 	var sb strings.Builder
 	fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
 	fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
-	fmt.Fprintf(&sb, "# FROM %s\n\n", m.ShortName)
-	fmt.Fprint(&sb, m.String())
+	fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
+	fmt.Fprint(&sb, model.String())
 	resp.Modelfile = sb.String()

 	return resp, nil
@@ -1001,6 +987,7 @@ func (s *Server) GenerateRoutes() http.Handler {

 	// Compatibility endpoints
 	r.POST("/v1/chat/completions", openai.Middleware(), s.ChatHandler)
+	r.DELETE("/v1/models/:model", openai.DeleteMiddleware(), s.DeleteModelHandler)

 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
--- a/server/sched.go
+++ b/server/sched.go
@@ -7,6 +7,7 @@ import (
 	"log/slog"
 	"reflect"
 	"runtime"
+	"slices"
 	"sort"
 	"strings"
 	"sync"
@@ -26,7 +27,6 @@ type LlmRequest struct {
 	sessionDuration time.Duration
 	successCh       chan *runnerRef
 	errCh           chan error
-	schedAttempts   uint
 }

 type Scheduler struct {
@@ -38,11 +38,9 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex

-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
-	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
-	getGpuFn     func() gpu.GpuInfoList
-	getCpuFn     func() gpu.GpuInfoList
-	reschedDelay time.Duration
+	loadFn      func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
+	newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
+	getGpuFn    func() gpu.GpuInfoList
 }

 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
@@ -56,8 +54,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		getGpuFn:      gpu.GetGPUInfo,
-		getCpuFn:      gpu.GetCPUInfo,
-		reschedDelay:  250 * time.Millisecond,
 	}
 	sched.loadFn = sched.load
 	return sched
@@ -109,7 +105,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 			return
 		case pending := <-s.pendingReqCh:
 			// Block other requests until we get this pending request running
-			pending.schedAttempts++

 			if pending.ctx.Err() != nil {
 				slog.Debug("pending request cancelled or timed out, skipping scheduling")
@@ -136,12 +131,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				} else {
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
-					var gpus gpu.GpuInfoList
-					if pending.opts.NumGPU == 0 {
-						gpus = s.getCpuFn()
-					} else {
-						gpus = s.getGpuFn()
-					}
+					gpus := s.getGpuFn()

 					// Load model for fitting
 					ggml, err := llm.LoadModel(pending.model.ModelPath)
@@ -150,22 +140,16 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

-					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
-					if len(gpus) == 1 && gpus[0].Library == "cpu" {
-						if loadedCount == 0 {
-							slog.Debug("cpu mode with first model, loading")
-							s.loadFn(pending, ggml, gpus)
-							break
-						}
-						runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
-						if runnerToExpire == nil {
-							slog.Debug("cpu mode with available system memory or first model, loading")
-							s.loadFn(pending, ggml, gpus)
-							break
-						}
-						// else we need to expire a runner
-					} else if loadedCount == 0 {
-						// No models loaded. Load the model but prefer the best fit.
+					// If we're CPU only mode, just limit by envconfig.MaxRunners above
+					// TODO handle system memory exhaustion
+					if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
+						slog.Debug("cpu mode with existing models, loading")
+						s.loadFn(pending, ggml, gpus)
+						break
+					}
+
+					// No models loaded. Load the model but prefer the best fit.
+					if loadedCount == 0 {
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
 						g := pickBestFitGPUs(pending, ggml, gpus)
 						if g != nil {
@@ -175,44 +159,16 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

-					if runnerToExpire == nil {
-						// More than one loaded model, so we have to see if the
-						// new one fits
-						//
-						// We want to avoid loading on any GPUs that have other
-						// models still loading on them to avoid potential races
-						// with VRAM consumption ramping up during load
-						availGpus := s.filterGPUsWithoutLoadingModels(gpus)
-
-						// Update free memory from currently loaded models
-						s.updateFreeSpace(availGpus)
-						fitGpus := pickBestFitGPUs(pending, ggml, availGpus)
-						if fitGpus != nil {
-							slog.Debug("new model fits with existing models, loading")
-							s.loadFn(pending, ggml, fitGpus)
-							break
-						}
-
-						// We couldn't find a set of GPUs to fully load the new
-						// model. If no other models are loading (both GPU lists
-						// are the same) then we need to unload another model to
-						// make room
-						if len(availGpus) < len(gpus) {
-							// There are other requests pending, and this one
-							// needs more time, so put it on the back of the
-							// queue so that we might satisfy other pending
-							// requests that aren't blocked
-							go func() {
-								// Process in a go routine to avoid deadlocking
-								// the scheduler if our queue is full
-								slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
-								time.Sleep(s.reschedDelay)
-								s.pendingReqCh <- pending
-							}()
-							break
-						}
-						runnerToExpire = s.findRunnerToUnload()
+					// More than one loaded model, so we have to see if the new one fits
+					// Update free memory from currently loaded models
+					s.updateFreeSpace(gpus)
+					gpus = pickBestFitGPUs(pending, ggml, gpus)
+					if gpus != nil {
+						slog.Debug("new model fits with existing models, loading")
+						s.loadFn(pending, ggml, gpus)
+						break
 					}
+					runnerToExpire = s.findRunnerToUnload()
 				}

 				if runnerToExpire == nil {
@@ -412,9 +368,17 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 	s.loadedMu.Lock()
 	for _, r := range s.loaded {
 		r.refMu.Lock()
+		gpuIDs := make([]string, 0, len(r.gpus))
 		if r.llama != nil {
+			// TODO this should be broken down by GPU instead of assuming uniform spread
+			estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
+			for _, gpu := range r.gpus {
+				gpuIDs = append(gpuIDs, gpu.ID)
+			}
 			for _, gpu := range allGpus {
-				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
+				if slices.Contains(gpuIDs, gpu.ID) {
+					predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
+				}
 			}
 		} else {
 			slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@@ -437,36 +401,11 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 				// after we start our first runner, then we'll never acount for that, so picking the smallest free value seems prudent.
 				allGpus[i].FreeMemory = allGpus[i].TotalMemory - p
 			}
-			slog.Info("updated VRAM based on existing loaded models", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
+			slog.Info("updated VRAM", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
 		}
 	}
 }

-// While models are loading the VRAM consumption numbers will be indeterminate, so we have
-// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
-// This routine returns the set of GPUs that do not have an active loading model.
-// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
-func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
-	ret := append(gpu.GpuInfoList{}, allGpus...)
-	s.loadedMu.Lock()
-	defer s.loadedMu.Unlock()
-	for _, runner := range s.loaded {
-		if runner.loading {
-			slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
-			for _, busyGPU := range runner.gpus {
-				for i := range ret {
-					if ret[i].ID == busyGPU.ID {
-						ret = append(ret[:i], ret[i+1:]...)
-						break
-					}
-				}
-			}
-		}
-	}
-	return ret
-}
-
-// TODO consolidate sched_types.go
 type runnerRef struct {
 	refMu sync.Mutex
 	// refCond   sync.Cond // Signaled on transition from 1 -> 0 refCount
@@ -548,11 +487,8 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	finished := make(chan interface{}, 1)

-	// CPU or Metal don't need checking, so no waiting required
-	// windows can page VRAM, only cuda currently can report accurate used vram usage
-	if len(runner.gpus) == 0 ||
-		(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
-		(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
+	// CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space
+	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" {
 		finished <- struct{}{}
 		return finished
 	}
@@ -572,7 +508,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 		for {
 			<-ticker.C
 			if time.Now().After(expiresAt) {
-				slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "model", runner.modelPath)
+				slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
 				finished <- struct{}{}
 			}

@@ -585,7 +521,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 			}
 			// If we're within ~80% of the estimated memory usage recovered, bail out
 			if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
-				slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "model", runner.modelPath)
+				slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
 				finished <- struct{}{}
 				return
 			}
@@ -622,12 +558,10 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))

 		// First attempt to fit the model into a single GPU
-		if !envconfig.SchedSpread {
-			for _, g := range sgl {
-				if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
-					slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
-					return []gpu.GpuInfo{g}
-				}
+		for _, g := range sgl {
+			if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+				slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
+				return []gpu.GpuInfo{g}
 			}
 		}

@@ -652,10 +586,6 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
 		runnerList = append(runnerList, r)
 	}
 	s.loadedMu.Unlock()
-	if len(runnerList) == 0 {
-		slog.Debug("no loaded runner to unload")
-		return nil
-	}

 	// In the future we can enhance the algorithm to be smarter about picking the optimal runner to unload
 	// e.g., if we have multiple options, will one make room for the request?
@@ -686,18 +616,3 @@ func (s *Scheduler) unloadAllRunners() {
 		}
 	}
 }
-
-// If other runners are loaded, make sure the pending request will fit in system memory
-// If not, pick a runner to unload, else return nil and the request can be loaded
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
-	slog.Debug("evaluating if CPU model load will fit in available system memory")
-	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
-	if estimate.TotalSize <= gpus[0].FreeMemory {
-		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
-		return nil
-	}
-
-	// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
-
-	return s.findRunnerToUnload()
-}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
 	err := <-req.errCh
 	require.Contains(t, err.Error(), "this model may be incompatible")

-	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
+	server := &mockLlm{estimatedVRAM: 10}
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
 		return server, nil
 	}
@@ -129,7 +129,6 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		"tokenizer.ggml.token_type":     []int32{0},
 	}, []llm.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 	})
 	require.NoError(t, err)

@@ -146,17 +145,17 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
-	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
+	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
 	return scenario
 }

 func TestRequests(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 10*time.Second)
+	ctx, done := context.WithTimeout(context.Background(), time.Second)
 	defer done()

 	// Same model, same request
 	scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
-	scenario1a.req.sessionDuration = 5 * time.Millisecond
+	scenario1a.req.sessionDuration = 0
 	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
 	scenario1b.req.model = scenario1a.req.model
 	scenario1b.ggml = scenario1a.ggml
@@ -167,7 +166,6 @@ func TestRequests(t *testing.T) {
 	tmpModel := *scenario1a.req.model
 	scenario2a.req.model = &tmpModel
 	scenario2a.ggml = scenario1a.ggml
-	scenario2a.req.sessionDuration = 5 * time.Millisecond

 	// Multiple loaded models
 	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@@ -183,12 +181,6 @@ func TestRequests(t *testing.T) {
 		g.FreeMemory = 12 * format.GigaByte
 		return []gpu.GpuInfo{g}
 	}
-	s.getCpuFn = func() gpu.GpuInfoList {
-		g := gpu.GpuInfo{Library: "cpu"}
-		g.TotalMemory = 32 * format.GigaByte
-		g.FreeMemory = 26 * format.GigaByte
-		return []gpu.GpuInfo{g}
-	}
 	s.newServerFn = scenario1a.newServer
 	slog.Info("scenario1a")
 	s.pendingReqCh <- scenario1a.req
@@ -317,6 +309,7 @@ func TestGetRunner(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()

+	// Same model, same request
 	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
 	scenario1a.req.sessionDuration = 0
 	scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
@@ -426,7 +419,7 @@ func TestUseLoadedRunner(t *testing.T) {
 		sessionDuration: 2,
 	}
 	finished := make(chan *LlmRequest)
-	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
+	llm1 := &mockLlm{}
 	r1 := &runnerRef{llama: llm1, sessionDuration: 1}
 	req.useLoadedRunner(r1, finished)
 	require.Equal(t, uint(1), r1.refCount)
@@ -459,8 +452,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 	gpus[0].FreeMemory = 900
 	gpus[1].TotalMemory = 2000
 	gpus[1].FreeMemory = 1900
-	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
-	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
+	llm1 := &mockLlm{estimatedVRAM: 100}
+	llm2 := &mockLlm{estimatedVRAM: 200}
 	r1 := &runnerRef{llama: llm1, gpus: gpus}
 	r2 := &runnerRef{llama: llm2, gpus: gpus}

@@ -471,42 +464,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 	s.loadedMu.Unlock()

 	s.updateFreeSpace(gpus)
-	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
-	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
-}
-
-func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
-	defer done()
-	gpus := gpu.GpuInfoList{
-		{
-			Library: "cuda",
-			ID:      "0",
-		},
-		{
-			Library: "cuda",
-			ID:      "1",
-		},
-	}
-	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
-
-	s := InitScheduler(ctx)
-	s.loadedMu.Lock()
-	s.loaded["a"] = r1
-	s.loadedMu.Unlock()
-
-	tmp := s.filterGPUsWithoutLoadingModels(gpus)
-	require.Len(t, tmp, 1)
-	require.Equal(t, "1", tmp[0].ID)
-
-	r1.gpus = gpu.GpuInfoList{gpus[1]}
-	tmp = s.filterGPUsWithoutLoadingModels(gpus)
-	require.Len(t, tmp, 1)
-	require.Equal(t, "0", tmp[0].ID)
-
-	r1.gpus = gpu.GpuInfoList{}
-	tmp = s.filterGPUsWithoutLoadingModels(gpus)
-	require.Len(t, tmp, 2)
+	require.Equal(t, uint64(850), gpus[0].FreeMemory)
+	require.Equal(t, uint64(1850), gpus[1].FreeMemory)
 }

 func TestFindRunnerToUnload(t *testing.T) {
@@ -533,7 +492,7 @@ func TestNeedsReload(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()

-	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
+	llm := &mockLlm{}
 	do := api.DefaultOptions()
 	runner := &runnerRef{
 		model:   &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
@@ -576,8 +535,8 @@ func TestUnloadAllRunners(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()

-	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
-	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
+	llm1 := &mockLlm{}
+	llm2 := &mockLlm{}
 	s := InitScheduler(ctx)
 	s.unloadAllRunners()

@@ -595,7 +554,7 @@ func TestUnloadAllRunners(t *testing.T) {
 }

 func TestUnload(t *testing.T) {
-	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
+	llm1 := &mockLlm{}
 	r1 := &runnerRef{llama: llm1}
 	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
 	r1.unload()
@@ -605,20 +564,19 @@ func TestUnload(t *testing.T) {
 }

 type mockLlm struct {
-	pingResp           error
-	waitResp           error
-	completionResp     error
-	embeddingResp      []float64
-	embeddingRespErr   error
-	tokenizeResp       []int
-	tokenizeRespErr    error
-	detokenizeResp     string
-	detonekizeRespErr  error
-	closeResp          error
-	closeCalled        bool
-	estimatedVRAM      uint64
-	estimatedTotal     uint64
-	estimatedVRAMByGPU map[string]uint64
+	pingResp          error
+	waitResp          error
+	completionResp    error
+	embeddingResp     []float64
+	embeddingRespErr  error
+	tokenizeResp      []int
+	tokenizeRespErr   error
+	detokenizeResp    string
+	detonekizeRespErr error
+	closeResp         error
+	closeCalled       bool
+	estimatedVRAM     uint64
+	estimatedTotal    uint64
 }

 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
@@ -639,6 +597,5 @@ func (s *mockLlm) Close() error {
 	s.closeCalled = true
 	return s.closeResp
 }
-func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
-func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
-func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
+func (s *mockLlm) EstimatedVRAM() uint64  { return s.estimatedVRAM }
+func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }