update tests

drop float16 dependency
goos: darwin goarch: arm64 pkg: github.com/ollama/ollama/convert/float16 cpu: Apple M3 Max BenchmarkFloat16/x448/float16-16 159 7398462 ns/op BenchmarkFloat16/simple-16 512 2327098 ns/op PASS ok github.com/ollama/ollama/convert/float16 2.553s
2025-08-14 15:04:26 -07:00 · 2025-08-14 15:04:26 -07:00 · 2025-08-14 15:04:26 -07:00
50 changed files with 1450 additions and 2784 deletions
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -22,7 +22,7 @@
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
+        "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;120-virtual",
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
@@ -30,14 +30,14 @@
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "72;87"
+        "CMAKE_CUDA_ARCHITECTURES": "72-virtual;87-virtual"
      }
    },
    {
      "name": "JetPack 6",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "87"
+        "CMAKE_CUDA_ARCHITECTURES": "87-virtual"
      }
    },
    {
--- a/2
+++ b/2
@@ -86,8 +86,6 @@ RUN go mod download
 COPY . .
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
-ARG CGO_CFLAGS
-ARG CGO_CXXFLAGS
 RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

--- a/README.md
+++ b/README.md
@@ -411,8 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)

 ### Cloud

@@ -539,8 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
 - [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))

 ### Mobile

--- a/api/types.go
+++ b/api/types.go
@@ -90,10 +90,6 @@ type GenerateRequest struct {
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -124,10 +120,6 @@ type ChatRequest struct {
 	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
 	// for supported models.
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 type Tools []Tool
@@ -316,19 +308,6 @@ type ChatResponse struct {
 	Metrics
 }

-// DebugInfo contains debug information for template rendering
-type DebugInfo struct {
-	RenderedTemplate string `json:"rendered_template"`
-	ImageCount       int    `json:"image_count,omitempty"`
-}
-
-// DebugTemplateResponse is returned when _debug_render_only is set to true
-type DebugTemplateResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	DebugInfo DebugInfo `json:"_debug_info"`
-}
-
 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1612,7 +1612,6 @@ func NewCLI() *cobra.Command {
 			appendEnvDocs(cmd, []envconfig.EnvVar{
 				envVars["OLLAMA_DEBUG"],
 				envVars["OLLAMA_HOST"],
-				envVars["OLLAMA_CONTEXT_LENGTH"],
 				envVars["OLLAMA_KEEP_ALIVE"],
 				envVars["OLLAMA_MAX_LOADED_MODELS"],
 				envVars["OLLAMA_MAX_QUEUE"],
--- a/convert/bfloat16/bfloat16.go
+++ b/convert/bfloat16/bfloat16.go
@@ -0,0 +1,21 @@
+package bfloat16
+
+import "math"
+
+// FromFloat32s converts a slice of float32 values to a slice of bfloat16 values, represented as uint16s.
+func FromFloat32s(f32s []float32) (u16s []uint16) {
+	u16s = make([]uint16, len(f32s))
+	for i := range f32s {
+		u16s[i] = uint16(math.Float32bits(f32s[i]) >> 16)
+	}
+	return u16s
+}
+
+// Float32s converts a slice of bfloat16 values, represented as uint16s, back to a slice of float32 values.
+func Float32s(u16s []uint16) (f32s []float32) {
+	f32s = make([]float32, len(u16s))
+	for i := range u16s {
+		f32s[i] = math.Float32frombits(uint32(u16s[i]) << 16)
+	}
+	return f32s
+}
--- a/convert/bfloat16/bfloat16_test.go
+++ b/convert/bfloat16/bfloat16_test.go
@@ -0,0 +1,82 @@
+package bfloat16
+
+import (
+	"math"
+	"math/rand/v2"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestBfloat16(t *testing.T) {
+	cases := []struct {
+		name  string
+		input uint16
+		want  uint32
+	}{
+		// Zero cases
+		{"positive zero", 0x0000, 0x0},
+		{"negative zero", 0x8000, 0x80000000},
+
+		// Normal numbers
+		{"one", 0x3F80, 0x3F800000},
+		{"negative one", 0xBF80, 0xBF800000},
+		{"two", 0x4000, 0x40000000},
+		{"half", 0x3F00, 0x3F000000},
+		{"quarter", 0x3E80, 0x3E800000},
+		{"max finite", 0x7F7F, 0x7F7F0000},
+		{"min positive normal", 0x0080, 0x00800000},
+
+		// Infinity cases
+		{"positive infinity", 0x7F80, 0x7F800000},
+		{"negative infinity", 0xFF80, 0xFF800000},
+
+		// NaN cases
+		{"NaN", 0x7FC0, 0x7FC00000},
+		{"NaN with payload", 0x7FC1, 0x7FC10000},
+
+		// Subnormal cases
+		{"min positive subnormal", 0x0001, 0x00010000},
+		{"max subnormal", 0x007F, 0x007F0000},
+
+		// Powers of 2
+		{"2^10", 0x4480, 0x44800000},
+		{"2^-10", 0x3A80, 0x3A800000},
+		{"2^20", 0x4B80, 0x4B800000},
+
+		// Common approximations in BF16
+		{"pi approximation", 0x4049, 0x40490000},
+		{"e approximation", 0x402E, 0x402E0000},
+		{"sqrt(2) approximation", 0x3FB5, 0x3FB50000},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			t.Run("Float32s", func(t *testing.T) {
+				got := Float32s([]uint16{tt.input})[0]
+				if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
+					t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+
+			t.Run("FromFloat32s", func(t *testing.T) {
+				got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
+				if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
+					t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+		})
+	}
+}
+
+func BenchmarkBfloat16(b *testing.B) {
+	f32s := make([]float32, 1_000_000)
+	for i := range f32s {
+		f32s[i] = rand.Float32()
+	}
+	for b.Loop() {
+		Float32s(FromFloat32s(f32s))
+	}
+}
--- a/convert/float16/float16.go
+++ b/convert/float16/float16.go
@@ -0,0 +1,97 @@
+package float16
+
+import (
+	"math"
+)
+
+func FromFloat32s(f32s []float32) (u16s []uint16) {
+	u16s = make([]uint16, len(f32s))
+	for i := range f32s {
+		bits := math.Float32bits(f32s[i])
+		sign := (bits >> 31) & 0x1
+		exponent := (bits >> 23) & 0xFF
+		mantissa := bits & 0x7FFFFF
+		if exponent == 0xFF {
+			if mantissa == 0 {
+				// Infinity
+				u16s[i] = uint16((sign << 15) | 0x7C00)
+			} else {
+				// NaN
+				u16s[i] = uint16((sign << 15) | 0x7C00 | (mantissa >> 13))
+			}
+		} else if exponent == 0 && mantissa == 0 {
+			// Zero
+			u16s[i] = uint16(sign << 15)
+		} else {
+			// Convert exponent from FP32 bias (127) to FP16 bias (15)
+			exponent := int(exponent) - 127 + 15
+			if exponent >= 31 {
+				// Overflow to infinity
+				u16s[i] = uint16((sign << 15) | 0x7C00)
+			} else if exponent <= 0 {
+				// Underflow - create subnormal or zero
+				if exponent < -10 {
+					u16s[i] = uint16(sign << 15) // Zero
+				} else {
+					// Subnormal number
+					mantissa = (mantissa | 0x800000) >> uint(-exponent+1)
+					u16s[i] = uint16((sign << 15) | (mantissa >> 13))
+				}
+			} else {
+				// Normal number - truncate mantissa from 23 to 10 bits
+				u16s[i] = uint16((sign << 15) | (uint32(exponent) << 10) | (mantissa >> 13))
+			}
+		}
+	}
+
+	return u16s
+}
+
+func Float32s(u16s []uint16) (f32s []float32) {
+	f32s = make([]float32, len(u16s))
+	for i := range u16s {
+		sign := (u16s[i] >> 15) & 0x1
+		exponent := (u16s[i] >> 10) & 0x1F
+		mantissa := u16s[i] & 0x3FF
+
+		var u32 uint32
+		switch exponent {
+		case 0:
+			if mantissa == 0 {
+				// Zero
+				u32 = uint32(sign) << 31
+			} else {
+				// Subnormal - convert to normal
+				// Find leading 1 bit
+				shift := 0
+				temp := mantissa
+				for temp&0x400 == 0 {
+					temp <<= 1
+					shift++
+				}
+
+				exponent := 127 - 15 + 1 - shift
+				mantissa := (uint32(temp&0x3FF) << 13)
+
+				u32 = (uint32(sign) << 31) | (uint32(exponent) << 23) | mantissa
+			}
+		case 0x1F:
+			if mantissa == 0 {
+				// Infinity
+				u32 = (uint32(sign) << 31) | 0x7F800000
+			} else {
+				// NaN
+				u32 = (uint32(sign) << 31) | 0x7F800000 | (uint32(mantissa) << 13)
+			}
+		default:
+			// Normal number
+			exponent := uint32(exponent) - 15 + 127
+			mantissa := uint32(mantissa) << 13
+
+			u32 = (uint32(sign) << 31) | (exponent << 23) | mantissa
+		}
+
+		f32s[i] = math.Float32frombits(u32)
+	}
+	return f32s
+}
--- a/convert/float16/float16_test.go
+++ b/convert/float16/float16_test.go
@@ -0,0 +1,75 @@
+package float16
+
+import (
+	"math"
+	"math/rand/v2"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestFloat16(t *testing.T) {
+	cases := []struct {
+		name  string
+		input uint16
+		want  uint32
+	}{
+		// Zero cases
+		{"positive zero", 0x0000, 0x0},
+		{"negative zero", 0x8000, 0x80000000},
+
+		// Normal numbers
+		{"one", 0x3C00, 0x3F800000},
+		{"negative one", 0xBC00, 0xBF800000},
+		{"two", 0x4000, 0x40000000},
+		{"half", 0x3800, 0x3F000000},
+		{"max normal", 0x7BFF, 0x477fe000},
+		{"min positive normal", 0x0400, 0x38800000},
+
+		// Infinity cases
+		{"positive infinity", 0x7C00, 0x7F800000},
+		{"negative infinity", 0xFC00, 0xFF800000},
+
+		// NaN cases
+		{"NaN", 0x7C01, 0x7f802000},
+		{"NaN with payload", 0x7E00, 0x7FC00000},
+
+		// Subnormal cases
+		{"min positive subnormal", 0x0001, 0x33800000},
+		{"max subnormal", 0x03FF, 0x387fc000},
+
+		// Common values
+		{"pi approximation", 0x4248, 0x40490000},
+		{"e approximation", 0x416F, 0x402de000},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			t.Run("Float32s", func(t *testing.T) {
+				got := Float32s([]uint16{tt.input})[0]
+				if diff := cmp.Diff(tt.want, math.Float32bits(got)); diff != "" {
+					t.Errorf("Float32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+
+			t.Run("FromFloat32s", func(t *testing.T) {
+				got := FromFloat32s([]float32{math.Float32frombits(tt.want)})
+				if diff := cmp.Diff([]uint16{tt.input}, got); diff != "" {
+					t.Errorf("FromFloat32s mismatch (-want +got):\n%s", diff)
+				}
+			})
+		})
+	}
+}
+
+func BenchmarkFloat16(b *testing.B) {
+	f32s := make([]float32, 1_000_000)
+	for i := range f32s {
+		f32s[i] = rand.Float32()
+	}
+	for b.Loop() {
+		Float32s(FromFloat32s(f32s))
+	}
+}
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -13,8 +13,8 @@ import (
 	"slices"
 	"strings"

-	"github.com/d4l3k/go-bfloat16"
-	"github.com/x448/float16"
+	"github.com/ollama/ollama/convert/bfloat16"
+	"github.com/ollama/ollama/convert/float16"
 )

 type safetensorMetadata struct {
@@ -163,18 +163,16 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 			return 0, err
 		}

-		f32s = make([]float32, len(u16s))
-		for i := range u16s {
-			f32s[i] = float16.Frombits(u16s[i]).Float32()
-		}
+		f32s = float16.Float32s(u16s)

 	case "BF16":
-		u8s := make([]uint8, st.size)
-		if err = binary.Read(br, binary.LittleEndian, u8s); err != nil {
+		u16s := make([]uint16, st.size/2)
+		if err = binary.Read(br, binary.LittleEndian, u16s); err != nil {
 			return 0, err
 		}

-		f32s = bfloat16.DecodeFloat32(u8s)
+		f32s = bfloat16.Float32s(u16s)
+
 	default:
 		return 0, fmt.Errorf("unknown data type: %s", st.dtype)
 	}
@@ -190,15 +188,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	case tensorKindFP32:
 		return 0, binary.Write(w, binary.LittleEndian, f32s)
 	case tensorKindFP16:
-		f16s := make([]uint16, len(f32s))
-		for i := range f32s {
-			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
-		}
-
-		return 0, binary.Write(w, binary.LittleEndian, f16s)
+		return 0, binary.Write(w, binary.LittleEndian, float16.FromFloat32s(f32s))
 	case tensorKindBF16:
-		u8s := bfloat16.EncodeFloat32(f32s)
-		return 0, binary.Write(w, binary.LittleEndian, u8s)
+		return 0, binary.Write(w, binary.LittleEndian, bfloat16.FromFloat32s(f32s))
 	default:
 		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
 	}
--- a/convert/reader_test.go
+++ b/convert/reader_test.go
@@ -7,9 +7,9 @@ import (
 	"path/filepath"
 	"testing"

-	"github.com/d4l3k/go-bfloat16"
 	"github.com/google/go-cmp/cmp"
-	"github.com/x448/float16"
+	"github.com/ollama/ollama/convert/bfloat16"
+	"github.com/ollama/ollama/convert/float16"
 )

 func TestSafetensors(t *testing.T) {
@@ -21,6 +21,11 @@ func TestSafetensors(t *testing.T) {
 	}
 	defer root.Close()

+	f32s := make([]float32, 32)
+	for i := range f32s {
+		f32s[i] = float32(i)
+	}
+
 	cases := []struct {
 		name,
 		dtype string
@@ -36,11 +41,6 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 4, // 32 floats, each 4 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
 				if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
 					t.Fatal(err)
 				}
@@ -62,11 +62,6 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 4, // 32 floats, each 4 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
 				if err := binary.Write(f, binary.LittleEndian, f32s); err != nil {
 					t.Fatal(err)
 				}
@@ -84,12 +79,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 floats, each 2 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
-				u16s := make([]uint16, 32)
-				for i := range u16s {
-					u16s[i] = float16.Fromfloat32(float32(i)).Bits()
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
@@ -106,12 +96,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 floats, each 2 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
-				u16s := make([]uint16, 32)
-				for i := range u16s {
-					u16s[i] = float16.Fromfloat32(float32(i)).Bits()
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, u16s); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, float16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
@@ -132,12 +117,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 brain floats, each 2 bytes
 			shape: []uint64{16, 2},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
@@ -154,12 +134,7 @@ func TestSafetensors(t *testing.T) {
 			size:  32 * 2, // 32 brain floats, each 2 bytes
 			shape: []uint64{32},
 			setup: func(t *testing.T, f *os.File) {
-				f32s := make([]float32, 32)
-				for i := range f32s {
-					f32s[i] = float32(i)
-				}
-
-				if err := binary.Write(f, binary.LittleEndian, bfloat16.EncodeFloat32(f32s)); err != nil {
+				if err := binary.Write(f, binary.LittleEndian, bfloat16.FromFloat32s(f32s)); err != nil {
 					t.Fatal(err)
 				}
 			},
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -97,7 +97,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		return a < b
 	})
 	gpuCount := 0
-	gpuOrdinalID := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)
 		fp, err := os.Open(match)
@@ -188,6 +187,10 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			continue
 		}

+		// Keep track of numeric IDs based on valid GPUs
+		gpuID := gpuCount
+		gpuCount += 1
+
 		// Look up the memory for the current node
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
@@ -266,7 +269,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		if uniqueID != 0 {
 			ID = fmt.Sprintf("GPU-%016x", uniqueID)
 		} else {
-			ID = strconv.Itoa(gpuOrdinalID)
+			ID = strconv.Itoa(gpuID)
 		}

 		gpuInfo := RocmGPUInfo{
@@ -284,40 +287,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				DriverMinor:   driverMinor,
 			},
 			usedFilepath: usedFile,
-			index:        gpuCount,
+			index:        gpuID,
 		}

-		// Keep track of numeric IDs based on valid GPUs
-		gpuCount += 1
-
-		// If the user wants to filter to a subset of devices, filter out if we aren't a match
-		if len(visibleDevices) > 0 {
-			include := false
-			for _, visible := range visibleDevices {
-				if (uniqueID != 0 && visible == gpuInfo.ID) || visible == strconv.Itoa(gpuInfo.index) {
-					include = true
-					break
-				}
-			}
-			if !include {
-				reason := "filtering out device per user request"
-				slog.Info(reason, "id", gpuInfo.ID, "index", gpuInfo.index, "visible_devices", visibleDevices)
-				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
-					GpuInfo: gpuInfo.GpuInfo,
-					Reason:  reason,
-				})
-
-				continue
-			}
-		}
-
-		// Ordinal IDs are based on the visible GPUs
-		gpuOrdinalID += 1
-
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
 			reason := "unsupported Radeon iGPU detected skipping"
-			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
+			slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
@@ -330,7 +306,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		}
 		if int(major) < minVer {
 			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
-			slog.Warn(reason, "gpu", gpuInfo.ID)
+			slog.Warn(reason, "gpu", gpuID)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
@@ -339,8 +315,29 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			continue
 		}

-		slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
-		slog.Debug("amdgpu memory", "gpu", gpuInfo.ID, "available", format.HumanBytes2(totalMemory-usedMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
+
+		// If the user wants to filter to a subset of devices, filter out if we aren't a match
+		if len(visibleDevices) > 0 {
+			include := false
+			for _, visible := range visibleDevices {
+				if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
+					include = true
+					break
+				}
+			}
+			if !include {
+				reason := "filtering out device per user request"
+				slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
+					GpuInfo: gpuInfo.GpuInfo,
+					Reason:  reason,
+				})
+
+				continue
+			}
+		}

 		// Final validation is gfx compatibility - load the library if we haven't already loaded it
 		// even if the user overrides, we still need to validate the library
--- a/docs/turbo.md
+++ b/docs/turbo.md
@@ -75,7 +75,7 @@ for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
 import { Ollama } from 'ollama';

 const ollama = new Ollama({
-  host: 'https://ollama.com',
+  host: 'https://ollama.com'
  headers: {
 	  Authorization: "Bearer <api key>"
  }
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -185,8 +185,6 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
-	// Enable the new memory estimation logic
-	NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
 )

 func String(s string) func() string {
@@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
 		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
-		"OLLAMA_NEW_ESTIMATES":     {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},

 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -480,8 +480,6 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 }

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
-	context *= uint64(numParallel)
-
 	embedding := f.KV().EmbeddingLength()
 	heads := f.KV().HeadCountMax()
 	headsKV := f.KV().HeadCountKVMax()
@@ -752,11 +750,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {

 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
-	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
-		// gpt-oss uses attention with sinks which does not support quantized cache types
-		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
-		return cacheType == "f16"
-	}
 	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }

--- a/go.mod
+++ b/go.mod
@@ -10,13 +10,11 @@ require (
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
-	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.12.0
 )

 require (
 	github.com/agnivade/levenshtein v1.1.1
-	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.7.0
--- a/go.sum
+++ b/go.sum
@@ -35,8 +35,6 @@ github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARu
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -197,8 +195,6 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
 github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
-github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
-github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
 github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -7,7 +7,6 @@ import (
 	"fmt"
 	"log/slog"
 	"math"
-	"math/rand"
 	"os"
 	"strconv"
 	"sync"
@@ -17,157 +16,245 @@ import (
 	"github.com/stretchr/testify/require"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )

-// Send multiple requests in parallel (concurrently) to a single model and ensure responses are expected
-func TestConcurrentGenerate(t *testing.T) {
-	// Assumes all requests have the same model
-	req, resp := GenerateRequests()
-	numParallel := int(envconfig.NumParallel() + 1)
-	iterLimit := 3
+func TestMultiModelConcurrency(t *testing.T) {
+	var (
+		req = [2]api.GenerateRequest{
+			{
+				Model:     smol,
+				Prompt:    "why is the ocean blue?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
+			}, {
+				Model:     "qwen3:0.6b",
+				Prompt:    "what is the origin of the us thanksgiving holiday?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
+			},
+		}
+		resp = [2][]string{
+			{"sunlight"},
+			{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
+		}
+	)
+	var wg sync.WaitGroup
+	wg.Add(len(req))
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
+	defer cancel()

-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for i := 0; i < len(req); i++ {
+		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
+	}
+
+	for i := 0; i < len(req); i++ {
+		go func(i int) {
+			defer wg.Done()
+			// Note: CPU based inference can crawl so don't give up too quickly
+			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
+		}(i)
+	}
+	wg.Wait()
+}
+
+func TestIntegrationConcurrentPredict(t *testing.T) {
+	req, resp := GenerateRequests()
+	reqLimit := len(req)
+	iterLimit := 5
+
+	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
+		maxVram, err := strconv.ParseUint(s, 10, 64)
+		require.NoError(t, err)
+		// Don't hammer on small VRAM cards...
+		if maxVram < 4*format.GibiByte {
+			reqLimit = min(reqLimit, 2)
+			iterLimit = 2
+		}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

 	// Get the server running (if applicable) warm the model up with a single initial request
-	slog.Info("loading", "model", req[0].Model)
-	err := client.Generate(ctx,
-		&api.GenerateRequest{Model: req[0].Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
-		func(response api.GenerateResponse) error { return nil },
-	)
-	if err != nil {
-		t.Fatalf("failed to load model %s: %s", req[0].Model, err)
-	}
+	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)

 	var wg sync.WaitGroup
-	r := rand.New(rand.NewSource(0))
-	wg.Add(numParallel)
-	for i := range numParallel {
+	wg.Add(reqLimit)
+	for i := 0; i < reqLimit; i++ {
 		go func(i int) {
 			defer wg.Done()
 			for j := 0; j < iterLimit; j++ {
-				if time.Now().Sub(started) > softTimeout {
-					slog.Info("exceeded soft timeout, winding down test")
-					return
-				}
-				k := r.Int() % len(req)
-				slog.Info("Starting", "thread", i, "iter", j)
+				slog.Info("Starting", "req", i, "iter", j)
 				// On slower GPUs it can take a while to process the concurrent requests
 				// so we allow a much longer initial timeout
-				DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
 			}
 		}(i)
 	}
 	wg.Wait()
 }

-// Stress the scheduler and attempt to load more models than will fit to cause thrashing
-// This test will always load at least 2 models even on CPU based systems
+// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	s := os.Getenv("OLLAMA_MAX_VRAM")
+	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if s == "" {
-		s = "0"
+		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}

 	maxVram, err := strconv.ParseUint(s, 10, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
-
-	smallModels := []string{
-		"llama3.2:1b",
-		"qwen3:0.6b",
-		"gemma:2b",
-		"deepseek-r1:1.5b",
-		"starcoder2:3b",
-	}
-	mediumModels := []string{
-		"qwen3:8b",
-		"llama2",
-		"deepseek-r1:7b",
-		"mistral",
-		"dolphin-mistral",
-		"gemma:7b",
-		"codellama:7b",
+	if maxVram < 2*format.GibiByte {
+		t.Skip("VRAM less than 2G, skipping model stress tests")
 	}

-	var chosenModels []string
+	type model struct {
+		name string
+		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
+	}
+
+	smallModels := []model{
+		{
+			name: "llama3.2:1b",
+			size: 2876 * format.MebiByte,
+		},
+		{
+			name: "qwen3:0.6b",
+			size: 1600 * format.MebiByte,
+		},
+		{
+			name: "gemma:2b",
+			size: 2364 * format.MebiByte,
+		},
+		{
+			name: "deepseek-r1:1.5b",
+			size: 2048 * format.MebiByte,
+		},
+		{
+			name: "starcoder2:3b",
+			size: 2166 * format.MebiByte,
+		},
+	}
+	mediumModels := []model{
+		{
+			name: "qwen3:8b",
+			size: 6600 * format.MebiByte,
+		},
+		{
+			name: "llama2",
+			size: 5118 * format.MebiByte,
+		},
+		{
+			name: "deepseek-r1:7b",
+			size: 5600 * format.MebiByte,
+		},
+		{
+			name: "mistral",
+			size: 4620 * format.MebiByte,
+		},
+		{
+			name: "dolphin-mistral",
+			size: 4620 * format.MebiByte,
+		},
+		{
+			name: "gemma:7b",
+			size: 5000 * format.MebiByte,
+		},
+		{
+			name: "codellama:7b",
+			size: 5118 * format.MebiByte,
+		},
+	}
+
+	// These seem to be too slow to be useful...
+	// largeModels := []model{
+	// 	{
+	// 		name: "llama2:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "codellama:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "orca-mini:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "gemma:7b",
+	// 		size: 5000 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "starcoder2:15b",
+	// 		size: 9100 * format.MebiByte,
+	// 	},
+	// }
+
+	var chosenModels []model
 	switch {
 	case maxVram < 10000*format.MebiByte:
 		slog.Info("selecting small models")
 		chosenModels = smallModels
+	// case maxVram < 30000*format.MebiByte:
 	default:
 		slog.Info("selecting medium models")
 		chosenModels = mediumModels
+		// default:
+		// 	slog.Info("selecting large models")
+		// 	chosenModels = largeModels
 	}

-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	req, resp := GenerateRequests()
+
+	for i := range req {
+		if i > len(chosenModels) {
+			break
+		}
+		req[i].Model = chosenModels[i].name
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

 	// Make sure all the models are pulled before we get started
-	for _, model := range chosenModels {
-		require.NoError(t, PullIfMissing(ctx, client, model))
+	for _, r := range req {
+		require.NoError(t, PullIfMissing(ctx, client, r.Model))
 	}

-	// Determine how many models we can load in parallel before we exceed VRAM
-	// The intent is to go 1 over what can fit so we force the scheduler to thrash
-	targetLoadCount := 0
-	slog.Info("Loading models to find how many can fit in VRAM before overflowing")
-	for i, model := range chosenModels {
-		req := &api.GenerateRequest{Model: model}
-		slog.Info("loading", "model", model)
-		err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil })
-		if err != nil {
-			t.Fatalf("failed to load model %s: %s", model, err)
-		}
-		targetLoadCount++
-		if i > 0 {
-			models, err := client.ListRunning(ctx)
-			if err != nil {
-				t.Fatalf("failed to list running models: %s", err)
-			}
-			if len(models.Models) < targetLoadCount {
-				loaded := []string{}
-				for _, m := range models.Models {
-					loaded = append(loaded, m.Name)
-				}
-				slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
-				break
-			}
-		}
-	}
-	if targetLoadCount == len(chosenModels) {
-		// TODO consider retrying the medium models
-		slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models")
-	}
-
-	r := rand.New(rand.NewSource(0))
 	var wg sync.WaitGroup
-	for i := range targetLoadCount {
+	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
+	for i := 0; i < len(req); i++ {
+		// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
+		if i > 1 && consumed > maxVram {
+			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
+			break
+		}
+		consumed += chosenModels[i].size
+		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
+
 		wg.Add(1)
 		go func(i int) {
 			defer wg.Done()
-			reqs, resps := GenerateRequests()
 			for j := 0; j < 3; j++ {
-				if time.Now().Sub(started) > softTimeout {
-					slog.Info("exceeded soft timeout, winding down test")
-					return
-				}
-				k := r.Int() % len(reqs)
-				reqs[k].Model = chosenModels[i]
-				slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Prompt)
-				DoGenerate(ctx, t, client, reqs[k], resps[k],
-					120*time.Second, // Be extra patient for the model to load initially
-					10*time.Second,  // Once results start streaming, fail if they stall
-				)
+				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
 			}
 		}(i)
 	}
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -4,8 +4,6 @@ package integration

 import (
 	"context"
-	"log/slog"
-	"sync"
 	"testing"
 	"time"

@@ -65,51 +63,3 @@ func TestContextExhaustion(t *testing.T) {
 	}
 	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
 }
-
-// Send multiple requests with prior context and ensure the response is coherant and expected
-func TestGenerateWithHistory(t *testing.T) {
-	modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model
-	req, resp := GenerateRequests()
-	numParallel := 2
-	iterLimit := 2
-
-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	// Get the server running (if applicable) warm the model up with a single initial request
-	slog.Info("loading", "model", modelOverride)
-	err := client.Generate(ctx,
-		&api.GenerateRequest{Model: modelOverride, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
-		func(response api.GenerateResponse) error { return nil },
-	)
-	if err != nil {
-		t.Fatalf("failed to load model %s: %s", modelOverride, err)
-	}
-
-	var wg sync.WaitGroup
-	wg.Add(numParallel)
-	for i := range numParallel {
-		go func(i int) {
-			defer wg.Done()
-			k := i % len(req)
-			req[k].Model = modelOverride
-			for j := 0; j < iterLimit; j++ {
-				if time.Now().Sub(started) > softTimeout {
-					slog.Info("exceeded soft timeout, winding down test")
-					return
-				}
-				slog.Info("Starting", "thread", i, "iter", j)
-				// On slower GPUs it can take a while to process the concurrent requests
-				// so we allow a much longer initial timeout
-				c := DoGenerate(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
-				req[k].Context = c
-				req[k].Prompt = "tell me more!"
-			}
-		}(i)
-	}
-	wg.Wait()
-
-}
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -472,19 +472,15 @@ func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRe
 	DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
 }

-func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) []int {
+func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) {
 	stallTimer := time.NewTimer(initialTimeout)
 	var buf bytes.Buffer
-	var context []int
 	fn := func(response api.GenerateResponse) error {
 		// fmt.Print(".")
 		buf.Write([]byte(response.Response))
 		if !stallTimer.Reset(streamTimeout) {
 			return errors.New("stall was detected while streaming response, aborting")
 		}
-		if len(response.Context) > 0 {
-			context = response.Context
-		}
 		return nil
 	}

@@ -507,7 +503,7 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 	case <-done:
 		if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
 			slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
-			return context
+			return
 		}
 		require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
 		// Verify the response contains the expected data
@@ -524,7 +520,6 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 	case <-ctx.Done():
 		t.Error("outer test context done while waiting for generate")
 	}
-	return context
 }

 // Generate a set of requests
@@ -533,35 +528,55 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 	return []api.GenerateRequest{
 			{
 				Model:     smol,
-				Prompt:    "why is the ocean blue? Be brief but factual in your reply",
+				Prompt:    "why is the ocean blue?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "why is the color of dirt brown? Be brief but factual in your reply",
+				Prompt:    "why is the color of dirt brown?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "what is the origin of the US thanksgiving holiday? Be brief but factual in your reply",
+				Prompt:    "what is the origin of the us thanksgiving holiday?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "what is the origin of independence day? Be brief but factual in your reply",
+				Prompt:    "what is the origin of independence day?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			}, {
 				Model:     smol,
-				Prompt:    "what is the composition of air? Be brief but factual in your reply",
+				Prompt:    "what is the composition of air?",
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]any{
+					"seed":        42,
+					"temperature": 0.0,
+				},
 			},
 		},
 		[][]string{
-			{"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"},
-			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"},
-			{"england", "english", "massachusetts", "pilgrims", "colonists", "independence", "british", "feast", "family", "gatherings", "traditions", "turkey", "colonial", "period", "harvest", "agricultural", "european settlers", "american revolution", "civil war", "16th century", "17th century", "native american", "united states"},
+			{"sunlight", "scattering", "interact"},
+			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles"},
+			{"england", "english", "massachusetts", "pilgrims", "british"},
 			{"fourth", "july", "declaration", "independence"},
 			{"nitrogen", "oxygen", "carbon", "dioxide"},
 		}
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
    const int64_t n_vocab = vocab.n_tokens();
    const int64_t n_embd  = hparams.n_embd;

-    const bool output_all = false;
+    // when computing embeddings, all tokens are output
+    const bool output_all = cparams.embeddings;

    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -62,22 +62,6 @@ func BackendInit() {
 	C.llama_backend_init()
 }

-func EnumerateGPUs() []string {
-	var ids []string
-
-	for i := range C.ggml_backend_dev_count() {
-		device := C.ggml_backend_dev_get(i)
-
-		if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
-			var props C.struct_ggml_backend_dev_props
-			C.ggml_backend_dev_get_props(device, &props)
-			ids = append(ids, C.GoString(props.id))
-		}
-	}
-
-	return ids
-}
-
 func GetModelArch(modelPath string) (string, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
--- a/llama/patches/0016-temporary-prevent-rocm-cuda-mixed-loading.patch
+++ b/llama/patches/0016-temporary-prevent-rocm-cuda-mixed-loading.patch
@@ -0,0 +1,32 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Sun, 22 Jun 2025 09:22:05 -0700
+Subject: [PATCH] temporary prevent rocm+cuda mixed loading
+
+---
+ ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
+index 3040b2aa..f1e9c180 100644
+--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
+@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
+ 
+     ggml_backend_load_best("blas", silent, dir_path);
+     ggml_backend_load_best("cann", silent, dir_path);
+-    ggml_backend_load_best("cuda", silent, dir_path);
+-    ggml_backend_load_best("hip", silent, dir_path);
+
+    // Avoid mixed hip+cuda configurations
+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
+    if (!hip_devices && !rocr_devices) {
+        ggml_backend_load_best("cuda", silent, dir_path);
+    } else {
+        ggml_backend_load_best("hip", silent, dir_path);
+    }
+
+     ggml_backend_load_best("metal", silent, dir_path);
+     ggml_backend_load_best("rpc", silent, dir_path);
+     ggml_backend_load_best("sycl", silent, dir_path);
--- a/llama/patches/0017-add-C-API-for-mtmd_input_text.patch
+++ b/llama/patches/0017-add-C-API-for-mtmd_input_text.patch
--- a/llama/patches/0018-no-power-throttling-win32-with-gnuc.patch
+++ b/llama/patches/0018-no-power-throttling-win32-with-gnuc.patch
--- a/llama/patches/0019-BF16-macos-version-guard.patch
+++ b/llama/patches/0019-BF16-macos-version-guard.patch
--- a/llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch
+++ b/llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -13,7 +13,7 @@ checks.
 1 file changed, 18 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 57eae461..c7f9dc3a 100644
+index 57eae461..9db0c8b5 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
--- a/llama/patches/0021-Disable-ggml-blas-on-macos-v13-and-older.patch
+++ b/llama/patches/0021-Disable-ggml-blas-on-macos-v13-and-older.patch
--- a/llama/patches/0022-fix-mtmd-audio.cpp-build-on-windows.patch
+++ b/llama/patches/0022-fix-mtmd-audio.cpp-build-on-windows.patch
--- a/llama/patches/0023-decode-disable-output_all.patch
+++ b/llama/patches/0023-decode-disable-output_all.patch
@@ -1,23 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Mon, 18 Aug 2025 16:58:39 -0700
-Subject: [PATCH] decode: disable output_all
-
---
- src/llama-context.cpp | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 26a5cf9c..6ece5263 100644
--- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
-     const int64_t n_vocab = vocab.n_tokens();
-     const int64_t n_embd  = hparams.n_embd;
- 
-    // when computing embeddings, all tokens are output
-    const bool output_all = cparams.embeddings;
-+    const bool output_all = false;
- 
-     if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
-         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llama/patches/0023-ggml-No-alloc-mode.patch
+++ b/llama/patches/0023-ggml-No-alloc-mode.patch
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -4,7 +4,7 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
-	"sort"
+	"strconv"
 	"strings"

 	"github.com/ollama/ollama/api"
@@ -14,79 +14,13 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 )

-// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
-// The list of GPUs returned will always be the same brand (library)
-// If the model can not be fit fully within the available GPU(s) nil is returned
-func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
-	for _, gl := range gpus.ByLibrary() {
-		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
-
-		// TODO - potentially sort by performance capability, existing models loaded, etc.
-		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
-		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
-		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
-
-		if !envconfig.SchedSpread() {
-			// Try to pack into as few GPUs as possible, starting from 1 GPU
-			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
-				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
-
-				if ok {
-					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
-						"model", modelPath,
-						"library", sgl[0].Library,
-						"parallel", numParallel,
-						"required", format.HumanBytes2(estimatedVRAM),
-						"gpus", numGPUs)
-					return gpuSubset
-				}
-			}
-		} else {
-			// TODO future refinements
-			// - if multiple Libraries, see if any single GPU in any Library will fit
-			// - try subsets of GPUs instead of just falling back to 1 or all in a family
-
-			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
-				slog.Info("new model will fit in available VRAM, loading",
-					"model", modelPath,
-					"library", sgl[0].Library,
-					"parallel", numParallel,
-					"required", format.HumanBytes2(estimatedVRAM),
-					"gpus", len(sgl))
-				return sgl
-			}
-		}
-	}
-	return nil
-}
-
-// If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
-	byLibrary := gpus.ByLibrary()
-	if len(byLibrary) <= 1 {
-		return gpus
-	}
-	var bestEstimate uint64
-	var bestFit int
-	for i, gl := range byLibrary {
-		_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
-		if estimatedVRAM > bestEstimate {
-			bestEstimate = estimatedVRAM
-			bestFit = i
-		}
-	}
-	return byLibrary[bestFit]
-}
-
 // This algorithm looks for a complete fit to determine if we need to unload other models
 func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
+		estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
@@ -115,7 +49,7 @@ type MemoryEstimate struct {
 	TotalSize uint64

 	// For multi-GPU scenarios, this provides the tensor split parameter
-	TensorSplit []int
+	TensorSplit string

 	// For multi-GPU scenarios, this is the size in bytes per GPU
 	GPUSizes []uint64
@@ -137,7 +71,7 @@ type MemoryEstimate struct {

 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64

@@ -178,9 +112,13 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	for _, projector := range projectors {
 		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
+
+		// multimodal models require at least 2048 context
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 	if llamaEngineProjectorWeights == 0 {
 		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}

 	layers := f.Tensors().GroupLayers()
@@ -246,7 +184,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
-	tensorSplit := make([]int, len(gpus))
+	layerCounts := make([]int, len(gpus))
 	gpuAllocations := make([]uint64, len(gpus))
 	type gs struct {
 		i int
@@ -310,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
 			if g.g.FreeMemory > overhead+used+layerSize {
 				gpuAllocations[g.i] += layerSize
-				tensorSplit[g.i]++
+				layerCounts[g.i]++
 				layerCount++
 				break
 			} else {
@@ -335,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
 				if g.g.FreeMemory > overhead+used+memoryLastLayer {
 					gpuAllocations[g.i] += memoryLastLayer
-					tensorSplit[g.i]++
+					layerCounts[g.i]++
 					layerCount++
 					break
 				}
@@ -350,7 +288,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	// Add the applicable (full or partial) graph allocations
 	for i := range gpus {
-		if tensorSplit[i] <= 0 {
+		if layerCounts[i] <= 0 {
 			continue
 		}
 		if fullyLoaded {
@@ -372,6 +310,14 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	memoryRequiredTotal = memoryRequiredPartial + overflow

+	tensorSplit := ""
+	if len(gpus) > 1 {
+		splits := make([]string, len(gpus))
+		for i, count := range layerCounts {
+			splits[i] = strconv.Itoa(count)
+		}
+		tensorSplit = strings.Join(splits, ",")
+	}
 	allocationsList := []string{}
 	for _, a := range gpuAllocations {
 		allocationsList = append(allocationsList, format.HumanBytes2(a))
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	projectors := []string{}
 	opts := api.DefaultOptions()
 	t.Run("cpu", func(t *testing.T) {
-		estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
 		assert.Equal(t, 0, estimate.Layers)
 		assert.Equal(t, uint64(0), estimate.Graph)
 	})
@@ -88,7 +88,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
 	for i, s := range []struct {
 		layer0, layer1   uint64
-		expect0, expect1 int
+		expect0, expect1 uint64
 	}{
 		{1, 1, 1, 1},
 		{2, 1, 2, 1},
@@ -112,9 +112,9 @@ func TestEstimateGPULayers(t *testing.T) {
 			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
 			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
 			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
-			assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
-			assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
+			estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
+			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
+			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
 			var layerSums uint64
 			for _, b := range estimate.GPUSizes {
 				layerSums += b
--- a/llm/server.go
+++ b/llm/server.go
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -8,178 +8,9 @@ import (
 	"testing"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/ml"
 	"golang.org/x/sync/semaphore"
 )

-func TestLLMServerFitGPU(t *testing.T) {
-	type gpu struct {
-		library string
-		free    int
-	}
-
-	tests := []struct {
-		name        string
-		gpus        []gpu
-		layers      []int
-		numGPU      int
-		requireFull bool
-		expected    ml.GPULayersList
-		expectedErr error
-	}{
-		{
-			name:     "No GPU",
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{},
-		},
-		{
-			name:     "Full single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
-		},
-		{
-			name:     "Partial single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
-		},
-		{
-			name:     "Single GPU with numGPU 1",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
-		},
-		{
-			name:     "Single GPU with numGPU 0",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   0,
-			expected: ml.GPULayersList{},
-		},
-		{
-			name:     "Single GPU with numGPU 999",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
-			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
-		},
-		{
-			name:     "Multi GPU fits on one",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
-		},
-		{
-			name:     "Multi GPU split",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
-		},
-		{
-			name:     "Multi GPU partial",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
-		},
-		{
-			name:     "Multi GPU numGPU 1",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
-		},
-		{
-			name:     "Multi GPU numGPU 2",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   2,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
-		},
-		{
-			name:     "Multi GPU numGPU 999",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
-			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
-		},
-		{
-			name:     "Multi GPU different libraries",
-			gpus:     []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
-			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
-		},
-		{
-			name:        "requireFull",
-			gpus:        []gpu{{free: 256 * format.MebiByte}},
-			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:      -1,
-			requireFull: true,
-			expectedErr: ErrLoadRequiredFull,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			var systemInfo discover.SystemInfo
-			systemInfo.System.TotalMemory = format.GibiByte
-			systemInfo.System.FreeMemory = 512 * format.MebiByte
-			systemInfo.System.FreeSwap = 256 * format.MebiByte
-
-			gpus := make(discover.GpuInfoList, len(tt.gpus))
-			for i := range tt.gpus {
-				gpus[i].ID = fmt.Sprintf("gpu%d", i)
-				gpus[i].Library = tt.gpus[i].library
-				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
-			}
-
-			s := &ollamaServer{
-				llmServer: llmServer{
-					totalLayers: uint64(len(tt.layers)),
-					options: api.Options{
-						Runner: api.Runner{
-							NumGPU: tt.numGPU,
-						},
-					},
-				},
-			}
-
-			s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
-				Weights: make([]ml.Memory, s.totalLayers),
-				Cache:   make([]ml.Memory, s.totalLayers),
-			}, GPUs: make([]ml.DeviceMemory, len(gpus))}
-
-			for i := range tt.layers {
-				s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
-			}
-
-			for i := range s.mem.GPUs {
-				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
-				s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
-				s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
-			}
-
-			gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
-			if err != tt.expectedErr {
-				t.Fatalf("fitGPU returned error: %v", err)
-			}
-			if gpuLayers.Hash() != tt.expected.Hash() {
-				t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
-			}
-		})
-	}
-}
-
 func TestLLMServerCompletionFormat(t *testing.T) {
 	// This test was written to fix an already deployed issue. It is a bit
 	// of a mess, and but it's good enough, until we can refactoring the
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,14 +5,12 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"hash/maphash"
 	"log/slog"
 	"math"
 	"slices"
 	"strconv"
 	"strings"

-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs"
 )

@@ -60,89 +58,19 @@ type CacheConfig struct {
 	MaskBatchPadding int
 }

-// GPULayers is a set of layers to be allocated on a single GPU
-type GPULayers struct {
-	// ID is the identifier of the GPU, as reported in DeviceMemory
-	ID string
-
-	// Layers is a set of layer indicies to load
-	Layers []int
-}
-
-func (g GPULayers) String() string {
-	if len(g.Layers) == 0 {
-		return ""
-	}
-
-	slices.Sort(g.Layers)
-
-	contiguous := true
-	base := g.Layers[0]
-	for i := range g.Layers {
-		if g.Layers[i] != base+i {
-			contiguous = false
-			break
-		}
-	}
-
-	if contiguous {
-		return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
-	} else {
-		return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
-	}
-}
-
-// GPULayersList is a set of layer allocations across multiple GPUs
-type GPULayersList []GPULayers
-
-func (l GPULayersList) String() string {
-	if l.Sum() > 0 {
-		return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
-	} else {
-		return fmt.Sprintf("%v", []GPULayers(l))
-	}
-}
-
-// Sum is the total number of layers assigned across all GPUs
-func (l GPULayersList) Sum() int {
-	var sum int
-
-	for _, g := range l {
-		sum += len(g.Layers)
-	}
-
-	return sum
-}
-
-var h maphash.Hash
-
-// Hash is an identifier of this layer assignment
-func (l GPULayersList) Hash() uint64 {
-	h.Reset()
-	for _, g := range l {
-		if len(g.Layers) > 0 {
-			h.WriteString(g.ID)
-			for _, l := range g.Layers {
-				binary.Write(&h, binary.NativeEndian, int64(l))
-			}
-		}
-	}
-
-	return h.Sum64()
-}
-
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// AllocMemory causes the backend to allocate memory for the model. If
-	// false, this is only being used for discovering the required amount of
-	// memory and cannot load the model for running.
-	AllocMemory bool
-
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

-	// GPULayers is the set of layers to offload to GPUs
-	GPULayers GPULayersList
+	// MainGPU is the index of the primary GPU to use
+	MainGPU int
+
+	// NumGPULayers is the number of layers to offload to GPUs
+	NumGPULayers int
+
+	// TensorSplit is the fraction of the model to offload to each GPU
+	TensorSplit []float32

 	// FlashAttention indicates that we should use a fused flash attention kernel
 	FlashAttention bool
@@ -213,28 +141,6 @@ type DeviceMemory struct {
 	Graph Memory
 }

-// Allocated returns the total size of the memory that has been successfully
-// allocated on this device
-func (m DeviceMemory) Allocated() uint64 {
-	var mem uint64
-
-	for _, w := range m.Weights {
-		if w.Status == Allocated {
-			mem += w.Size
-		}
-	}
-	for _, c := range m.Cache {
-		if c.Status == Allocated {
-			mem += c.Size
-		}
-	}
-	if m.Graph.Status == Allocated {
-		mem += m.Graph.Size
-	}
-
-	return mem
-}
-
 func memoryPresent(mem []Memory) bool {
 	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
 }
@@ -291,58 +197,6 @@ func (m BackendMemory) LogValue() slog.Value {
 	return slog.GroupValue(attrs...)
 }

-func sumMemory(mem []Memory) uint64 {
-	var sum uint64
-
-	for _, m := range mem {
-		sum += m.Size
-	}
-
-	return sum
-}
-
-// Log prints a high level summary of the memory (allocated or not)
-func (m BackendMemory) Log(level slog.Level) {
-	var total uint64
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Weights); sum > 0 {
-			slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
-		slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Cache); sum > 0 {
-			slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := sumMemory(m.CPU.Cache); sum > 0 {
-		slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := gpu.Graph.Size; sum > 0 {
-			slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.CPU.Graph.Size; sum > 0 {
-		slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	if total > 0 {
-		slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
-	}
-}
-
 var backends = make(map[string]func(string, BackendParams) (Backend, error))

 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,7 +10,6 @@ import "C"

 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -63,21 +62,12 @@ var initDevices = sync.OnceFunc(func() {
 	}
 })

-type layerDevice struct {
-	d  C.ggml_backend_dev_t
-	bt C.ggml_backend_buffer_type_t
-}
-
 type Backend struct {
 	// modelPath is the location of the model data
 	modelPath string

 	meta *fsggml.GGML

-	// allocMemory means that memory should be allocated for tensors and not
-	// just a dry run
-	allocMemory bool
-
 	// tensorLoadTargets maps from the name of the tensor in the file
 	// to the name that is used by the model definition
 	tensorLoadTargets map[string][]string
@@ -88,14 +78,11 @@ type Backend struct {

 	tensors map[string]*C.struct_ggml_tensor

-	// input is the backend buffer type used for inputs
+	// input is the backend used for inputs
 	input C.ggml_backend_buffer_type_t

-	// output is the backend device used for outputs
-	output C.ggml_backend_dev_t
-
 	// layers is the backend used for repeating layers
-	layers map[int]layerDevice
+	layers map[int]C.ggml_backend_buffer_type_t

 	// requiredMemory is the cumulative memory allocations needed by the backend
 	requiredMemory *ml.BackendMemory
@@ -112,8 +99,6 @@ type Backend struct {
 	weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
 }

-var once sync.Once
-
 func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	r, err := os.Open(modelPath)
 	if err != nil {
@@ -126,17 +111,15 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		return nil, err
 	}

-	once.Do(func() {
-		slog.Info(
-			"",
-			"architecture", meta.KV().Architecture(),
-			"file_type", meta.KV().FileType(),
-			"name", meta.KV().String("general.name"),
-			"description", meta.KV().String("general.description"),
-			"num_tensors", len(meta.Tensors().Items()),
-			"num_key_values", len(meta.KV()),
-		)
-	})
+	slog.Info(
+		"",
+		"architecture", meta.KV().Architecture(),
+		"file_type", meta.KV().FileType(),
+		"name", meta.KV().String("general.name"),
+		"description", meta.KV().String("general.description"),
+		"num_tensors", len(meta.Tensors().Items()),
+		"num_key_values", len(meta.KV()),
+	)

 	initDevices()

@@ -156,10 +139,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		switch C.ggml_backend_dev_type(d) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
-			bt := C.ggml_backend_dev_buffer_type(d)
-			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, bt)
-			C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
-
+			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
 			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}
@@ -180,8 +160,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			d:   d,
 			bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
 		})
-		C.ggml_backend_buft_set_alloc(bt, C.bool(params.AllocMemory))
-
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
@@ -191,25 +169,56 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

+	useDefaultSplit := true
+	for _, s := range params.TensorSplit {
+		if s != 0 {
+			useDefaultSplit = false
+			break
+		}
+	}
+
+	// calculate splits
+	splits := make([]float32, len(gpus))
+	if useDefaultSplit {
+		// default: split on free memory
+		for i := range splits {
+			var free, total C.size_t
+			C.ggml_backend_dev_memory(gpus[i], &free, &total)
+			splits[i] = float32(free)
+		}
+	} else {
+		splits = params.TensorSplit
+	}
+
+	var sum float32
+	// cumulative sum of all splits
+	for i := range splits {
+		sum += splits[i]
+		splits[i] = sum
+	}
+
+	// normalize splits
+	for i := range splits {
+		splits[i] /= sum
+	}
+
 	// inputs always use cpu
 	input := cpuDeviceBufferType

-	assignLayer := func(layer int) deviceBufferType {
-		for _, p := range params.GPULayers {
-			for _, l := range p.Layers {
-				if l == layer {
-					for i := range requiredMemory.GPUs {
-						if requiredMemory.GPUs[i].ID == p.ID {
-							return gpuDeviceBufferTypes[i]
-						}
-					}
-
-					return cpuDeviceBufferType
-				}
-			}
+	// define a range of gpu layers. anything outside of this range is assigned to the cpu
+	gpuRangeStart := max(0, blocks-params.NumGPULayers)
+	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
+	assignLayer := func(i int) deviceBufferType {
+		if i < gpuRangeStart || i >= gpuRangeStop {
+			return cpuDeviceBufferType
 		}

-		return cpuDeviceBufferType
+		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
+		if index < 0 || index >= len(gpuDeviceBufferTypes) {
+			return cpuDeviceBufferType
+		}
+
+		return gpuDeviceBufferTypes[index]
 	}

 	// repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
@@ -275,9 +284,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
 			if layer == -1 {
 				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
-				if params.AllocMemory {
-					requiredMemory.InputWeights.Status = ml.Allocated
-				}
+				requiredMemory.InputWeights.Status = ml.Allocated
 				requiredMemory.InputWeights.Size += uint64(size)
 			} else {
 				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
@@ -348,14 +355,12 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}

 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
-		if params.AllocMemory {
-			for i := range btDeviceMemory[bt].Weights {
-				if btDeviceMemory[bt].Weights[i].Size != 0 {
-					if b != nil {
-						btDeviceMemory[bt].Weights[i].Status = ml.Allocated
-					} else {
-						btDeviceMemory[bt].Weights[i].Status = ml.Failed
-					}
+		for i := range btDeviceMemory[bt].Weights {
+			if btDeviceMemory[bt].Weights[i].Size != 0 {
+				if b != nil {
+					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
+				} else {
+					btDeviceMemory[bt].Weights[i].Status = ml.Failed
 				}
 			}
 		}
@@ -376,9 +381,28 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		bbs[c] = b
 	}

+	// Mimic llama runner logs summarizing layers and memory
+	gpuLayers := 0
+	for _, layer := range layers {
+		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
+			gpuLayers++
+		}
+	}
+	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
+
+	switch C.ggml_backend_dev_type(output.d) {
+	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
+		slog.Info("offloading output layer to CPU")
+	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
+		slog.Info("offloading output layer to GPU")
+		gpuLayers++
+	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
+		slog.Info("offloading output layer to ACCEL")
+	}
+	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
+
 	for bs := range maps.Values(bbs) {
-		slog.Log(context.TODO(), logutil.LevelTrace, "model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)),
-			"size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
+		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
 	}

 	// map tensor names to tensors for easy lookup later
@@ -399,13 +423,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		b := backends[d]
 		bt := C.ggml_backend_get_default_buffer_type(b)

-		// Always include CPU as a fallback but otherwise, just use the devices where we assigned layers
-		if !slices.Contains(cpuDeviceBufferType.bts, bt) {
-			if c, ok := ctxs[bt]; !ok || C.ggml_get_first_tensor(c) == nil {
-				continue
-			}
-		}
-
 		deviceBufferTypes[d] = bt

 		schedBackends = append(schedBackends, b)
@@ -420,7 +437,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
 		modelPath:         modelPath,
-		allocMemory:       params.AllocMemory,
 		flashAttention:    params.FlashAttention,
 		meta:              meta,
 		tensorLoadTargets: targets,
@@ -436,14 +452,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		schedBackends: schedBackends,
 		schedBufts:    schedBufts,
 		input:         deviceBufferTypes[input.d],
-		output:        output.d,
-		layers: func() map[int]layerDevice {
-			m := make(map[int]layerDevice)
+		layers: func() map[int]C.ggml_backend_buffer_type_t {
+			m := make(map[int]C.ggml_backend_buffer_type_t)
 			for i, layer := range layers {
-				m[i] = layerDevice{
-					d:  layer.d,
-					bt: deviceBufferTypes[layer.d],
-				}
+				m[i] = deviceBufferTypes[layer.d]
 			}
 			return m
 		}(),
@@ -472,30 +484,6 @@ func (b *Backend) Close() {
 }

 func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
-	if !b.allocMemory {
-		return errors.New("cannot load model without memory allocation")
-	}
-
-	// Mimic llama runner logs summarizing layers and memory
-	gpuLayers := 0
-	for layer := range maps.Values(b.layers) {
-		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
-			gpuLayers++
-		}
-	}
-	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
-
-	switch C.ggml_backend_dev_type(b.output) {
-	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
-		slog.Info("offloading output layer to CPU")
-	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
-		slog.Info("offloading output layer to GPU")
-		gpuLayers++
-	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
-		slog.Info("offloading output layer to ACCEL")
-	}
-	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
-
 	var doneBytes atomic.Uint64
 	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset

@@ -742,11 +730,11 @@ func (c *Context) Input() ml.Context {
 }

 func (c *Context) Layer(i int) ml.Context {
-	if layer, ok := c.b.layers[i]; ok {
+	if buft, ok := c.b.layers[i]; ok {
 		return &Context{
 			b:                c.b,
 			ctx:              c.ctx,
-			buft:             layer.bt,
+			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
 			layer:            i,
@@ -804,16 +792,14 @@ func (c *Context) Reserve() {

 		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
 		graph.Size += uint64(bufferStatus.size)
-		if c.b.allocMemory {
-			if bufferStatus.allocated && graph.Status != ml.Failed {
-				graph.Status = ml.Allocated
-			} else {
-				graph.Status = ml.Failed
-			}
+		if bufferStatus.allocated && graph.Status != ml.Failed {
+			graph.Status = ml.Allocated
+		} else {
+			graph.Status = ml.Failed
 		}

-		slog.Log(context.TODO(), logutil.LevelTrace, "compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
-			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
+		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
+			"size", format.HumanBytes2(uint64(bufferStatus.size)))
 	}

 	if !reserved {
@@ -882,12 +868,10 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]

 		cache.Size += uint64(size)
-		if c.b.allocMemory {
-			if b != nil {
-				cache.Status = ml.Allocated
-			} else {
-				cache.Status = ml.Failed
-			}
+		if b != nil {
+			cache.Status = ml.Allocated
+		} else {
+			cache.Status = ml.Failed
 		}
 	}

@@ -906,9 +890,7 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {

 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	t := c.newTensor(dtype, shape)
-	if c.b.allocMemory {
-		C.ggml_set_zero(t.(*Tensor).t)
-	}
+	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }

@@ -933,7 +915,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {

 	t := c.newTensor(ml.DTypeF32, shape)

-	if c.b.allocMemory && len(s) > 0 {
+	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

@@ -945,7 +927,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {

 	t := c.newTensor(ml.DTypeI32, shape)

-	if c.b.allocMemory && len(s) > 0 {
+	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

@@ -1568,7 +1550,7 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
 func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
 	// Unchecked to handle quantized types
 	t := c.newTensor(dtype, shape)
-	if c.b.allocMemory && len(s) > 0 {
+	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -581,8 +581,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {

    ggml_backend_load_best("blas", silent, dir_path);
    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
+
+    // Avoid mixed hip+cuda configurations
+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES");
+    if (!hip_devices && !rocr_devices) {
+        ggml_backend_load_best("cuda", silent, dir_path);
+    } else {
+        ggml_backend_load_best("hip", silent, dir_path);
+    }
+
    ggml_backend_load_best("metal", silent, dir_path);
    ggml_backend_load_best("rpc", silent, dir_path);
    ggml_backend_load_best("sycl", silent, dir_path);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/arm.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/arm.go
@@ -1,5 +1,5 @@
 package arm

 // #cgo CXXFLAGS: -std=c++17
-// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include -DHWCAP2_SVE2="2"
+// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include
 import "C"
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -12,6 +12,7 @@ import (
 	"net/http"
 	"os"
 	"regexp"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -215,12 +216,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 }

 type Server struct {
-	// modelPath is the location of the model to be loaded
-	modelPath string
-
-	// loadMu prevents more than one load attempt from occurring at a time
-	loadMu sync.Mutex
-
 	// is the server ready to process requests?
 	// protects access to model and image
 	ready sync.WaitGroup
@@ -728,12 +723,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
 	}
 }

-// loadModel allocates memory based on the given parameters and loads the weights. The
-// memory allocated is worst case for text models but not for vision.
+type multiLPath []string
+
+func (m *multiLPath) Set(value string) error {
+	*m = append(*m, value)
+	return nil
+}
+
+func (m *multiLPath) String() string {
+	return strings.Join(*m, ", ")
+}
+
 func (s *Server) loadModel(
 	params llama.ModelParams,
 	mpath string,
-	lpath []string,
+	lpath multiLPath,
 	ppath string,
 	kvSize int,
 	kvCacheType string,
@@ -753,10 +757,12 @@ func (s *Server) loadModel(
 		panic(err)
 	}

-	for _, path := range lpath {
-		err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
-		if err != nil {
-			panic(err)
+	if lpath.String() != "" {
+		for _, path := range lpath {
+			err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
+			if err != nil {
+				panic(err)
+			}
 		}
 	}

@@ -777,81 +783,26 @@ func (s *Server) loadModel(
 	s.ready.Done()
 }

-// load is the handler called by the Ollama server to process different
-// load operations
-func (s *Server) load(w http.ResponseWriter, r *http.Request) {
-	s.loadMu.Lock()
-	defer s.loadMu.Unlock()
-
-	w.Header().Set("Content-Type", "application/json")
-
-	if s.status != llm.ServerStatusLaunched {
-		http.Error(w, "model already loaded", http.StatusInternalServerError)
-		return
-	}
-
-	var req llm.LoadRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		http.Error(w, "bad request", http.StatusBadRequest)
-		return
-	}
-
-	slog.Info("load", "request", req)
-
-	switch req.Operation {
-	// LoadOperationFit and LoadOperationAlloc have no meaning here - just return a successful response
-
-	case llm.LoadOperationCommit:
-		s.batchSize = req.BatchSize
-		s.parallel = req.Parallel
-		s.seqs = make([]*Sequence, s.parallel)
-		s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
-
-		gpuIDs := llama.EnumerateGPUs()
-		tensorSplit := make([]float32, len(gpuIDs))
-		numGPU := 0
-		for i := range gpuIDs {
-			for _, layers := range req.GPULayers {
-				if gpuIDs[i] == layers.ID {
-					tensorSplit[i] = float32(len(layers.Layers))
-					numGPU += len(layers.Layers)
-				}
-			}
-		}
-
-		params := llama.ModelParams{
-			NumGpuLayers: numGPU,
-			MainGpu:      req.MainGPU,
-			UseMmap:      req.UseMmap && len(req.LoraPath) == 0,
-			TensorSplit:  tensorSplit,
-			Progress: func(progress float32) {
-				s.progress = progress
-			},
-		}
-
-		s.status = llm.ServerStatusLoadingModel
-		go s.loadModel(params, s.modelPath, req.LoraPath, req.ProjectorPath, req.KvSize, req.KvCacheType, req.FlashAttention, req.NumThreads, req.MultiUserCache)
-
-	case llm.LoadOperationClose:
-		// No-op for us
-		if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
-			http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-		}
-		return
-	}
-
-	resp := llm.LoadResponse{Success: true}
-	if err := json.NewEncoder(w).Encode(&resp); err != nil {
-		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-		return
-	}
-}
-
 func Execute(args []string) error {
 	fs := flag.NewFlagSet("runner", flag.ExitOnError)
 	mpath := fs.String("model", "", "Path to model binary file")
+	ppath := fs.String("mmproj", "", "Path to projector binary file")
+	parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
+	batchSize := fs.Int("batch-size", 512, "Batch size")
+	nGpuLayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
+	mainGpu := fs.Int("main-gpu", 0, "Main GPU")
+	flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
+	kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
+	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := fs.Int("port", 8080, "Port to expose the server on")
+	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
+	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
+	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
+	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
+
+	var lpaths multiLPath
+	fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")

 	fs.Usage = func() {
 		fmt.Fprintf(fs.Output(), "Runner usage\n")
@@ -866,11 +817,35 @@ func Execute(args []string) error {
 	llama.BackendInit()

 	server := &Server{
-		modelPath: *mpath,
-		status:    llm.ServerStatusLaunched,
+		batchSize: *batchSize,
+		parallel:  *parallel,
+		seqs:      make([]*Sequence, *parallel),
+		seqsSem:   semaphore.NewWeighted(int64(*parallel)),
+		status:    llm.ServerStatusLoadingModel,
+	}
+
+	var tensorSplitFloats []float32
+	if *tensorSplit != "" {
+		splits := strings.Split(*tensorSplit, ",")
+		tensorSplitFloats = make([]float32, len(splits))
+		for i, s := range splits {
+			f, _ := strconv.ParseFloat(s, 32)
+			tensorSplitFloats[i] = float32(f)
+		}
+	}
+
+	params := llama.ModelParams{
+		NumGpuLayers: *nGpuLayers,
+		MainGpu:      *mainGpu,
+		UseMmap:      !*noMmap && lpaths.String() == "",
+		TensorSplit:  tensorSplitFloats,
+		Progress: func(progress float32) {
+			server.progress = progress
+		},
 	}

 	server.ready.Add(1)
+	go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)

 	server.cond = sync.NewCond(&server.mu)

@@ -888,7 +863,6 @@ func Execute(args []string) error {
 	defer listener.Close()

 	mux := http.NewServeMux()
-	mux.HandleFunc("POST /load", server.load)
 	mux.HandleFunc("/embedding", server.embeddings)
 	mux.HandleFunc("/completion", server.completion)
 	mux.HandleFunc("/health", server.health)
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -14,7 +14,6 @@ import (
 	"net"
 	"net/http"
 	"os"
-	"reflect"
 	"regexp"
 	"runtime"
 	"strconv"
@@ -260,16 +259,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 }

 type Server struct {
-	// modelPath is the location of the model to be loaded
-	modelPath string
-
-	// loadMu prevents more than one load attempt from occurring at a time
-	loadMu sync.Mutex
-
-	// lastLoad is the load request from the previous load attempt. Used to
-	// detect if we can reuse an existing memory allocation.
-	lastLoad llm.LoadRequest
-
 	// is the server ready to process requests?
 	// protects access to model and image
 	ready sync.WaitGroup
@@ -731,6 +720,17 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
 	}
 }

+type multiLPath []string
+
+func (m *multiLPath) Set(value string) error {
+	*m = append(*m, value)
+	return nil
+}
+
+func (m *multiLPath) String() string {
+	return strings.Join(*m, ", ")
+}
+
 func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
@@ -828,28 +828,15 @@ func (s *Server) reserveWorstCaseGraph() error {
 	return nil
 }

-// allocModel pre-allocates the maximum needed memory for a model
-// based on the given parameters
-func (s *Server) allocModel(
+func (s *Server) initModel(
 	mpath string,
 	params ml.BackendParams,
-	loraPath []string,
+	lpath multiLPath,
 	parallel int,
 	kvCacheType string,
 	kvSize int,
 	multiUserCache bool,
-) (panicErr error) {
-	// Convert memory allocation panics to errors
-	defer func() {
-		if r := recover(); r != nil {
-			if err, ok := r.(error); ok {
-				panicErr = err
-			} else {
-				panic(r)
-			}
-		}
-	}()
-
+) error {
 	var err error
 	s.model, err = model.New(mpath, params)
 	if err != nil {
@@ -857,7 +844,7 @@ func (s *Server) allocModel(
 	}

 	// TODO(jessegross): LoRA loading
-	if len(loraPath) > 0 {
+	if lpath.String() != "" {
 		return errors.New("loras are not yet implemented")
 	}

@@ -878,122 +865,63 @@ func (s *Server) allocModel(
 	return s.reserveWorstCaseGraph()
 }

-// closeModel frees all memory associated with a model
-func (s *Server) closeModel() {
-	s.cache.Close()
-	s.cache = nil
-	if s.model != nil {
-		s.model.Backend().Close()
-		s.model = nil
-	}
-}
+func (s *Server) load(
+	ctx context.Context,
+	mpath string,
+	params ml.BackendParams,
+	lpath multiLPath,
+	parallel int,
+	kvCacheType string,
+	kvSize int,
+	multiUserCache bool,
+) {
+	err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
+	if err != nil {
+		var noMem ml.ErrNoMem
+		if errors.As(err, &noMem) {
+			// We can't yet handle this but in the future we will
+			s.cache.Close()
+			if s.model != nil {
+				s.model.Backend().Close()
+			}
+		}

-// loadModel loads the weights for a model. The memory must already
-// have been allocated with allocModel
-func (s *Server) loadModel() {
-	err := s.model.Backend().Load(context.TODO(),
+		panic(err)
+	}
+
+	slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
+
+	err = s.model.Backend().Load(ctx,
 		func(progress float32) {
 			s.progress = progress
 		})
 	if err != nil {
-		panic(fmt.Errorf("failed to load model: %v", err))
+		panic(err)
 	}

 	s.status = llm.ServerStatusReady
 	s.ready.Done()
 }

-// load is the handler called by the Ollama server to process different
-// load operations
-func (s *Server) load(w http.ResponseWriter, r *http.Request) {
-	s.loadMu.Lock()
-	defer s.loadMu.Unlock()
-
-	w.Header().Set("Content-Type", "application/json")
-
-	if s.status != llm.ServerStatusLaunched {
-		http.Error(w, "model already loaded", http.StatusInternalServerError)
-		return
-	}
-
-	var req llm.LoadRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		http.Error(w, "bad request", http.StatusBadRequest)
-		return
-	}
-
-	slog.Info("load", "request", req)
-
-	if req.Operation == llm.LoadOperationClose {
-		s.closeModel()
-		if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
-			http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-		}
-		return
-	}
-
-	s.lastLoad.Operation = req.Operation
-	loadModel := s.model == nil || !reflect.DeepEqual(req, s.lastLoad)
-
-	s.lastLoad = req
-
-	if loadModel {
-		s.closeModel()
-
-		params := ml.BackendParams{
-			AllocMemory:    req.Operation != llm.LoadOperationFit,
-			NumThreads:     req.NumThreads,
-			GPULayers:      req.GPULayers,
-			FlashAttention: req.FlashAttention,
-		}
-
-		s.batchSize = req.BatchSize
-
-		err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
-		if err != nil {
-			s.closeModel()
-
-			var noMem ml.ErrNoMem
-			if errors.As(err, &noMem) {
-				resp := llm.LoadResponse{Success: false, Memory: noMem.BackendMemory}
-				if err := json.NewEncoder(w).Encode(&resp); err != nil {
-					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-				}
-
-				return
-			}
-
-			http.Error(w, fmt.Sprintf("failed to initialize model: %v", err), http.StatusInternalServerError)
-			return
-		}
-	}
-
-	mem := s.model.Backend().BackendMemory()
-
-	switch req.Operation {
-	case llm.LoadOperationFit:
-		// LoadOperationFit can't be used for anything else, so just close it
-		s.closeModel()
-
-	// LoadOperationAlloc should stay open for future operations
-
-	case llm.LoadOperationCommit:
-		s.status = llm.ServerStatusLoadingModel
-		go s.loadModel()
-	}
-
-	resp := llm.LoadResponse{Success: true, Memory: mem}
-	if err := json.NewEncoder(w).Encode(&resp); err != nil {
-		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-		return
-	}
-}
-
 func Execute(args []string) error {
 	fs := flag.NewFlagSet("runner", flag.ExitOnError)
 	mpath := fs.String("model", "", "Path to model binary file")
+	parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
+	batchSize := fs.Int("batch-size", 512, "Batch size")
+	numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
+	mainGPU := fs.Int("main-gpu", 0, "Main GPU")
+	flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
+	kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
+	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := fs.Int("port", 8080, "Port to expose the server on")
+	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
+	_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
+	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
+	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
+
+	var lpaths multiLPath
+	fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")

 	fs.Usage = func() {
 		fmt.Fprintf(fs.Output(), "Runner usage\n")
@@ -1005,17 +933,39 @@ func Execute(args []string) error {
 	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	slog.Info("starting ollama engine")

-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
 	server := &Server{
-		modelPath: *mpath,
-		status:    llm.ServerStatusLaunched,
+		batchSize: *batchSize,
+		status:    llm.ServerStatusLoadingModel,
 	}

 	server.cond = sync.NewCond(&server.mu)
 	server.ready.Add(1)

+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// TODO(jessegross): Parameters that need to be implemented:
+	//	no-mmap
+
+	var tensorSplitFloats []float32
+	if *tensorSplit != "" {
+		splits := strings.Split(*tensorSplit, ",")
+		tensorSplitFloats = make([]float32, len(splits))
+		for i, s := range splits {
+			f, _ := strconv.ParseFloat(s, 32)
+			tensorSplitFloats[i] = float32(f)
+		}
+	}
+
+	params := ml.BackendParams{
+		NumThreads:     *threads,
+		NumGPULayers:   *numGPULayers,
+		MainGPU:        *mainGPU,
+		TensorSplit:    tensorSplitFloats,
+		FlashAttention: *flashAttention,
+	}
+
+	go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
 	go server.run(ctx)

 	addr := "127.0.0.1:" + strconv.Itoa(*port)
@@ -1028,7 +978,6 @@ func Execute(args []string) error {

 	mux := http.NewServeMux()
 	// TODO: support embeddings
-	mux.HandleFunc("POST /load", server.load)
 	mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
 	})
--- a/server/harmonyparser.go
+++ b/server/harmonyparser.go
@@ -2,7 +2,6 @@ package server

 import (
 	"context"
-	"fmt"
 	"log/slog"
 	"slices"
 	"strings"
@@ -276,9 +275,8 @@ const (
 // HarmonyMessageHandler processes harmony events and accumulates content appropriately.
 // This is a higher level interface that maps harmony concepts into ollama concepts
 type HarmonyMessageHandler struct {
-	state           harmonyMessageState
-	harmonyParser   *HarmonyParser
-	functionNameMap *FunctionNameMap
+	state         harmonyMessageState
+	harmonyParser *HarmonyParser
 }

 // NewHarmonyMessageHandler creates a new message handler
@@ -290,7 +288,6 @@ func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 			MessageEndTag:   "<|end|>",
 			HeaderEndTag:    "<|message|>",
 		},
-		functionNameMap: NewFunctionNameMap(),
 	}
 }

@@ -381,97 +378,3 @@ func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
 func (a *HarmonyToolCallAccumulator) Content() string {
 	return a.acc.String()
 }
-
-// FunctionNameMap maps a user-specified function name to a valid function
-// name for harmony (which look like TypeScript identifiers). This is needed to
-// transform user-specified function names, which might contain characters that
-// are not allowed in TypeScript identifiers
-type FunctionNameMap struct {
-	userToHarmony map[string]string
-	harmonyToUser map[string]string
-}
-
-func NewFunctionNameMap() *FunctionNameMap {
-	return &FunctionNameMap{
-		userToHarmony: make(map[string]string),
-		harmonyToUser: make(map[string]string),
-	}
-}
-
-func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
-	harmonyFunctionName := m.deriveName(userFunctionName)
-	m.userToHarmony[userFunctionName] = harmonyFunctionName
-	m.harmonyToUser[harmonyFunctionName] = userFunctionName
-	return harmonyFunctionName
-}
-
-// OriginalFromConverted looks up the reverse-mapping of a previously-converted
-// user->harmony function name. To unmap reliably, the mapping must exist, as
-// the conversion process is not reversible without the appropriate state
-func (m *FunctionNameMap) OriginalFromConverted(harmonyFunctionName string) string {
-	if userFunctionName, ok := m.harmonyToUser[harmonyFunctionName]; ok {
-		return userFunctionName
-	}
-	slog.Warn("harmony parser: no reverse mapping found for function name", "harmonyFunctionName", harmonyFunctionName)
-	// fallback to the original function name if we can't find a mapping
-	return harmonyFunctionName
-}
-
-// convertToValidChars converts a user-specified function name to a valid
-// TypeScript identifier.
-//
-// Limitations:
-//
-//   - This doesn't restrict reserved TypeScript keywords.
-//   - We don't perform a real ID_Start/ID_Continue check, and instead use the more
-//     restrictive unicode.IsLetter/unicode.IsDigit check. Unclear what kind of
-//     identifiers these models were trained on, so in the end we might want to
-//     convert unicode-heavy identifiers to their closest ASCII equivalents.
-func (m *FunctionNameMap) convertToValidChars(userFunctionName string) string {
-	mapper := func(r rune) rune {
-		// first, replace certain characters with underscores
-		if r == ' ' || r == '-' || r == '.' {
-			return '_'
-		}
-
-		if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$' {
-			return r
-		}
-
-		// finally, remove any other characters
-		return -1
-	}
-	candidate := strings.Map(mapper, userFunctionName)
-
-	// set a default name if we end up with nothing left
-	if candidate == "" {
-		return "unnamed"
-	}
-
-	// if the candidate starts with a number, prepend an underscore to make it a
-	// valid identifier
-	if unicode.IsDigit(rune(candidate[0])) {
-		candidate = "_" + candidate
-	}
-
-	return candidate
-}
-
-func (m *FunctionNameMap) deriveName(userFunctionName string) string {
-	originalCandidate := m.convertToValidChars(userFunctionName)
-	candidate := originalCandidate
-
-	// Check for dupes, and if so, add a number to the end.
-	// We start at 2 because if we have dupes and the first is never renamed, it
-	// makes sense for them to be named, say, `f`, `f_2`, `f_3`
-	count := 2
-	for {
-		if _, exists := m.harmonyToUser[candidate]; !exists {
-			break
-		}
-		candidate = fmt.Sprintf("%s_%d", originalCandidate, count)
-		count++
-	}
-
-	return candidate
-}
--- a/server/harmonyparser_test.go
+++ b/server/harmonyparser_test.go
@@ -467,71 +467,3 @@ func TestHarmonyParserStreaming(t *testing.T) {
 		})
 	}
 }
-
-// TestFunctionConvertToValidChars tests only FunctionNameMap.convert(), which doesn't
-// handle any saving (and therefore no dupe handling)
-func TestFunctionConvertToValidChars(t *testing.T) {
-	tests := []struct {
-		name string
-		in   string
-		want string
-	}{
-		{name: "replace spaces with underscores", in: "get weather", want: "get_weather"},
-		{name: "replace hyphens with underscores", in: "get-weather", want: "get_weather"},
-		{name: "replace periods with underscores", in: "get.weather", want: "get_weather"},
-		{name: "disallow non-word characters", in: "get weather!", want: "get_weather"},
-		{name: "strip out invalid non-alphanumeric unicode characters", in: "a🫠bc", want: "abc"},
-		{name: "names that only contain invalid characters", in: "🫠", want: "unnamed"},
-		{name: "leading number", in: "123", want: "_123"},
-		{name: "$ allowed", in: "$", want: "$"},
-		// show that we allow weird unicode letter characters, though we might want
-		// to convert them to their closest ASCII equivalents in the future
-		{name: "allow weird unicode letter characters", in: "𝓸𝓵𝓵𝓪𝓶𝓪", want: "𝓸𝓵𝓵𝓪𝓶𝓪"},
-		// names that look like words but are invalid (i.e., not ID_Start/ID_Continue)
-		{name: "disallow non-word characters that look like words", in: "ⓞⓛⓛⓐⓜⓐ123", want: "_123"},
-	}
-
-	for i, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			parser := NewFunctionNameMap()
-			got := parser.convertToValidChars(tt.in)
-			if got != tt.want {
-				t.Errorf("case %d: got %q, want %q", i, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestFunctionConvertAndAdd(t *testing.T) {
-	// make a fresh map for each test, but within a test use the same map so we can test for dupe handling
-	tests := []struct {
-		name string
-		in   []string
-		want []string
-	}{
-		{name: "basic dupe handling", in: []string{"get weather", "get weather"}, want: []string{"get_weather", "get_weather_2"}},
-		{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
-		{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
-		{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
-	}
-
-	for i, tt := range tests {
-		parser := NewFunctionNameMap()
-		t.Run(tt.name, func(t *testing.T) {
-			for j, in := range tt.in {
-				got := parser.ConvertAndAdd(in)
-				want := tt.want[j]
-				if got != want {
-					t.Errorf("case %d: got %q, want %q", i, got, want)
-				}
-				// check that the maps are correct
-				if parser.userToHarmony[in] != want {
-					t.Errorf("case %d: userToHarmony[%q] = %q, want %q", i, in, parser.userToHarmony[in], want)
-				}
-				if parser.harmonyToUser[want] != in {
-					t.Errorf("case %d: harmonyToUser[%q] = %q, want %q", i, want, parser.harmonyToUser[want], in)
-				}
-			}
-		})
-	}
-}
--- a/server/routes.go
+++ b/server/routes.go
@@ -314,19 +314,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}

-	// If debug mode is enabled, return the rendered template instead of calling the model
-	if req.DebugRenderOnly {
-		c.JSON(http.StatusOK, api.DebugTemplateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			DebugInfo: api.DebugInfo{
-				RenderedTemplate: prompt,
-				ImageCount:       len(images),
-			},
-		})
-		return
-	}
-
 	var thinkingState *thinking.Parser
 	if !useHarmony {
 		openingTag, closingTag := thinking.InferTags(m.Template.Template)
@@ -1490,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) {
 		mr := api.ProcessModelResponse{
 			Model:     model.ShortName,
 			Name:      model.ShortName,
-			Size:      int64(v.totalSize),
-			SizeVRAM:  int64(v.vramSize),
+			Size:      int64(v.estimatedTotal),
+			SizeVRAM:  int64(v.estimatedVRAM),
 			Digest:    model.Digest,
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
 		if v.Options != nil {
-			mr.ContextLength = v.Options.NumCtx
+			mr.ContextLength = v.Options.NumCtx / v.numParallel
 		}
 		// The scheduler waits to set expiresAt, so if a model is loading it's
 		// possible that it will be set to the unix epoch. For those cases, just
@@ -1603,12 +1590,24 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	var harmonyMessageHandler *HarmonyMessageHandler
-	var harmonyToolParser *HarmonyToolCallAccumulator
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
+	if err != nil {
+		slog.Error("chat prompt error", "error", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}

 	useHarmony := shouldUseHarmony(*m)

-	processedTools := req.Tools
+	// Validate Think value: string values currently only allowed for gptoss models
+	if req.Think != nil && req.Think.IsString() && !useHarmony {
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
+		return
+	}
+
+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator
+
 	if useHarmony {
 		harmonyMessageHandler = NewHarmonyMessageHandler()
 		var lastMessage *api.Message
@@ -1617,40 +1616,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 		harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
-
-		// make a copy of tools to pass to the chat prompt. Function names may be
-		// renamed to be valid Harmony function names.
-		processedTools = make([]api.Tool, len(req.Tools))
-		copy(processedTools, req.Tools)
-		for i, tool := range processedTools {
-			processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
-		}
-	}
-
-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
-	if err != nil {
-		slog.Error("chat prompt error", "error", err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		return
-	}
-
-	// If debug mode is enabled, return the rendered template instead of calling the model
-	if req.DebugRenderOnly {
-		c.JSON(http.StatusOK, api.DebugTemplateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			DebugInfo: api.DebugInfo{
-				RenderedTemplate: prompt,
-				ImageCount:       len(images),
-			},
-		})
-		return
-	}
-
-	// Validate Think value: string values currently only allowed for gptoss models
-	if req.Think != nil && req.Think.IsString() && !useHarmony {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
-		return
 	}

 	var thinkingState *thinking.Parser
@@ -1705,7 +1670,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					toolName, toolContent := harmonyToolParser.Drain()
 					if toolName != nil {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
-						*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
 							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@@ -1,413 +0,0 @@
-package server
-
-import (
-	"bytes"
-	"encoding/json"
-	"net/http"
-	"testing"
-	"time"
-
-	"github.com/gin-gonic/gin"
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/llm"
-)
-
-func TestGenerateDebugRenderOnly(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         llm.DoneReasonStop,
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      discover.GetGPUInfo,
-			getCpuFn:      discover.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	// Create a test model
-	stream := false
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:    "test-model",
-		Files:    map[string]string{"file.gguf": digest},
-		Template: "{{ .Prompt }}",
-		Stream:   &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	tests := []struct {
-		name            string
-		request         api.GenerateRequest
-		expectDebug     bool
-		expectTemplate  string
-		expectNumImages int
-	}{
-		{
-			name: "debug render only enabled",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello, world!",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "Hello, world!",
-		},
-		{
-			name: "debug render only disabled",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello, world!",
-				DebugRenderOnly: false,
-			},
-			expectDebug: false,
-		},
-		{
-			name: "debug render only with system prompt",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "User question",
-				System:          "You are a helpful assistant",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "User question",
-		},
-		{
-			name: "debug render only with template",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello",
-				Template:        "PROMPT: {{ .Prompt }}",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "PROMPT: Hello",
-		},
-		{
-			name: "debug render only with images",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Describe this image",
-				Images:          []api.ImageData{[]byte("fake-image-data")},
-				DebugRenderOnly: true,
-			},
-			expectDebug:     true,
-			expectTemplate:  "[img-0]\n\nDescribe this image",
-			expectNumImages: 1,
-		},
-		{
-			name: "debug render only with raw mode",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Raw prompt text",
-				Raw:             true,
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "Raw prompt text",
-		},
-	}
-
-	for _, tt := range tests {
-		// Test both with and without streaming
-		streamValues := []bool{false, true}
-		for _, stream := range streamValues {
-			streamSuffix := ""
-			if stream {
-				streamSuffix = " (streaming)"
-			}
-			t.Run(tt.name+streamSuffix, func(t *testing.T) {
-				req := tt.request
-				req.Stream = &stream
-				w := createRequest(t, s.GenerateHandler, req)
-
-				if tt.expectDebug {
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
-					}
-
-					var response api.DebugTemplateResponse
-					if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
-						t.Fatalf("failed to unmarshal response: %v", err)
-					}
-
-					if response.Model != tt.request.Model {
-						t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
-					}
-
-					if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
-						t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
-					}
-
-					if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
-						t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
-					}
-				} else {
-					// When debug is disabled, it should attempt normal processing
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
-					}
-				}
-			})
-		}
-	}
-}
-
-func TestChatDebugRenderOnly(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         llm.DoneReasonStop,
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      discover.GetGPUInfo,
-			getCpuFn:      discover.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	// Create a test model
-	stream := false
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:    "test-model",
-		Files:    map[string]string{"file.gguf": digest},
-		Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
-		Stream:   &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	tests := []struct {
-		name            string
-		request         api.ChatRequest
-		expectDebug     bool
-		expectTemplate  string
-		expectNumImages int
-	}{
-		{
-			name: "chat debug render only enabled",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "system", Content: "You are a helpful assistant"},
-					{Role: "user", Content: "Hello"},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
-		},
-		{
-			name: "chat debug render only disabled",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-				},
-				DebugRenderOnly: false,
-			},
-			expectDebug: false,
-		},
-		{
-			name: "chat debug with assistant message",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-					{Role: "assistant", Content: "Hi there!"},
-					{Role: "user", Content: "How are you?"},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
-		},
-		{
-			name: "chat debug with images",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "What's in this image?",
-						Images:  []api.ImageData{[]byte("fake-image-data")},
-					},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:     true,
-			expectTemplate:  "user: [img-0]What's in this image?\n",
-			expectNumImages: 1,
-		},
-		{
-			name: "chat debug with tools",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Get the weather"},
-				},
-				Tools: api.Tools{
-					{
-						Type: "function",
-						Function: api.ToolFunction{
-							Name:        "get_weather",
-							Description: "Get weather information",
-						},
-					},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
-		},
-	}
-
-	for _, tt := range tests {
-		// Test both with and without streaming
-		streamValues := []bool{false, true}
-		for _, stream := range streamValues {
-			streamSuffix := ""
-			if stream {
-				streamSuffix = " (streaming)"
-			}
-			t.Run(tt.name+streamSuffix, func(t *testing.T) {
-				req := tt.request
-				req.Stream = &stream
-				w := createRequest(t, s.ChatHandler, req)
-
-				if tt.expectDebug {
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
-					}
-
-					var response api.DebugTemplateResponse
-					if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
-						t.Fatalf("failed to unmarshal response: %v", err)
-					}
-
-					if response.Model != tt.request.Model {
-						t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
-					}
-
-					if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
-						t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
-					}
-
-					if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
-						t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
-					}
-				} else {
-					// When debug is disabled, it should attempt normal processing
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
-					}
-				}
-			})
-		}
-	}
-}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -77,13 +77,12 @@ func TestGenerateChat(t *testing.T) {
 			getGpuFn:      discover.GetGPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 					llama: &mock,
 				}
-				return false
 			},
 		},
 	}
@@ -621,13 +620,12 @@ func TestGenerate(t *testing.T) {
 			getGpuFn:      discover.GetGPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 					llama: &mock,
 				}
-				return false
 			},
 		},
 	}
--- a/server/routes_harmony_streaming_test.go
+++ b/server/routes_harmony_streaming_test.go
@@ -277,11 +277,10 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
 					getGpuFn:      discover.GetGPUInfo,
 					getCpuFn:      discover.GetCPUInfo,
 					reschedDelay:  100 * time.Millisecond,
-					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
 						req.successCh <- &runnerRef{
 							llama: &mock,
 						}
-						return false
 					},
 				},
 			}
@@ -428,11 +427,10 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
 			getGpuFn:      discover.GetGPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  100 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
 				req.successCh <- &runnerRef{
 					llama: &mock,
 				}
-				return false
 			},
 		},
 	}
@@ -610,11 +608,10 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
 					getGpuFn:      discover.GetGPUInfo,
 					getCpuFn:      discover.GetCPUInfo,
 					reschedDelay:  250 * time.Millisecond,
-					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
 						req.successCh <- &runnerRef{
 							llama: &mock,
 						}
-						return false
 					},
 				},
 			}
--- a/server/sched.go
+++ b/server/sched.go
@@ -28,6 +28,7 @@ type LlmRequest struct {
 	ctx             context.Context //nolint:containedctx
 	model           *Model
 	opts            api.Options
+	origNumCtx      int // Track the initial ctx request
 	sessionDuration *api.Duration
 	successCh       chan *runnerRef
 	errCh           chan error
@@ -40,17 +41,10 @@ type Scheduler struct {
 	expiredCh     chan *runnerRef
 	unloadedCh    chan any

-	// loadedMu protects loaded and activeLoading
+	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex

-	// activeLoading is the model that we are currently working on loading,
-	// including by evicting one or more other models. We can only load
-	// one model at a time but new requests to models that already loaded can
-	// happen in parallel
-	activeLoading llm.LlamaServer
-	loaded        map[string]*runnerRef
-
-	loadFn       func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
+	loadFn       func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
 	newServerFn  func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
 	getGpuFn     func() discover.GpuInfoList
 	getCpuFn     func() discover.GpuInfoList
@@ -62,6 +56,9 @@ type Scheduler struct {
 // on a large GPU can cause stalling
 var defaultModelsPerGPU = 3

+// Default automatic value for parallel setting
+var defaultParallel = 1
+
 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")

 func InitScheduler(ctx context.Context) *Scheduler {
@@ -82,36 +79,24 @@ func InitScheduler(ctx context.Context) *Scheduler {
 }

 // context must be canceled to decrement ref count and release the runner
-func (s *Scheduler) GetRunner(c context.Context, m *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
+func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}

-	if m.CheckCapabilities(model.CapabilityVision) == nil {
-		// multimodal models require at least 2048 context
-		opts.NumCtx = max(opts.NumCtx, 2048)
-	}
-
 	req := &LlmRequest{
 		ctx:             c,
-		model:           m,
+		model:           model,
 		opts:            opts,
 		sessionDuration: sessionDuration,
-		successCh:       make(chan *runnerRef, 1),
+		successCh:       make(chan *runnerRef),
 		errCh:           make(chan error, 1),
 	}

-	s.loadedMu.Lock()
-	runner := s.loaded[req.model.ModelPath]
-	s.loadedMu.Unlock()
-	if runner != nil && !runner.needsReload(c, req) {
-		req.useLoadedRunner(runner, s.finishedReqCh)
-	} else {
-		select {
-		case s.pendingReqCh <- req:
-		default:
-			req.errCh <- ErrMaxQueue
-		}
+	select {
+	case s.pendingReqCh <- req:
+	default:
+		req.errCh <- ErrMaxQueue
 	}
 	return req.successCh, req.errCh
 }
@@ -137,11 +122,21 @@ func (s *Scheduler) processPending(ctx context.Context) {
 		case pending := <-s.pendingReqCh:
 			// Block other requests until we get this pending request running
 			pending.schedAttempts++
+			if pending.origNumCtx == 0 {
+				pending.origNumCtx = pending.opts.NumCtx
+			}

 			if pending.ctx.Err() != nil {
 				slog.Debug("pending request cancelled or timed out, skipping scheduling")
 				continue
 			}
+			numParallel := int(envconfig.NumParallel())
+			// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
+			// ref: https://github.com/ollama/ollama/issues/4165
+			if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
+				numParallel = 1
+				slog.Warn("mllama does not currently support parallel requests")
+			}

 			for {
 				var runnerToExpire *runnerRef
@@ -200,26 +195,84 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

-					// Update free memory from currently loaded models
-					s.updateFreeSpace(gpus)
+					// Embedding models should always be loaded with parallel=1
+					if pending.model.CheckCapabilities(model.CapabilityCompletion) != nil {
+						numParallel = 1
+					}

-					if loadedCount == 0 {
+					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
+					if len(gpus) == 1 && gpus[0].Library == "cpu" {
+						// simplifying assumption of defaultParallel when in CPU mode
+						if numParallel <= 0 {
+							numParallel = defaultParallel
+						}
+
+						pending.opts.NumCtx = pending.origNumCtx * numParallel
+
+						if loadedCount == 0 {
+							slog.Debug("cpu mode with first model, loading")
+							s.loadFn(pending, ggml, gpus, numParallel)
+							break
+						}
+						runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
+						if runnerToExpire == nil {
+							slog.Debug("cpu mode with available system memory or first model, loading")
+							s.loadFn(pending, ggml, gpus, numParallel)
+							break
+						}
+						// else we need to expire a runner
+					} else if loadedCount == 0 {
 						// No models loaded. Load the model but prefer the best fit.
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
-						s.loadFn(pending, ggml, gpus, false)
+						g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
+						if g != nil {
+							gpus = g
+						} else {
+							// Only allow partial loads when this is the first model
+							gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
+						}
+						s.loadFn(pending, ggml, gpus, numParallel)
 						break
 					}

-					// More than one loaded model, so we have to see if the
-					// new one fits
+					if runnerToExpire == nil {
+						// More than one loaded model, so we have to see if the
+						// new one fits
+						//
+						// We want to avoid loading on any GPUs that have other
+						// models still loading on them to avoid potential races
+						// with VRAM consumption ramping up during load
+						availGpus := s.filterGPUsWithoutLoadingModels(gpus)

-					needEvict := s.loadFn(pending, ggml, gpus, true)
-					if !needEvict {
-						slog.Debug("new model fits with existing models, loading")
-						break
+						// Update free memory from currently loaded models
+						s.updateFreeSpace(availGpus)
+						fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
+						if fitGpus != nil {
+							slog.Debug("new model fits with existing models, loading")
+							s.loadFn(pending, ggml, fitGpus, numParallel)
+							break
+						}
+
+						// We couldn't find a set of GPUs to fully load the new
+						// model. If no other models are loading (both GPU lists
+						// are the same) then we need to unload another model to
+						// make room
+						if len(availGpus) < len(gpus) {
+							// There are other requests pending, and this one
+							// needs more time, so put it on the back of the
+							// queue so that we might satisfy other pending
+							// requests that aren't blocked
+							go func() {
+								// Process in a go routine to avoid deadlocking
+								// the scheduler if our queue is full
+								slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
+								time.Sleep(s.reschedDelay)
+								s.pendingReqCh <- pending
+							}()
+							break
+						}
+						runnerToExpire = s.findRunnerToUnload()
 					}
-
-					runnerToExpire = s.findRunnerToUnload()
 				}

 				if runnerToExpire == nil {
@@ -240,6 +293,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				}
 				runnerToExpire.refMu.Unlock()
 				// Wait for the unload to happen
+				// Note: at this point we're queueing up all incoming requests, even if they were for
+				// a different model that's loaded and not scheduled to be removed.
 				slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire)
 				select {
 				case <-ctx.Done():
@@ -379,72 +434,26 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 }

-// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
-// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
-func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
-	numParallel := int(envconfig.NumParallel())
+func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
 	if numParallel < 1 {
 		numParallel = 1
 	}
-
-	// Embedding models should always be loaded with parallel=1
-	if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
-		numParallel = 1
-	}
-
-	// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
-	// ref: https://github.com/ollama/ollama/issues/4165
-	if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
-		numParallel = 1
-		slog.Warn("mllama does not currently support parallel requests")
-	}
-
 	sessionDuration := envconfig.KeepAlive()
 	if req.sessionDuration != nil {
 		sessionDuration = req.sessionDuration.Duration
 	}
-
-	s.loadedMu.Lock()
-	llama := s.activeLoading
-
-	if llama == nil {
-		var err error
-		llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
-		if err != nil {
-			// some older models are not compatible with newer versions of llama.cpp
-			// show a generalized compatibility error until there is a better way to
-			// check for model compatibility
-			if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
-				err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
-			}
-			slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
-			req.errCh <- err
-			s.loadedMu.Unlock()
-			return false
-		}
-
-		s.activeLoading = llama
-	} else {
-		if s.activeLoading.ModelPath() != req.model.ModelPath {
-			panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
-		}
-	}
-
-	s.loadedMu.Unlock()
-
-	err := llama.Load(req.ctx, gpus, requireFull)
+	llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
 	if err != nil {
-		if errors.Is(err, llm.ErrLoadRequiredFull) {
-			return true
+		// some older models are not compatible with newer versions of llama.cpp
+		// show a generalized compatibility error until there is a better way to
+		// check for model compatibility
+		if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
+			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
 		}
-
-		slog.Info("Load failed", "model", req.model.ModelPath, "error", err)
-		s.activeLoading.Close()
-		s.activeLoading = nil
+		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
 		req.errCh <- err
-		return false
+		return
 	}
-
 	runner := &runnerRef{
 		model:           req.model,
 		modelPath:       req.model.ModelPath,
@@ -452,8 +461,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		Options:         &req.opts,
 		sessionDuration: sessionDuration,
 		gpus:            gpus,
-		vramSize:        llama.VRAMSize(),
-		totalSize:       llama.TotalSize(),
+		estimatedVRAM:   llama.EstimatedVRAM(),
+		estimatedTotal:  llama.EstimatedTotal(),
 		loading:         true,
 		pid:             llama.Pid(),
 	}
@@ -468,7 +477,6 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		oldRunner.unload()
 		oldRunner.refMu.Unlock()
 	}
-	s.activeLoading = nil
 	s.loaded[req.model.ModelPath] = runner
 	slog.Info("loaded runners", "count", len(s.loaded))
 	s.loadedMu.Unlock()
@@ -495,8 +503,6 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		}()
 		req.successCh <- runner
 	}()
-
-	return false
 }

 func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
@@ -515,7 +521,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
 		r.refMu.Lock()
 		if r.llama != nil {
 			for _, gpu := range allGpus {
-				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.VRAMByGPU(gpu.ID)
+				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
 			}
 		} else {
 			slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@@ -542,17 +548,41 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
 	}
 }

+// While models are loading the VRAM consumption numbers will be indeterminate, so we have
+// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
+// This routine returns the set of GPUs that do not have an active loading model.
+// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
+	ret := append(discover.GpuInfoList{}, allGpus...)
+	s.loadedMu.Lock()
+	defer s.loadedMu.Unlock()
+	for _, runner := range s.loaded {
+		if runner.loading {
+			slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
+			for _, busyGPU := range runner.gpus {
+				for i := range ret {
+					if ret[i].ID == busyGPU.ID {
+						ret = append(ret[:i], ret[i+1:]...)
+						break
+					}
+				}
+			}
+		}
+	}
+	return ret
+}
+
 // TODO consolidate sched_types.go
 type runnerRef struct {
 	refMu    sync.Mutex
 	refCount uint // prevent unloading if > 0

-	llama     llm.LlamaServer
-	pid       int
-	loading   bool                 // True only during initial load, then false forever
-	gpus      discover.GpuInfoList // Recorded at time of provisioning
-	vramSize  uint64
-	totalSize uint64
+	llama          llm.LlamaServer
+	pid            int
+	loading        bool                 // True only during initial load, then false forever
+	gpus           discover.GpuInfoList // Recorded at time of provisioning
+	estimatedVRAM  uint64
+	estimatedTotal uint64

 	sessionDuration time.Duration
 	expireTimer     *time.Timer
@@ -601,6 +631,9 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 		optsNew.NumGPU = -1
 	}

+	// Normalize the NumCtx for parallelism
+	optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
+
 	ctx, cancel := context.WithTimeout(ctx, timeout)
 	defer cancel()
 	if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
@@ -661,7 +694,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
 				freeMemoryNow += gpu.FreeMemory
 			}
 			// If we're within ~80% of the estimated memory usage recovered, bail out
-			if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.8 {
+			if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
 				slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner)
 				finished <- struct{}{}
 				return
@@ -686,8 +719,8 @@ func (runner *runnerRef) LogValue() slog.Value {
 		)
 	}
 	attrs = append(attrs,
-		slog.String("size", format.HumanBytes2(runner.totalSize)),
-		slog.String("vram", format.HumanBytes2(runner.vramSize)),
+		slog.String("size", format.HumanBytes2(runner.estimatedTotal)),
+		slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)),
 		slog.Int("parallel", runner.numParallel),
 		slog.Int("pid", runner.pid),
 		slog.String("model", runner.modelPath),
@@ -717,7 +750,95 @@ func (a ByDurationAndName) Less(i, j int) bool {
 // type BySize []*runnerRef
 // func (a BySize) Len() int           { return len(a) }
 // func (a BySize) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
-// func (a BySize) Less(i, j int) bool { return a[i].vramSize < a[j].vramSize }
+// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
+
+// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
+// The list of GPUs returned will always be the same brand (library)
+// If the model can not be fit fully within the available GPU(s) nil is returned
+// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
+// opts.NumCtx accordingly
+func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
+	var numParallelToTry []int
+	if *numParallel <= 0 {
+		// If no specific parallel setting was provided, try larger then smaller, always end with 1
+		numParallelToTry = append(numParallelToTry, defaultParallel, 1)
+	} else {
+		numParallelToTry = []int{*numParallel}
+	}
+
+	for _, gl := range gpus.ByLibrary() {
+		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
+
+		// TODO - potentially sort by performance capability, existing models loaded, etc.
+		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
+		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
+		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
+
+		if !envconfig.SchedSpread() {
+			for _, p := range numParallelToTry {
+				req.opts.NumCtx = req.origNumCtx * p
+				// Try to pack into as few GPUs as possible, starting from 1 GPU
+				for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
+					gpuSubset := sgl[:numGPUs]
+					ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
+
+					if ok {
+						slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
+							"model", req.model.ModelPath,
+							"library", sgl[0].Library,
+							"parallel", p,
+							"required", format.HumanBytes2(estimatedVRAM),
+							"gpus", numGPUs)
+						*numParallel = p
+						return gpuSubset
+					}
+				}
+			}
+		} else {
+			// TODO future refinements
+			// - if multiple Libraries, see if any single GPU in any Library will fit
+			// - try subsets of GPUs instead of just falling back to 1 or all in a family
+
+			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
+			for _, p := range numParallelToTry {
+				req.opts.NumCtx = req.origNumCtx * p
+				if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+					slog.Info("new model will fit in available VRAM, loading",
+						"model", req.model.ModelPath,
+						"library", sgl[0].Library,
+						"parallel", p,
+						"required", format.HumanBytes2(estimatedVRAM),
+						"gpus", len(sgl))
+					*numParallel = p
+					return sgl
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// If multiple Libraries are detected, pick the Library which loads the most layers for the model
+func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
+	if *numParallel <= 0 {
+		*numParallel = 1
+		req.opts.NumCtx = req.origNumCtx
+	}
+	byLibrary := gpus.ByLibrary()
+	if len(byLibrary) <= 1 {
+		return gpus
+	}
+	var bestEstimate uint64
+	var bestFit int
+	for i, gl := range byLibrary {
+		_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
+		if estimatedVRAM > bestEstimate {
+			bestEstimate = estimatedVRAM
+			bestFit = i
+		}
+	}
+	return byLibrary[bestFit]
+}

 // findRunnerToUnload finds a runner to unload to make room for a new model
 func (s *Scheduler) findRunnerToUnload() *runnerRef {
@@ -754,13 +875,6 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
 func (s *Scheduler) unloadAllRunners() {
 	s.loadedMu.Lock()
 	defer s.loadedMu.Unlock()
-
-	if s.activeLoading != nil {
-		slog.Debug("shutting down currently loading runner")
-		s.activeLoading.Close()
-		s.activeLoading = nil
-	}
-
 	for model, runner := range s.loaded {
 		if runner.llama != nil {
 			slog.Debug("shutting down runner", "model", model)
@@ -787,3 +901,18 @@ func (s *Scheduler) expireRunner(model *Model) {
 		runner.refMu.Unlock()
 	}
 }
+
+// If other runners are loaded, make sure the pending request will fit in system memory
+// If not, pick a runner to unload, else return nil and the request can be loaded
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
+	slog.Debug("evaluating if CPU model load will fit in available system memory")
+	estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
+	if estimate.TotalSize <= gpus[0].FreeMemory {
+		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
+		return nil
+	}
+
+	// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
+
+	return s.findRunnerToUnload()
+}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -52,7 +52,7 @@ func TestLoad(t *testing.T) {
 		return nil, errors.New("something failed to load model blah")
 	}
 	gpus := discover.GpuInfoList{}
-	s.load(req, f, gpus, false)
+	s.load(req, f, gpus, 0)
 	require.Empty(t, req.successCh)
 	require.Len(t, req.errCh, 1)
 	s.loadedMu.Lock()
@@ -61,17 +61,16 @@ func TestLoad(t *testing.T) {
 	err := <-req.errCh
 	require.Contains(t, err.Error(), "this model may be incompatible")

-	server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
+	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
-		server.modelPath = model
 		return server, nil
 	}
-	s.load(req, f, gpus, false)
+	s.load(req, f, gpus, 0)
 	select {
 	case err := <-req.errCh:
 		require.NoError(t, err)
 	case resp := <-req.successCh:
-		require.Equal(t, uint64(10), resp.vramSize)
+		require.Equal(t, uint64(10), resp.estimatedVRAM)
 		require.Equal(t, uint(1), resp.refCount)
 		s.loadedMu.Lock()
 		require.Len(t, s.loaded, 1)
@@ -80,7 +79,7 @@ func TestLoad(t *testing.T) {

 	req.model.ModelPath = "dummy_model_path"
 	server.waitResp = errors.New("wait failure")
-	s.load(req, f, gpus, false)
+	s.load(req, f, gpus, 0)
 	select {
 	case err := <-req.errCh:
 		require.Contains(t, err.Error(), "wait failure")
@@ -105,11 +104,10 @@ type reqBundle struct {
 }

 func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
-	scenario.srv.modelPath = model
 	return scenario.srv, nil
 }

-func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration) *reqBundle {
+func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
 	b := &reqBundle{}
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()
@@ -146,7 +144,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
-	b.srv = &mockLlm{vramSize: vramSize, vramByGPU: map[string]uint64{"": vramSize}}
+	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }

@@ -264,10 +262,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {

 	// Multiple loaded models
 	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
-	b := newScenarioRequest(t, ctx, "ollama-model-3b", 10*format.GigaByte, nil)
-	c := newScenarioRequest(t, ctx, "ollama-model-4a", 10*format.GigaByte, nil)
-	c.req.opts.NumGPU = 0                                                       // CPU load, will be allowed
-	d := newScenarioRequest(t, ctx, "ollama-model-3c", 10*format.GigaByte, nil) // Needs prior unloaded
+	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
+	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
+	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
+	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded

 	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
 	s.newServerFn = a.newServer
@@ -420,12 +418,11 @@ func TestExpireRunner(t *testing.T) {

 	var f *ggml.GGML
 	gpus := discover.GpuInfoList{}
-	server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
+	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
-		server.modelPath = model
 		return server, nil
 	}
-	s.load(req, f, gpus, false)
+	s.load(req, f, gpus, 0)

 	select {
 	case err := <-req.errCh:
@@ -509,7 +506,7 @@ func TestUseLoadedRunner(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2},
 	}
 	finished := make(chan *LlmRequest)
-	llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
 	req.useLoadedRunner(r1, finished)
 	require.Equal(t, uint(1), r1.refCount)
@@ -544,8 +541,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 	gpus[0].FreeMemory = 900
 	gpus[1].TotalMemory = 2000
 	gpus[1].FreeMemory = 1900
-	llm1 := &mockLlm{vramByGPU: map[string]uint64{"1": 50, "2": 50}}
-	llm2 := &mockLlm{vramByGPU: map[string]uint64{"1": 125, "2": 75}}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
+	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
 	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
 	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}

@@ -560,6 +557,40 @@ func TestUpdateFreeSpace(t *testing.T) {
 	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
 }

+func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
+	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
+	defer done()
+	gpus := discover.GpuInfoList{
+		{
+			Library: "cuda",
+			ID:      "0",
+		},
+		{
+			Library: "cuda",
+			ID:      "1",
+		},
+	}
+	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
+
+	s := InitScheduler(ctx)
+	s.loadedMu.Lock()
+	s.loaded["a"] = r1
+	s.loadedMu.Unlock()
+
+	tmp := s.filterGPUsWithoutLoadingModels(gpus)
+	require.Len(t, tmp, 1)
+	require.Equal(t, "1", tmp[0].ID)
+
+	r1.gpus = discover.GpuInfoList{gpus[1]}
+	tmp = s.filterGPUsWithoutLoadingModels(gpus)
+	require.Len(t, tmp, 1)
+	require.Equal(t, "0", tmp[0].ID)
+
+	r1.gpus = discover.GpuInfoList{}
+	tmp = s.filterGPUsWithoutLoadingModels(gpus)
+	require.Len(t, tmp, 2)
+}
+
 func TestFindRunnerToUnload(t *testing.T) {
 	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
 	defer done()
@@ -584,7 +615,7 @@ func TestNeedsReload(t *testing.T) {
 	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
 	defer done()

-	llm := &mockLlm{vramByGPU: map[string]uint64{}}
+	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	do := api.DefaultOptions()
 	runner := &runnerRef{
 		model: &Model{
@@ -631,8 +662,8 @@ func TestUnloadAllRunners(t *testing.T) {
 	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
 	defer done()

-	llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
-	llm2 := &mockLlm{vramByGPU: map[string]uint64{}}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
+	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	s := InitScheduler(ctx)
 	s.unloadAllRunners()

@@ -650,7 +681,7 @@ func TestUnloadAllRunners(t *testing.T) {
 }

 func TestUnload(t *testing.T) {
-	llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	r1 := &runnerRef{llama: llm1, numParallel: 1}
 	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
 	r1.unload()
@@ -676,40 +707,62 @@ func TestAlreadyCanceled(t *testing.T) {
 	require.Empty(t, scenario1a.req.successCh)
 }

-type mockLlm struct {
-	modelPath         string
-	pingResp          error
-	waitResp          error
-	completionResp    error
-	embeddingResp     []float32
-	embeddingRespErr  error
-	tokenizeResp      []int
-	tokenizeRespErr   error
-	detokenizeResp    string
-	detonekizeRespErr error
-	closeResp         error
-	closeCalled       bool
-	vramSize          uint64
-	totalSize         uint64
-	vramByGPU         map[string]uint64
-}
+func TestHomogeneousGPUs(t *testing.T) {
+	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
+	defer done()
+	s := InitScheduler(ctx)

-func (s *mockLlm) ModelPath() string {
-	return s.modelPath
-}
-
-func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
-	if requireFull {
-		for _, g := range gpus {
-			if g.FreeMemory >= s.vramSize {
-				return nil
-			}
+	s.getGpuFn = func() discover.GpuInfoList {
+		// Set memory values to require the model to be spread
+		gpus := []discover.GpuInfo{
+			{Library: "cuda"},
+			{Library: "rocm"},
 		}
-
-		return llm.ErrLoadRequiredFull
+		gpus[0].TotalMemory = 1 * format.GibiByte
+		gpus[0].FreeMemory = 256 * format.MebiByte
+		gpus[1].TotalMemory = 1 * format.GibiByte
+		gpus[1].FreeMemory = 256 * format.MebiByte
+		return gpus
+	}
+	s.getCpuFn = getCpuFn
+	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+		require.Len(t, gpus, 1)
+		return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
+	}
+	slog.Info("a")
+	s.pendingReqCh <- a.req
+	require.Len(t, s.pendingReqCh, 1)
+	s.Run(ctx)
+	select {
+	case resp := <-a.req.successCh:
+		require.Equal(t, resp.llama, a.srv)
+		require.Empty(t, s.pendingReqCh)
+		require.Empty(t, a.req.errCh)
+	case err := <-a.req.errCh:
+		t.Fatal(err.Error())
+	case <-ctx.Done():
+		t.Fatal("timeout")
 	}
-	return nil
 }
+
+type mockLlm struct {
+	pingResp           error
+	waitResp           error
+	completionResp     error
+	embeddingResp      []float32
+	embeddingRespErr   error
+	tokenizeResp       []int
+	tokenizeRespErr    error
+	detokenizeResp     string
+	detonekizeRespErr  error
+	closeResp          error
+	closeCalled        bool
+	estimatedVRAM      uint64
+	estimatedTotal     uint64
+	estimatedVRAMByGPU map[string]uint64
+}
+
 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
 func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
 func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
@@ -732,7 +785,7 @@ func (s *mockLlm) Close() error {
 	s.closeCalled = true
 	return s.closeResp
 }
-func (s *mockLlm) VRAMSize() uint64              { return s.vramSize }
-func (s *mockLlm) TotalSize() uint64             { return s.totalSize }
-func (s *mockLlm) VRAMByGPU(gpuid string) uint64 { return s.vramByGPU[gpuid] }
-func (s *mockLlm) Pid() int                      { return -1 }
+func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
+func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
+func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
+func (s *mockLlm) Pid() int                               { return -1 }