From 3515cc377ce2506c95a0ea408fd5d15d306fc6aa Mon Sep 17 00:00:00 2001
From: Yoshi <70424721+yoshihyoda@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:19:13 -0700
Subject: [PATCH 1/6] docs: fix typos and remove trailing whitespaces (#11554)

---
 docs/api.md             | 4 ++--
 docs/development.md     | 2 +-
 docs/openai.md          | 2 +-
 docs/troubleshooting.md | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/docs/api.md b/docs/api.md
index 41858885b..683db3573 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -500,11 +500,11 @@ The `message` object has the following fields:
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
-- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result 
+- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result
 
 Advanced parameters (optional):
 
-- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
+- `format`: the format to return a response in. Format can be `json` or a JSON schema.
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
diff --git a/docs/development.md b/docs/development.md
index 24bcba194..9726b5d91 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```
 
-> NOTE: In rare cirumstances, you may need to change a package using the new
+> NOTE: In rare circumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
diff --git a/docs/openai.md b/docs/openai.md
index d0bac4cd3..26930124c 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -72,7 +72,7 @@ client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
 # Define the schema for the response
 class FriendInfo(BaseModel):
     name: str
-    age: int 
+    age: int
     is_available: bool
 
 class FriendList(BaseModel):
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 995b33aca..6fdd3e85b 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
 On **Linux** systems with systemd, the logs can be found with this command:
 
 ```shell
-journalctl -u ollama --no-pager --follow --pager-end 
+journalctl -u ollama --no-pager --follow --pager-end
 ```
 
 When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
@@ -23,7 +23,7 @@ docker logs <container-name>
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.
 
 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
-- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
 
@@ -38,7 +38,7 @@ Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
 
 ## LLM libraries
 
-Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. 
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` and the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library.
 
 In the server log, you will see a message that looks something like this (varies from release to release):
 
@@ -97,7 +97,7 @@ If none of those resolve the problem, gather additional information and file an
 
 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
 
-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
 
 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems

From c116a7523ddc067db2b86aab38172c05ad01c710 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 28 Jul 2025 11:29:25 -0700
Subject: [PATCH 2/6] kvcache: Don't shift empty batches

When we context shift, we delete half the context and apply RoPE
with an offset to the other half. We used to RoPE across the entire
context in a single pass with a zero offset for the deleted
section. With the change to shifting in batches, we can skip any
batches where all of the offsets would be zero. This typically
reduces the number of operations by half.
---
 kvcache/causal.go | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index 8b101a817..496eeaa64 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -646,18 +646,31 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 	seqRange := c.cellRanges[seq]
 
 	for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
-		ctx := c.backend.NewContext()
-
 		size := min(seqRange.max-start+1, c.maxBatch)
 		offsets := make([]int32, size)
+
+		var batchFirst, batchLast int
+
+		batchFirst = -1
 		for i := range offsets {
 			cell := c.cells[start+i]
 
 			if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
 				offsets[i] = offset
+				if batchFirst < 0 {
+					batchFirst = i
+				}
+				batchLast = i
 			}
 		}
 
+		if batchFirst < 0 {
+			continue
+		}
+
+		offsets = offsets[batchFirst : batchLast+1]
+
+		ctx := c.backend.NewContext()
 		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
 
 		for i, key := range c.keys {
@@ -669,10 +682,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 			numKVHeads := key.Dim(1)
 			rowSize := key.Stride(2)
 
-			key = key.View(ctx, rowSize*start,
+			key = key.View(ctx, rowSize*(start+batchFirst),
 				kHeadDim, key.Stride(1),
 				numKVHeads, key.Stride(2),
-				size,
+				len(offsets),
 			)
 
 			roped, err := c.shiftFn(ctx, i, key, kShift)

From ea85e27bbd76a342ad390576fc2e717a72ce96de Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Tue, 29 Jul 2025 21:37:06 +0200
Subject: [PATCH 3/6] Increase performance for Gemma3n models on NVGPUs by
 enabling CUDA Graph execution (#11525)

* Enable CUDA Graphs for gemma3n.

Similar to
https://github.com/ggml-org/llama.cpp/pull/14741,
though ollama has a slightly different model graph
than llama.cpp which requires different workaround
checks.

* Remove residual check by reshaping differently in gemma3n model

This should make the heuristics more robust
---
 .../0019-metal-add-mean-kernel-14267.patch    |  2 +-
 .../0020-CUDA-add-mean-operation-14313.patch  |  2 +-
 .../0021-Enable-CUDA-Graphs-for-gemma3n.patch | 50 +++++++++++++++++++
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      | 16 ++++--
 model/models/gemma3n/model_text.go            |  7 ++-
 5 files changed, 67 insertions(+), 10 deletions(-)
 create mode 100644 llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch

diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
index a52f0fdfe..e65aeb7b4 100644
--- a/llama/patches/0019-metal-add-mean-kernel-14267.patch
+++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -16,7 +16,7 @@ ggml-ci
  2 files changed, 67 insertions(+), 14 deletions(-)
 
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index ee4f2dcb..f20f5615 100644
+index a9eeebc6..110c9ece 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
 @@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
diff --git a/llama/patches/0020-CUDA-add-mean-operation-14313.patch b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
index efcb1e8bc..2f4e37949 100644
--- a/llama/patches/0020-CUDA-add-mean-operation-14313.patch
+++ b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
@@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644
  static __device__ __forceinline__ float warp_reduce_max(float x) {
  #pragma unroll
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 4c829153..9e64e5ae 100644
+index d6960174..2b9fabf4 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -35,6 +35,7 @@
diff --git a/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch
new file mode 100644
index 000000000..b9dd6cdc6
--- /dev/null
+++ b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -0,0 +1,50 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Oliver Simons <osimons@nvidia.com>
+Date: Tue, 22 Jul 2025 11:02:28 +0200
+Subject: [PATCH] Enable CUDA Graphs for gemma3n.
+
+Similar to
+https://github.com/ggml-org/llama.cpp/pull/14741,
+though ollama has a slightly different model graph
+than llama.cpp which requires different workaround
+checks.
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++----
+ 1 file changed, 12 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 2b9fabf4..28ccf4be 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+ 
++    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
++    const std::string gemma3n_node_name                  = "node_";
++
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+         ggml_tensor * node = cgraph->nodes[i];
+ 
+@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #endif
+         }
+ 
+-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
+-            // disable CUDA graphs for batch size > 1 for now.
+-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
++        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
++        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
++        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
++                                                                                    && node->ne[2] == 1
++                                                                                    && node->ne[3] == 1
++                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
++                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
++            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+             use_cuda_graph = false;
+ #ifndef NDEBUG
+-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
++            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+ #endif
+         }
+ 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 2b9fabf4f..28ccf4bef 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
 
+    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
+    const std::string gemma3n_node_name                  = "node_";
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
         }
 
-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-            // disable CUDA graphs for batch size > 1 for now.
-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
+        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
+        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
+                                                                                    && node->ne[2] == 1
+                                                                                    && node->ne[3] == 1
+                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
+                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
+            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
             use_cuda_graph = false;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 #endif
         }
 
diff --git a/model/models/gemma3n/model_text.go b/model/models/gemma3n/model_text.go
index 715b8a0ea..b75a2abb3 100644
--- a/model/models/gemma3n/model_text.go
+++ b/model/models/gemma3n/model_text.go
@@ -203,10 +203,9 @@ func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions
 	coefficients := a.PredictionCoefficient.Forward(ctx, modalities)
 	coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2))
 
-	hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-	predictions := coefficients.Mulmat(ctx, hiddenStates)
-	predictions = predictions.Add(ctx, hiddenStates)
-	return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	predictions := coefficients.Mulmat(ctx, hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx))
+	predictions = predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	return predictions.Add(ctx, hiddenStates)
 }
 
 func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor {

From 8afa6e83f2cace42cc1421737f9f9b235e8e33b7 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:41:25 -0700
Subject: [PATCH 4/6] CI: switch back to x86 macos builder (#11572)

---
 .github/workflows/release.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 40871e644..4acb283b0 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -23,7 +23,7 @@ jobs:
           echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
 
   darwin-build:
-    runs-on: macos-13-xlarge
+    runs-on: macos-13
     environment: release
     needs: setup-environment
     strategy:

From 25911a6e6bd5a0cf209d871c721aa7bc74f59509 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:50:54 -0700
Subject: [PATCH 5/6] mac: disable bf16 on unsupported OS versions (#11585)

Support for bf16 was added in MacOS v14+ and attempting to enable
on older versions causes runtime failures.
---
 .../0019-metal-add-mean-kernel-14267.patch    |  4 +--
 .../0022-BF16-macos-version-guard.patch       | 27 +++++++++++++++++++
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |  6 ++++-
 3 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 llama/patches/0022-BF16-macos-version-guard.patch

diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
index e65aeb7b4..f20e854b2 100644
--- a/llama/patches/0019-metal-add-mean-kernel-14267.patch
+++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -19,7 +19,7 @@ diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
 index a9eeebc6..110c9ece 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
+@@ -489,6 +489,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
      GGML_METAL_KERNEL_TYPE_COS,
      GGML_METAL_KERNEL_TYPE_NEG,
      GGML_METAL_KERNEL_TYPE_SUM_ROWS,
@@ -27,7 +27,7 @@ index a9eeebc6..110c9ece 100644
      GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
      GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
      GGML_METAL_KERNEL_TYPE_ARGMAX,
-@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+@@ -1436,6 +1437,7 @@ @implementation GGMLMetalClass
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
diff --git a/llama/patches/0022-BF16-macos-version-guard.patch b/llama/patches/0022-BF16-macos-version-guard.patch
new file mode 100644
index 000000000..68aac0bb0
--- /dev/null
+++ b/llama/patches/0022-BF16-macos-version-guard.patch
@@ -0,0 +1,27 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Wed, 30 Jul 2025 08:43:46 -0700
+Subject: [PATCH] BF16 macos version guard
+
+Only enable BF16 on supported MacOS versions (v14+)
+---
+ ggml/src/ggml-metal/ggml-metal.m | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 110c9ece..ab46f6e3 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
++++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -89,7 +89,11 @@
+         ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
+ 
+ #if defined(GGML_METAL_USE_BF16)
+-        ctx->use_bfloat = ctx->has_bfloat;
++        if (@available(macOS 14.0, *)) {
++            ctx->use_bfloat = ctx->has_bfloat;
++        } else {
++            ctx->use_bfloat = false;
++        }
+ #else
+         ctx->use_bfloat = false;
+ #endif
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index 110c9ece9..ab46f6e3a 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -89,7 +89,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
         ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
 
 #if defined(GGML_METAL_USE_BF16)
-        ctx->use_bfloat = ctx->has_bfloat;
+        if (@available(macOS 14.0, *)) {
+            ctx->use_bfloat = ctx->has_bfloat;
+        } else {
+            ctx->use_bfloat = false;
+        }
 #else
         ctx->use_bfloat = false;
 #endif

From 6dcc5dfb9c0a033e4e8dde627d55580600418fb6 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:56:01 -0700
Subject: [PATCH 6/6] Revert "CI: switch back to x86 macos builder" (#11588)

This reverts commit 9d071e6089319b37acf62bb739e3430dcb2ac0c3.
---
 .github/workflows/release.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 4acb283b0..40871e644 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -23,7 +23,7 @@ jobs:
           echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
 
   darwin-build:
-    runs-on: macos-13
+    runs-on: macos-13-xlarge
     environment: release
     needs: setup-environment
     strategy: