Merge remote-tracking branch 'origin/main' into GraniteFour

* origin/main: ggml: Report ordinal IDs for AMD GPUs on Windows doc: add MacOS docs (#11334) Reduce default parallelism to 1 (#11330) API/CLI context enhancements (#11331) add `tool_name` to api.md (#11326) template: add tool result compatibility (#11294) ci: modularization (#11324) Revert "ggml: Temporarily disable reporting UUIDs" readme: update Ollama icon size int: add performance integration tests (#11173) doc: add NVIDIA blackwell to supported list (#11307) Update base image to Ubuntu 24.04 LTS (#9681) doc: Update link for mac install (#11288) mimic logs for layers on new engine (#11278) readme: add NativeMind to community integrations (#11242) tools: fix parsing tool calls with empty arguments, missing required fields (#11233) readme: add ollama-bash-toolshed to community integrations (#11224)
2025-07-10 14:01:24 -06:00 · 2025-07-10 14:01:24 -06:00 · e61826c180
parent 34ff84df43 35fda7b4af
commit e61826c180
28 changed files with 125358 additions and 147 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -225,7 +225,7 @@ jobs:
            CGO_CFLAGS=${{ env.CGO_CFLAGS }}
            CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
-          cache-from: type=registry,ref=ollama/ollama:latest
+          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
@ -298,8 +298,8 @@ jobs:
          context: .
          platforms: ${{ matrix.os }}/${{ matrix.arch }}
          build-args: ${{ matrix.build-args }}
-          outputs: type=image,name=ollama/ollama,push-by-digest=true,name-canonical=true,push=true
-          cache-from: type=registry,ref=ollama/ollama:latest
+          outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
      - run: |
          mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
@ -331,7 +331,7 @@ jobs:
            latest=false
            suffix=${{ matrix.suffix }}
          images: |
-            ollama/ollama
+            ${{ vars.DOCKER_REPO }}
          tags: |
            type=ref,enable=true,priority=600,prefix=pr-,event=pr
            type=semver,pattern={{version}}
@ -341,8 +341,8 @@ jobs:
          path: ${{ runner.temp }}
          merge-multiple: true
      - run: |
-          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf 'ollama/ollama@%s ')
-          docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
+          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf '${{ vars.DOCKER_REPO }}@%s ')
+          docker buildx imagetools inspect ${{ vars.DOCKER_REPO }}:${{ steps.metadata.outputs.version }}
        working-directory: ${{ runner.temp }}

  # Trigger downstream release process
@ -380,4 +380,4 @@ jobs:
            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
-            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"publish\": \"1\"}}"
+            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"origin\": \"${GITHUB_REPOSITORY}\", \"publish\": \"1\"}}"
--- a/2
+++ b/2
@ -104,7 +104,7 @@ FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama

-FROM ubuntu:20.04
+FROM ubuntu:24.04
 RUN apt-get update \
    && apt-get install -y ca-certificates \
    && apt-get clean \
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 <div align="center">
   <a href="https://ollama.com">
-    <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+    <img alt="ollama" width="240" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
  </a>
 </div>

@ -10,7 +10,7 @@ Get up and running with large language models.

 ### macOS

-[Download](https://ollama.com/download/Ollama-darwin.zip)
+[Download](https://ollama.com/download/Ollama.dmg)

 ### Windows

@ -455,6 +455,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
 - [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
+- [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.

 ### Apple Vision Pro

@ -593,6 +594,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
 - [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
+- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)

 ### Supported backends

--- a/api/types.go
+++ b/api/types.go
@ -143,6 +143,7 @@ type Message struct {
 	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
+	ToolName  string      `json:"tool_name,omitempty"`
 }

 func (m *Message) UnmarshalJSON(b []byte) error {
@ -467,13 +468,14 @@ type ListModelResponse struct {

 // ProcessModelResponse is a single model description in [ProcessResponse].
 type ProcessModelResponse struct {
-	Name      string       `json:"name"`
-	Model     string       `json:"model"`
-	Size      int64        `json:"size"`
-	Digest    string       `json:"digest"`
-	Details   ModelDetails `json:"details,omitempty"`
-	ExpiresAt time.Time    `json:"expires_at"`
-	SizeVRAM  int64        `json:"size_vram"`
+	Name          string       `json:"name"`
+	Model         string       `json:"model"`
+	Size          int64        `json:"size"`
+	Digest        string       `json:"digest"`
+	Details       ModelDetails `json:"details,omitempty"`
+	ExpiresAt     time.Time    `json:"expires_at"`
+	SizeVRAM      int64        `json:"size_vram"`
+	ContextLength int          `json:"context_length"`
 }

 type TokenResponse struct {
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -583,12 +583,13 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 			} else {
 				until = format.HumanTime(m.ExpiresAt, "Never")
 			}
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
+			ctxStr := strconv.Itoa(m.ContextLength)
+			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, ctxStr, until})
 		}
 	}

 	table := tablewriter.NewWriter(os.Stdout)
-	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
+	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "CONTEXT", "UNTIL"})
 	table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	table.SetHeaderLine(false)
--- a/docs/README.md
+++ b/docs/README.md
@ -4,6 +4,7 @@
 * [Quickstart](../README.md#quickstart)
 * [Examples](./examples.md)
 * [Importing models](./import.md)
+* [MacOS Documentation](./macos.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](./docker.md)
--- a/docs/api.md
+++ b/docs/api.md
@ -500,6 +500,7 @@ The `message` object has the following fields:
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
+- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result 

 Advanced parameters (optional):

@ -508,13 +509,21 @@ Advanced parameters (optional):
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

+### Tool calling
+
+Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below.
+
+Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below.
+
+[See models with tool calling capabilities](https://ollama.com/search?c=tool).
+
 ### Structured outputs

 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.

 ### Examples

-#### Chat Request (Streaming)
+#### Chat request (Streaming)

 ##### Request

@ -569,6 +578,88 @@ Final response:
 }
 ```

+#### Chat request (Streaming with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": true
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+```json
+{
+    "model": "llama3.2",
+    "created_at": "2025-07-07T20:22:19.184789Z",
+    "message": {
+        "role": "assistant",
+        "content": "",
+        "tool_calls": [
+            {
+                "function": {
+                    "name": "get_weather",
+                    "arguments": {
+                        "city": "Tokyo"
+                    }
+                },
+            }
+        ]
+    },
+    "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model":"llama3.2",
+  "created_at":"2025-07-07T20:22:19.19314Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 182242375,
+  "load_duration": 41295167,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 24573166,
+  "eval_count": 15,
+  "eval_duration": 115959084
+}
+```
+
 #### Chat request (No streaming)

 ##### Request
@ -606,6 +697,74 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```

+#### Chat request (No streaming, with tools)
+
+##### Request
+
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": false 
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:32:53.844124Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_weather",
+          "arguments": {
+            "city": "Tokyo"
+          }
+        },
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 3244883583,
+  "load_duration": 2969184542,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 141656333,
+  "eval_count": 18,
+  "eval_duration": 133293625
+}
+```
+
 #### Chat request (Structured outputs)

 ##### Request
@ -712,6 +871,87 @@ Final response:
 }
 ```

+
+#### Chat request (With history, with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in Toronto?"
+    },
+    // the message from the model appended to history
+    {
+      "role": "assistant",
+      "content": "",
+      "tool_calls": [
+        {
+          "function": {
+            "name": "get_temperature",
+            "arguments": {
+              "city": "Toronto"
+            }
+          },
+        }
+      ]
+    },
+    // the tool call result appended to history
+    {
+      "role": "tool",
+      "content": "11 degrees celsius",
+      "tool_name": "get_temperature",
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:43:37.688511Z",
+  "message": {
+    "role": "assistant",
+    "content": "The current temperature in Toronto is 11°C."
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 890771750,
+  "load_duration": 707634750,
+  "prompt_eval_count": 94,
+  "prompt_eval_duration": 91703208,
+  "eval_count": 11,
+  "eval_duration": 90282125
+}
+
+```
+
+
 #### Chat request (with images)

 ##### Request
--- a/docs/faq.md
+++ b/docs/faq.md
@ -292,7 +292,7 @@ If too many requests are sent to the server, it will respond with a 503 error in

 ## How does Ollama handle concurrent requests?

-Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
+Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.

 If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded.  As prior models become idle, one or more will be unloaded to make room for the new model.  Queued requests will be processed in order.  When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.

@ -301,7 +301,7 @@ Parallel request processing for a given model results in increasing the context
 The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:

 - `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory.  The default is 3 * the number of GPUs or 3 for CPU inference.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
+- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default is 1, and will handle 1 request per model at a time.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512

 Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
@ -333,3 +333,16 @@ The currently available K/V cache quantization types are:
 How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.

 You may need to experiment with different quantization types to find the best balance between memory usage and quality.
+
+## How can I stop Ollama from starting when I login to my computer
+
+Ollama for Windows and macOS register as a login item during installation.  You can disable this if you prefer not to have Ollama automatically start.  Ollama will respect this setting across upgrades, unless you uninstall the application.
+
+**Windows**
+- Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
+
+**MacOS Monterey (v12)**
+- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
+
+**MacOS Ventura (v13) and later**
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@ -7,6 +7,8 @@ Check your compute compatibility to see if your card is supported:

 | Compute Capability | Family              | Cards                                                                                                       |
 | ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
+| 12.0               | GeForce RTX 50xx    | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090`                                     |
+|                    | NVIDIA Professioal  | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell`         |
 | 9.0                | NVIDIA              | `H200` `H100`                                                                                               |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
--- a/docs/macos.md
+++ b/docs/macos.md
@ -0,0 +1,42 @@
+# Ollama for macOS
+
+## System Requirements
+
+* MacOS Monterey (v12) or newer
+* Apple M series (CPU and GPU support) or x86 (CPU only)
+
+
+## Filesystem Requirements
+
+The preferred method of installation is to mount the `ollama.dmg` and drag-and-drop the Ollama application to the system-wide `Applications` folder.  Upon startup, the Ollama app will verify the `ollama` CLI is present in your PATH, and if not detected, will prompt for permission to create a link in `/usr/local/bin`
+
+Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
+
+### Changing Install Location
+
+To install the Ollama application somewhere other than `Applications`, place the Ollama application in the desired location, and ensure the CLI `Ollama.app/Contents/Resources/ollama` or a sym-link to the CLI can be found in your path.  Upon first start decline the "Move to Applications?" request.
+
+
+## Troubleshooting
+
+Ollama on MacOS stores files in a few different locations.
+- `~/.ollama` contains models and configuration
+- `~/.ollama/logs` contains logs
+    - *app.log* contains most resent logs from the GUI application
+    - *server.log* contains the most recent server logs
+- `<install location>/Ollama.app/Contents/Resources/ollama` the CLI binary
+
+## Uninstall
+
+To fully remove Ollama from your system, remove the following files and folders:
+
+```
+sudo rm -rf /Applications/Ollama.app
+sudo rm /usr/local/bin/ollama
+rm -rf "~/Library/Application Support/Ollama"
+rm -rf "~/Library/Saved Application State/com.electron.ollama.savedState"
+rm -rf ~/Library/Caches/com.electron.ollama/
+rm -rf ~/Library/Caches/ollama
+rm -rf ~/Library/WebKit/com.electron.ollama
+rm -rf ~/.ollama
+```
--- a/docs/windows.md
+++ b/docs/windows.md
@ -30,20 +30,6 @@ To install the Ollama application in a location different than your home directo
 OllamaSetup.exe /DIR="d:\some\location"
 ```

-### Changing Model Location
-
-To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
-
-1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
-
-2. Click on _Edit environment variables for your account_.
-
-3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
-
-4. Click OK/Apply to save.
-
-If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
-
 ## API Access

 Here's a quick example showing API access from `powershell`
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -219,7 +219,7 @@ func Uint(key string, defaultValue uint) func() uint {

 var (
 	// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
-	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
+	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 1)
 	// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
 	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
 	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@ -19,37 +19,6 @@ import (
 	"github.com/ollama/ollama/format"
 )

-var (
-	started    = time.Now()
-	chatModels = []string{
-		"granite3-moe:latest",
-		"granite-code:latest",
-		"nemotron-mini:latest",
-		"command-r:latest",
-		"gemma2:latest",
-		"gemma:latest",
-		"internlm2:latest",
-		"phi3.5:latest",
-		"phi3:latest",
-		// "phi:latest", // flaky, sometimes generates no response on first query
-		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
-		"falcon:latest",
-		"falcon2:latest",
-		"minicpm-v:latest",
-		"mistral:latest",
-		"orca-mini:latest",
-		"llama2:latest",
-		"llama3.1:latest",
-		"llama3.2:latest",
-		"llama3.2-vision:latest",
-		"qwen2.5-coder:latest",
-		"qwen:latest",
-		"solar-pro:latest",
-		"codellama:latest",
-		"nous-hermes:latest",
-	}
-)
-
 func TestModelsGenerate(t *testing.T) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
@ -70,6 +39,13 @@ func TestModelsGenerate(t *testing.T) {
 		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
 	}

+	var chatModels []string
+	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
+		chatModels = ollamaEngineChatModels
+	} else {
+		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
+	}
+
 	for _, model := range chatModels {
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
--- a/integration/model_perf_test.go
+++ b/integration/model_perf_test.go
@ -0,0 +1,266 @@
+//go:build integration && perf
+
+package integration
+
+import (
+	"context"
+	"fmt"
+	"io/ioutil"
+	"log/slog"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
+)
+
+var (
+	// Models that don't work reliably with the large context prompt in this test case
+	longContextFlakes = []string{
+		"granite-code:latest",
+		"nemotron-mini:latest",
+		"falcon:latest",  // 2k model
+		"falcon2:latest", // 2k model
+		"minicpm-v:latest",
+		"qwen:latest",
+		"solar-pro:latest",
+	}
+)
+
+// Note: this test case can take a long time to run, particularly on models with
+// large contexts.  Run with -timeout set to a large value to get reasonable coverage
+// Example usage:
+//
+// go test --tags=integration,perf -count 1 ./integration -v -timeout 90m -run TestModelsPerf 2>&1 | tee int.log
+// cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
+// cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
+func TestModelsPerf(t *testing.T) {
+	softTimeout, hardTimeout := getTimeouts(t)
+	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	// TODO use info API eventually
+	var maxVram uint64
+	var err error
+	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
+		maxVram, err = strconv.ParseUint(s, 10, 64)
+		if err != nil {
+			t.Fatalf("invalid  OLLAMA_MAX_VRAM %v", err)
+		}
+	} else {
+		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
+	}
+
+	data, err := ioutil.ReadFile(filepath.Join("testdata", "shakespeare.txt"))
+	if err != nil {
+		t.Fatalf("failed to open test data file: %s", err)
+	}
+	longPrompt := "summarize the following: " + string(data)
+
+	var chatModels []string
+	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
+		chatModels = ollamaEngineChatModels
+	} else {
+		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
+	}
+
+	for _, model := range chatModels {
+		t.Run(model, func(t *testing.T) {
+			if time.Now().Sub(started) > softTimeout {
+				t.Skip("skipping remaining tests to avoid excessive runtime")
+			}
+			if err := PullIfMissing(ctx, client, model); err != nil {
+				t.Fatalf("pull failed %s", err)
+			}
+			var maxContext int
+
+			resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
+			if err != nil {
+				t.Fatalf("show failed: %s", err)
+			}
+			arch := resp.ModelInfo["general.architecture"].(string)
+			maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
+
+			if maxVram > 0 {
+				resp, err := client.List(ctx)
+				if err != nil {
+					t.Fatalf("list models failed %v", err)
+				}
+				for _, m := range resp.Models {
+					// For these tests we want to exercise a some amount of overflow on the CPU
+					if m.Name == model && float32(m.Size)*0.75 > float32(maxVram) {
+						t.Skipf("model %s is too large %s for available VRAM %s", model, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
+					}
+				}
+			}
+			slog.Info("scneario", "model", model, "max_context", maxContext)
+			loaded := false
+			defer func() {
+				// best effort unload once we're done with the model
+				if loaded {
+					client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
+				}
+			}()
+
+			// Some models don't handle the long context data well so skip them to avoid flaky test results
+			longContextFlake := false
+			for _, flake := range longContextFlakes {
+				if model == flake {
+					longContextFlake = true
+					break
+				}
+			}
+
+			// iterate through a few context sizes for coverage without excessive runtime
+			var contexts []int
+			keepGoing := true
+			if maxContext > 16384 {
+				contexts = []int{4096, 8192, 16384, maxContext}
+			} else if maxContext > 8192 {
+				contexts = []int{4096, 8192, maxContext}
+			} else if maxContext > 4096 {
+				contexts = []int{4096, maxContext}
+			} else if maxContext > 0 {
+				contexts = []int{maxContext}
+			} else {
+				t.Fatal("unknown max context size")
+			}
+			for _, numCtx := range contexts {
+				if !keepGoing && numCtx > 8192 { // Always try up to 8k before bailing out
+					break
+				}
+				skipLongPrompt := false
+
+				// Workaround bug 11172 temporarily...
+				maxPrompt := longPrompt
+				// If we fill the context too full with the prompt, many models
+				// quickly hit context shifting and go bad.
+				if len(maxPrompt) > numCtx*2 { // typically yields ~1/2 full context
+					maxPrompt = maxPrompt[:numCtx*2]
+				}
+
+				testCases := []struct {
+					prompt  string
+					anyResp []string
+				}{
+					{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
+					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
+				}
+				var gpuPercent int
+				for _, tc := range testCases {
+					if len(tc.prompt) > 100 && (longContextFlake || skipLongPrompt) {
+						slog.Info("skipping long prompt", "model", model, "num_ctx", numCtx, "gpu_percent", gpuPercent)
+						continue
+					}
+					req := api.GenerateRequest{
+						Model:     model,
+						Prompt:    tc.prompt,
+						KeepAlive: &api.Duration{Duration: 20 * time.Second}, // long enough to ensure a ps returns
+						Options: map[string]interface{}{
+							"temperature": 0,
+							"seed":        123,
+							"num_ctx":     numCtx,
+						},
+					}
+					atLeastOne := false
+					var resp api.GenerateResponse
+
+					stream := false
+					req.Stream = &stream
+
+					// Avoid potentially getting stuck indefinitely
+					limit := 5 * time.Minute
+					genCtx, cancel := context.WithDeadlineCause(
+						ctx,
+						time.Now().Add(limit),
+						fmt.Errorf("generate on model %s with ctx %d took longer than %v", model, numCtx, limit),
+					)
+					defer cancel()
+
+					err = client.Generate(genCtx, &req, func(rsp api.GenerateResponse) error {
+						resp = rsp
+						return nil
+					})
+					if err != nil {
+						// Avoid excessive test runs, but don't consider a failure with massive context
+						if numCtx > 16384 && strings.Contains(err.Error(), "took longer") {
+							slog.Warn("max context was taking too long, skipping", "error", err)
+							keepGoing = false
+							skipLongPrompt = true
+							continue
+						}
+						t.Fatalf("generate error: ctx:%d err:%s", numCtx, err)
+					}
+					loaded = true
+					for _, expResp := range tc.anyResp {
+						if strings.Contains(strings.ToLower(resp.Response), expResp) {
+							atLeastOne = true
+							break
+						}
+					}
+					if !atLeastOne {
+						t.Fatalf("response didn't contain expected values: ctx:%d  expected:%v response:%s ", numCtx, tc.anyResp, resp.Response)
+					}
+					models, err := client.ListRunning(ctx)
+					if err != nil {
+						slog.Warn("failed to list running models", "error", err)
+						continue
+					}
+					if len(models.Models) > 1 {
+						slog.Warn("multiple models loaded, may impact performance results", "loaded", models.Models)
+					}
+					for _, m := range models.Models {
+						if m.Name == model {
+							if m.SizeVRAM == 0 {
+								slog.Info("Model fully loaded into CPU")
+								gpuPercent = 0
+								keepGoing = false
+								skipLongPrompt = true
+							} else if m.SizeVRAM == m.Size {
+								slog.Info("Model fully loaded into GPU")
+								gpuPercent = 100
+							} else {
+								sizeCPU := m.Size - m.SizeVRAM
+								cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
+								gpuPercent = int(100 - cpuPercent)
+								slog.Info("Model split between CPU/GPU", "CPU", cpuPercent, "GPU", gpuPercent)
+								keepGoing = false
+
+								// Heuristic to avoid excessive test run time
+								if gpuPercent < 90 {
+									skipLongPrompt = true
+								}
+							}
+						}
+					}
+					fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
+						"MODEL",
+						"CONTEXT",
+						"GPU PERCENT",
+						"PROMPT COUNT",
+						"LOAD TIME",
+						"PROMPT EVAL TPS",
+						"EVAL TPS",
+					)
+					fmt.Fprintf(os.Stderr, "MODEL_PERF_DATA:%s,%d,%d,%d,%0.2f,%0.2f,%0.2f\n",
+						model,
+						numCtx,
+						gpuPercent,
+						resp.PromptEvalCount,
+						float64(resp.LoadDuration)/1000000000.0,
+						float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
+						float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
+					)
+				}
+			}
+		})
+	}
+}
--- a/integration/testdata/shakespeare.txt
+++ b/integration/testdata/shakespeare.txt
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@ -32,6 +32,48 @@ const (
 	smol = "llama3.2:1b"
 )

+var (
+	started = time.Now()
+
+	// Note: add newer models at the top of the list to test them first
+	ollamaEngineChatModels = []string{
+		"gemma3n:e2b",
+		"mistral-small3.2:latest",
+		"deepseek-r1:1.5b",
+		"llama3.2-vision:latest",
+		"qwen2.5-coder:latest",
+		"qwen2.5vl:3b",
+		"qwen3:0.6b", // dense
+		"qwen3:30b",  // MOE
+		"gemma3:1b",
+		"llama3.1:latest",
+		"llama3.2:latest",
+		"gemma2:latest",
+		"minicpm-v:latest",    // arch=qwen2
+		"granite-code:latest", // arch=llama
+	}
+	llamaRunnerChatModels = []string{
+		"mistral:latest",
+		"falcon3:latest",
+		"granite3-moe:latest",
+		"command-r:latest",
+		"nemotron-mini:latest",
+		"phi3.5:latest",
+		"solar-pro:latest",
+		"internlm2:latest",
+		"codellama:latest", // arch=llama
+		"phi3:latest",
+		"falcon2:latest",
+		"gemma:latest",
+		"llama2:latest",
+		"nous-hermes:latest",
+		"orca-mini:latest",
+		"qwen:latest",
+		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
+		"falcon:latest",
+	}
+)
+
 func Init() {
 	lifecycle.InitLogging()
 }
--- a/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
@ -7,19 +7,19 @@ This enables matching up devices and information reported by the backend
 with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
 ---
 ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 39 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 35 insertions(+)
+ 3 files changed, 41 insertions(+)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..a880df33 100644
+index 74e46716..48839339 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
     struct ggml_backend_dev_props {
         const char * name;
         const char * description;
-+        const char * uuid;
+        const char * id;
         size_t memory_free;
         size_t memory_total;
         enum ggml_backend_dev_type type;
@ -31,17 +31,17 @@ index b6cca93f..09ce299c 100644
     int device;
     std::string name;
     std::string description;
-+    std::string uuid;
+    std::string id;
 };
- 
+
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2951,6 +2952,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
- 
-+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 +    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    return ctx->uuid.c_str();
+    return ctx->id.c_str();
 +}
 +
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@ -51,17 +51,17 @@ index b6cca93f..09ce299c 100644
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
-+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
+    props->id          = ggml_backend_cuda_device_get_id(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
- 
+
@@ -3535,6 +3542,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
- 
+
 +                #if !defined(GGML_USE_HIP)
-+                char uuid[64];
-+                snprintf(uuid, sizeof(uuid),
+                char id[64];
+                snprintf(id, sizeof(id),
 +                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 +                    (unsigned char)prop.uuid.bytes[0],
 +                    (unsigned char)prop.uuid.bytes[1],
@ -80,9 +80,15 @@ index b6cca93f..09ce299c 100644
 +                    (unsigned char)prop.uuid.bytes[14],
 +                    (unsigned char)prop.uuid.bytes[15]
 +                  );
-+                dev_ctx->uuid = uuid;
+                dev_ctx->id = id;
 +                #else
-+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #ifdef _WIN32
+                char id[16];
+                snprintf(id, sizeof(id), "%d", i);
+                dev_ctx->id = id;
+                #else
+                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
 +                #endif
 +
                 ggml_backend_dev_t dev = new ggml_backend_device {
@ -96,7 +102,7 @@ index 74fd6654..ea2d6218 100644
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
-+    props->uuid        = "0";
+    props->id          = "0";
     props->type        = ggml_backend_metal_device_get_type(dev);
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = (struct ggml_backend_dev_caps) {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -124,9 +124,9 @@ type DeviceMemory struct {
 	// may not be persistent across instances of the runner.
 	Name string

-	// UUID is a unique persistent identifier for the device for matching
-	// with system management libraries
-	UUID string
+	// ID is an identifier for the device for matching with system
+	// management libraries.
+	ID string

 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory
@ -156,8 +156,8 @@ func (m DeviceMemory) LogValue() slog.Value {
 		attrs = append(attrs, slog.Any("Graph", m.Graph))
 	}

-	if len(attrs) > 0 && m.UUID != "" {
-		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
+	if len(attrs) > 0 && m.ID != "" {
+		attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
 	}

 	return slog.GroupValue(attrs...)
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -138,10 +138,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-
-	// Bug #11211: Reporting of UUIDs is temporarily disabled due to causing segfaults
-	// This only affects debug information until the new memory management code is in place
-	// requiredMemory.CPU.UUID = C.GoString(props.uuid)
+	requiredMemory.CPU.ID = C.GoString(props.id)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)

@ -158,7 +155,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
-		// requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
+		requiredMemory.GPUs[i].ID = C.GoString(props.id)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
@ -358,6 +355,24 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		bbs[c] = b
 	}

+	// Mimic llama runner logs summarizing layers and memory
+	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", max(0, params.NumGPULayers-1)))
+	gpuLayers := 0
+	switch C.ggml_backend_dev_type(output.d) {
+	case 0: // CPU
+		slog.Info("offloading output layer to CPU")
+	case 1: // GPU
+		slog.Info("offloading output layer to GPU")
+		gpuLayers++
+	case 2: // ACCEL
+		slog.Info("offloading output layer to ACCEL")
+	}
+	for _, layer := range layers {
+		if C.ggml_backend_dev_type(layer.d) == 1 {
+			gpuLayers++
+		}
+	}
+	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
 	for bs := range maps.Values(bbs) {
 		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
 	}
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@ -152,7 +152,7 @@ extern "C" {
    struct ggml_backend_dev_props {
        const char * name;
        const char * description;
-        const char * uuid;
+        const char * id;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2939,7 +2939,7 @@ struct ggml_backend_cuda_device_context {
    int device;
    std::string name;
    std::string description;
-    std::string uuid;
+    std::string id;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@ -2952,9 +2952,9 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
    return ctx->description.c_str();
 }

-static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    return ctx->uuid.c_str();
+    return ctx->id.c_str();
 }

 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@ -2971,7 +2971,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
-    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
+    props->id          = ggml_backend_cuda_device_get_id(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

@ -3543,8 +3543,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->description = prop.name;

                #if !defined(GGML_USE_HIP)
-                char uuid[64];
-                snprintf(uuid, sizeof(uuid),
+                char id[64];
+                snprintf(id, sizeof(id),
                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
                    (unsigned char)prop.uuid.bytes[0],
                    (unsigned char)prop.uuid.bytes[1],
@ -3563,9 +3563,15 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                    (unsigned char)prop.uuid.bytes[14],
                    (unsigned char)prop.uuid.bytes[15]
                  );
-                dev_ctx->uuid = uuid;
+                dev_ctx->id = id;
                #else
-                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #ifdef _WIN32
+                char id[16];
+                snprintf(id, sizeof(id), "%d", i);
+                dev_ctx->id = id;
+                #else
+                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
                #endif

                ggml_backend_dev_t dev = new ggml_backend_device {
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@ -5985,7 +5985,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
    props->name        = ggml_backend_metal_device_get_name(dev);
    props->description = ggml_backend_metal_device_get_description(dev);
-    props->uuid        = "0";
+    props->id          = "0";
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
--- a/server/routes.go
+++ b/server/routes.go
@ -1404,6 +1404,9 @@ func (s *Server) PsHandler(c *gin.Context) {
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
+		if v.Options != nil {
+			mr.ContextLength = v.Options.NumCtx / v.numParallel
+		}
 		// The scheduler waits to set expiresAt, so if a model is loading it's
 		// possible that it will be set to the unix epoch. For those cases, just
 		// calculate the time w/ the sessionDuration instead.
--- a/server/sched.go
+++ b/server/sched.go
@ -57,9 +57,7 @@ type Scheduler struct {
 var defaultModelsPerGPU = 3

 // Default automatic value for parallel setting
-// Model will still need to fit in VRAM.  If this setting won't fit
-// we'll back off down to 1 to try to get it to fit
-var defaultParallel = 2
+var defaultParallel = 1

 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")

--- a/template/template.go
+++ b/template/template.go
@ -310,21 +310,23 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 }

 // collate messages based on role. consecutive messages of the same role are merged
-// into a single message. collate also collects and returns all system messages.
+// into a single message (except for tool messages which preserve individual metadata).
+// collate also collects and returns all system messages.
 // collate mutates message content adding image tags ([img-%d]) as needed
+// todo(parthsareen): revisit for contextual image support
 func collate(msgs []api.Message) (string, []*api.Message) {
 	var system []string
 	var collated []*api.Message
 	for i := range msgs {
-		msg := msgs[i]
-		if msg.Role == "system" {
-			system = append(system, msg.Content)
+		if msgs[i].Role == "system" {
+			system = append(system, msgs[i].Content)
 		}

-		if len(collated) > 0 && collated[len(collated)-1].Role == msg.Role {
-			collated[len(collated)-1].Content += "\n\n" + msg.Content
+		// merges consecutive messages of the same role into a single message (except for tool messages)
+		if len(collated) > 0 && collated[len(collated)-1].Role == msgs[i].Role && msgs[i].Role != "tool" {
+			collated[len(collated)-1].Content += "\n\n" + msgs[i].Content
 		} else {
-			collated = append(collated, &msg)
+			collated = append(collated, &msgs[i])
 		}
 	}

--- a/template/template_test.go
+++ b/template/template_test.go
@ -163,10 +163,12 @@ func TestParse(t *testing.T) {
 		{"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}},
 		{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
 		{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
+		{"{{ range .Messages }}{{ if eq .Role \"tool\" }}Tool Result: {{ .ToolName }} {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role", "toolname"}},
 		{`{{- range .Messages }}
 {{- if eq .Role "system" }}SYSTEM:
 {{- else if eq .Role "user" }}USER:
 {{- else if eq .Role "assistant" }}ASSISTANT:
+{{- else if eq .Role "tool" }}TOOL: 
 {{- end }} {{ .Content }}
 {{- end }}`, []string{"content", "messages", "role"}},
 		{`{{- if .Messages }}
@ -376,3 +378,99 @@ func TestExecuteWithSuffix(t *testing.T) {
 		})
 	}
 }
+
+func TestCollate(t *testing.T) {
+	cases := []struct {
+		name     string
+		msgs     []api.Message
+		expected []*api.Message
+		system   string
+	}{
+		{
+			name: "consecutive user messages are merged",
+			msgs: []api.Message{
+				{Role: "user", Content: "Hello"},
+				{Role: "user", Content: "How are you?"},
+			},
+			expected: []*api.Message{
+				{Role: "user", Content: "Hello\n\nHow are you?"},
+			},
+			system: "",
+		},
+		{
+			name: "consecutive tool messages are NOT merged",
+			msgs: []api.Message{
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			expected: []*api.Message{
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			system: "",
+		},
+		{
+			name: "tool messages preserve all fields",
+			msgs: []api.Message{
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_conditions"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			expected: []*api.Message{
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_conditions"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+			},
+			system: "",
+		},
+		{
+			name: "mixed messages with system",
+			msgs: []api.Message{
+				{Role: "system", Content: "You are helpful"},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "Hi there!"},
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+				{Role: "user", Content: "Thanks"},
+			},
+			expected: []*api.Message{
+				{Role: "system", Content: "You are helpful"},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "Hi there!"},
+				{Role: "user", Content: "What's the weather?"},
+				{Role: "tool", Content: "sunny", ToolName: "get_weather"},
+				{Role: "tool", Content: "72F", ToolName: "get_temperature"},
+				{Role: "user", Content: "Thanks"},
+			},
+			system: "You are helpful",
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			system, collated := collate(tt.msgs)
+			if diff := cmp.Diff(system, tt.system); diff != "" {
+				t.Errorf("system mismatch (-got +want):\n%s", diff)
+			}
+
+			// Compare the messages
+			if len(collated) != len(tt.expected) {
+				t.Errorf("expected %d messages, got %d", len(tt.expected), len(collated))
+				return
+			}
+
+			for i := range collated {
+				if collated[i].Role != tt.expected[i].Role {
+					t.Errorf("message %d role mismatch: got %q, want %q", i, collated[i].Role, tt.expected[i].Role)
+				}
+				if collated[i].Content != tt.expected[i].Content {
+					t.Errorf("message %d content mismatch: got %q, want %q", i, collated[i].Content, tt.expected[i].Content)
+				}
+				if collated[i].ToolName != tt.expected[i].ToolName {
+					t.Errorf("message %d tool name mismatch: got %q, want %q", i, collated[i].ToolName, tt.expected[i].ToolName)
+				}
+			}
+		})
+	}
+}
--- a/tools/tools.go
+++ b/tools/tools.go
@ -134,16 +134,16 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 		return nil
 	}

-	// only look for arguments if the tool has parameters
+	// only look for arguments after the tool name if the tool has parameters
+	// TODO (jmorganca): while probably uncommon, this doesn't support
+	// parsing arguments before the tool name, which may be needed in the future
 	args := map[string]any{}
 	if len(tool.Function.Parameters.Properties) > 0 {
-		if args, i = p.findArguments(*tool); args == nil {
+		if args, i = findArguments(*tool, p.buffer[end:]); args == nil {
 			return nil
 		}

-		if i > end {
-			end = i
-		}
+		end += i
 	}

 	tc := &api.ToolCall{
@ -160,14 +160,14 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 }

 // findArguments returns the first object that appears to be
-// arguments for the provided tool, returning nil
-func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
-	if len(p.buffer) == 0 {
-		return nil, 0
-	}
-
-	// no arguments to parse
-	if len(tool.Function.Parameters.Properties) == 0 {
+// arguments for the provided tool in the provided buffer,
+// returning nil if no arguments are found.
+// TODO (jmorganca): this does not support parsing omitted arguments
+// objects for functions that have all-optional parameters
+// e.g. `{"name": "get_conditions", "arguments": {}}` will work but
+// `{"name": "get_conditions"}` will not currently work
+func findArguments(tool api.Tool, buffer []byte) (map[string]any, int) {
+	if len(buffer) == 0 {
 		return nil, 0
 	}

@ -177,7 +177,7 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
 	var object []byte

 	// find any outer json object
-	for i, c := range p.buffer {
+	for i, c := range buffer {
 		if c == '{' {
 			braces++
 			if start == -1 {
@ -190,7 +190,7 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
 				braces--
 				if braces == 0 {
 					end = i + 1
-					object = p.buffer[start:end]
+					object = buffer[start:end]
 					break
 				}
 			}
@ -202,8 +202,6 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
 	}

 	var data map[string]any
-
-	// not valid json
 	if err := json.Unmarshal(object, &data); err != nil {
 		return nil, 0
 	}
@ -212,15 +210,27 @@ func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
 	find = func(obj any) map[string]any {
 		switch obj := obj.(type) {
 		case map[string]any:
-			found := true
+			valid := true
+			// check if all keys in the object exist in the tool's parameters
 			for key := range obj {
 				if _, exists := tool.Function.Parameters.Properties[key]; !exists {
-					found = false
+					valid = false
 					break
 				}
 			}

-			if found {
+			// check for required parameters
+			// TODO (jmorganca): this should error instead of silently failing
+			if valid {
+				for _, required := range tool.Function.Parameters.Required {
+					if _, exists := obj[required]; !exists {
+						valid = false
+						break
+					}
+				}
+			}
+
+			if valid {
 				return obj
 			}

--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@ -52,7 +52,8 @@ func TestParser(t *testing.T) {
 						Enum        []any            `json:"enum,omitempty"`
 					} `json:"properties"`
 				}{
-					Type: "object",
+					Type:     "object",
+					Required: []string{"city"},
 					Properties: map[string]struct {
 						Type        api.PropertyType `json:"type"`
 						Items       any              `json:"items,omitempty"`
@ -159,8 +160,23 @@ func TestParser(t *testing.T) {
 			calls:   nil,
 		},
 		{
-			name:    "missing args",
-			inputs:  []string{`<tool_call>{"name": "get_conditions"}</tool_call>`},
+			name:    "empty args",
+			inputs:  []string{`<tool_call>{"name": "get_conditions", "arguments": {}}</tool_call>`},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index:     0,
+						Name:      "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
+			},
+		},
+		{
+			name:    "missing required args",
+			inputs:  []string{`<tool_call>{"name": "get_temperature", "arguments": {}}</tool_call>`},
 			content: "",
 			tmpl:    qwen,
 			calls:   nil,
@ -259,9 +275,9 @@ func TestParser(t *testing.T) {
 			},
 		},
 		{
-			name:    "qwen two tool calls one with no args",
-			inputs:  []string{`Let me check the weather. <tool_call>{"name": "say_hello"}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}`},
-			content: "Let me check the weather. ",
+			name:    "empty args followed by args",
+			inputs:  []string{`Let me say hello and check the weather. <tool_call>{"name": "say_hello", "arguments": {}}</tool_call><tool_call>{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}</tool_call>`},
+			content: "Let me say hello and check the weather. ",
 			tmpl:    qwen,
 			calls: []api.ToolCall{
 				{
@ -271,6 +287,31 @@ func TestParser(t *testing.T) {
 						Arguments: api.ToolCallFunctionArguments{},
 					},
 				},
+				{
+					Function: api.ToolCallFunction{
+						Index: 1,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city":   "London",
+							"format": "fahrenheit",
+						},
+					},
+				},
+			},
+		},
+		{
+			name:    "qwen empty followed by args",
+			inputs:  []string{`Let me check the weather. <tool_call>{"name": "get_conditions", "arguments": {}}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}`},
+			content: "Let me check the weather. ",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index:     0,
+						Name:      "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{},
+					},
+				},
 				{
 					Function: api.ToolCallFunction{
 						Index: 1,
@ -1035,16 +1076,19 @@ func TestFindArguments(t *testing.T) {
 			},
 			tool: tool,
 		},
+		{
+			name:   "deepseek",
+			buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
+			want: map[string]any{
+				"location": "Tokyo",
+			},
+			tool: tool,
+		},
 	}

 	for _, tt := range tests {
-		parser := &Parser{
-			buffer: tt.buffer,
-			tools:  []api.Tool{tool, tool2},
-		}
-
 		t.Run(tt.name, func(t *testing.T) {
-			got, _ := parser.findArguments(tool)
+			got, _ := findArguments(tt.tool, tt.buffer)

 			if diff := cmp.Diff(got, tt.want); diff != "" {
 				t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff)