wip

2025-11-11 17:15:04 -05:00
113 changed files with 20047 additions and 8905 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -104,13 +104,6 @@ jobs:
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
-            runner_dir: 'rocm'
-          - os: windows
-            arch: amd64
-            preset: Vulkan
-            install: https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe
-            flags: ''
-            runner_dir: 'vulkan'
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -120,14 +113,13 @@ jobs:
        run: |
          choco install -y --no-progress ccache ninja
          ccache -o cache_dir=${{ github.workspace }}\.ccache
-      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ') || startsWith(matrix.preset, 'Vulkan')
+      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ')
        id: cache-install
        uses: actions/cache/restore@v4
        with:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
-            C:\VulkanSDK
          key: ${{ matrix.install }}
      - if: startsWith(matrix.preset, 'CUDA ')
        name: Install CUDA ${{ matrix.cuda-version }}
@@ -157,18 +149,6 @@ jobs:
          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
-      - if: matrix.preset == 'Vulkan'
-        name: Install Vulkan ${{ matrix.rocm-version }}
-        run: |
-          $ErrorActionPreference = "Stop"
-          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
-            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList "-c","--am","--al","in" -NoNewWindow -Wait
-          }
-          
-          $vulkanPath = (Resolve-Path "C:\VulkanSDK\*").path
-          echo "$vulkanPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "VULKAN_SDK=$vulkanPath" >> $env:GITHUB_ENV
      - if: matrix.preset == 'CPU'
        run: |
          echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
@@ -179,7 +159,6 @@ jobs:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
-            C:\VulkanSDK
          key: ${{ matrix.install }}
      - uses: actions/checkout@v4
      - uses: actions/cache@v4
@@ -192,7 +171,7 @@ jobs:
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} --install-prefix "$((pwd).Path)\dist\${{ matrix.os }}-${{ matrix.arch }}"
          cmake --build --parallel ([Environment]::ProcessorCount) --preset "${{ matrix.preset }}"
-          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || startsWith(matrix.preset, 'Vulkan') && 'Vulkan' || 'CPU' }}" --strip
+          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip
          Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
        env:
          CMAKE_GENERATOR: Ninja
@@ -333,13 +312,13 @@ jobs:
        include:
          - os: linux
            arch: amd64
-            target: archive
+            target: archive_novulkan
          - os: linux
            arch: amd64
            target: rocm
          - os: linux
            arch: arm64
-            target: archive
+            target: archive_novulkan
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: setup-environment
@@ -395,12 +374,14 @@ jobs:
        include:
          - os: linux
            arch: arm64
+            target: novulkan
            build-args: |
              CGO_CFLAGS
              CGO_CXXFLAGS
              GOFLAGS
          - os: linux
            arch: amd64
+            target: novulkan
            build-args: |
              CGO_CFLAGS
              CGO_CXXFLAGS
@@ -413,6 +394,14 @@ jobs:
              CGO_CXXFLAGS
              GOFLAGS
              FLAVOR=rocm
+          - os: linux
+            arch: amd64
+            suffix: '-vulkan'
+            target: default
+            build-args: |
+              CGO_CFLAGS
+              CGO_CXXFLAGS
+              GOFLAGS
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: setup-environment
@@ -430,6 +419,7 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.os }}/${{ matrix.arch }}
+          target: ${{ matrix.preset }}
          build-args: ${{ matrix.build-args }}
          outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -172,7 +172,6 @@ jobs:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
-            C:\VulkanSDK
          key: ${{ matrix.install }}
      - uses: actions/checkout@v4
      - uses: actions/cache@v4
--- a/27
+++ b/27
@@ -159,7 +159,32 @@ ARG VULKANVERSION
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama

-FROM ubuntu:24.04
+# Temporary opt-out stages for Vulkan
+FROM --platform=linux/amd64 scratch AS amd64_novulkan
+# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
+COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
+FROM arm64 AS arm64_novulkan
+FROM ${FLAVOR}_novulkan AS archive_novulkan
+COPY --from=cpu dist/lib/ollama /lib/ollama
+COPY --from=build /bin/ollama /bin/ollama
+FROM ubuntu:24.04 AS novulkan
+RUN apt-get update \
+    && apt-get install -y ca-certificates \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+COPY --from=archive_novulkan /bin /usr/bin
+ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+COPY --from=archive_novulkan /lib/ollama /usr/lib/ollama
+ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV OLLAMA_HOST=0.0.0.0:11434
+EXPOSE 11434
+ENTRYPOINT ["/bin/ollama"]
+CMD ["serve"]
+
+FROM ubuntu:24.04 AS default
 RUN apt-get update \
    && apt-get install -y ca-certificates libvulkan1 \
    && apt-get clean \
--- a/README.md
+++ b/README.md
@@ -299,7 +299,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [AI-UI](https://github.com/bajahaw/ai-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
 - [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
@@ -366,7 +365,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VS Code extension for multi-file/whole-repo coding
+- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
@@ -398,7 +397,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
 - [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
 - [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VS Code extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
+- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
 - [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
 - [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
@@ -641,5 +640,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
 - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.

-### Security
+## Security
 - [Ollama Fortress](https://github.com/ParisNeo/ollama_proxy_server)
--- a/api/types.go
+++ b/api/types.go
@@ -117,14 +117,6 @@ type GenerateRequest struct {
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
-
-	// Logprobs specifies whether to return log probabilities of the output tokens.
-	Logprobs bool `json:"logprobs,omitempty"`
-
-	// TopLogprobs is the number of most likely tokens to return at each token position,
-	// each with an associated log probability. Only applies when Logprobs is true.
-	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
-	TopLogprobs int `json:"top_logprobs,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -167,14 +159,6 @@ type ChatRequest struct {
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
-
-	// Logprobs specifies whether to return log probabilities of the output tokens.
-	Logprobs bool `json:"logprobs,omitempty"`
-
-	// TopLogprobs is the number of most likely tokens to return at each token position,
-	// each with an associated log probability. Only applies when Logprobs is true.
-	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
-	TopLogprobs int `json:"top_logprobs,omitempty"`
 }

 type Tools []Tool
@@ -359,27 +343,6 @@ func (t *ToolFunction) String() string {
 	return string(bts)
 }

-// TokenLogprob represents log probability information for a single token alternative.
-type TokenLogprob struct {
-	// Token is the text representation of the token.
-	Token string `json:"token"`
-
-	// Logprob is the log probability of this token.
-	Logprob float64 `json:"logprob"`
-
-	// Bytes contains the raw byte representation of the token
-	Bytes []int `json:"bytes,omitempty"`
-}
-
-// Logprob contains log probability information for a generated token.
-type Logprob struct {
-	TokenLogprob
-
-	// TopLogprobs contains the most likely tokens and their log probabilities
-	// at this position, if requested via TopLogprobs parameter.
-	TopLogprobs []TokenLogprob `json:"top_logprobs,omitempty"`
-}
-
 // ChatResponse is the response returned by [Client.Chat]. Its fields are
 // similar to [GenerateResponse].
 type ChatResponse struct {
@@ -406,10 +369,6 @@ type ChatResponse struct {

 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`

-	// Logprobs contains log probability information for the generated tokens,
-	// if requested via the Logprobs parameter.
-	Logprobs []Logprob `json:"logprobs,omitempty"`
-
 	Metrics
 }

@@ -718,10 +677,6 @@ type GenerateResponse struct {
 	ToolCalls []ToolCall `json:"tool_calls,omitempty"`

 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
-
-	// Logprobs contains log probability information for the generated tokens,
-	// if requested via the Logprobs parameter.
-	Logprobs []Logprob `json:"logprobs,omitempty"`
 }

 // ModelDetails provides details about a model.
--- a/app/ui/app/codegen/gotypes.gen.ts
+++ b/app/ui/app/codegen/gotypes.gen.ts
@@ -213,6 +213,44 @@ export class ChatResponse {
 	    return a;
 	}
 }
+export class MessageUpdateRequest {
+    content: string;
+
+    constructor(source: any = {}) {
+        if ('string' === typeof source) source = JSON.parse(source);
+        this.content = source["content"];
+    }
+}
+export class MessageUpdateResponse {
+    index: number;
+    chatId: string;
+    message: Message;
+
+    constructor(source: any = {}) {
+        if ('string' === typeof source) source = JSON.parse(source);
+        this.index = source["index"];
+        this.chatId = source["chatId"];
+        this.message = this.convertValues(source["message"], Message);
+    }
+
+	convertValues(a: any, classs: any, asMap: boolean = false): any {
+	    if (!a) {
+	        return a;
+	    }
+	    if (Array.isArray(a)) {
+	        return (a as any[]).map(elem => this.convertValues(elem, classs));
+	    } else if ("object" === typeof a) {
+	        if (asMap) {
+	            for (const key of Object.keys(a)) {
+	                a[key] = new classs(a[key]);
+	            }
+	            return a;
+	        }
+	        return new classs(a);
+	    }
+	    return a;
+	}
+}
 export class Model {
    model: string;
    digest?: string;
--- a/app/ui/app/components.json
+++ b/app/ui/app/components.json
@@ -0,0 +1,20 @@
+{
+  "$schema": "https://ui.shadcn.com/schema.json",
+  "style": "default",
+  "rsc": false,
+  "tsx": true,
+  "tailwind": {
+    "config": "tailwind.config.js",
+    "css": "src/index.css",
+    "baseColor": "neutral",
+    "cssVariables": true,
+    "prefix": ""
+  },
+  "aliases": {
+    "components": "@/components",
+    "utils": "@/lib/utils",
+    "ui": "@/components/ui",
+    "lib": "@/lib",
+    "hooks": "@/hooks"
+  }
+}
--- a/app/ui/app/package-lock.json
+++ b/app/ui/app/package-lock.json
@@ -26,6 +26,7 @@
        "rehype-sanitize": "^6.0.0",
        "remark-math": "^6.0.0",
        "streamdown": "^1.4.0",
+        "tailwind-merge": "^3.4.0",
        "unist-builder": "^4.0.0",
        "unist-util-parents": "^3.0.0"
      },
@@ -12110,9 +12111,9 @@
      "integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew=="
    },
    "node_modules/tailwind-merge": {
-      "version": "3.3.1",
-      "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.3.1.tgz",
-      "integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==",
+      "version": "3.4.0",
+      "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.4.0.tgz",
+      "integrity": "sha512-uSaO4gnW+b3Y2aWoWfFpX62vn2sR3skfhbjsEnaBI81WD1wBLlHZe5sWf0AqjksNdYTbGBEd0UasQMT3SNV15g==",
      "license": "MIT",
      "funding": {
        "type": "github",
--- a/app/ui/app/package.json
+++ b/app/ui/app/package.json
@@ -35,6 +35,7 @@
    "rehype-sanitize": "^6.0.0",
    "remark-math": "^6.0.0",
    "streamdown": "^1.4.0",
+    "tailwind-merge": "^3.4.0",
    "unist-builder": "^4.0.0",
    "unist-util-parents": "^3.0.0"
  },
--- a/app/ui/app/src/api.ts
+++ b/app/ui/app/src/api.ts
@@ -11,6 +11,7 @@ import {
  ChatRequest,
  Settings,
  User,
+  Message,
 } from "@/gotypes";
 import { parseJsonlFromResponse } from "./util/jsonl-parsing";
 import { ollamaClient as ollama } from "./lib/ollama-client";
@@ -205,13 +206,6 @@ export async function* sendMessage(
    data: uint8ArrayToBase64(att.data),
  }));

-  // Only send think parameter when actually requesting thinking
-  // Don't send false as it causes issues with some providers
-  const shouldSendThink =
-    think !== undefined &&
-    ((typeof think === "boolean" && think) ||
-      (typeof think === "string" && think !== ""));
-
  const response = await fetch(`${API_BASE}/api/v1/chat/${chatId}`, {
    method: "POST",
    headers: {
@@ -229,7 +223,7 @@ export async function* sendMessage(
        web_search: webSearch ?? false,
        file_tools: fileTools ?? false,
        ...(forceUpdate !== undefined ? { forceUpdate } : {}),
-        ...(shouldSendThink ? { think } : {}),
+        ...(think !== undefined ? { think } : {}),
      }),
    ),
    signal,
@@ -307,6 +301,40 @@ export async function deleteChat(chatId: string): Promise<void> {
  }
 }

+export async function updateChatMessage(
+  chatId: string,
+  index: number,
+  content: string,
+): Promise<{
+  index: number;
+  chatId: string;
+  message: Message;
+}> {
+  const response = await fetch(
+    `${API_BASE}/api/v1/chat/${chatId}/messages/${index}`,
+    {
+      method: "PATCH",
+      headers: {
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({ content }),
+    },
+  );
+
+  if (!response.ok) {
+    const errorMessage = await response.text();
+    throw new Error(errorMessage || "Failed to update message");
+  }
+
+  const data = await response.json();
+
+  return {
+    index: data.index,
+    chatId: data.chatId,
+    message: new Message(data.message),
+  };
+}
+
 // Get upstream information for model staleness checking
 export async function getModelUpstreamInfo(
  model: Model,
--- a/app/ui/app/src/components/Chat.tsx
+++ b/app/ui/app/src/components/Chat.tsx
@@ -13,6 +13,7 @@ import {
  useChatError,
  useShouldShowStaleDisplay,
  useDismissStaleModel,
+  useUpdateChatMessage,
 } from "@/hooks/useChats";
 import { useHealth } from "@/hooks/useHealth";
 import { useMessageAutoscroll } from "@/hooks/useMessageAutoscroll";
@@ -47,6 +48,12 @@ export default function Chat({ chatId }: { chatId: string }) {
    index: number;
    originalMessage: Message;
  } | null>(null);
+  const [editingAssistantIndex, setEditingAssistantIndex] = useState<
+    number | null
+  >(null);
+  const [assistantEditError, setAssistantEditError] = useState<string | null>(
+    null,
+  );
  const prevChatIdRef = useRef<string>(chatId);

  const chatFormCallbackRef = useRef<
@@ -98,9 +105,14 @@ export default function Chat({ chatId }: { chatId: string }) {
  // Clear editing state when navigating to a different chat
  useEffect(() => {
    setEditingMessage(null);
+    setEditingAssistantIndex(null);
+    setAssistantEditError(null);
  }, [chatId]);

  const sendMessageMutation = useSendMessage(chatId);
+  const updateAssistantMessageMutation = useUpdateChatMessage(
+    chatId === "new" ? "" : chatId,
+  );

  const { containerRef, handleNewUserMessage, spacerHeight } =
    useMessageAutoscroll({
@@ -186,6 +198,44 @@ export default function Chat({ chatId }: { chatId: string }) {
    }
  };

+  const handleAssistantEditStart = (index: number) => {
+    setAssistantEditError(null);
+    setEditingAssistantIndex(index);
+  };
+
+  const handleAssistantEditCancel = () => {
+    if (updateAssistantMessageMutation.isPending) {
+      return;
+    }
+    setAssistantEditError(null);
+    setEditingAssistantIndex(null);
+  };
+
+  const handleAssistantEditSave = async (index: number, content: string) => {
+    if (updateAssistantMessageMutation.isPending) {
+      return;
+    }
+
+    const trimmedContent = content.trim();
+    if (!trimmedContent) {
+      setAssistantEditError("Response cannot be empty.");
+      return;
+    }
+
+    try {
+      setAssistantEditError(null);
+      await updateAssistantMessageMutation.mutateAsync({
+        index,
+        content: trimmedContent,
+      });
+      setEditingAssistantIndex(null);
+    } catch (error) {
+      setAssistantEditError(
+        error instanceof Error ? error.message : "Failed to update message.",
+      );
+    }
+  };
+
  const clearChatError = () => {
    queryClient.setQueryData(
      ["chatError", chatId === "new" ? "" : chatId],
@@ -236,6 +286,12 @@ export default function Chat({ chatId }: { chatId: string }) {
              editingMessageIndex={editingMessage?.index}
              error={chatError}
              browserToolResult={browserToolResult}
+              onAssistantEditStart={handleAssistantEditStart}
+              onAssistantEditSave={handleAssistantEditSave}
+              onAssistantEditCancel={handleAssistantEditCancel}
+              assistantEditingIndex={editingAssistantIndex}
+              assistantEditIsSaving={updateAssistantMessageMutation.isPending}
+              assistantEditError={assistantEditError}
            />
          </section>

--- a/app/ui/app/src/components/Message.tsx
+++ b/app/ui/app/src/components/Message.tsx
@@ -3,8 +3,16 @@ import Thinking from "./Thinking";
 import StreamingMarkdownContent from "./StreamingMarkdownContent";
 import { ImageThumbnail } from "./ImageThumbnail";
 import { isImageFile } from "@/utils/imageUtils";
-import CopyButton from "./CopyButton";
-import React, { useState, useMemo, useRef } from "react";
+import React, { useState, useMemo, useRef, useEffect } from "react";
+import {
+  CheckIcon,
+  PencilSquareIcon,
+  Square2StackIcon,
+} from "@heroicons/react/24/outline";
+import {
+  MessageActions,
+  MessageAction,
+} from "@/components/ai-elements/message";

 const Message = React.memo(
  ({
@@ -15,6 +23,12 @@ const Message = React.memo(
    isFaded,
    browserToolResult,
    lastToolQuery,
+    onAssistantEditStart,
+    onAssistantEditSave,
+    onAssistantEditCancel,
+    assistantEditingIndex,
+    assistantEditIsSaving,
+    assistantEditError,
  }: {
    message: MessageType;
    onEditMessage?: (content: string, index: number) => void;
@@ -24,6 +38,15 @@ const Message = React.memo(
    // TODO(drifkin): this type isn't right
    browserToolResult?: BrowserToolResult;
    lastToolQuery?: string;
+    onAssistantEditStart?: (index: number) => void;
+    onAssistantEditSave?: (
+      index: number,
+      content: string,
+    ) => void | Promise<void>;
+    onAssistantEditCancel?: () => void;
+    assistantEditingIndex?: number | null;
+    assistantEditIsSaving?: boolean;
+    assistantEditError?: string | null;
  }) => {
    if (message.role === "user") {
      return (
@@ -42,6 +65,13 @@ const Message = React.memo(
          isFaded={isFaded}
          browserToolResult={browserToolResult}
          lastToolQuery={lastToolQuery}
+          messageIndex={messageIndex}
+          onAssistantEditStart={onAssistantEditStart}
+          onAssistantEditSave={onAssistantEditSave}
+          onAssistantEditCancel={onAssistantEditCancel}
+          assistantEditingIndex={assistantEditingIndex}
+          assistantEditIsSaving={assistantEditIsSaving}
+          assistantEditError={assistantEditError}
        />
      );
    }
@@ -53,7 +83,10 @@ const Message = React.memo(
      prevProps.messageIndex === nextProps.messageIndex &&
      prevProps.isStreaming === nextProps.isStreaming &&
      prevProps.isFaded === nextProps.isFaded &&
-      prevProps.browserToolResult === nextProps.browserToolResult
+      prevProps.browserToolResult === nextProps.browserToolResult &&
+      prevProps.assistantEditingIndex === nextProps.assistantEditingIndex &&
+      prevProps.assistantEditIsSaving === nextProps.assistantEditIsSaving &&
+      prevProps.assistantEditError === nextProps.assistantEditError
    );
  },
 );
@@ -880,6 +913,13 @@ function OtherRoleMessage({
  isFaded,
  browserToolResult,
  lastToolQuery,
+  messageIndex,
+  onAssistantEditStart,
+  onAssistantEditSave,
+  onAssistantEditCancel,
+  assistantEditingIndex,
+  assistantEditIsSaving,
+  assistantEditError,
 }: {
  message: MessageType;
  previousMessage?: MessageType;
@@ -888,8 +928,106 @@ function OtherRoleMessage({
  // TODO(drifkin): this type isn't right
  browserToolResult?: BrowserToolResult;
  lastToolQuery?: string;
+  messageIndex?: number;
+  onAssistantEditStart?: (index: number) => void;
+  onAssistantEditSave?: (
+    index: number,
+    content: string,
+  ) => void | Promise<void>;
+  onAssistantEditCancel?: () => void;
+  assistantEditingIndex?: number | null;
+  assistantEditIsSaving?: boolean;
+  assistantEditError?: string | null;
 }) {
  const messageRef = useRef<HTMLDivElement>(null);
+  const [draftContent, setDraftContent] = useState(message.content || "");
+  const [isCopied, setIsCopied] = useState(false);
+  const copyResetTimeoutRef = useRef<number | undefined>(undefined);
+
+  const isAssistantMessage = message.role === "assistant";
+  const isEditingAssistant =
+    isAssistantMessage &&
+    assistantEditingIndex !== null &&
+    assistantEditingIndex !== undefined &&
+    messageIndex !== undefined &&
+    assistantEditingIndex === messageIndex;
+
+  useEffect(() => {
+    if (isEditingAssistant) {
+      setDraftContent(message.content || "");
+    }
+  }, [isEditingAssistant, message.content]);
+
+  useEffect(() => {
+    setIsCopied(false);
+  }, [message.content]);
+
+  useEffect(() => {
+    return () => {
+      if (copyResetTimeoutRef.current !== undefined) {
+        window.clearTimeout(copyResetTimeoutRef.current);
+      }
+    };
+  }, []);
+
+  const handleAssistantEditStart = () => {
+    if (onAssistantEditStart && messageIndex !== undefined) {
+      onAssistantEditStart(messageIndex);
+    }
+  };
+
+  const handleAssistantEditSave = async () => {
+    if (!onAssistantEditSave || messageIndex === undefined) {
+      return;
+    }
+    await onAssistantEditSave(messageIndex, draftContent);
+  };
+
+  const handleAssistantEditCancel = () => {
+    onAssistantEditCancel?.();
+  };
+
+  const handleCopy = async () => {
+    const contentToCopy = message.content || "";
+    if (!contentToCopy) {
+      return;
+    }
+
+    const scheduleReset = () => {
+      if (copyResetTimeoutRef.current !== undefined) {
+        window.clearTimeout(copyResetTimeoutRef.current);
+      }
+      copyResetTimeoutRef.current = window.setTimeout(() => {
+        setIsCopied(false);
+      }, 2000);
+    };
+
+    try {
+      if (messageRef.current) {
+        const cloned = messageRef.current.cloneNode(true) as HTMLElement;
+        await navigator.clipboard.write([
+          new ClipboardItem({
+            "text/html": new Blob([cloned.innerHTML], { type: "text/html" }),
+            "text/plain": new Blob([contentToCopy], { type: "text/plain" }),
+          }),
+        ]);
+      } else {
+        await navigator.clipboard.writeText(contentToCopy);
+      }
+
+      setIsCopied(true);
+      scheduleReset();
+    } catch (error) {
+      console.error("Clipboard API failed, falling back to plain text", error);
+      try {
+        await navigator.clipboard.writeText(contentToCopy);
+        setIsCopied(true);
+        scheduleReset();
+      } catch (fallbackError) {
+        console.error("Fallback copy also failed:", fallbackError);
+      }
+    }
+  };

  return (
    <div
@@ -918,17 +1056,19 @@ function OtherRoleMessage({

          if (
            message.role !== "tool" &&
+            !isEditingAssistant &&
            (!message.content || !message.content.trim())
          ) {
            return null;
          }

-          // Render appropriate content
+          const refForContent = isEditingAssistant ? null : messageRef;
+
          return (
            <div
              className="max-w-full prose dark:prose-invert assistant-message-content break-words"
              id="message-container"
-              ref={messageRef}
+              ref={refForContent}
            >
              {message.role === "tool" ? (
                <ToolRoleContent
@@ -936,6 +1076,42 @@ function OtherRoleMessage({
                  browserToolResult={browserToolResult}
                  lastToolQuery={lastToolQuery}
                />
+              ) : isEditingAssistant ? (
+                <>
+                  <textarea
+                    value={draftContent}
+                    onChange={(event) => setDraftContent(event.target.value)}
+                    disabled={assistantEditIsSaving}
+                    autoFocus
+                    rows={Math.min(draftContent.split("\n").length, 20)}
+                    className="w-full max-h-[500px] overflow-y-auto resize-none rounded-2xl border border-neutral-200 bg-white p-4 text-sm leading-relaxed text-neutral-900 transition-colors dark:border-neutral-700 dark:bg-neutral-900 dark:text-neutral-100"
+                  />
+                  <div className="mt-3 flex flex-wrap items-center gap-2">
+                    <button
+                      type="button"
+                      onClick={handleAssistantEditSave}
+                      disabled={assistantEditIsSaving}
+                      tabIndex={0}
+                      className="rounded-2xl px-2 py-1 text-sm font-medium text-black transition-colors hover:text-neutral-700 focus-visible:ring-2 focus-visible:ring-neutral-300 focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60 dark:text-neutral-200 dark:hover:text-neutral-400 dark:focus-visible:ring-neutral-500 cursor-pointer"
+                    >
+                      {assistantEditIsSaving ? "Saving…" : "Save"}
+                    </button>
+                    <button
+                      type="button"
+                      onClick={handleAssistantEditCancel}
+                      disabled={assistantEditIsSaving}
+                      tabIndex={0}
+                      className="rounded-2xl px-2 py-1 text-sm text-neutral-500 transition-colors hover:text-neutral-700 focus-visible:ring-2 focus-visible:ring-neutral-300 focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60 dark:text-neutral-400 dark:hover:text-neutral-200 dark:focus-visible:ring-neutral-500 cursor-pointer"
+                    >
+                      Cancel
+                    </button>
+                  </div>
+                  {assistantEditError && (
+                    <p className="mt-2 text-sm text-red-500 dark:text-red-400">
+                      {assistantEditError}
+                    </p>
+                  )}
+                </>
              ) : (
                <StreamingMarkdownContent
                  content={message.content}
@@ -972,18 +1148,28 @@ function OtherRoleMessage({
        message.content &&
        message.content.trim() &&
        (!message.tool_calls || message.tool_calls.length === 0) &&
-        !message.tool_call && (
-          <div className="-ml-1">
-            <CopyButton
-              content={message.content || ""}
-              copyRef={messageRef as React.RefObject<HTMLElement>}
-              removeClasses={["copy-button"]}
-              size="md"
-              showLabels={false}
-              className="copy-button z-10 text-neutral-500 dark:text-neutral-400"
-              title="Copy"
-            />
-          </div>
+        !message.tool_call &&
+        !isEditingAssistant && (
+          <MessageActions>
+            <MessageAction
+              onClick={handleCopy}
+              title={isCopied ? "Copied" : "Copy"}
+              aria-label={isCopied ? "Copied" : "Copy"}
+            >
+              {isCopied ? (
+                <CheckIcon className="h-4.5 w-4.5" />
+              ) : (
+                <Square2StackIcon className="h-4.5 w-4.5" />
+              )}
+            </MessageAction>
+            <MessageAction
+              onClick={handleAssistantEditStart}
+              title="Edit"
+              aria-label="Edit"
+            >
+              <PencilSquareIcon className="h-4.5 w-4.5" />
+            </MessageAction>
+          </MessageActions>
        )}
    </div>
  );
--- a/app/ui/app/src/components/MessageList.tsx
+++ b/app/ui/app/src/components/MessageList.tsx
@@ -14,6 +14,12 @@ export default function MessageList({
  editingMessageIndex,
  error,
  browserToolResult,
+  onAssistantEditStart,
+  onAssistantEditSave,
+  onAssistantEditCancel,
+  assistantEditingIndex,
+  assistantEditIsSaving,
+  assistantEditError,
 }: {
  messages: MessageType[];
  spacerHeight: number;
@@ -24,6 +30,15 @@ export default function MessageList({
  editingMessageIndex?: number;
  error?: ErrorEvent | null;
  browserToolResult?: any;
+  onAssistantEditStart?: (index: number) => void;
+  onAssistantEditSave?: (
+    index: number,
+    content: string,
+  ) => void | Promise<void>;
+  onAssistantEditCancel?: () => void;
+  assistantEditingIndex?: number | null;
+  assistantEditIsSaving?: boolean;
+  assistantEditError?: string | null;
 }) {
  const [showDots, setShowDots] = React.useState(false);
  const isDownloadingModel = downloadProgress && !downloadProgress.done;
@@ -101,6 +116,12 @@ export default function MessageList({
              }
              browserToolResult={browserToolResult}
              lastToolQuery={lastToolQuery}
+              onAssistantEditStart={onAssistantEditStart}
+              onAssistantEditSave={onAssistantEditSave}
+              onAssistantEditCancel={onAssistantEditCancel}
+              assistantEditingIndex={assistantEditingIndex}
+              assistantEditIsSaving={assistantEditIsSaving}
+              assistantEditError={assistantEditError}
            />
          </div>
        );
--- a/app/ui/app/src/components/ai-elements/message.tsx
+++ b/app/ui/app/src/components/ai-elements/message.tsx
@@ -0,0 +1,227 @@
+import { cn } from "@/lib/utils";
+import type { HTMLAttributes, ReactElement } from "react";
+import { createContext, memo, useContext, useEffect, useState } from "react";
+import { Streamdown } from "streamdown";
+
+export type MessageProps = HTMLAttributes<HTMLDivElement> & {
+  from: "user" | "assistant" | "system";
+};
+
+export const Message = ({ className, from, ...props }: MessageProps) => (
+  <div
+    className={cn(
+      "group flex w-full max-w-[80%] flex-col gap-2",
+      from === "user" ? "is-user ml-auto justify-end" : "is-assistant",
+      className,
+    )}
+    {...props}
+  />
+);
+
+export type MessageContentProps = HTMLAttributes<HTMLDivElement>;
+
+export const MessageContent = ({
+  children,
+  className,
+  ...props
+}: MessageContentProps) => (
+  <div
+    className={cn(
+      "is-user:dark flex w-fit flex-col gap-2 overflow-hidden text-sm",
+      "group-[.is-user]:ml-auto group-[.is-user]:rounded-lg group-[.is-user]:bg-neutral-100 dark:group-[.is-user]:bg-neutral-800 group-[.is-user]:px-4 group-[.is-user]:py-3 group-[.is-user]:text-neutral-900 dark:group-[.is-user]:text-neutral-100",
+      "group-[.is-assistant]:text-neutral-900 dark:group-[.is-assistant]:text-neutral-100",
+      className,
+    )}
+    {...props}
+  >
+    {children}
+  </div>
+);
+
+export type MessageActionsProps = HTMLAttributes<HTMLDivElement>;
+
+export const MessageActions = ({
+  className,
+  children,
+  ...props
+}: MessageActionsProps) => (
+  <div className={cn("flex items-center gap-1", className)} {...props}>
+    {children}
+  </div>
+);
+
+export type MessageActionProps =
+  React.ButtonHTMLAttributes<HTMLButtonElement> & {
+    tooltip?: string;
+    label?: string;
+    active?: boolean;
+  };
+
+export const MessageAction = ({
+  tooltip,
+  children,
+  label,
+  active = false,
+  className,
+  ...props
+}: MessageActionProps) => {
+  return (
+    <button
+      type="button"
+      title={tooltip || label}
+      aria-label={label || tooltip}
+      className={cn(
+        "flex h-7 w-7 items-center justify-center rounded-md transition-colors focus-visible:ring-2 focus-visible:ring-neutral-300 focus-visible:ring-offset-2 focus-visible:ring-offset-white dark:focus-visible:ring-neutral-500 dark:focus-visible:ring-offset-neutral-900",
+        active
+          ? "bg-neutral-900 text-white dark:bg-white dark:text-neutral-900"
+          : "text-neutral-600 hover:bg-neutral-100 hover:text-neutral-900 dark:text-neutral-600 dark:hover:bg-neutral-800 dark:hover:text-neutral-100",
+        className,
+      )}
+      {...props}
+    >
+      {children}
+      {(label || tooltip) && (
+        <span className="sr-only">{label || tooltip}</span>
+      )}
+    </button>
+  );
+};
+
+type MessageBranchContextType = {
+  currentBranch: number;
+  totalBranches: number;
+  goToPrevious: () => void;
+  goToNext: () => void;
+  branches: ReactElement[];
+  setBranches: (branches: ReactElement[]) => void;
+};
+
+const MessageBranchContext = createContext<MessageBranchContextType | null>(
+  null,
+);
+
+const useMessageBranch = () => {
+  const context = useContext(MessageBranchContext);
+
+  if (!context) {
+    throw new Error(
+      "MessageBranch components must be used within MessageBranch",
+    );
+  }
+
+  return context;
+};
+
+export type MessageBranchProps = HTMLAttributes<HTMLDivElement> & {
+  defaultBranch?: number;
+  onBranchChange?: (branchIndex: number) => void;
+};
+
+export const MessageBranch = ({
+  defaultBranch = 0,
+  onBranchChange,
+  className,
+  ...props
+}: MessageBranchProps) => {
+  const [currentBranch, setCurrentBranch] = useState(defaultBranch);
+  const [branches, setBranches] = useState<ReactElement[]>([]);
+
+  const handleBranchChange = (newBranch: number) => {
+    setCurrentBranch(newBranch);
+    onBranchChange?.(newBranch);
+  };
+
+  const goToPrevious = () => {
+    const newBranch =
+      currentBranch > 0 ? currentBranch - 1 : branches.length - 1;
+    handleBranchChange(newBranch);
+  };
+
+  const goToNext = () => {
+    const newBranch =
+      currentBranch < branches.length - 1 ? currentBranch + 1 : 0;
+    handleBranchChange(newBranch);
+  };
+
+  const contextValue: MessageBranchContextType = {
+    currentBranch,
+    totalBranches: branches.length,
+    goToPrevious,
+    goToNext,
+    branches,
+    setBranches,
+  };
+
+  return (
+    <MessageBranchContext.Provider value={contextValue}>
+      <div
+        className={cn("grid w-full gap-2 [&>div]:pb-0", className)}
+        {...props}
+      />
+    </MessageBranchContext.Provider>
+  );
+};
+
+export type MessageBranchContentProps = HTMLAttributes<HTMLDivElement>;
+
+export const MessageBranchContent = ({
+  children,
+  ...props
+}: MessageBranchContentProps) => {
+  const { currentBranch, setBranches, branches } = useMessageBranch();
+  const childrenArray = Array.isArray(children) ? children : [children];
+
+  useEffect(() => {
+    if (branches.length !== childrenArray.length) {
+      setBranches(childrenArray);
+    }
+  }, [childrenArray, branches, setBranches]);
+
+  return childrenArray.map((branch, index) => (
+    <div
+      className={cn(
+        "grid gap-2 overflow-hidden [&>div]:pb-0",
+        index === currentBranch ? "block" : "hidden",
+      )}
+      key={branch.key}
+      {...props}
+    >
+      {branch}
+    </div>
+  ));
+};
+
+export type MessageResponseProps = React.ComponentProps<typeof Streamdown>;
+
+export const MessageResponse = memo(
+  ({ className, ...props }: MessageResponseProps) => (
+    <Streamdown
+      className={cn(
+        "size-full [&>*:first-child]:mt-0 [&>*:last-child]:mb-0",
+        className,
+      )}
+      {...props}
+    />
+  ),
+  (prevProps, nextProps) => prevProps.children === nextProps.children,
+);
+
+MessageResponse.displayName = "MessageResponse";
+
+export type MessageToolbarProps = HTMLAttributes<HTMLDivElement>;
+
+export const MessageToolbar = ({
+  className,
+  children,
+  ...props
+}: MessageToolbarProps) => (
+  <div
+    className={cn(
+      "mt-4 flex w-full items-center justify-between gap-4",
+      className,
+    )}
+    {...props}
+  >
+    {children}
+  </div>
+);
--- a/app/ui/app/src/hooks/useChats.ts
+++ b/app/ui/app/src/hooks/useChats.ts
@@ -1,5 +1,11 @@
 import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
-import { getChats, getChat, sendMessage, type ChatEventUnion } from "../api";
+import {
+  getChats,
+  getChat,
+  sendMessage,
+  type ChatEventUnion,
+  updateChatMessage,
+} from "../api";
 import { Chat, ErrorEvent, Model } from "@/gotypes";
 import { Message } from "@/gotypes";
 import { useSelectedModel } from "./useSelectedModel";
@@ -713,6 +719,42 @@ export const useSendMessage = (chatId: string) => {
  });
 };

+export const useUpdateChatMessage = (chatId: string) => {
+  const queryClient = useQueryClient();
+
+  return useMutation({
+    mutationKey: ["updateChatMessage", chatId],
+    mutationFn: ({ index, content }: { index: number; content: string }) =>
+      updateChatMessage(chatId, index, content),
+    onSuccess: ({ index, message }) => {
+      queryClient.setQueryData(
+        ["chat", chatId],
+        (old: { chat: Chat } | undefined) => {
+          if (!old?.chat?.messages) {
+            return old;
+          }
+
+          const updatedMessages = [...old.chat.messages];
+          if (index < 0 || index >= updatedMessages.length) {
+            return old;
+          }
+
+          updatedMessages[index] = message;
+
+          return {
+            ...old,
+            chat: new Chat({
+              ...old.chat,
+              messages: updatedMessages,
+            }),
+          };
+        },
+      );
+      queryClient.invalidateQueries({ queryKey: ["chats"] });
+    },
+  });
+};
+
 export const useCancelMessage = () => {
  const {
    abortControllers,
--- a/app/ui/app/src/lib/utils.ts
+++ b/app/ui/app/src/lib/utils.ts
@@ -0,0 +1,6 @@
+import { clsx, type ClassValue } from "clsx";
+import { twMerge } from "tailwind-merge";
+
+export function cn(...inputs: ClassValue[]) {
+  return twMerge(clsx(inputs));
+}
--- a/app/ui/responses/types.go
+++ b/app/ui/responses/types.go
@@ -25,6 +25,16 @@ type ChatResponse struct {
 	Chat store.Chat `json:"chat"`
 }

+type MessageUpdateRequest struct {
+	Content string `json:"content"`
+}
+
+type MessageUpdateResponse struct {
+	Index   int           `json:"index"`
+	ChatID  string        `json:"chatId"`
+	Message store.Message `json:"message"`
+}
+
 type Model struct {
 	Model      string     `json:"model"`
 	Digest     string     `json:"digest,omitempty"`
--- a/app/ui/ui.go
+++ b/app/ui/ui.go
@@ -162,7 +162,7 @@ func (s *Server) Handler() http.Handler {
 			// Add CORS headers for dev work
 			if CORS() {
 				w.Header().Set("Access-Control-Allow-Origin", "*")
-				w.Header().Set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS")
+				w.Header().Set("Access-Control-Allow-Methods", "GET, POST, PUT, PATCH, DELETE, OPTIONS")
 				w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Requested-With")
 				w.Header().Set("Access-Control-Allow-Credentials", "true")

@@ -246,6 +246,9 @@ func (s *Server) Handler() http.Handler {
 	mux.Handle("OPTIONS /", handle(func(w http.ResponseWriter, r *http.Request) error {
 		return nil
 	}))
+	mux.Handle("OPTIONS /api/v1/chat/{id}/messages/{index}", handle(func(w http.ResponseWriter, r *http.Request) error {
+		return nil
+	}))

 	// API routes - handle first to take precedence
 	mux.Handle("GET /api/v1/chats", handle(s.listChats))
@@ -254,6 +257,7 @@ func (s *Server) Handler() http.Handler {
 	mux.Handle("DELETE /api/v1/chat/{id}", handle(s.deleteChat))
 	mux.Handle("POST /api/v1/create-chat", handle(s.createChat))
 	mux.Handle("PUT /api/v1/chat/{id}/rename", handle(s.renameChat))
+	mux.Handle("PATCH /api/v1/chat/{id}/messages/{index}", handle(s.updateChatMessage))

 	mux.Handle("GET /api/v1/inference-compute", handle(s.getInferenceCompute))
 	mux.Handle("POST /api/v1/model/upstream", handle(s.modelUpstream))
@@ -1280,6 +1284,57 @@ func (s *Server) renameChat(w http.ResponseWriter, r *http.Request) error {
 	return nil
 }

+func (s *Server) updateChatMessage(w http.ResponseWriter, r *http.Request) error {
+	chatID := r.PathValue("id")
+	if chatID == "" {
+		return fmt.Errorf("chat ID is required")
+	}
+
+	indexParam := r.PathValue("index")
+	msgIndex, err := strconv.Atoi(indexParam)
+	if err != nil {
+		return fmt.Errorf("invalid message index: %w", err)
+	}
+
+	var req responses.MessageUpdateRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		return fmt.Errorf("invalid request body: %w", err)
+	}
+
+	if strings.TrimSpace(req.Content) == "" {
+		return fmt.Errorf("message content cannot be empty")
+	}
+
+	chat, err := s.Store.ChatWithOptions(chatID, true)
+	if err != nil {
+		return fmt.Errorf("failed to load chat: %w", err)
+	}
+
+	if msgIndex < 0 || msgIndex >= len(chat.Messages) {
+		return fmt.Errorf("message index out of range")
+	}
+
+	message := chat.Messages[msgIndex]
+	if message.Role != "assistant" {
+		return fmt.Errorf("only assistant messages can be updated")
+	}
+
+	message.Content = req.Content
+	message.UpdatedAt = time.Now()
+	chat.Messages[msgIndex] = message
+
+	if err := s.Store.SetChat(*chat); err != nil {
+		return fmt.Errorf("failed to update chat message: %w", err)
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	return json.NewEncoder(w).Encode(responses.MessageUpdateResponse{
+		Index:   msgIndex,
+		ChatID:  chatID,
+		Message: message,
+	})
+}
+
 func (s *Server) deleteChat(w http.ResponseWriter, r *http.Request) error {
 	cid := r.PathValue("id")
 	if cid == "" {
@@ -1794,14 +1849,13 @@ func (s *Server) buildChatRequest(chat *store.Chat, model string, think any, ava

 	var thinkValue *api.ThinkValue
 	if think != nil {
-		// Only set Think if it's actually requesting thinking
 		if boolValue, ok := think.(bool); ok {
-			if boolValue {
-				thinkValue = &api.ThinkValue{Value: boolValue}
+			thinkValue = &api.ThinkValue{
+				Value: boolValue,
 			}
 		} else if stringValue, ok := think.(string); ok {
-			if stringValue != "" && stringValue != "none" {
-				thinkValue = &api.ThinkValue{Value: stringValue}
+			thinkValue = &api.ThinkValue{
+				Value: stringValue,
 			}
 		}
 	}
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -94,9 +94,6 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 					continue
 				} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
 					continue
-				} else if !envconfig.EnableVulkan() && strings.Contains(filepath.Base(dir), "vulkan") {
-					slog.Info("experimental Vulkan support disabled.  To enable, set OLLAMA_VULKAN=1")
-					continue
 				}
 				dirs = []string{ml.LibOllamaPath, dir}
 			} else {
--- a/docs/api/introduction.mdx
+++ b/docs/api/introduction.mdx
--- a/docs/capabilities/embeddings.mdx
+++ b/docs/capabilities/embeddings.mdx
@@ -13,23 +13,9 @@ Embeddings turn text into numeric vectors you can store in a vector database, se

 ## Generate embeddings

+Use `/api/embed` with a single string.
+
 <Tabs>
-  <Tab title="CLI">
-    Generate embeddings directly from the command line:
-
-    ```shell
-    ollama run embeddinggemma "Hello world"
-    ```
-
-    You can also pipe text to generate embeddings:
-
-    ```shell
-    echo "Hello world" | ollama run embeddinggemma
-    ```
-
-    Output is a JSON array.
-
-  </Tab>
  <Tab title="cURL">
    ```shell
    curl -X POST http://localhost:11434/api/embed \
--- a/docs/docker.mdx
+++ b/docs/docker.mdx
@@ -68,15 +68,6 @@ To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following c
 docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
 ```

-## Vulkan Support
-
-Vulkan is bundled into the `ollama/ollama` image.  
-
-```shell
-docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 -e OLLAMA_VULKAN=1 --name ollama ollama/ollama
-```
-
-
 ## Run model locally

 Now you can run a model:
@@ -88,4 +79,3 @@ docker exec -it ollama ollama run llama3.2
 ## Try different models

 More models can be found on the [Ollama library](https://ollama.com/library).
-
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -63,10 +63,6 @@
    {
      "source": "/api/openai",
      "destination": "/api/openai-compatibility"
-    },
-    {
-      "source": "/api",
-      "destination": "/api/introduction"
    }
  ],
  "navigation": {
@@ -134,7 +130,7 @@
          {
            "group": "API Reference",
            "pages": [
-              "/api/introduction",
+              "/api/index",
              "/api/authentication",
              "/api/streaming",
              "/api/usage",
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -223,7 +223,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 ## How can I use Ollama in Visual Studio Code?

-There is already a large collection of plugins available for VS Code as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
+There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.

 ## How do I use Ollama with GPU acceleration in Docker?

--- a/docs/gpu.mdx
+++ b/docs/gpu.mdx
@@ -52,11 +52,7 @@ sudo modprobe nvidia_uvm`

 ## AMD Radeon

-Ollama supports the following AMD GPUs via the ROCm library:
-
-> [!NOTE]
-> Additional AMD GPU support is provided by the Vulkan Library - see below.
-
+Ollama supports the following AMD GPUs:

 ### Linux Support

@@ -128,39 +124,3 @@ accessing the AMD GPU devices. On the host system you can run
 ## Metal (Apple GPUs)

 Ollama supports GPU acceleration on Apple devices via the Metal API.
-
-
-## Vulkan GPU Support
-
-> [!NOTE]
-> Vulkan is currently an Experimental feature.  To enable, you must set OLLAMA_VULKAN=1 for the Ollama server as
-described in the [FAQ](faq.md#how-do-i-configure-ollama-server)
-
-Additional GPU support on Windows and Linux is provided via
-[Vulkan](https://www.vulkan.org/). On Windows most GPU vendors drivers come
-bundled with Vulkan support and require no additional setup steps. Most Linux
-distributions require installing additional components, and you may have
-multiple options for Vulkan drivers between Mesa and GPU Vendor specific packages
-
- Linux Intel GPU Instructions - https://dgpu-docs.intel.com/driver/client/overview.html
- Linux AMD GPU Instructions - https://amdgpu-install.readthedocs.io/en/latest/install-script.html#specifying-a-vulkan-implementation
-
-For AMD GPUs on some Linux distributions, you may need to add the `ollama` user to the `render` group.
-
-The Ollama scheduler leverages available VRAM data reported by the GPU libraries to
-make optimal scheduling decisions.  Vulkan requires additional capabilities or
-running as root to expose this available VRAM data.  If neither root access or this
-capability are granted, Ollama will use approximate sizes of the models
-to make best effort scheduling decisions.
-
-```bash
-sudo setcap cap_perfmon+ep /usr/local/bin/ollama
-```
-
-### GPU Selection
-
-To select specific Vulkan GPU(s), you can set the environment variable
-`GGML_VK_VISIBLE_DEVICES` to one or more numeric IDs on the Ollama server as
-described in the [FAQ](faq.md#how-do-i-configure-ollama-server). If you
-encounter any problems with Vulkan based GPUs, you can disable all Vulkan GPUs
-by setting `GGML_VK_VISIBLE_DEVICES=-1` 
--- a/docs/integrations/vscode.mdx
+++ b/docs/integrations/vscode.mdx
@@ -4,7 +4,7 @@ title: VS Code

 ## Install

-Install [VS Code](https://code.visualstudio.com/download). 
+Install [VSCode](https://code.visualstudio.com/download). 

 ## Usage with Ollama 

@@ -12,7 +12,7 @@ Install [VS Code](https://code.visualstudio.com/download).
 <div style={{ display: 'flex', justifyContent: 'center' }}>
  <img 
    src="/images/vscode-sidebar.png" 
-    alt="VS Code chat Sidebar"
+    alt="VSCode chat Sidebar"
    width="75%"
  />
 </div>
@@ -20,7 +20,7 @@ Install [VS Code](https://code.visualstudio.com/download).
 <div style={{ display: 'flex', justifyContent: 'center' }}>
  <img 
    src="/images/vscode-models.png" 
-    alt="VS Code model picker"
+    alt="VSCode model picker"
    width="75%"
  />
 </div>
@@ -28,7 +28,7 @@ Install [VS Code](https://code.visualstudio.com/download).
 <div style={{ display: 'flex', justifyContent: 'center' }}>
  <img 
    src="/images/vscode-model-options.png" 
-    alt="VS Code model options dropdown"
+    alt="VSCode model options dropdown"
    width="75%"
  />
 </div>
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -2,15 +2,12 @@ openapi: 3.1.0
 info:
  title: Ollama API
  version: 0.1.0
-  license:
-    name: MIT
-    url: https://opensource.org/licenses/MIT
  description: |
    OpenAPI specification for the Ollama HTTP API
+
 servers:
  - url: http://localhost:11434
-    description: Ollama
-security: []
+    description: Local Ollama instance
 components:
  securitySchemes:
    bearerAuth:
@@ -96,11 +93,8 @@ components:
          type: boolean
          default: true
        think:
-          oneOf:
-            - type: boolean
-            - type: string
-              enum: [high, medium, low]
-          description: When true, returns separate thinking output in addition to content. Can be a boolean (true/false) or a string ("high", "medium", "low") for supported models.
+          type: boolean
+          description: When true, returns separate thinking output in addition to content
        raw:
          type: boolean
          description: When true, returns the raw response from the model without any prompt templating
@@ -277,11 +271,8 @@ components:
          type: boolean
          default: true
        think:
-          oneOf:
-            - type: boolean
-            - type: string
-              enum: [high, medium, low]
-          description: When true, returns separate thinking output in addition to content. Can be a boolean (true/false) or a string ("high", "medium", "low") for supported models.
+          type: boolean
+          description: When true, returns separate thinking output in addition to content
        keep_alive:
          oneOf:
            - type: string
@@ -319,6 +310,7 @@ components:
              type: array
              items:
                type: string
+              nullable: true
              description: Optional base64-encoded images in the response
        done:
          type: boolean
@@ -375,6 +367,7 @@ components:
              type: array
              items:
                type: string
+              nullable: true
              description: Partial base64-encoded images, when present
        done:
          type: boolean
@@ -550,9 +543,6 @@ components:
        license:
          type: string
          description: The license of the model
-        modified_at:
-          type: string
-          description: Last modified timestamp in ISO 8601 format
        details:
          type: object
          description: High-level model details
@@ -632,9 +622,6 @@ components:
        size_vram:
          type: integer
          description: VRAM usage in bytes
-        context_length:
-          type: integer
-          description: Context length for the running model
    PsResponse:
      type: object
      properties:
@@ -1288,9 +1275,6 @@ paths:
            example:
              source: gemma3
              destination: gemma3-backup
-      responses:
-        "200":
-          description: Model successfully copied
  /api/pull:
    post:
      summary: Pull a model
@@ -1398,7 +1382,16 @@ paths:
              model: gemma3
      responses:
        "200":
-          description: Model successfully deleted
+          description: Deletion status updates.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/StatusResponse"
+              example:
+                status: "success"
+            application/x-ndjson:
+              schema:
+                $ref: "#/components/schemas/StatusEvent"
  /api/version:
    get:
      summary: Get version
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -196,6 +196,8 @@ var (
 	NoPrune = Bool("OLLAMA_NOPRUNE")
 	// SchedSpread allows scheduling models across all GPUs.
 	SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
+	// IntelGPU enables experimental Intel GPU detection.
+	IntelGPU = Bool("OLLAMA_INTEL_GPU")
 	// MultiUserCache optimizes prompt caching for multi-user scenarios
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 	// Enable the new Ollama engine
@@ -204,8 +206,6 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
-	// Enable Vulkan backend
-	EnableVulkan = Bool("OLLAMA_VULKAN")
 )

 func String(s string) func() string {
@@ -314,7 +314,7 @@ func AsMap() map[string]EnvVar {
 		ret["GGML_VK_VISIBLE_DEVICES"] = EnvVar{"GGML_VK_VISIBLE_DEVICES", VkVisibleDevices(), "Set which Vulkan devices are visible by numeric ID"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_VULKAN"] = EnvVar{"OLLAMA_VULKAN", EnableVulkan(), "Enable experimental Vulkan support"}
+		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}

 	return ret
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -797,6 +797,73 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	return
 }

+func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
+	if llm.KV().Uint("vision.block_count") == 0 {
+		return
+	}
+
+	for name, layer := range llm.Tensors().GroupLayers() {
+		if name == "v" || strings.HasPrefix(name, "v.") {
+			for _, tensor := range layer {
+				weights += tensor.Size()
+			}
+		}
+	}
+
+	imageSize := uint64(llm.KV().Uint("vision.image_size"))
+	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
+	if patchSize == 0 {
+		slog.Warn("unknown patch size for vision model")
+		return
+	}
+
+	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
+
+	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
+	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
+		numPatches++
+	}
+
+	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
+	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
+
+	switch llm.KV().Architecture() {
+	case "mllama":
+		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
+
+		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))
+
+		graphSize = 4 * (8 +
+			imageSize*imageSize*numChannels*maxNumTiles +
+			embeddingLength*numPatches*maxNumTiles +
+			9*embeddingLength*numPaddedPatches*maxNumTiles +
+			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
+	case "gemma3", "mistral3":
+		graphSize = 4 * (imageSize*imageSize*numChannels +
+			embeddingLength*patchSize +
+			numPatches*numPatches*headCount)
+	case "qwen25vl":
+		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
+
+		numPatches := maxPixels / (patchSize * patchSize)
+
+		graphSize = 4 * (maxPixels*numChannels + // Original image storage
+			// Normalized pixels
+			maxPixels*numChannels +
+			// Patches storage (numPatches * channels * patchSize^2)
+			numPatches*numChannels*patchSize*patchSize +
+			// Self-attention calculations
+			numPatches*numPatches*headCount +
+			// Additional buffer for processing
+			embeddingLength*numPatches)
+	case "llama4":
+		// vision graph is computed independently in the same schedule
+		// and is negligible compared to the worst case text graph
+	}
+
+	return weights, graphSize
+}
+
 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
 	if cacheType == "" || cacheType == "f16" {
--- a/integration/api_test.go
+++ b/integration/api_test.go
@@ -14,23 +14,6 @@ import (
 	"github.com/ollama/ollama/api"
 )

-func assertBytesMatchToken(t *testing.T, label, token string, ints []int) {
-	t.Helper()
-
-	raw := []byte(token)
-	if len(ints) != len(raw) {
-		t.Errorf("%s expected %d bytes for token %q, got %d (%v)", label, len(raw), token, len(ints), ints)
-		return
-	}
-
-	for i, b := range raw {
-		if ints[i] != int(b) {
-			t.Errorf("%s byte[%d] mismatch for token %q: got %d want %d", label, i, token, ints[i], int(b))
-			return
-		}
-	}
-}
-
 func TestAPIGenerate(t *testing.T) {
 	initialTimeout := 60 * time.Second
 	streamTimeout := 30 * time.Second
@@ -398,182 +381,3 @@ func TestAPIShowModel(t *testing.T) {
 		t.Errorf("%s missing modified_at: %#v", modelName, resp)
 	}
 }
-
-func TestAPIGenerateLogprobs(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	if err := PullIfMissing(ctx, client, smol); err != nil {
-		t.Fatalf("pull failed %s", err)
-	}
-
-	enableLogprobs := true
-	noStream := false
-
-	tests := []struct {
-		name        string
-		logprobs    *bool
-		topLogprobs int
-		expectCount int
-	}{
-		{
-			name:        "no_logprobs",
-			logprobs:    nil,
-			topLogprobs: 0,
-			expectCount: 0,
-		},
-		{
-			name:        "logprobs_only",
-			logprobs:    &enableLogprobs,
-			topLogprobs: 0,
-			expectCount: 1,
-		},
-		{
-			name:        "logprobs_with_top_5",
-			logprobs:    &enableLogprobs,
-			topLogprobs: 5,
-			expectCount: 1,
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			req := api.GenerateRequest{
-				Model:       smol,
-				Prompt:      "Why is the sky blue?",
-				Stream:      &noStream,
-				Logprobs:    test.logprobs != nil && *test.logprobs,
-				TopLogprobs: test.topLogprobs,
-				Options: map[string]interface{}{
-					"temperature": 0,
-					"seed":        123,
-					"num_predict": 10,
-				},
-			}
-
-			var response api.GenerateResponse
-			err := client.Generate(ctx, &req, func(resp api.GenerateResponse) error {
-				if resp.Done {
-					response = resp
-				}
-				return nil
-			})
-			if err != nil {
-				t.Fatalf("generate failed: %s", err)
-			}
-
-			// Check logprobs based on expectation
-			if test.expectCount == 0 {
-				if len(response.Logprobs) > 0 {
-					t.Errorf("expected no logprobs but got %d", len(response.Logprobs))
-				}
-			} else {
-				if len(response.Logprobs) == 0 {
-					t.Errorf("expected logprobs but got none")
-				}
-
-				// Validate each logprob entry
-				for i, lp := range response.Logprobs {
-					if lp.Token == "" {
-						t.Errorf("logprob[%d] has empty token", i)
-					}
-					if lp.Logprob > 0 {
-						t.Errorf("logprob[%d] has positive logprob %f (should be <= 0)", i, lp.Logprob)
-					}
-					assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d]", i), lp.Token, lp.Bytes)
-
-					// Check top_logprobs if requested
-					if test.topLogprobs > 0 {
-						if len(lp.TopLogprobs) == 0 {
-							t.Errorf("logprob[%d] expected top_logprobs but got none", i)
-						}
-						if len(lp.TopLogprobs) > test.topLogprobs {
-							t.Errorf("logprob[%d] has %d top_logprobs, expected max %d", i, len(lp.TopLogprobs), test.topLogprobs)
-						}
-
-						// Verify top_logprobs are sorted by probability (descending)
-						for j := 1; j < len(lp.TopLogprobs); j++ {
-							if lp.TopLogprobs[j-1].Logprob < lp.TopLogprobs[j].Logprob {
-								t.Errorf("logprob[%d].top_logprobs not sorted: %f < %f", i, lp.TopLogprobs[j-1].Logprob, lp.TopLogprobs[j].Logprob)
-							}
-						}
-						for j, top := range lp.TopLogprobs {
-							assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
-						}
-					} else if len(lp.TopLogprobs) > 0 {
-						t.Errorf("logprob[%d] has top_logprobs but none were requested", i)
-					}
-				}
-			}
-		})
-	}
-}
-
-func TestAPIChatLogprobs(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	if err := PullIfMissing(ctx, client, smol); err != nil {
-		t.Fatalf("pull failed %s", err)
-	}
-
-	enableLogprobs := true
-	noStream := false
-
-	req := api.ChatRequest{
-		Model: smol,
-		Messages: []api.Message{
-			{Role: "user", Content: "Say hello in one word"},
-		},
-		Stream:      &noStream,
-		Logprobs:    enableLogprobs,
-		TopLogprobs: 3,
-		Options: map[string]interface{}{
-			"temperature": 0,
-			"seed":        123,
-			"num_predict": 5,
-		},
-	}
-
-	var response api.ChatResponse
-	err := client.Chat(ctx, &req, func(resp api.ChatResponse) error {
-		if resp.Done {
-			response = resp
-		}
-		return nil
-	})
-	if err != nil {
-		t.Fatalf("chat failed: %s", err)
-	}
-
-	if len(response.Logprobs) == 0 {
-		t.Fatal("expected logprobs in response but got none")
-	}
-
-	t.Logf("received %d logprobs for chat response", len(response.Logprobs))
-
-	for i, lp := range response.Logprobs {
-		if lp.Token == "" {
-			t.Errorf("logprob[%d] has empty token", i)
-		}
-		if lp.Logprob > 0 {
-			t.Errorf("logprob[%d] has positive logprob %f", i, lp.Logprob)
-		}
-		assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d]", i), lp.Token, lp.Bytes)
-		if len(lp.TopLogprobs) == 0 {
-			t.Errorf("logprob[%d] expected top_logprobs but got none", i)
-		}
-		if len(lp.TopLogprobs) > 3 {
-			t.Errorf("logprob[%d] has %d top_logprobs, expected max 3", i, len(lp.TopLogprobs))
-		}
-		for j, top := range lp.TopLogprobs {
-			assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
-		}
-	}
-}
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -63,13 +63,8 @@ func BackendInit() {
 	C.llama_backend_init()
 }

-type Devices struct {
-	ml.DeviceID
-	LlamaID uint64
-}
-
-func EnumerateGPUs() []Devices {
-	var ids []Devices
+func EnumerateGPUs() []ml.DeviceID {
+	var ids []ml.DeviceID

 	for i := range C.ggml_backend_dev_count() {
 		device := C.ggml_backend_dev_get(i)
@@ -79,12 +74,9 @@ func EnumerateGPUs() []Devices {
 			C.GGML_BACKEND_DEVICE_TYPE_IGPU:
 			var props C.struct_ggml_backend_dev_props
 			C.ggml_backend_dev_get_props(device, &props)
-			ids = append(ids, Devices{
-				DeviceID: ml.DeviceID{
-					ID:      C.GoString(props.id),
-					Library: C.GoString(props.library),
-				},
-				LlamaID: uint64(i),
+			ids = append(ids, ml.DeviceID{
+				ID:      C.GoString(props.id),
+				Library: C.GoString(props.library),
 			})
 		}
 	}
@@ -225,21 +217,7 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 {
 	return embeddings
 }

-// GetLogitsIth gets the logits for the ith token
-func (c *Context) GetLogitsIth(i int) []float32 {
-	logits := unsafe.Pointer(C.llama_get_logits_ith(c.c, C.int32_t(i)))
-	if logits == nil {
-		return nil
-	}
-
-	vocabSize := c.Model().NumVocab()
-	result := make([]float32, vocabSize)
-	_ = copy(result, unsafe.Slice((*float32)(logits), vocabSize))
-	return result
-}
-
 type ModelParams struct {
-	Devices      []uint64
 	NumGpuLayers int
 	MainGpu      int
 	UseMmap      bool
@@ -263,21 +241,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	cparams.use_mmap = C.bool(params.UseMmap)
 	cparams.vocab_only = C.bool(params.VocabOnly)

-	var devices []C.ggml_backend_dev_t
-	for _, llamaID := range params.Devices {
-		devices = append(devices, C.ggml_backend_dev_get(C.size_t(llamaID)))
-	}
-	if len(devices) > 0 {
-		devices = append(devices, C.ggml_backend_dev_t(C.NULL))
-		devicesData := &devices[0]
-
-		var devicesPin runtime.Pinner
-		devicesPin.Pin(devicesData)
-		defer devicesPin.Unpin()
-
-		cparams.devices = devicesData
-	}
-
 	if len(params.TensorSplit) > 0 {
 		tensorSplitData := &params.TensorSplit[0]

--- a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
+++ b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
@@ -1,32 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jeff Bolz <jbolz@nvidia.com>
-Date: Wed, 29 Oct 2025 03:53:04 -0500
-Subject: [PATCH] vulkan: Call ggml_vk_buffer_write_2d from ggml_vk_buffer_copy
- (#16793)
-
-This lets the copy to the destination device use the host-visible
-vidmem optimization.
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +----
- 1 file changed, 1 insertion(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 221e29509..18b7cbccf 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
-         VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
-         // Copy device to device
-         ggml_vk_ensure_sync_staging_buffer(src->device, size);
-        ggml_vk_ensure_sync_staging_buffer(dst->device, size);
- 
-         // Copy to src staging buffer
-         ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
-        // memcpy to dst staging buffer
-        memcpy(dst->device->sync_staging->ptr, src->device->sync_staging->ptr, size);
-         // Copy to dst buffer
-        ggml_vk_buffer_copy(dst, dst_offset, dst->device->sync_staging, 0, size);
-+        ggml_vk_buffer_write_2d(dst, dst_offset, src->device->sync_staging->ptr, 0, size, 1);
-     }
- }
- 
--- a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
+++ b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
--- a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
+++ b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
@@ -1,657 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jeff Bolz <jbolz@nvidia.com>
-Date: Wed, 29 Oct 2025 08:44:29 -0500
-Subject: [PATCH] vulkan: Update topk_moe fusion to handle gpt's late softmax
- (#16656)
-
-* vulkan: Update topk_moe fusion to handle gpt's late softmax
-
-Based on #16649.
-
-* Add ggml_check_edges
-
-* Add sync logging to show fusion effects
-
-* handle clamp added in #16655
-
-* Update ggml/src/ggml-impl.h
-
-Co-authored-by: Diego Devesa <slarengh@gmail.com>
---
- ggml/src/ggml-impl.h                          |  16 +
- ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 304 +++++++++++-------
- .../ggml-vulkan/vulkan-shaders/topk_moe.comp  |  90 ++++--
- 3 files changed, 272 insertions(+), 138 deletions(-)
-
-diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index 639d551a2..e5c446d1d 100644
--- a/ggml/src/ggml-impl.h
-+++ b/ggml/src/ggml-impl.h
-@@ -693,6 +693,7 @@ GGML_API void ggml_dxgi_pdh_release();
- #endif
- 
- #ifdef __cplusplus
-+#include <array>
- #include <initializer_list>
- #include <vector>
- 
-@@ -708,6 +709,21 @@ inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph *          cgraph,
-     return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
- }
- 
-+// Return true if the edges in the graph match expectations.
-+inline bool ggml_check_edges(const struct ggml_cgraph *                cgraph,
-+                             int                                       start_idx,
-+                             std::initializer_list<std::array<int, 3>> edges) {
-+    for (const auto & edge : edges) {
-+        int dst_node = edge[0];
-+        int src_idx  = edge[1];
-+        int src_node = edge[2];
-+        if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
-+            return false;
-+        }
-+    }
-+    return true;
-+}
-+
- // expose GGUF internals for test code
- GGML_API size_t gguf_type_size(enum gguf_type type);
- GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 53b57c179..b2855b078 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11;
- static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1);
- static constexpr uint32_t num_topk_moe_pipelines = 10;
- 
-static constexpr std::array topk_moe_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-                                           GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-                                           GGML_OP_SUM_ROWS, GGML_OP_DIV,      GGML_OP_RESHAPE };
-static constexpr std::array topk_moe     { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-                                           GGML_OP_VIEW,     GGML_OP_GET_ROWS };
-+static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-+                                                                             GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-+                                                                             GGML_OP_SUM_ROWS, GGML_OP_CLAMP,    GGML_OP_DIV,
-+                                                                             GGML_OP_RESHAPE };
-+static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax     { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-+                                                                             GGML_OP_VIEW,     GGML_OP_GET_ROWS };
-+static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax      { GGML_OP_ARGSORT,  GGML_OP_VIEW,
-+                                                                             GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-+                                                                             GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
-+
-+//node #978 (  SOFT_MAX):     ffn_moe_probs-15 (   0K) [Vulka         ] use=2:    ffn_moe_logits-15 (   0K) [Vulka         ]
-+//node #979 (   RESHAPE): ffn_moe_probs-15 (re (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
-+//node #980 (   ARGSORT):   ffn_moe_argsort-15 (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
-+//node #981 (      VIEW):      ffn_moe_topk-15 (   0K) [Vulka         ] use=4:   ffn_moe_argsort-15 (   0K) [Vulka         ]
-+//node #982 (  GET_ROWS):   ffn_moe_weights-15 (   0K) [Vulka         ] use=1: ffn_moe_probs-15 (re (   0K) [Vulka         ]      ffn_moe_topk-15 (   0K) [Vulka         ]
-+//node #983 (   RESHAPE): ffn_moe_weights-15 ( (   0K) [Vulka         ] use=2:   ffn_moe_weights-15 (   0K) [Vulka         ]
-+//node #984 (  SUM_ROWS): ffn_moe_weights_sum- (   0K) [Vulka         ] use=1: ffn_moe_weights-15 ( (   0K) [Vulka         ]
-+//node #985 (     CLAMP): ffn_moe_weights_sum_ (   0K) [Vulka         ] use=1: ffn_moe_weights_sum- (   0K) [Vulka         ]
-+//node #986 (       DIV): ffn_moe_weights_norm (   0K) [Vulka         ] use=1: ffn_moe_weights-15 ( (   0K) [Vulka         ] ffn_moe_weights_sum_ (   0K) [Vulka         ]
-+//node #987 (   RESHAPE): ffn_moe_weights_norm (   0K) [Vulka         ] use=1: ffn_moe_weights_norm (   0K) [Vulka         ]
-+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_norm_edges {
-+    { 1, 0, 0 }, // reshape->src[0]  == softmax
-+    { 2, 0, 0 }, // argsort->src[0]  == softmax
-+    { 3, 0, 2 }, // view->src[0]     == argsort
-+    { 4, 0, 1 }, // get_rows->src[0] == reshape
-+    { 4, 1, 3 }, // get_rows->src[1] == view
-+    { 5, 0, 4 }, // reshape->src[0]  == get_rows
-+    { 6, 0, 5 }, // sum_rows->src[0] == reshape
-+    { 7, 0, 6 }, // clamp->src[0]    == sum_rows
-+    { 8, 0, 5 }, // div->src[0]      == reshape
-+    { 8, 1, 7 }, // div->src[1]      == clamp
-+    { 9, 0, 8 }, // reshape->src[0]  == div
-+};
-+
-+// same as early_softmax_norm but ending after the get_rows
-+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_edges {
-+    { 1, 0, 0 }, // reshape->src[0]  == softmax
-+    { 2, 0, 0 }, // argsort->src[0]  == softmax
-+    { 3, 0, 2 }, // view->src[0]     == argsort
-+    { 4, 0, 1 }, // get_rows->src[0] == reshape
-+    { 4, 1, 3 }, // get_rows->src[1] == view
-+};
- 
-+//node #652 (   ARGSORT):   ffn_moe_argsort-11 (   0K) [Vulka         ] use=1:     ffn_moe_probs-11 (   0K) [Vulka         ]
-+//node #653 (      VIEW):      ffn_moe_topk-11 (   0K) [Vulka         ] use=7:   ffn_moe_argsort-11 (   0K) [Vulka         ]
-+//node #654 (  GET_ROWS):   ffn_moe_weights-11 (   0K) [Vulka         ] use=1: ffn_moe_probs-11 (re (   0K) [Vulka         ]      ffn_moe_topk-11 (   0K) [Vulka         ]
-+//node #655 (   RESHAPE): ffn_moe_weights-11 ( (   0K) [Vulka         ] use=1:   ffn_moe_weights-11 (   0K) [Vulka         ]
-+//node #656 (  SOFT_MAX):             node_656 (   0K) [Vulka         ] use=1: ffn_moe_weights-11 ( (   0K) [Vulka         ]
-+//node #657 (   RESHAPE): ffn_moe_weights_soft (   0K) [Vulka         ] use=1:             node_656 (   0K) [Vulka         ]
-+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_late_softmax_edges {
-+    { 1, 0, 0 }, // view->src[0]     == argsort
-+    { 2, 1, 1 }, // get_rows->src[1] == view
-+    { 3, 0, 2 }, // reshape->src[0]  == get_rows
-+    { 4, 0, 3 }, // soft_max->src[0] == reshape
-+    { 5, 0, 4 }, // reshape->src[0]  == soft_max
-+};
-+
-+enum topk_moe_mode {
-+    TOPK_MOE_EARLY_SOFTMAX,
-+    TOPK_MOE_EARLY_SOFTMAX_NORM,
-+    TOPK_MOE_LATE_SOFTMAX,
-+    TOPK_MOE_COUNT,
-+};
-+
-+static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
-+    topk_moe_mode mode = num == topk_moe_early_softmax_norm.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX_NORM :
-+                         num == topk_moe_early_softmax.size() - 1      ? TOPK_MOE_EARLY_SOFTMAX :
-+                                                                         TOPK_MOE_LATE_SOFTMAX;
-+    return mode;
-+}
- 
- struct vk_device_struct {
-     std::recursive_mutex mutex;
-@@ -607,8 +671,7 @@ struct vk_device_struct {
- 
-     vk_pipeline pipeline_flash_attn_split_k_reduce;
- 
-    // [2] is {!norm, norm}
-    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2];
-+    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
- 
-     std::vector<vk_pipeline_ref> all_pipelines;
- 
-@@ -956,6 +1019,8 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
- struct vk_op_topk_moe_push_constants {
-     uint32_t n_rows;
-     uint32_t n_expert_used;
-+    float clamp_min;
-+    float clamp_max;
- };
- 
- struct vk_op_add_id_push_constants {
-@@ -3806,8 +3871,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
-     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
- 
-     for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][0], "topk_moe_f32_"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0}, 1, true, true);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][1], "topk_moe_f32_"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1}, 1, true, true);
-+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true);
-+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true);
-+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true);
-     }
- 
-     for (auto &c : compiles) {
-@@ -8085,8 +8151,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
-         if (ctx->num_additional_fused_ops) {
-             uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
-             GGML_ASSERT(idx < num_topk_moe_pipelines);
-            bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
-            return ctx->device->pipeline_topk_moe[idx][with_norm];
-+            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-+            return ctx->device->pipeline_topk_moe[idx][mode];
-         }
- 
-         if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
-@@ -8141,6 +8207,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
-             return nullptr;
-         }
-     case GGML_OP_ARGSORT:
-+        if (ctx->num_additional_fused_ops) {
-+            uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
-+            GGML_ASSERT(idx < num_topk_moe_pipelines);
-+            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-+            return ctx->device->pipeline_topk_moe[idx][mode];
-+        }
-+
-         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
-             uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
-             return ctx->device->pipeline_argsort_f32[idx];
-@@ -9676,10 +9749,12 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub
- 
- static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
- 
-    bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
-+    topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-     ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
-    ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
-    ggml_tensor * ids = cgraph->nodes[node_idx + 3];
-+    ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
-+                            (mode == TOPK_MOE_EARLY_SOFTMAX)      ? cgraph->nodes[node_idx + 4] :
-+                                                                    cgraph->nodes[node_idx + 5];
-+    ggml_tensor * ids = (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : cgraph->nodes[node_idx + 3];
- 
-     GGML_ASSERT(logits->type == GGML_TYPE_F32);
-     GGML_ASSERT(weights->type == GGML_TYPE_F32);
-@@ -9738,9 +9813,14 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
-         GGML_ASSERT(d_ids != nullptr);
-     }
- 
-    vk_op_topk_moe_push_constants pc;
-+    vk_op_topk_moe_push_constants pc {};
-     pc.n_rows = n_rows;
-     pc.n_expert_used = n_expert_used;
-+    if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
-+        ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
-+        pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
-+        pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
-+    }
- 
-     GGML_ASSERT(n_expert_used <= n_experts);
- 
-@@ -11335,7 +11415,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
-                 }
-             }
-         }
-+
-+#define ENABLE_SYNC_LOGGING 0
-+
-         if (need_sync) {
-+#if ENABLE_SYNC_LOGGING
-+            std::cerr <<  "sync" << std::endl;
-+#endif
-             ctx->unsynced_nodes_written.clear();
-             ctx->unsynced_nodes_read.clear();
-             ggml_vk_sync_buffers(ctx, compute_ctx);
-@@ -11353,6 +11439,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
-             }
-         }
-     }
-+#if ENABLE_SYNC_LOGGING
-+    if (!dryrun) {
-+        for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
-+            auto *n = cgraph->nodes[node_idx + i];
-+            std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
-+            if (n->op == GGML_OP_GLU) {
-+                std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
-+            }
-+            std::cerr << std::endl;
-+        }
-+    }
-+#endif
- 
-     switch (node->op) {
-     case GGML_OP_REPEAT:
-@@ -11531,7 +11629,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
- 
-         break;
-     case GGML_OP_ARGSORT:
-        ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
-+        if (ctx->num_additional_fused_ops) {
-+            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
-+        } else {
-+            ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
-+        }
- 
-         break;
-     case GGML_OP_SUM:
-@@ -12329,30 +12431,27 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st
- }
- 
- static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
-                                      int node_idx, bool with_norm) {
-+                                      int node_idx, topk_moe_mode mode) {
- 
-    if (with_norm) {
-        if (node_idx + (int)topk_moe_norm.size() > cgraph->n_nodes) {
-            return false;
-        }
-        for (size_t i = 0; i < topk_moe_norm.size(); ++i) {
-            if (cgraph->nodes[node_idx + i]->op != topk_moe_norm[i]) {
-                return false;
-            }
-        }
-    } else {
-        if (node_idx + (int)topk_moe.size() > cgraph->n_nodes) {
-            return false;
-        }
-        for (size_t i = 0; i < topk_moe.size(); ++i) {
-            if (cgraph->nodes[node_idx + i]->op != topk_moe[i]) {
-                return false;
-            }
-        }
-    }
-+    const ggml_tensor * softmax;
-+    const ggml_tensor * weights;
- 
-    const ggml_tensor * softmax =  cgraph->nodes[node_idx + 0];
-    const ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
-+    switch (mode) {
-+    case TOPK_MOE_EARLY_SOFTMAX_NORM:
-+        softmax = cgraph->nodes[node_idx + 0];
-+        weights = cgraph->nodes[node_idx + 9];
-+        break;
-+    case TOPK_MOE_EARLY_SOFTMAX:
-+        softmax = cgraph->nodes[node_idx + 0];
-+        weights = cgraph->nodes[node_idx + 4];
-+        break;
-+    case TOPK_MOE_LATE_SOFTMAX:
-+        softmax = cgraph->nodes[node_idx + 4];
-+        weights = cgraph->nodes[node_idx + 5];
-+        break;
-+    default:
-+        return false;
-+    }
- 
-     const float * op_params = (const float *)softmax->op_params;
- 
-@@ -12378,60 +12477,6 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
-         return false;
-     }
- 
-    // Check that the nodes don't have any unexpected uses
-    const ggml_tensor * reshape1 =  cgraph->nodes[node_idx + 1];
-    const ggml_tensor * argsort =   cgraph->nodes[node_idx + 2];
-    const ggml_tensor * view =      cgraph->nodes[node_idx + 3];
-    const ggml_tensor * get_rows =  cgraph->nodes[node_idx + 4];
-    const ggml_tensor * reshape5 =  with_norm ? cgraph->nodes[node_idx + 5] : nullptr;
-    const ggml_tensor * sum_rows =  with_norm ? cgraph->nodes[node_idx + 6] : nullptr;
-    const ggml_tensor * div =       with_norm ? cgraph->nodes[node_idx + 7] : nullptr;
-    const ggml_tensor * reshape8 =  with_norm ? cgraph->nodes[node_idx + 8] : nullptr;
-
-    // softmax is used by reshape and argsort
-    if (ggml_node_get_use_count(cgraph, node_idx) != 2 ||
-        reshape1->src[0] != softmax ||
-        argsort->src[0] != softmax) {
-        return false;
-    }
-    // reshape is used by get_rows
-    if (ggml_node_get_use_count(cgraph, node_idx + 1) != 1 ||
-        get_rows->src[0] != reshape1) {
-        return false;
-    }
-    // argsort is used by view
-    if (ggml_node_get_use_count(cgraph, node_idx + 2) != 1 ||
-        view->src[0] != argsort) {
-        return false;
-    }
-    // view is written (via argsort), we can skip checking it
-
-    if (with_norm) {
-        // get_rows is used by reshape
-        if (ggml_node_get_use_count(cgraph, node_idx + 4) != 1 ||
-            reshape5->src[0] != get_rows) {
-            return false;
-        }
-
-        // reshape is used by sum_rows and div
-        if (ggml_node_get_use_count(cgraph, node_idx + 5) != 2 ||
-            sum_rows->src[0] != reshape5 ||
-            div->src[0] != reshape5) {
-            return false;
-        }
-
-        // sum_rows is used by div
-        if (ggml_node_get_use_count(cgraph, node_idx + 6) != 1 ||
-            div->src[1] != sum_rows) {
-            return false;
-        }
-
-        // div/reshape are written
-        if (reshape8->src[0] != div) {
-            return false;
-        }
-    }
-
-     if (!ctx->device->subgroup_arithmetic ||
-         !ctx->device->subgroup_shuffle ||
-         !ctx->device->subgroup_require_full_support ||
-@@ -12517,10 +12562,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
-                 ctx->num_additional_fused_ops = num_adds - 1;
-             } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-                 ctx->num_additional_fused_ops = 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
-                ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
-                ctx->num_additional_fused_ops = topk_moe.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
-             }
-         }
-         ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
-@@ -12618,10 +12671,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
-                 ctx->num_additional_fused_ops = num_adds - 1;
-             } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-                 ctx->num_additional_fused_ops = 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
-                ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
-                ctx->num_additional_fused_ops = topk_moe.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
-             }
-         }
- 
-@@ -12754,25 +12815,44 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
-     while (first_unused < graph->n_nodes) {
-         std::vector<int> current_set;
- 
-        // Avoid reordering topk_moe_norm
-        if (first_unused + (int)topk_moe_norm.size() <= graph->n_nodes) {
-            bool is_topk_moe_norm = true;
-            for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
-                if (graph->nodes[first_unused + j]->op != topk_moe_norm[j] || used[first_unused + j]) {
-                    is_topk_moe_norm = false;
-+        // Check for fusion patterns and avoid reordering them
-+        auto const &match_pattern = [&](const std::initializer_list<ggml_op> &pattern, int start) -> bool {
-+            if (start + (int)pattern.size() <= graph->n_nodes) {
-+                bool is_pattern = true;
-+                for (size_t j = 0; j < pattern.size(); ++j) {
-+                    if (graph->nodes[start + j]->op != pattern.begin()[j] || used[start + j]) {
-+                        is_pattern = false;
-+                    }
-                 }
-+                return is_pattern;
-             }
-            if (is_topk_moe_norm) {
-                for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
-+            return false;
-+        };
-+
-+        auto const &keep_pattern = [&](const std::initializer_list<ggml_op> &pattern) -> bool {
-+            if (match_pattern(pattern, first_unused)) {
-+                for (size_t j = 0; j < pattern.size(); ++j) {
-                     new_order.push_back(graph->nodes[first_unused + j]);
-                     used[first_unused + j] = true;
-                 }
-                 while (first_unused < graph->n_nodes && used[first_unused]) {
-                     first_unused++;
-                 }
-                continue;
-+                return true;
-             }
-+            return false;
-+        };
-+
-+        if (keep_pattern(topk_moe_early_softmax_norm)) {
-+            continue;
-+        }
-+        if (keep_pattern(topk_moe_early_softmax)) {
-+            continue;
-         }
-+        if (keep_pattern(topk_moe_late_softmax)) {
-+            continue;
-+        }
-+
-         // First, grab the next unused node.
-         current_set.push_back(first_unused);
- 
-@@ -12790,6 +12870,12 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
-             if (is_empty(graph->nodes[j])) {
-                 continue;
-             }
-+            // Don't pull forward nodes from fusion patterns
-+            if (match_pattern(topk_moe_early_softmax_norm, j) ||
-+                match_pattern(topk_moe_early_softmax, j) ||
-+                match_pattern(topk_moe_late_softmax, j)) {
-+                continue;
-+            }
-             bool ok = true;
-             for (int c = first_unused; c < j; ++c) {
-                 if (!used[c] &&
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
-index 9e56d5f8a..bc1c278bf 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
-@@ -11,6 +11,8 @@ layout (push_constant) uniform parameter
- {
-     uint n_rows;
-     uint n_expert_used;
-+    float clamp_min;
-+    float clamp_max;
- };
- 
- layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
-@@ -18,6 +20,7 @@ layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
- layout(constant_id = 0) const uint WARP_SIZE = 32;
- layout(constant_id = 1) const uint n_experts = 512;
- layout(constant_id = 2) const bool with_norm = true;
-+layout(constant_id = 3) const bool late_softmax = false;
- 
- const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
- 
-@@ -25,53 +28,72 @@ layout (binding = 0, std430) readonly buffer Logits {float logits[];};
- layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
- layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};
- 
-void main() {
-    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
-    if (row >= n_rows) {
-        return;
-    }
-+const float INFINITY = 1.0 / 0.0;
- 
-    const uint logits_offset = n_experts * row;
-    const uint weights_offset = n_expert_used * row;
-    const uint ids_offset = n_experts * row;
-
-    float logits_r[experts_per_thread];
-
-    const float INFINITY = 1.0 / 0.0;
-+// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
-+void softmax_warp_inplace(inout float vals[experts_per_thread], const uint limit, const uint lane, const bool use_limit) {
-+    float max_val = -INFINITY;
- 
-     [[unroll]]
-    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
-        const uint expert        = i + gl_LocalInvocationID.x;
-        logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[logits_offset + expert] : -INFINITY;
-+    for (int i = 0; i < experts_per_thread; i++) {
-+        const uint idx       = lane + i * WARP_SIZE;
-+        const bool is_active = !use_limit || (idx < limit);
-+        if (is_active) {
-+            max_val = max(max_val, vals[i]);
-+        }
-     }
- 
-    float max_val = logits_r[0];
-+    max_val = subgroupMax(max_val);
-+
-+    float sum = 0.f;
- 
-     [[unroll]]
-    for (int i = 1; i < experts_per_thread; i++) {
-        const float val = logits_r[i];
-        max_val         = max(val, max_val);
-+    for (int i = 0; i < experts_per_thread; i++) {
-+        const uint idx       = lane + i * WARP_SIZE;
-+        const bool is_active = !use_limit || (idx < limit);
-+        if (is_active) {
-+            const float val = exp(vals[i] - max_val);
-+            vals[i]         = val;
-+            sum += val;
-+        } else {
-+            vals[i] = 0.f;
-+        }
-     }
- 
-    max_val = subgroupMax(max_val);
-+    sum = subgroupAdd(sum);
- 
-    float wt[experts_per_thread];
-    float tmp = 0.f;
-+    const float inv_sum = 1.0f / sum;
- 
-     [[unroll]]
-     for (int i = 0; i < experts_per_thread; i++) {
-        const float val = logits_r[i];
-        wt[i]           = exp(val - max_val);
-        tmp += wt[i];
-+        const uint idx       = lane + i * WARP_SIZE;
-+        const bool is_active = !use_limit || (idx < limit);
-+        if (is_active) {
-+            vals[i] *= inv_sum;
-+        }
-     }
-+}
- 
-    tmp = subgroupAdd(tmp);
-+void main() {
-+    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
-+    if (row >= n_rows) {
-+        return;
-+    }
- 
-    const float inv_sum = 1.0f / tmp;
-+    const uint logits_offset = n_experts * row;
-+    const uint weights_offset = n_expert_used * row;
-+    const uint ids_offset = n_experts * row;
-+
-+    float wt[experts_per_thread];
- 
-     [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        wt[i] = wt[i] * inv_sum;
-+    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
-+        const uint expert = i + gl_LocalInvocationID.x;
-+        wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
-+    }
-+
-+    if (!late_softmax) {
-+        softmax_warp_inplace(wt, n_experts, gl_LocalInvocationID.x, false);
-     }
- 
-     // at this point, each thread holds a portion of softmax,
-@@ -82,6 +104,11 @@ void main() {
- 
-     float output_weights[experts_per_thread];
- 
-+    [[unroll]]
-+    for (int i = 0; i < experts_per_thread; i++) {
-+        output_weights[i] = 0.f;
-+    }
-+
-     for (int k = 0; k < n_expert_used; k++) {
-         float max_val    = wt[0];
-         uint   max_expert = gl_LocalInvocationID.x;
-@@ -121,6 +148,7 @@ void main() {
- 
-     if (with_norm) {
-         wt_sum              = subgroupAdd(wt_sum);
-+        wt_sum              = clamp(wt_sum, clamp_min, clamp_max);
-         const float inv_sum = 1.0f / wt_sum;
- 
-         [[unroll]]
-@@ -129,6 +157,10 @@ void main() {
-         }
-     }
- 
-+    if (late_softmax) {
-+        softmax_warp_inplace(output_weights, n_expert_used, gl_LocalInvocationID.x, true);
-+    }
-+
-     [[unroll]]
-     for (uint i = 0; i < experts_per_thread; ++i) {
-         uint idx = i * WARP_SIZE + gl_LocalInvocationID.x;
--- a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
+++ b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
--- a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
+++ b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
@@ -1,85 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jeff Bolz <jbolz@nvidia.com>
-Date: Thu, 30 Oct 2025 01:27:41 -0500
-Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851)
-
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp             |  4 ++++
- ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp | 16 ++++++++++++----
- 2 files changed, 16 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index aaf4334b5..3604ceb04 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants {
- 
- struct vk_op_argsort_push_constants {
-     uint32_t ncols;
-+    uint32_t nrows;
-     int32_t order;
- };
- 
-@@ -8710,6 +8711,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
-         break;
-     case GGML_OP_ARGSORT:
-         elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
-+        elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
-         break;
-     case GGML_OP_IM2COL:
-         {
-@@ -9952,9 +9954,11 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
-     int32_t * op_params = (int32_t *)dst->op_params;
- 
-     uint32_t ncols = src0->ne[0];
-+    uint32_t nrows = ggml_nrows(src0);
- 
-     ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
-         ncols,
-+        nrows,
-         op_params[0],
-     }, dryrun);
- }
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
-index c81b84452..c4e68bc02 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
-@@ -14,6 +14,7 @@ layout (binding = 1)          buffer D {int data_d[];};
- 
- layout (push_constant) uniform parameter {
-     uint ncols;
-+    uint nrows;
-     uint order;
- } p;
- 
-@@ -26,10 +27,9 @@ void swap(uint idx0, uint idx1) {
-     dst_row[idx1] = tmp;
- }
- 
-void argsort(bool needs_bounds_check) {
-+void argsort(bool needs_bounds_check, const uint row) {
-     // bitonic sort
-     const int col = int(gl_LocalInvocationID.x);
-    const uint row = gl_WorkGroupID.y;
- 
-     const uint row_offset = row * p.ncols;
- 
-@@ -72,8 +72,16 @@ void argsort(bool needs_bounds_check) {
- 
- void main() {
-     if (p.ncols == BLOCK_SIZE) {
-        argsort(false);
-+        uint row = gl_WorkGroupID.y;
-+        while (row < p.nrows) {
-+            argsort(false, row);
-+            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-+        }
-     } else {
-        argsort(true);
-+        uint row = gl_WorkGroupID.y;
-+        while (row < p.nrows) {
-+            argsort(true, row);
-+            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-+        }
-     }
- }
--- a/llama/patches/0034-vulkan-fix-shmem-overrun-in-mmq-id-shader-16873.patch
+++ b/llama/patches/0034-vulkan-fix-shmem-overrun-in-mmq-id-shader-16873.patch
@@ -1,77 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ruben Ortlam <picard12@live.de>
-Date: Fri, 31 Oct 2025 08:14:49 +0100
-Subject: [PATCH] vulkan: fix shmem overrun in mmq id shader (#16873)
-
-* vulkan: fix shmem overrun in mmq id shader
-
-* metal : fix mul_mm_id
-
---------
-
-Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
- ggml/src/ggml-metal/ggml-metal-device.cpp                    | 2 +-
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp             | 4 ++++
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl | 2 +-
- tests/test-backend-ops.cpp                                   | 3 +++
- 4 files changed, 9 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
-index 758116342..c78082ac3 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
-+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
-@@ -677,7 +677,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_
-     char name[256];
- 
-     snprintf(base, 256, "kernel_mul_mm_id_map0_ne20_%d", ne20);
-    snprintf(name, 256, "%s", base);
-+    snprintf(name, 256, "%s_ne02=%d", base, ne02);
- 
-     ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
-     if (res) {
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
-index 8b238ac4b..d955b4fc7 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
-@@ -82,9 +82,13 @@ layout (constant_id = 10) const uint WARP = 32;
- 
- #include "mul_mmq_shmem_types.glsl"
- 
-+#ifdef MUL_MAT_ID
-+#define BK_STEP 1
-+#else
- #ifndef BK_STEP
- #define BK_STEP 4
- #endif
-+#endif
- 
- // Shared memory cache
- shared block_a_cache buf_a[BM * BK_STEP];
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
-index 72fec4404..1c0f5306f 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
-@@ -27,7 +27,7 @@ struct block_a_cache {
- #elif defined(DATA_A_Q8_0)
- #define QUANT_R_MMQ 1
- // AMD likes 4, Intel likes 1 and Nvidia likes 2
-#define BK_STEP 1
-+// #define BK_STEP 1
- struct block_a_cache {
-     int32_t qs[32/4];
-     FLOAT_TYPE dm;
-diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
-index 657b6cc2f..1f8dda383 100644
--- a/tests/test-backend-ops.cpp
-+++ b/tests/test-backend-ops.cpp
-@@ -6722,6 +6722,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
-     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
-     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
- 
-+    // gpt-oss issue with Vulkan mmq_id
-+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
-+
-     for (ggml_type type_a : base_types) {
-         for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-             for (int n_mats : {4, 8}) {
--- a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
+++ b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
@@ -1,80 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Masato Nakasaka <masato.nakasaka@intel.com>
-Date: Fri, 31 Oct 2025 16:18:59 +0900
-Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not
- supported (#16796)
-
-* Experimenting crash fix
-
-* added assert for aborting and fixed comment
-
-* changed to check if a pipeline is empty or not
-
-* Moved function in class definition
-
-* replaced with is_empty
-
-* Modified is_empty to check only unaligned pipelines
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 20 +++++++++++++-------
- 1 file changed, 13 insertions(+), 7 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 3604ceb04..80185d9f0 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
- struct vk_matmul_pipeline_struct {
-     vk_pipeline l, m, s;
-     vk_pipeline a_l, a_m, a_s;
-+    // Returns true when all unaligned pipelines are null.
-+    // We only check for unaligned variants since one of the unaligned pipelines must exist
-+    // while aligned pipelines are optional
-+    bool is_empty() const {
-+        return l == nullptr && m == nullptr && s == nullptr;
-+    }
- };
-
- typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline;
- 
- struct vk_matmul_pipeline2 {
-@@ -5080,7 +5085,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
-     if (src1_type == GGML_TYPE_Q8_1) {
-         vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc;
- 
-        if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
-+        if (pipelines->is_empty()) {
-             return nullptr;
-         }
- 
-@@ -5229,7 +5234,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
-     if (src1_type == GGML_TYPE_Q8_1) {
-         vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_id_q8_1[src0_type].f32acc;
- 
-        if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
-+        if (pipelines->is_empty()) {
-             return nullptr;
-         }
- 
-@@ -5264,16 +5269,17 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
-             return nullptr;
-     }
- 
-+    vk_matmul_pipeline2& mmp = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type];
-     // XXX TODO 'prec' is not actually allowed in mul_mat_id.
-     bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/;
-    bool support_fp16acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc != nullptr;
-    bool support_fp32acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc != nullptr;
-+    bool support_fp16acc = !mmp.f16acc->is_empty();
-+    bool support_fp32acc = !mmp.f32acc->is_empty();
- 
-     if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) {
-        return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc;
-+        return mmp.f16acc;
-     } else {
-         GGML_ASSERT(support_fp32acc);
-        return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc;
-+        return mmp.f32acc;
-     }
- }
- 
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -0,0 +1,516 @@
+package llm
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"slices"
+	"sort"
+	"strings"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
+)
+
+// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
+// The list of GPUs returned will always be the same brand (library)
+// If the model can not be fit fully within the available GPU(s) nil is returned
+func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
+	for _, gl := range ml.ByLibrary(gpus) {
+		sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...)
+
+		// TODO - potentially sort by performance capability, existing models loaded, etc.
+		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
+		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
+		sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl)))
+
+		if !envconfig.SchedSpread() {
+			// Try to pack into as few GPUs as possible, starting from 1 GPU
+			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
+				gpuSubset := sgl[:numGPUs]
+				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
+
+				if ok {
+					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
+						"model", modelPath,
+						"library", sgl[0].Library,
+						"parallel", numParallel,
+						"required", format.HumanBytes2(estimatedVRAM),
+						"gpus", numGPUs)
+					return gpuSubset
+				}
+			}
+		} else {
+			// TODO future refinements
+			// - if multiple Libraries, see if any single GPU in any Library will fit
+			// - try subsets of GPUs instead of just falling back to 1 or all in a family
+
+			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
+			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
+				slog.Info("new model will fit in available VRAM, loading",
+					"model", modelPath,
+					"library", sgl[0].Library,
+					"parallel", numParallel,
+					"required", format.HumanBytes2(estimatedVRAM),
+					"gpus", len(sgl))
+				return sgl
+			}
+		}
+	}
+	return nil
+}
+
+// If multiple Libraries are detected, pick the Library which loads the most layers for the model
+func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
+	byLibrary := ml.ByLibrary(gpus)
+	if len(byLibrary) <= 1 {
+		return gpus
+	}
+	var bestEstimate uint64
+	var bestFit int
+	for i, gl := range byLibrary {
+		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
+		if estimatedVRAM > bestEstimate {
+			bestEstimate = estimatedVRAM
+			bestFit = i
+		}
+	}
+	return byLibrary[bestFit]
+}
+
+// This algorithm looks for a complete fit to determine if we need to unload other models
+func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+	// Split up the GPUs by type and try them
+	var estimatedVRAM uint64
+	for _, gpus := range ml.ByLibrary(allGpus) {
+		var layerCount int
+		estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
+		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
+		if opts.NumGPU < 0 {
+			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
+				return true, estimatedVRAM
+			}
+		} else {
+			if layerCount > 0 && layerCount >= opts.NumGPU {
+				return true, estimatedVRAM
+			}
+		}
+	}
+	return false, estimatedVRAM
+}
+
+func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool {
+	estimate := estimateGPULayers(nil, f, projectors, opts, numParallel)
+	if estimate.TotalSize > systemInfo.FreeMemory {
+		return false
+	}
+	slog.Info("new model will fit in available system memory for CPU inference, loading",
+		"model", modelPath,
+		"parallel", numParallel,
+		"required", format.HumanBytes2(estimate.TotalSize),
+	)
+	return true
+}
+
+type MemoryEstimate struct {
+	// How many layers we predict we can load
+	Layers int
+
+	// The size of the graph which occupies the main GPU
+	Graph uint64
+
+	// How much VRAM will be allocated given the number of layers we predict
+	VRAMSize uint64
+
+	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
+	TotalSize uint64
+
+	// For multi-GPU scenarios, this provides the tensor split parameter
+	TensorSplit []int
+
+	// For multi-GPU scenarios, this is the size in bytes per GPU
+	GPUSizes []uint64
+
+	// internal fields for logging purposes
+	inferenceLibrary    string
+	layersRequested     int
+	layersModel         int
+	availableList       []string
+	kv                  uint64
+	allocationsList     []string
+	memoryWeights       uint64
+	memoryLayerOutput   uint64
+	graphFullOffload    uint64
+	graphPartialOffload uint64
+
+	projectorWeights, projectorGraph uint64
+}
+
+// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
+// The GPUs provided must all be the same Library
+func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
+	// Graph size for a partial offload, applies to all GPUs
+	var graphPartialOffload uint64
+
+	// Graph size when all layers are offloaded, applies to all GPUs
+	var graphFullOffload uint64
+
+	// Final graph offload once we know full or partial
+	var graphOffload uint64
+
+	// Projectors loaded into GPU0 only
+	var llamaEngineProjectorWeights uint64
+
+	// Projectors loaded with output layer
+	var ollamaEngineProjectorWeights uint64
+	var ollamaEngineProjectorGraph uint64
+
+	// Conditional output size on GPU 0
+	var memoryLayerOutput uint64
+
+	// The sizes of a layer
+	var layerSize uint64
+
+	// The sum of all the layer sizes (just for logging)
+	var memoryWeights uint64
+
+	// True if all the layers are loaded
+	var fullyLoaded bool
+
+	// Overflow that didn't fit into the GPU
+	var overflow uint64
+
+	overhead := envconfig.GpuOverhead()
+	availableList := make([]string, len(gpus))
+	libraries := []string{}
+	for i, gpu := range gpus {
+		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
+		if !slices.Contains(libraries, gpu.Library) {
+			libraries = append(libraries, gpu.Library)
+		}
+	}
+	if len(libraries) == 0 {
+		libraries = []string{"cpu"}
+	}
+	slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList)
+
+	for _, projector := range projectors {
+		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
+	}
+	if llamaEngineProjectorWeights == 0 {
+		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+	}
+
+	layers := f.Tensors().GroupLayers()
+	// add one layer worth of memory as a buffer
+	if blk0, ok := layers["blk.0"]; ok {
+		layerSize = blk0.Size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
+	}
+
+	useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
+		ml.FlashAttentionSupported(gpus) &&
+		f.SupportsFlashAttention()
+
+	var kvct string
+	if useFlashAttention {
+		requested := strings.ToLower(envconfig.KvCacheType())
+		if f.SupportsKVCacheType(requested) {
+			kvct = requested
+		}
+	}
+
+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct, useFlashAttention)
+
+	if len(kv) > 0 {
+		layerSize += kv[0]
+	}
+
+	var kvTotal uint64
+	for _, kvLayer := range kv {
+		kvTotal += kvLayer
+	}
+
+	if graphPartialOffload == 0 {
+		headsKV := f.KV().HeadCountKVMin()
+		if headsKV == 0 {
+			headsKV = 1
+		}
+		gqa := f.KV().HeadCountMax() / headsKV
+		graphPartialOffload = gqa * kvTotal / 6
+	}
+	if graphFullOffload == 0 {
+		graphFullOffload = graphPartialOffload
+	}
+
+	// on metal there's no partial offload overhead
+	if len(gpus) > 0 && gpus[0].Library == "Metal" {
+		graphPartialOffload = graphFullOffload
+	} else if len(gpus) > 1 {
+		// multigpu should always use the partial graph size
+		graphFullOffload = graphPartialOffload
+	}
+
+	// Output layer handled at the end if we have space
+	if layer, ok := layers["output_norm"]; ok {
+		memoryLayerOutput += layer.Size()
+	}
+	if layer, ok := layers["output"]; ok {
+		memoryLayerOutput += layer.Size()
+	} else if layer, ok := layers["token_embd"]; ok {
+		memoryLayerOutput += layer.Size()
+	}
+
+	gpuZeroOverhead := llamaEngineProjectorWeights
+
+	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
+	var layerCount int
+	tensorSplit := make([]int, len(gpus))
+	gpuAllocations := make([]uint64, len(gpus))
+	type gs struct {
+		i int
+		g *ml.DeviceInfo
+	}
+	gpusWithSpace := []gs{}
+	for i := range gpus {
+		var gzo uint64
+		if len(gpusWithSpace) == 0 {
+			gzo = gpuZeroOverhead
+		}
+		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
+		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize {
+			slog.Debug("gpu has too little memory to allocate any layers",
+				"id", gpus[i].ID,
+				"library", gpus[i].Library,
+				"compute", gpus[i].Compute(),
+				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
+				"name", gpus[i].Name,
+				"total", format.HumanBytes2(gpus[i].TotalMemory),
+				"available", format.HumanBytes2(gpus[i].FreeMemory),
+				"minimum_memory", gpus[i].MinimumMemory,
+				"layer_size", format.HumanBytes2(layerSize),
+				"gpu_zer_overhead", format.HumanBytes2(gzo),
+				"partial_offload", format.HumanBytes2(graphPartialOffload),
+				"full_offload", format.HumanBytes2(graphFullOffload),
+			)
+			continue
+		}
+		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
+		gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full
+	}
+
+	var gpuZeroID int
+	if len(gpusWithSpace) > 0 {
+		gpuZeroID = gpusWithSpace[0].i
+		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	} else {
+		overflow += gpuZeroOverhead
+	}
+
+	// For all the layers, find where they can fit on the GPU(s)
+	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
+		// Some models have inconsistent layer sizes
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			layerSize = blk.Size()
+			layerSize += kv[i]
+			memoryWeights += blk.Size()
+		}
+
+		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
+			// Stop allocating on GPU(s) once we hit the users target NumGPU
+			overflow += layerSize
+			continue
+		}
+
+		// distribute the layers across the GPU(s) that have space
+		for j := len(gpusWithSpace); j > 0; j-- {
+			g := gpusWithSpace[i%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			if g.g.FreeMemory > overhead+used+layerSize {
+				gpuAllocations[g.i] += layerSize
+				tensorSplit[g.i]++
+				layerCount++
+				break
+			} else {
+				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
+			}
+		}
+
+		if len(gpusWithSpace) == 0 {
+			overflow += layerSize
+		}
+	}
+	if layerCount >= int(f.KV().BlockCount()) {
+		fullyLoaded = true
+	}
+
+	// Determine if we need to consider output then find where it fits
+	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
+	if memoryLastLayer > 0 {
+		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
+			for j := len(gpusWithSpace); j > 0; j-- {
+				g := gpusWithSpace[layerCount%j]
+				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+				if g.g.FreeMemory > overhead+used+memoryLastLayer {
+					gpuAllocations[g.i] += memoryLastLayer
+					tensorSplit[g.i]++
+					layerCount++
+					break
+				}
+			}
+		}
+
+		if layerCount < int(f.KV().BlockCount())+1 {
+			fullyLoaded = false
+			overflow += memoryLastLayer
+		}
+	}
+
+	// Add the applicable (full or partial) graph allocations
+	for i := range gpus {
+		if tensorSplit[i] <= 0 {
+			continue
+		}
+		if fullyLoaded {
+			gpuAllocations[i] += graphFullOffload
+		} else {
+			gpuAllocations[i] += graphPartialOffload
+		}
+	}
+	if fullyLoaded {
+		graphOffload = graphFullOffload
+	} else {
+		graphOffload = graphPartialOffload
+	}
+
+	// Summaries for the log
+	var memoryRequiredPartial, memoryRequiredTotal uint64
+	for i := range gpuAllocations {
+		memoryRequiredPartial += gpuAllocations[i]
+	}
+	memoryRequiredTotal = memoryRequiredPartial + overflow
+
+	allocationsList := []string{}
+	for _, a := range gpuAllocations {
+		allocationsList = append(allocationsList, format.HumanBytes2(a))
+	}
+
+	estimate := MemoryEstimate{
+		TotalSize: memoryRequiredTotal,
+		Layers:    0,
+		Graph:     0,
+		VRAMSize:  0,
+		GPUSizes:  []uint64{},
+
+		inferenceLibrary:    strings.Join(libraries, ","),
+		layersRequested:     opts.NumGPU,
+		layersModel:         int(f.KV().BlockCount()) + 1,
+		availableList:       availableList,
+		kv:                  kvTotal,
+		allocationsList:     allocationsList,
+		memoryWeights:       memoryWeights,
+		memoryLayerOutput:   memoryLayerOutput,
+		graphFullOffload:    graphFullOffload,
+		graphPartialOffload: graphPartialOffload,
+		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
+		projectorGraph:      ollamaEngineProjectorGraph,
+	}
+
+	if len(gpus) == 0 {
+		return estimate
+	}
+	if layerCount == 0 {
+		slog.Debug("insufficient VRAM to load any model layers")
+		return estimate
+	}
+	estimate.Layers = layerCount
+	estimate.Graph = graphOffload
+	estimate.VRAMSize = memoryRequiredPartial
+	estimate.TotalSize = memoryRequiredTotal
+	estimate.TensorSplit = tensorSplit
+	estimate.GPUSizes = gpuAllocations
+	return estimate
+}
+
+func (m MemoryEstimate) LogValue() slog.Value {
+	attrs := []slog.Attr{
+		slog.String("library", m.inferenceLibrary),
+		slog.Group(
+			"layers",
+			// requested number of layers to offload
+			"requested", m.layersRequested,
+			// The number of layers the model has (including output)
+			"model", m.layersModel,
+			// estimated number of layers that can be offloaded
+			"offload", m.Layers,
+			// multi-gpu split for tensors
+			"split", m.TensorSplit,
+		),
+		slog.Group(
+			"memory",
+			// memory available by GPU for offloading
+			"available", m.availableList,
+			"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
+			slog.Group(
+				"required",
+				// memory required for full offloading
+				"full", format.HumanBytes2(m.TotalSize),
+				// memory required to offload layers.estimate layers
+				"partial", format.HumanBytes2(m.VRAMSize),
+				// memory of KV cache
+				"kv", format.HumanBytes2(m.kv),
+				// Allocations across the GPUs
+				"allocations", m.allocationsList,
+			),
+			slog.Group(
+				"weights",
+				// memory of the weights
+				"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
+				// memory of repeating layers
+				"repeating", format.HumanBytes2(m.memoryWeights),
+				// memory of non-repeating layers
+				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
+			),
+			slog.Group(
+				"graph",
+				// memory of graph when fully offloaded
+				"full", format.HumanBytes2(m.graphFullOffload),
+				// memory of graph when not fully offloaded
+				"partial", format.HumanBytes2(m.graphPartialOffload),
+			),
+		),
+	}
+
+	if m.projectorWeights > 0 {
+		attrs = append(attrs, slog.Group(
+			"projector",
+			"weights", format.HumanBytes2(m.projectorWeights),
+			"graph", format.HumanBytes2(m.projectorGraph),
+		))
+	}
+
+	return slog.GroupValue(attrs...)
+}
+
+func projectorMemoryRequirements(filename string) (weights uint64) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return 0
+	}
+	defer file.Close()
+
+	ggml, err := ggml.Decode(file, 1024)
+	if err != nil {
+		return 0
+	}
+
+	for _, layer := range ggml.Tensors().GroupLayers() {
+		weights += layer.Size()
+	}
+
+	return weights
+}
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -0,0 +1,130 @@
+package llm
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
+)
+
+func TestEstimateGPULayers(t *testing.T) {
+	t.Setenv("OLLAMA_DEBUG", "1")
+	t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "2048")
+
+	modelName := "dummy"
+	f, err := os.CreateTemp(t.TempDir(), modelName)
+	require.NoError(t, err)
+	defer f.Close()
+	inputLayerCount := 5
+
+	tensors := []*ggml.Tensor{
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+	}
+	assert.Len(t, tensors, inputLayerCount+1)
+	err = ggml.WriteGGUF(f, ggml.KV{
+		"general.architecture":          "llama",
+		"llama.context_length":          uint32(32),
+		"llama.embedding_length":        uint32(4096),
+		"llama.block_count":             uint32(inputLayerCount),
+		"llama.attention.head_count":    uint32(32),
+		"llama.attention.head_count_kv": uint32(32),
+		"tokenizer.ggml.tokens":         []string{" "},
+		"tokenizer.ggml.scores":         []float32{0},
+		"tokenizer.ggml.token_type":     []int32{0},
+	}, tensors)
+	require.NoError(t, err)
+
+	ggml, err := LoadModel(f.Name(), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Simple CPU scenario
+	gpus := []ml.DeviceInfo{}
+	projectors := []string{}
+	opts := api.DefaultOptions()
+	t.Run("cpu", func(t *testing.T) {
+		estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
+		assert.Equal(t, 0, estimate.Layers)
+		assert.Equal(t, uint64(0), estimate.Graph)
+	})
+
+	// derived from the dummy ggml file above
+	graphPartialOffload := uint64(202377216)
+	graphFullOffload := uint64(171968512)
+	layerSize := uint64(33554436)
+	projectorSize := uint64(0)
+	memoryLayerOutput := uint64(4)
+
+	// Dual CUDA scenario with asymmetry
+	gpuMinimumMemory := uint64(457 * format.MebiByte)
+	gpus = []ml.DeviceInfo{
+		{
+			DeviceID: ml.DeviceID{
+				Library: "CUDA",
+			},
+		},
+		{
+			DeviceID: ml.DeviceID{
+				Library: "CUDA",
+			},
+		},
+	}
+	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
+	for i, s := range []struct {
+		layer0, layer1   uint64
+		expect0, expect1 int
+	}{
+		{1, 1, 1, 1},
+		{2, 1, 2, 1},
+		{2, 2, 2, 2},
+		{1, 2, 1, 2},
+		{3, 3, 3, 3},
+		{4, 4, 3, 3},
+		{6, 6, 3, 3},
+		{0, 3, 0, 3},
+	} {
+		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
+			gpus[0].FreeMemory = 0
+			gpus[1].FreeMemory = 0
+			gpus[0].FreeMemory += projectorSize
+			if s.layer0 > 0 {
+				gpus[0].FreeMemory += memoryLayerOutput
+			} else {
+				gpus[1].FreeMemory += memoryLayerOutput
+			}
+			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
+			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
+			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
+			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
+			estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
+			assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
+			assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
+			var layerSums uint64
+			for _, b := range estimate.GPUSizes {
+				layerSums += b
+			}
+			if estimate.Layers < inputLayerCount+1 {
+				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+			} else {
+				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+			}
+		})
+	}
+}
--- a/llm/server.go
+++ b/llm/server.go
@@ -84,21 +84,25 @@ type LlamaServer interface {

 // llmServer is an instance of a runner hosting a single model
 type llmServer struct {
-	port      int
-	cmd       *exec.Cmd
-	done      chan error // Channel to signal when the process exits
-	status    *StatusWriter
-	options   api.Options
-	modelPath string
+	port        int
+	cmd         *exec.Cmd
+	done        chan error // Channel to signal when the process exits
+	status      *StatusWriter
+	options     api.Options
+	numParallel int
+	modelPath   string

-	loadRequest LoadRequest       // Parameters used to initialize the runner
-	mem         *ml.BackendMemory // Memory allocations for this model
+	loadRequest LoadRequest // Parameters used to initialize the runner

 	// llamaModel is an instance of the cgo llama.cpp model definition
 	// nil if this server is running the new engine
 	llamaModel     *llama.Model
 	llamaModelLock *sync.Mutex

+	// textProcessor handles text encoding/decoding for the model in the Ollama engine
+	// nil if this server is running the llama.cpp based engine
+	textProcessor model.TextProcessor
+
 	totalLayers  uint64
 	loadStart    time.Time // Record how long it took the model to load
 	loadProgress float32
@@ -109,13 +113,15 @@ type llmServer struct {
 type llamaServer struct {
 	llmServer

-	ggml *ggml.GGML
+	ggml     *ggml.GGML
+	gpus     []ml.DeviceInfo // The set of GPUs covered by the memory estimate
+	estimate MemoryEstimate
 }

 type ollamaServer struct {
 	llmServer

-	textProcessor model.TextProcessor // textProcessor handles text encoding/decoding
+	mem *ml.BackendMemory
 }

 // LoadModel will load a model from disk. The model must be in the GGML format.
@@ -239,6 +245,8 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 		loadRequest:    loadRequest,
 		llamaModel:     llamaModel,
 		llamaModelLock: &sync.Mutex{},
+		textProcessor:  textProcessor,
+		numParallel:    numParallel,
 		sem:            semaphore.NewWeighted(int64(numParallel)),
 		totalLayers:    f.KV().BlockCount() + 1,
 		loadStart:      time.Now(),
@@ -273,7 +281,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 	}()

 	if textProcessor != nil {
-		return &ollamaServer{llmServer: s, textProcessor: textProcessor}, nil
+		return &ollamaServer{llmServer: s}, nil
 	} else {
 		return &llamaServer{llmServer: s, ggml: f}, nil
 	}
@@ -455,226 +463,169 @@ type LoadResponse struct {

 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")

-func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
-	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
+func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
+	systemTotalMemory := systemInfo.TotalMemory
+	systemFreeMemory := systemInfo.FreeMemory
+	systemSwapFreeMemory := systemInfo.FreeSwap
+	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))

-	gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
-
-	// Synthesize memory allocation information based on our estimates
-	s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
-		Name:    "CPU",
-		Weights: make([]uint64, s.totalLayers),
-		Cache:   make([]uint64, s.totalLayers),
-	}, GPUs: make([]ml.DeviceMemory, len(gpus))}
-
-	for i := range s.mem.GPUs {
-		s.mem.GPUs[i].Name = gpus[i].Name
-		s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
-		s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
-		s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
-	}
-
-	kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize),
-		s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
-
-	// Use the size of one layer as a buffer
-	layers := s.ggml.Tensors().GroupLayers()
-	if blk0, ok := layers["blk.0"]; ok {
-		for i := range gpus {
-			gpus[i].FreeMemory -= blk0.Size() + kv[0]
+	if len(gpus) == 0 || s.options.NumGPU == 0 {
+		if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) {
+			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
+			return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
 		}
 	} else {
-		slog.Warn("model missing blk.0 layer size")
-	}
-
-	// Assign all the layers to the CPU for now, they will get reassigned later
-	for i := range s.ggml.KV().BlockCount() {
-		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
-			s.mem.CPU.Weights[i] = blk.Size()
-			s.mem.CPU.Cache[i] += kv[i]
-		}
-	}
-
-	// We historically haven't included InputWeights in the model size
-	var outputWeights uint64
-	if layer, ok := layers["output_norm"]; ok {
-		outputWeights += layer.Size()
-	}
-	if layer, ok := layers["output"]; ok {
-		outputWeights += layer.Size()
-	} else if layer, ok := layers["token_embd"]; ok {
-		outputWeights += layer.Size()
-	}
-	s.mem.CPU.Weights[s.totalLayers-1] = outputWeights
-
-	// The vision projector is always loaded on the first GPU if available.
-	// This can't be assigned by us, so just subtract it from free space
-	projectorGPU := -1
-	var projectorWeights uint64
-	if len(gpus) > 0 {
-		for _, projector := range s.loadRequest.LoraPath {
-			projectorWeights += projectorMemoryRequirements(projector)
-		}
-
-		// llama.cpp uses the first discrete GPU if available, otherwise the first iGPU
-		firstIntegrated := -1
-		for i := range gpus {
-			if !gpus[i].Integrated {
-				projectorGPU = i
-				break
-			}
-			if firstIntegrated == -1 {
-				firstIntegrated = i
+		g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
+		if g == nil {
+			if !requireFull {
+				g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
+			} else {
+				slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
+				return nil, ErrLoadRequiredFull
 			}
 		}
-		if projectorGPU == -1 {
-			projectorGPU = firstIntegrated
+		gpus = g
+	}
+
+	s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel)
+
+	if len(gpus) >= 1 {
+		switch {
+		case s.options.NumGPU == 0:
+			gpus = []ml.DeviceInfo{}
+		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory:
+			// disable partial offloading when model is greater than total system memory as this
+			// can lead to locking up the system
+			s.options.NumGPU = 0
+			gpus = []ml.DeviceInfo{}
+		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
+			// Don't bother loading into the GPU if no layers can fit
+			gpus = []ml.DeviceInfo{}
+		case s.options.NumGPU < 0 && s.estimate.Layers > 0:
+			s.options.NumGPU = s.estimate.Layers
 		}
-
-		gpus[projectorGPU].FreeMemory -= projectorWeights
+	} else {
+		s.options.NumGPU = 0
 	}

-	var kvTotal uint64
-	for _, kvLayer := range kv {
-		kvTotal += kvLayer
-	}
-
-	if graphPartialOffload == 0 {
-		headsKV := s.ggml.KV().HeadCountKVMin()
-		if headsKV == 0 {
-			headsKV = 1
-		}
-		gqa := s.ggml.KV().HeadCountMax() / headsKV
-		graphPartialOffload = gqa * kvTotal / 6
-	}
-	if graphFullOffload == 0 {
-		graphFullOffload = graphPartialOffload
-	}
-
-	// On Metal there's no partial offload overhead
-	if len(gpus) > 0 && gpus[0].Library == "Metal" {
-		graphPartialOffload = graphFullOffload
-	}
-
-	// Create a layout based on the memory data that we've built. The compute graph
-	// for GPUs is iteratively assigned based on the number of GPUs that are required.
-	var gpuLayers ml.GPULayersList
-	for {
-		prevGPULayers := gpuLayers
-
-		var err error
-		gpuLayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, 0)
-		if err != nil {
-			return nil, err
-		}
-
-		if len(gpuLayers) > len(prevGPULayers) {
-			for _, gl := range gpuLayers {
-				for i := range s.mem.GPUs {
-					if gl.DeviceID == s.mem.GPUs[i].DeviceID {
-						s.mem.GPUs[i].Graph = max(graphPartialOffload, graphFullOffload)
-						break
-					}
-				}
-			}
-		} else {
-			break
+	// On linux and windows, over-allocating CPU memory will almost always result in an error
+	// Darwin has fully dynamic swap so has no direct concept of free swap space
+	if runtime.GOOS != "darwin" {
+		systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize
+		available := systemInfo.FreeMemory + systemInfo.FreeSwap
+		if systemMemoryRequired > available {
+			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
 		}
 	}

-	// This maintains the historical assignment of graph sizes, though it isn't fully accurate
-	graphSize := graphFullOffload
-	if gpuLayers.Sum() < int(s.totalLayers) {
-		graphSize = graphPartialOffload
-	}
+	slog.Info("offload", "", s.estimate)

-	// For all layers that we have assigned to GPUs, move them in the memory data so
-	// that it is reported accurately
-	for _, gl := range gpuLayers {
-		for i := range s.mem.GPUs {
-			if gl.DeviceID == s.mem.GPUs[i].DeviceID {
-				for _, l := range gl.Layers {
-					s.mem.GPUs[i].Weights[l] = s.mem.CPU.Weights[l]
-					s.mem.GPUs[i].Cache[l] = s.mem.CPU.Cache[l]
+	s.gpus = gpus
+	s.loadRequest.GPULayers = createGPULayers(s.estimate, s.ggml, gpus, s.options.NumGPU)

-					s.mem.CPU.Weights[l] = 0
-					s.mem.CPU.Cache[l] = 0
-				}
+	// Mmap is only supported on the llama engine
+	if s.textProcessor == nil {
+		s.loadRequest.UseMmap = true

-				s.mem.GPUs[i].Graph = graphSize
-				break
+		// mmap has issues with partial offloading on metal
+		for _, g := range gpus {
+			if g.Library == "Metal" &&
+				uint64(s.options.NumGPU) > 0 &&
+				uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
+				s.options.UseMMap = new(bool)
+				*s.options.UseMMap = false
 			}
 		}
-	}

-	if projectorGPU > 0 && len(s.mem.GPUs[projectorGPU].Weights) > 0 {
-		s.mem.GPUs[projectorGPU].Weights[s.totalLayers-1] += projectorWeights
-	}
-
-	slog.Debug("memory", "estimate", s.mem)
-	s.mem.Log(slog.LevelInfo)
-
-	// The llama engine uses mmap by default
-	s.loadRequest.UseMmap = true
-
-	// mmap has issues with partial offloading on metal
-	for _, g := range gpus {
-		if g.Library == "Metal" &&
-			uint64(s.options.NumGPU) > 0 &&
-			uint64(s.options.NumGPU) < s.totalLayers {
-			s.options.UseMMap = new(bool)
-			*s.options.UseMMap = false
+		// Windows CUDA should not use mmap for best performance
+		// Linux  with a model larger than free space, mmap leads to thrashing
+		// For CPU loads we want the memory to be allocated, not FS cache
+		if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
+			(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
+			(len(gpus) == 0 && s.options.UseMMap == nil) ||
+			(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
+			(s.options.UseMMap != nil && !*s.options.UseMMap) {
+			s.loadRequest.UseMmap = false
 		}
 	}

-	// Windows CUDA should not use mmap for best performance
-	// Linux  with a model larger than free space, mmap leads to thrashing
-	// For CPU loads we want the memory to be allocated, not FS cache
-	if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
-		(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.TotalSize() && s.options.UseMMap == nil) ||
-		(len(gpus) == 0 && s.options.UseMMap == nil) ||
-		(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
-		(s.options.UseMMap != nil && !*s.options.UseMMap) {
-		s.loadRequest.UseMmap = false
-	}
-
 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
 		return nil, err
 	}

-	s.loadRequest.GPULayers = gpuLayers
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
 		return nil, err
 	}

+	// On the Ollama engine, we can print out a summary of the memory allocations.
+	// We don't have this for the llama engine but it does something similar itself.
+	if s.textProcessor != nil {
+		resp.Memory.Log(slog.LevelInfo)
+	}
+
 	if !resp.Success {
+		slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
 		return nil, errors.New("failed to allocate memory for model")
 	}

 	// The llama engine does its memory allocations together with model loading, so we
 	// need to wait until it is done to ensure that we have accurate memory data before
 	// loading the next model
-	return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
+	if s.textProcessor == nil {
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
+	} else {
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
+	}
 }

-func projectorMemoryRequirements(filename string) (weights uint64) {
-	file, err := os.Open(filename)
-	if err != nil {
-		return 0
-	}
-	defer file.Close()
-
-	ggml, err := ggml.Decode(file, 1024)
-	if err != nil {
-		return 0
+// createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment
+// of particular layers onto GPUs
+func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList {
+	if numGPU <= 0 || len(gpus) == 0 {
+		return nil
 	}

-	for _, layer := range ggml.Tensors().GroupLayers() {
-		weights += layer.Size()
+	gpuLayers := make(ml.GPULayersList, len(gpus))
+	for i := range gpuLayers {
+		gpuLayers[i].DeviceID = gpus[i].DeviceID
 	}

-	return weights
+	var sum float32
+	splits := make([]float32, len(estimate.TensorSplit))
+	// cumulative sum of all splits
+	for i := range splits {
+		sum += float32(estimate.TensorSplit[i])
+		splits[i] = sum
+	}
+
+	if sum <= 0 {
+		return nil
+	}
+
+	// normalize splits
+	for i := range splits {
+		splits[i] /= sum
+	}
+
+	blocks := int(ggml.KV().BlockCount())
+	gpuRangeStart := max(0, blocks-numGPU)
+	gpuRangeStop := min(gpuRangeStart+numGPU, blocks+1)
+	for i := range blocks + 1 {
+		if i < gpuRangeStart || i >= gpuRangeStop {
+			continue
+		}
+
+		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
+		if index < 0 || index >= len(gpus) {
+			continue
+		}
+
+		gpuLayers[index].Layers = append(gpuLayers[index].Layers, i)
+	}
+
+	return gpuLayers
 }

 // Load finds the optimal layout of layers to offload on GPUs based on no initial information about the size of the model
@@ -701,6 +652,23 @@ func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus

 	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)

+	systemTotalMemory := systemInfo.TotalMemory
+	systemFreeMemory := systemInfo.FreeMemory
+	systemSwapFreeMemory := systemInfo.FreeSwap
+	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+
+	for _, gpu := range gpus {
+		available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory()
+		if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() {
+			available = 0
+		}
+		slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
+			"available", format.HumanBytes2(available),
+			"free", format.HumanBytes2(gpu.FreeMemory),
+			"minimum", format.HumanBytes2(gpu.MinimumMemory()),
+			"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
+	}
+
 	pastAllocations := make(map[uint64]struct{})
 	var backoff float32

@@ -866,22 +834,25 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
 // - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph
 // - Assigning layers
 // - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
+func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
 	if memory == nil {
 		memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
 			Weights: make([]uint64, s.totalLayers),
 			Cache:   make([]uint64, s.totalLayers),
 		}}
 	}
-	gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
-	err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
+	gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff)
+	if err != nil {
+		return nil, err
+	}
+	err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
 	if err != nil {
 		return nil, err
 	}
 	return gpuLayers, nil
 }

-func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64) {
+func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) {
 	gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
 	sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus)))

@@ -939,11 +910,11 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe
 			gpuLayers = libraryGpuLayers
 		}
 	}
-	return gpuLayers, layers
+	return gpuLayers, layers, nil
 }

 // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
+func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
 	// These sizes will only increase as we go through additional iterations and get additional information.
 	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
@@ -971,13 +942,11 @@ nextLayer:

 	if requireFull {
 		if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
-			slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
 			return ErrLoadRequiredFull
 		}

 		if cpuSize > systemInfo.FreeMemory {
-			slog.Info("model requires more system memory than is currently available, evicting a model to make space", "required", cpuSize, "free", systemInfo.FreeMemory)
-			return fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
+			return ErrLoadRequiredFull
 		}
 	}

@@ -1007,13 +976,6 @@ nextLayer:

 // assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment
 func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
-	// If the user is manually overriding parameters, treat all GPUs equally so they split according to VRAM
-	if requestedLayers >= 0 || envconfig.SchedSpread() {
-		for i := range gpus {
-			gpus[i].Integrated = false
-		}
-	}
-
 	// If we can't fit everything then prefer offloading layers other than the output layer
 	for range 2 {
 		// requestedLayers may be -1 if nothing was requested
@@ -1046,38 +1008,33 @@ func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, reque

 // findBestFit binary searches to find the smallest capacity factor that can fit
 // the max number of layers. The capacity factor is multiplied by the free space on
-// each GPU and a small one will force even balancing. Higher performance GPUs are
-// used first.
+// each GPU and a small one will force even balancing.
 func findBestFit(layers []uint64, gpus []ml.DeviceInfo, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) {
-	for _, gl := range ml.ByPerformance(gpus) {
-		var high float32 = 1
-		var low float32 = 0
+	var high float32 = 1
+	var low float32 = 0

-		// If we need to fulfill the requested number of layers, pretend we have almost infinite VRAM
-		if requestedLayers >= 0 && forceRequest {
-			high = 1000
-		}
-
-		bestAssignments := greedyFit(layers, gl, high, requestedLayers)
-		maxNumGPU := bestAssignments.Sum()
-
-		for high-low > 1e-6 {
-			mid := (low + high) / 2
-			assignments := greedyFit(layers, gl, mid, requestedLayers)
-			if assignments.Sum() == maxNumGPU {
-				high = mid
-				bestAssignments = assignments
-			} else {
-				low = mid
-			}
-		}
-
-		layers = layers[:len(layers)-bestAssignments.Sum()]
-		requestedLayers -= bestAssignments.Sum()
-		gpuLayers = append(bestAssignments, gpuLayers...)
+	// If we need to fulfill the requested number of layers, pretend we have almost infinite VRAM
+	if requestedLayers >= 0 && forceRequest {
+		high = 1000
 	}

-	return gpuLayers
+	bestAssignments := greedyFit(layers, gpus, high, requestedLayers)
+	maxNumGPU := bestAssignments.Sum()
+	if maxNumGPU == 0 {
+		return bestAssignments
+	}
+
+	for high-low > 1e-6 {
+		mid := (low + high) / 2
+		assignments := greedyFit(layers, gpus, mid, requestedLayers)
+		if assignments.Sum() == maxNumGPU {
+			high = mid
+			bestAssignments = assignments
+		} else {
+			low = mid
+		}
+	}
+	return bestAssignments
 }

 // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
@@ -1405,12 +1362,6 @@ type CompletionRequest struct {
 	Grammar  string // set before sending the request to the subprocess
 	Shift    bool
 	Truncate bool
-
-	// Logprobs specifies whether to include log probabilities in the response
-	Logprobs bool
-
-	// TopLogprobs specifies the number of most likely alternative tokens to return (0-20)
-	TopLogprobs int
 }

 // DoneReason represents the reason why a completion response is done
@@ -1436,18 +1387,6 @@ func (d DoneReason) String() string {
 	}
 }

-// TokenLogprob represents log probability information for a single token alternative.
-type TokenLogprob struct {
-	Token   string  `json:"token"`
-	Logprob float64 `json:"logprob"`
-}
-
-// Logprob contains log probability information for a generated token.
-type Logprob struct {
-	TokenLogprob
-	TopLogprobs []TokenLogprob `json:"top_logprobs,omitempty"`
-}
-
 type CompletionResponse struct {
 	Content            string        `json:"content"`
 	DoneReason         DoneReason    `json:"done_reason"`
@@ -1456,9 +1395,6 @@ type CompletionResponse struct {
 	PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
 	EvalCount          int           `json:"eval_count"`
 	EvalDuration       time.Duration `json:"eval_duration"`
-
-	// Logprobs contains log probability information if requested
-	Logprobs []Logprob `json:"logprobs,omitempty"`
 }

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
@@ -1594,8 +1530,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu

 			if c.Content != "" {
 				fn(CompletionResponse{
-					Content:  c.Content,
-					Logprobs: c.Logprobs,
+					Content: c.Content,
 				})
 			}

@@ -1688,59 +1623,68 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
 	return e.Embedding, nil
 }

-func (s *llamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
+type TokenizeRequest struct {
+	Content string `json:"content"`
+}
+
+type TokenizeResponse struct {
+	Tokens []int `json:"tokens"`
+}
+
+func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
 	s.llamaModelLock.Lock()
 	defer s.llamaModelLock.Unlock()

-	if s.llamaModel == nil {
-		return nil, fmt.Errorf("no tokenizer configured")
+	if s.llamaModel != nil {
+		return s.llamaModel.Tokenize(content, false, true)
 	}
-
-	return s.llamaModel.Tokenize(content, false, true)
+	if s.textProcessor != nil {
+		tokens, err := s.textProcessor.Encode(content, false)
+		if err != nil {
+			return nil, err
+		}
+		toks := make([]int, len(tokens))
+		for i, t := range tokens {
+			toks[i] = int(t)
+		}
+		return toks, nil
+	}
+	// not reached
+	return nil, fmt.Errorf("no tokenizer configured")
 }

-func (s *ollamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
-	tokens, err := s.textProcessor.Encode(content, false)
-	if err != nil {
-		return nil, err
-	}
-
-	toks := make([]int, len(tokens))
-	for i, t := range tokens {
-		toks[i] = int(t)
-	}
-
-	return toks, nil
+type DetokenizeRequest struct {
+	Tokens []int `json:"tokens"`
 }

-func (s *llamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
+type DetokenizeResponse struct {
+	Content string `json:"content"`
+}
+
+func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
 	s.llamaModelLock.Lock()
 	defer s.llamaModelLock.Unlock()

-	if s.llamaModel == nil {
-		return "", fmt.Errorf("no tokenizer configured")
+	if s.llamaModel != nil {
+		var resp string
+		for _, token := range tokens {
+			resp += s.llamaModel.TokenToPiece(token)
+		}
+		return resp, nil
 	}
-
-	var resp string
-	for _, token := range tokens {
-		resp += s.llamaModel.TokenToPiece(token)
+	if s.textProcessor != nil {
+		toks := make([]int32, len(tokens))
+		for i, t := range tokens {
+			toks[i] = int32(t)
+		}
+		content, err := s.textProcessor.Decode(toks)
+		if err != nil {
+			return "", err
+		}
+		return content, nil
 	}
-
-	return resp, nil
-}
-
-func (s *ollamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
-	toks := make([]int32, len(tokens))
-	for i, t := range tokens {
-		toks[i] = int32(t)
-	}
-
-	content, err := s.textProcessor.Decode(toks)
-	if err != nil {
-		return "", err
-	}
-
-	return content, nil
+	// not reached
+	return "", fmt.Errorf("no tokenizer configured")
 }

 func (s *llmServer) Close() error {
@@ -1768,12 +1712,31 @@ func (s *llmServer) Close() error {
 	return nil
 }

+func (s *llamaServer) VRAMSize() uint64 {
+	return s.estimate.VRAMSize
+}
+
+func (s *llamaServer) TotalSize() uint64 {
+	return s.estimate.TotalSize
+}
+
+func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
+	for i, gpu := range s.gpus {
+		if gpu.DeviceID == id {
+			if i < len(s.estimate.GPUSizes) {
+				return s.estimate.GPUSizes[i]
+			}
+		}
+	}
+	return 0
+}
+
 func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
 	slog.Debug("llamarunner free vram reporting not supported")
 	return nil
 }

-func (s *llmServer) VRAMSize() uint64 {
+func (s *ollamaServer) VRAMSize() uint64 {
 	if s.mem == nil {
 		return 0
 	}
@@ -1801,7 +1764,7 @@ func (s *llmServer) VRAMSize() uint64 {
 	return mem
 }

-func (s *llmServer) TotalSize() uint64 {
+func (s *ollamaServer) TotalSize() uint64 {
 	if s.mem == nil {
 		return 0
 	}
@@ -1815,7 +1778,7 @@ func (s *llmServer) TotalSize() uint64 {
 	return mem
 }

-func (s *llmServer) VRAMByGPU(id ml.DeviceID) uint64 {
+func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	if s.mem == nil {
 		return 0
 	}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -14,11 +14,16 @@ import (
 )

 func TestLLMServerFitGPU(t *testing.T) {
+	type gpu struct {
+		id   ml.DeviceID
+		free int
+	}
+
 	minMemory := 457 * format.MebiByte

 	tests := []struct {
 		name        string
-		gpus        []ml.DeviceInfo
+		gpus        []gpu
 		layers      []int
 		numGPU      int
 		requireFull bool
@@ -33,91 +38,91 @@ func TestLLMServerFitGPU(t *testing.T) {
 		},
 		{
 			name:     "Full single GPU",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Partial single GPU",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Single GPU with numGPU 1",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Single GPU with numGPU 0",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   0,
 			expected: ml.GPULayersList{},
 		},
 		{
 			name:     "Single GPU with numGPU 999",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   999,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
 		},
 		{
 			name:     "Multi GPU fits on one",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Multi GPU split",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Multi GPU partial",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 1",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 2",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   2,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 999",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   999,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
 		},
 		{
 			name:     "Multi GPU different libraries",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:     []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
 		},
 		{
 			name:        "requireFull",
-			gpus:        []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:      -1,
 			requireFull: true,
@@ -125,54 +130,12 @@ func TestLLMServerFitGPU(t *testing.T) {
 		},
 		{
 			name:        "requireFull numGPU",
-			gpus:        []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256 * format.MebiByte)}},
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:      4,
 			requireFull: true,
 			expectedErr: ErrLoadRequiredFull,
 		},
-		{
-			name:     "iGPU",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
-		},
-		{
-			name:     "iGPU + dGPU",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
-		},
-		{
-			name:     "iGPU + dGPU fits on one",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1}}},
-		},
-		{
-			name:     "iGPU + dGPU partial",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
-			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
-		},
-		{
-			name:     "iGPU + dGPU numGPU 1",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
-			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:   1,
-			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
-		},
-		{
-			name:     "iGPU + dGPU numGPU 999",
-			gpus:     []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
-			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
-			numGPU:   999,
-			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1, 2, 3}}},
-		},
 	}

 	for _, tt := range tests {
@@ -182,6 +145,12 @@ func TestLLMServerFitGPU(t *testing.T) {
 			systemInfo.FreeMemory = 512 * format.MebiByte
 			systemInfo.FreeSwap = 256 * format.MebiByte

+			gpus := make([]ml.DeviceInfo, len(tt.gpus))
+			for i := range tt.gpus {
+				gpus[i].DeviceID = tt.gpus[i].id
+				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
+			}
+
 			s := &ollamaServer{
 				llmServer: llmServer{
 					totalLayers: uint64(len(tt.layers)),
@@ -196,19 +165,19 @@ func TestLLMServerFitGPU(t *testing.T) {
 			s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
 				Weights: make([]uint64, s.totalLayers),
 				Cache:   make([]uint64, s.totalLayers),
-			}, GPUs: make([]ml.DeviceMemory, len(tt.gpus))}
+			}, GPUs: make([]ml.DeviceMemory, len(gpus))}

 			for i := range tt.layers {
 				s.mem.CPU.Weights[i] = uint64(tt.layers[i])
 			}

 			for i := range s.mem.GPUs {
-				s.mem.GPUs[i].DeviceID = tt.gpus[i].DeviceID
+				s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
 				s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
 				s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
 			}

-			gpuLayers, err := s.createLayout(systemInfo, tt.gpus, s.mem, tt.requireFull, 0)
+			gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
 			if err != tt.expectedErr {
 				t.Fatalf("fitGPU returned error: %v", err)
 			}
--- a/macapp/.eslintrc.json
+++ b/macapp/.eslintrc.json
@@ -0,0 +1,16 @@
+{
+  "env": {
+    "browser": true,
+    "es6": true,
+    "node": true
+  },
+  "extends": [
+    "eslint:recommended",
+    "plugin:@typescript-eslint/eslint-recommended",
+    "plugin:@typescript-eslint/recommended",
+    "plugin:import/recommended",
+    "plugin:import/electron",
+    "plugin:import/typescript"
+  ],
+  "parser": "@typescript-eslint/parser"
+}
--- a/macapp/.gitignore
+++ b/macapp/.gitignore
@@ -0,0 +1,92 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+.DS_Store
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# TypeScript v1 declaration files
+typings/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variables file
+.env
+.env.test
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+
+# next.js build output
+.next
+
+# nuxt.js build output
+.nuxt
+
+# vuepress build output
+.vuepress/dist
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# Webpack
+.webpack/
+
+# Vite
+.vite/
+
+# Electron-Forge
+out/
--- a/macapp/README.md
+++ b/macapp/README.md
@@ -0,0 +1,21 @@
+# Desktop
+
+This app builds upon Ollama to provide a desktop experience for running models.
+
+## Developing
+
+First, build the `ollama` binary:
+
+```shell
+cd ..
+go build .
+```
+
+Then run the desktop app with `npm start`:
+
+```shell
+cd macapp
+npm install
+npm start
+```
+
--- a/macapp/assets/icon.icns
+++ b/macapp/assets/icon.icns
--- a/macapp/assets/iconDarkTemplate.png
+++ b/macapp/assets/iconDarkTemplate.png
--- a/macapp/assets/iconDarkTemplate@2x.png
+++ b/macapp/assets/iconDarkTemplate@2x.png
--- a/macapp/assets/iconDarkUpdateTemplate.png
+++ b/macapp/assets/iconDarkUpdateTemplate.png
--- a/macapp/assets/iconDarkUpdateTemplate@2x.png
+++ b/macapp/assets/iconDarkUpdateTemplate@2x.png
--- a/macapp/assets/iconTemplate.png
+++ b/macapp/assets/iconTemplate.png
--- a/macapp/assets/iconTemplate@2x.png
+++ b/macapp/assets/iconTemplate@2x.png
--- a/macapp/assets/iconUpdateTemplate.png
+++ b/macapp/assets/iconUpdateTemplate.png
--- a/macapp/assets/iconUpdateTemplate@2x.png
+++ b/macapp/assets/iconUpdateTemplate@2x.png
--- a/macapp/forge.config.ts
+++ b/macapp/forge.config.ts
@@ -0,0 +1,79 @@
+import type { ForgeConfig } from '@electron-forge/shared-types'
+import { MakerSquirrel } from '@electron-forge/maker-squirrel'
+import { MakerZIP } from '@electron-forge/maker-zip'
+import { PublisherGithub } from '@electron-forge/publisher-github'
+import { AutoUnpackNativesPlugin } from '@electron-forge/plugin-auto-unpack-natives'
+import { WebpackPlugin } from '@electron-forge/plugin-webpack'
+import * as path from 'path'
+import * as fs from 'fs'
+
+import { mainConfig } from './webpack.main.config'
+import { rendererConfig } from './webpack.renderer.config'
+
+const packageJson = JSON.parse(fs.readFileSync(path.resolve(__dirname, './package.json'), 'utf8'))
+
+const config: ForgeConfig = {
+  packagerConfig: {
+    appVersion: process.env.VERSION || packageJson.version,
+    asar: true,
+    icon: './assets/icon.icns',
+    extraResource: [
+      path.join(__dirname, '../dist/darwin/ollama'),
+      ...fs.readdirSync(path.join(__dirname, '../dist/darwin-amd64/lib/ollama')).map(f => path.join(__dirname, '../dist/darwin-amd64/lib/ollama', f)),
+      path.join(__dirname, './assets/iconTemplate.png'),
+      path.join(__dirname, './assets/iconTemplate@2x.png'),
+      path.join(__dirname, './assets/iconUpdateTemplate.png'),
+      path.join(__dirname, './assets/iconUpdateTemplate@2x.png'),
+      path.join(__dirname, './assets/iconDarkTemplate.png'),
+      path.join(__dirname, './assets/iconDarkTemplate@2x.png'),
+      path.join(__dirname, './assets/iconDarkUpdateTemplate.png'),
+      path.join(__dirname, './assets/iconDarkUpdateTemplate@2x.png'),
+    ],
+    ...(process.env.SIGN
+      ? {
+          osxSign: {
+            identity: process.env.APPLE_IDENTITY,
+          },
+          osxNotarize: {
+            tool: 'notarytool',
+            appleId: process.env.APPLE_ID || '',
+            appleIdPassword: process.env.APPLE_PASSWORD || '',
+            teamId: process.env.APPLE_TEAM_ID || '',
+          },
+        }
+      : {}),
+    osxUniversal: {
+      x64ArchFiles: '*',
+    },
+  },
+  rebuildConfig: {},
+  makers: [new MakerSquirrel({}), new MakerZIP({}, ['darwin'])],
+  hooks: {
+    readPackageJson: async (_, packageJson) => {
+      return { ...packageJson, version: process.env.VERSION || packageJson.version }
+    },
+  },
+  plugins: [
+    new AutoUnpackNativesPlugin({}),
+    new WebpackPlugin({
+      mainConfig,
+      devContentSecurityPolicy: `default-src * 'unsafe-eval' 'unsafe-inline'; img-src data: 'self'`,
+      renderer: {
+        config: rendererConfig,
+        nodeIntegration: true,
+        entryPoints: [
+          {
+            html: './src/index.html',
+            js: './src/renderer.tsx',
+            name: 'main_window',
+            preload: {
+              js: './src/preload.ts',
+            },
+          },
+        ],
+      },
+    }),
+  ],
+}
+
+export default config
--- a/macapp/package-lock.json
+++ b/macapp/package-lock.json
--- a/macapp/package.json
+++ b/macapp/package.json
@@ -0,0 +1,80 @@
+{
+  "name": "ollama",
+  "productName": "Ollama",
+  "version": "0.0.0",
+  "description": "ollama",
+  "main": ".webpack/main",
+  "scripts": {
+    "start": "electron-forge start",
+    "package": "electron-forge package --arch universal",
+    "package:sign": "SIGN=1 electron-forge package --arch universal",
+    "make": "electron-forge make --arch universal",
+    "make:sign": "SIGN=1 electron-forge make --arch universal",
+    "publish": "SIGN=1 electron-forge publish",
+    "lint": "eslint --ext .ts,.tsx ."
+  },
+  "keywords": [],
+  "author": {
+    "name": "Jeffrey Morgan",
+    "email": "jmorganca@gmail.com"
+  },
+  "license": "MIT",
+  "devDependencies": {
+    "@babel/core": "^7.22.5",
+    "@babel/preset-react": "^7.22.5",
+    "@electron-forge/cli": "^6.2.1",
+    "@electron-forge/maker-deb": "^6.2.1",
+    "@electron-forge/maker-rpm": "^6.2.1",
+    "@electron-forge/maker-squirrel": "^6.2.1",
+    "@electron-forge/maker-zip": "^6.2.1",
+    "@electron-forge/plugin-auto-unpack-natives": "^6.2.1",
+    "@electron-forge/plugin-webpack": "^6.2.1",
+    "@electron-forge/publisher-github": "^6.2.1",
+    "@electron/universal": "^1.4.1",
+    "@svgr/webpack": "^8.0.1",
+    "@types/chmodr": "^1.0.0",
+    "@types/node": "^20.4.0",
+    "@types/react": "^18.2.14",
+    "@types/react-dom": "^18.2.6",
+    "@types/uuid": "^9.0.2",
+    "@typescript-eslint/eslint-plugin": "^5.60.0",
+    "@typescript-eslint/parser": "^5.60.0",
+    "@vercel/webpack-asset-relocator-loader": "^1.7.3",
+    "babel-loader": "^9.1.2",
+    "chmodr": "^1.2.0",
+    "copy-webpack-plugin": "^11.0.0",
+    "css-loader": "^6.8.1",
+    "electron": "25.9.2",
+    "eslint": "^8.43.0",
+    "eslint-plugin-import": "^2.27.5",
+    "fork-ts-checker-webpack-plugin": "^7.3.0",
+    "node-loader": "^2.0.0",
+    "postcss": "^8.4.24",
+    "postcss-import": "^15.1.0",
+    "postcss-loader": "^7.3.3",
+    "postcss-preset-env": "^8.5.1",
+    "style-loader": "^3.3.3",
+    "svg-inline-loader": "^0.8.2",
+    "tailwindcss": "^3.3.2",
+    "ts-loader": "^9.4.3",
+    "ts-node": "^10.9.1",
+    "typescript": "~4.5.4",
+    "url-loader": "^4.1.1",
+    "webpack": "^5.88.0",
+    "webpack-cli": "^5.1.4",
+    "webpack-dev-server": "^4.15.1"
+  },
+  "dependencies": {
+    "@electron/remote": "^2.0.10",
+    "@heroicons/react": "^2.0.18",
+    "@segment/analytics-node": "^1.0.0",
+    "copy-to-clipboard": "^3.3.3",
+    "electron-squirrel-startup": "^1.0.0",
+    "electron-store": "^8.1.0",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0",
+    "uuid": "^9.0.0",
+    "winston": "^3.10.0",
+    "winston-daily-rotate-file": "^4.7.1"
+  }
+}
--- a/macapp/postcss.config.js
+++ b/macapp/postcss.config.js
@@ -0,0 +1,7 @@
+module.exports = {
+  plugins: {
+    'postcss-import': {},
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}
--- a/macapp/src/app.css
+++ b/macapp/src/app.css
@@ -0,0 +1,34 @@
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+html,
+body {
+  background: transparent;
+}
+
+.drag {
+  -webkit-app-region: drag;
+}
+
+.no-drag {
+  -webkit-app-region: no-drag;
+}
+
+.blink {
+  -webkit-animation: 1s blink step-end infinite;
+  -moz-animation: 1s blink step-end infinite;
+  -ms-animation: 1s blink step-end infinite;
+  -o-animation: 1s blink step-end infinite;
+  animation: 1s blink step-end infinite;
+}
+
+@keyframes blink {
+  from,
+  to {
+    color: transparent;
+  }
+  50% {
+    color: black;
+  }
+}
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -0,0 +1,122 @@
+import { useState } from 'react'
+import copy from 'copy-to-clipboard'
+import { CheckIcon, DocumentDuplicateIcon } from '@heroicons/react/24/outline'
+import Store from 'electron-store'
+import { getCurrentWindow, app } from '@electron/remote'
+
+import { install } from './install'
+import OllamaIcon from './ollama.svg'
+
+const store = new Store()
+
+enum Step {
+  WELCOME = 0,
+  CLI,
+  FINISH,
+}
+
+export default function () {
+  const [step, setStep] = useState<Step>(Step.WELCOME)
+  const [commandCopied, setCommandCopied] = useState<boolean>(false)
+
+  const command = 'ollama run llama3.2'
+
+  return (
+    <div className='drag'>
+      <div className='mx-auto flex min-h-screen w-full flex-col justify-between bg-white px-4 pt-16'>
+        {step === Step.WELCOME && (
+          <>
+            <div className='mx-auto text-center'>
+              <h1 className='mb-6 mt-4 text-2xl tracking-tight text-gray-900'>Welcome to Ollama</h1>
+              <p className='mx-auto w-[65%] text-sm text-gray-400'>
+                Let's get you up and running with your own large language models.
+              </p>
+              <button
+                onClick={() => setStep(Step.CLI)}
+                className='no-drag rounded-dm mx-auto my-8 w-[40%] rounded-md bg-black px-4 py-2 text-sm text-white hover:brightness-110'
+              >
+                Next
+              </button>
+            </div>
+            <div className='mx-auto'>
+              <OllamaIcon />
+            </div>
+          </>
+        )}
+        {step === Step.CLI && (
+          <>
+            <div className='mx-auto flex flex-col space-y-28 text-center'>
+              <h1 className='mt-4 text-2xl tracking-tight text-gray-900'>Install the command line</h1>
+              <pre className='mx-auto text-4xl text-gray-400'>&gt; ollama</pre>
+              <div className='mx-auto'>
+                <button
+                  onClick={async () => {
+                    try {
+                      await install()
+                      setStep(Step.FINISH)
+                    } catch (e) {
+                      console.error('could not install: ', e)
+                    } finally {
+                      getCurrentWindow().show()
+                      getCurrentWindow().focus()
+                    }
+                  }}
+                  className='no-drag rounded-dm mx-auto w-[60%] rounded-md bg-black px-4 py-2 text-sm text-white hover:brightness-110'
+                >
+                  Install
+                </button>
+                <p className='mx-auto my-4 w-[70%] text-xs text-gray-400'>
+                  You will be prompted for administrator access
+                </p>
+              </div>
+            </div>
+          </>
+        )}
+        {step === Step.FINISH && (
+          <>
+            <div className='mx-auto flex flex-col space-y-20 text-center'>
+              <h1 className='mt-4 text-2xl tracking-tight text-gray-900'>Run your first model</h1>
+              <div className='flex flex-col'>
+                <div className='group relative flex items-center'>
+                  <pre className='language-none text-2xs w-full rounded-md bg-gray-100 px-4 py-3 text-start leading-normal'>
+                    {command}
+                  </pre>
+                  <button
+                    className={`no-drag absolute right-[5px] px-2 py-2 ${
+                      commandCopied
+                        ? 'text-gray-900 opacity-100 hover:cursor-auto'
+                        : 'text-gray-200 opacity-50 hover:cursor-pointer'
+                    } hover:font-bold hover:text-gray-900 group-hover:opacity-100`}
+                    onClick={() => {
+                      copy(command)
+                      setCommandCopied(true)
+                      setTimeout(() => setCommandCopied(false), 3000)
+                    }}
+                  >
+                    {commandCopied ? (
+                      <CheckIcon className='h-4 w-4 font-bold text-gray-500' />
+                    ) : (
+                      <DocumentDuplicateIcon className='h-4 w-4 text-gray-500' />
+                    )}
+                  </button>
+                </div>
+                <p className='mx-auto my-4 w-[70%] text-xs text-gray-400'>
+                  Run this command in your favorite terminal.
+                </p>
+              </div>
+              <button
+                onClick={() => {
+                  store.set('first-time-run', true)
+                  window.close()
+                }}
+                className='no-drag rounded-dm mx-auto w-[60%] rounded-md bg-black px-4 py-2 text-sm text-white hover:brightness-110'
+              >
+                Finish
+              </button>
+            </div>
+          </>
+        )}
+      </div>
+    </div>
+  )
+}
--- a/macapp/src/declarations.d.ts
+++ b/macapp/src/declarations.d.ts
@@ -0,0 +1,4 @@
+declare module '*.svg' {
+  const content: string
+  export default content
+}
--- a/macapp/src/index.html
+++ b/macapp/src/index.html
@@ -0,0 +1,9 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="UTF-8" />
+  </head>
+  <body>
+    <div id="app"></div>
+  </body>
+</html>
--- a/macapp/src/index.ts
+++ b/macapp/src/index.ts
@@ -0,0 +1,302 @@
+import { spawn, ChildProcess } from 'child_process'
+import { app, autoUpdater, dialog, Tray, Menu, BrowserWindow, MenuItemConstructorOptions, nativeTheme } from 'electron'
+import Store from 'electron-store'
+import winston from 'winston'
+import 'winston-daily-rotate-file'
+import * as path from 'path'
+
+import { v4 as uuidv4 } from 'uuid'
+import { installed } from './install'
+
+require('@electron/remote/main').initialize()
+
+if (require('electron-squirrel-startup')) {
+  app.quit()
+}
+
+const store = new Store()
+
+let welcomeWindow: BrowserWindow | null = null
+
+declare const MAIN_WINDOW_WEBPACK_ENTRY: string
+
+const logger = winston.createLogger({
+  transports: [
+    new winston.transports.Console(),
+    new winston.transports.File({
+      filename: path.join(app.getPath('home'), '.ollama', 'logs', 'server.log'),
+      maxsize: 1024 * 1024 * 20,
+      maxFiles: 5,
+    }),
+  ],
+  format: winston.format.printf(info => info.message),
+})
+
+app.on('ready', () => {
+  const gotTheLock = app.requestSingleInstanceLock()
+  if (!gotTheLock) {
+    app.exit(0)
+    return
+  }
+
+  app.on('second-instance', () => {
+    if (app.hasSingleInstanceLock()) {
+      app.releaseSingleInstanceLock()
+    }
+
+    if (proc) {
+      proc.off('exit', restart)
+      proc.kill()
+    }
+
+    app.exit(0)
+  })
+
+  app.focus({ steal: true })
+
+  init()
+})
+
+function firstRunWindow() {
+  // Create the browser window.
+  welcomeWindow = new BrowserWindow({
+    width: 400,
+    height: 500,
+    frame: false,
+    fullscreenable: false,
+    resizable: false,
+    movable: true,
+    show: false,
+    webPreferences: {
+      nodeIntegration: true,
+      contextIsolation: false,
+    },
+  })
+
+  require('@electron/remote/main').enable(welcomeWindow.webContents)
+
+  welcomeWindow.loadURL(MAIN_WINDOW_WEBPACK_ENTRY)
+  welcomeWindow.on('ready-to-show', () => welcomeWindow.show())
+  welcomeWindow.on('closed', () => {
+    if (process.platform === 'darwin') {
+      app.dock.hide()
+    }
+  })
+}
+
+let tray: Tray | null = null
+let updateAvailable = false
+const assetPath = app.isPackaged ? process.resourcesPath : path.join(__dirname, '..', '..', 'assets')
+
+function trayIconPath() {
+  return nativeTheme.shouldUseDarkColors
+    ? updateAvailable
+      ? path.join(assetPath, 'iconDarkUpdateTemplate.png')
+      : path.join(assetPath, 'iconDarkTemplate.png')
+    : updateAvailable
+    ? path.join(assetPath, 'iconUpdateTemplate.png')
+    : path.join(assetPath, 'iconTemplate.png')
+}
+
+function updateTrayIcon() {
+  if (tray) {
+    tray.setImage(trayIconPath())
+  }
+}
+
+function updateTray() {
+  const updateItems: MenuItemConstructorOptions[] = [
+    { label: 'An update is available', enabled: false },
+    {
+      label: 'Restart to update',
+      click: () => autoUpdater.quitAndInstall(),
+    },
+    { type: 'separator' },
+  ]
+
+  const menu = Menu.buildFromTemplate([
+    ...(updateAvailable ? updateItems : []),
+    { role: 'quit', label: 'Quit Ollama', accelerator: 'Command+Q' },
+  ])
+
+  if (!tray) {
+    tray = new Tray(trayIconPath())
+  }
+
+  tray.setToolTip(updateAvailable ? 'An update is available' : 'Ollama')
+  tray.setContextMenu(menu)
+  tray.setImage(trayIconPath())
+
+  nativeTheme.off('updated', updateTrayIcon)
+  nativeTheme.on('updated', updateTrayIcon)
+}
+
+let proc: ChildProcess = null
+
+function server() {
+  const binary = app.isPackaged
+    ? path.join(process.resourcesPath, 'ollama')
+    : path.resolve(process.cwd(), '..', 'ollama')
+
+  proc = spawn(binary, ['serve'])
+
+  proc.stdout.on('data', data => {
+    logger.info(data.toString().trim())
+  })
+
+  proc.stderr.on('data', data => {
+    logger.error(data.toString().trim())
+  })
+
+  proc.on('exit', restart)
+}
+
+function restart() {
+  setTimeout(server, 1000)
+}
+
+app.on('before-quit', () => {
+  if (proc) {
+    proc.off('exit', restart)
+    proc.kill('SIGINT') // send SIGINT signal to the server, which also stops any loaded llms
+  }
+})
+
+const updateURL = `https://ollama.com/api/update?os=${process.platform}&arch=${
+  process.arch
+}&version=${app.getVersion()}&id=${id()}`
+
+let latest = ''
+async function isNewReleaseAvailable() {
+  try {
+    const response = await fetch(updateURL)
+
+    if (!response.ok) {
+      return false
+    }
+
+    if (response.status === 204) {
+      return false
+    }
+
+    const data = await response.json()
+
+    const url = data?.url
+    if (!url) {
+      return false
+    }
+
+    if (latest === url) {
+      return false
+    }
+
+    latest = url
+
+    return true
+  } catch (error) {
+    logger.error(`update check failed - ${error}`)
+    return false
+  }
+}
+
+async function checkUpdate() {
+  const available = await isNewReleaseAvailable()
+  if (available) {
+    logger.info('checking for update')
+    autoUpdater.checkForUpdates()
+  }
+}
+
+function init() {
+  if (app.isPackaged) {
+    checkUpdate()
+    setInterval(() => {
+      checkUpdate()
+    }, 60 * 60 * 1000)
+  }
+
+  updateTray()
+
+  if (process.platform === 'darwin') {
+    if (app.isPackaged) {
+      if (!app.isInApplicationsFolder()) {
+        const chosen = dialog.showMessageBoxSync({
+          type: 'question',
+          buttons: ['Move to Applications', 'Do Not Move'],
+          message: 'Ollama works best when run from the Applications directory.',
+          defaultId: 0,
+          cancelId: 1,
+        })
+
+        if (chosen === 0) {
+          try {
+            app.moveToApplicationsFolder({
+              conflictHandler: conflictType => {
+                if (conflictType === 'existsAndRunning') {
+                  dialog.showMessageBoxSync({
+                    type: 'info',
+                    message: 'Cannot move to Applications directory',
+                    detail:
+                      'Another version of Ollama is currently running from your Applications directory. Close it first and try again.',
+                  })
+                }
+                return true
+              },
+            })
+            return
+          } catch (e) {
+            logger.error(`[Move to Applications] Failed to move to applications folder - ${e.message}}`)
+          }
+        }
+      }
+    }
+  }
+
+  server()
+
+  if (store.get('first-time-run') && installed()) {
+    if (process.platform === 'darwin') {
+      app.dock.hide()
+    }
+
+    app.setLoginItemSettings({ openAtLogin: app.getLoginItemSettings().openAtLogin })
+    return
+  }
+
+  // This is the first run or the CLI is no longer installed
+  app.setLoginItemSettings({ openAtLogin: true })
+  firstRunWindow()
+}
+
+// Quit when all windows are closed, except on macOS. There, it's common
+// for applications and their menu bar to stay active until the user quits
+// explicitly with Cmd + Q.
+app.on('window-all-closed', () => {
+  if (process.platform !== 'darwin') {
+    app.quit()
+  }
+})
+
+function id(): string {
+  const id = store.get('id') as string
+
+  if (id) {
+    return id
+  }
+
+  const uuid = uuidv4()
+  store.set('id', uuid)
+  return uuid
+}
+
+autoUpdater.setFeedURL({ url: updateURL })
+
+autoUpdater.on('error', e => {
+  logger.error(`update check failed - ${e.message}`)
+  console.error(`update check failed - ${e.message}`)
+})
+
+autoUpdater.on('update-downloaded', () => {
+  updateAvailable = true
+  updateTray()
+})
--- a/macapp/src/install.ts
+++ b/macapp/src/install.ts
@@ -0,0 +1,21 @@
+import * as fs from 'fs'
+import { exec as cbExec } from 'child_process'
+import * as path from 'path'
+import { promisify } from 'util'
+
+const app = process && process.type === 'renderer' ? require('@electron/remote').app : require('electron').app
+const ollama = app.isPackaged ? path.join(process.resourcesPath, 'ollama') : path.resolve(process.cwd(), '..', 'ollama')
+const exec = promisify(cbExec)
+const symlinkPath = '/usr/local/bin/ollama'
+
+export function installed() {
+  return fs.existsSync(symlinkPath) && fs.readlinkSync(symlinkPath) === ollama
+}
+
+export async function install() {
+  const command = `do shell script "mkdir -p ${path.dirname(
+    symlinkPath
+  )} && ln -F -s \\"${ollama}\\" \\"${symlinkPath}\\"" with administrator privileges`
+
+  await exec(`osascript -e '${command}'`)
+}
--- a/macapp/src/ollama.svg
+++ b/macapp/src/ollama.svg
--- a/macapp/src/preload.ts
+++ b/macapp/src/preload.ts
--- a/macapp/src/renderer.tsx
+++ b/macapp/src/renderer.tsx
@@ -0,0 +1,7 @@
+import App from './app'
+import './app.css'
+import { createRoot } from 'react-dom/client'
+
+const container = document.getElementById('app')
+const root = createRoot(container)
+root.render(<App />)
--- a/macapp/tailwind.config.js
+++ b/macapp/tailwind.config.js
@@ -0,0 +1,6 @@
+/** @type {import('tailwindcss').Config} */
+module.exports = {
+  content: ['./src/**/*.{js,ts,jsx,tsx,mdx}'],
+  theme: {},
+  plugins: [],
+}
--- a/macapp/tsconfig.json
+++ b/macapp/tsconfig.json
@@ -0,0 +1,20 @@
+{
+  "compilerOptions": {
+    "target": "ES6",
+    "allowJs": true,
+    "module": "commonjs",
+    "skipLibCheck": true,
+    "esModuleInterop": true,
+    "noImplicitAny": true,
+    "sourceMap": true,
+    "baseUrl": ".",
+    "outDir": "dist",
+    "moduleResolution": "node",
+    "resolveJsonModule": true,
+    "paths": {
+      "*": ["node_modules/*"]
+    },
+    "jsx": "react-jsx"
+  },
+  "include": ["src/**/*"]
+}
--- a/macapp/webpack.main.config.ts
+++ b/macapp/webpack.main.config.ts
@@ -0,0 +1,20 @@
+import type { Configuration } from 'webpack'
+
+import { rules } from './webpack.rules'
+import { plugins } from './webpack.plugins'
+
+export const mainConfig: Configuration = {
+  /**
+   * This is the main entry point for your application, it's the first file
+   * that runs in the main process.
+   */
+  entry: './src/index.ts',
+  // Put your normal webpack config below here
+  module: {
+    rules,
+  },
+  plugins,
+  resolve: {
+    extensions: ['.js', '.ts', '.jsx', '.tsx', '.css', '.json'],
+  },
+}
--- a/macapp/webpack.plugins.ts
+++ b/macapp/webpack.plugins.ts
@@ -0,0 +1,14 @@
+import type IForkTsCheckerWebpackPlugin from 'fork-ts-checker-webpack-plugin'
+import { DefinePlugin } from 'webpack'
+
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+const ForkTsCheckerWebpackPlugin: typeof IForkTsCheckerWebpackPlugin = require('fork-ts-checker-webpack-plugin')
+
+export const plugins = [
+  new ForkTsCheckerWebpackPlugin({
+    logger: 'webpack-infrastructure',
+  }),
+  new DefinePlugin({
+    'process.env.TELEMETRY_WRITE_KEY': JSON.stringify(process.env.TELEMETRY_WRITE_KEY),
+  }),
+]
--- a/macapp/webpack.renderer.config.ts
+++ b/macapp/webpack.renderer.config.ts
@@ -0,0 +1,19 @@
+import type { Configuration } from 'webpack'
+
+import { rules } from './webpack.rules'
+import { plugins } from './webpack.plugins'
+
+rules.push({
+  test: /\.css$/,
+  use: [{ loader: 'style-loader' }, { loader: 'css-loader' }, { loader: 'postcss-loader' }],
+})
+
+export const rendererConfig: Configuration = {
+  module: {
+    rules,
+  },
+  plugins,
+  resolve: {
+    extensions: ['.js', '.ts', '.jsx', '.tsx', '.css'],
+  },
+}
--- a/macapp/webpack.rules.ts
+++ b/macapp/webpack.rules.ts
@@ -0,0 +1,35 @@
+import type { ModuleOptions } from 'webpack'
+
+export const rules: Required<ModuleOptions>['rules'] = [
+  // Add support for native node modules
+  {
+    // We're specifying native_modules in the test because the asset relocator loader generates a
+    // "fake" .node file which is really a cjs file.
+    test: /native_modules[/\\].+\.node$/,
+    use: 'node-loader',
+  },
+  {
+    test: /[/\\]node_modules[/\\].+\.(m?js|node)$/,
+    parser: { amd: false },
+    use: {
+      loader: '@vercel/webpack-asset-relocator-loader',
+      options: {
+        outputAssetBase: 'native_modules',
+      },
+    },
+  },
+  {
+    test: /\.tsx?$/,
+    exclude: /(node_modules|\.webpack)/,
+    use: {
+      loader: 'ts-loader',
+      options: {
+        transpileOnly: true,
+      },
+    },
+  },
+  {
+    test: /\.svg$/,
+    use: ['@svgr/webpack'],
+  },
+]
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -198,10 +198,6 @@ type Tensor interface {
 	Copy(ctx Context, t2 Tensor) Tensor
 	Duplicate(ctx Context) Tensor

-	Slice(ctx Context, dim, low, high, step int) Tensor
-	Chunk(ctx Context, dim int, size int) []Tensor
-	ChunkSections(ctx Context, dim int, sections ...int) []Tensor
-
 	TopK(ctx Context, k int) Tensor
 	Argsort(ctx Context) Tensor
 	Mean(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1738,66 +1738,3 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
 		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
 	}
 }
-
-// Slice returns a view of the tensor sliced along dim from low to high in step steps.
-// Slice panics if the dimension is invalid or the slice parameters are out of range.
-// If dim=0 and step>1, the tensor is a copy rather than a view to ensure proper shape.
-func (t *Tensor) Slice(ctx ml.Context, dim int, low, high, step int) ml.Tensor {
-	if dim < 0 || dim >= C.GGML_MAX_DIMS {
-		panic("invalid dimension")
-	} else if low < 0 || high > t.Dim(dim) || low >= high || step < 1 {
-		panic("invalid slice parameters")
-	}
-
-	if dim == 0 && step > 1 {
-		// dim=0,step>1 is a special case so handle it here first
-		return t.View(ctx,
-			low*t.Stride(0), 1,
-			step*t.Stride(0), (high-low+1)/step,
-			t.Stride(1), t.Dim(1),
-			// preserve dim 3 by merging it into dim 2
-			t.Stride(2), t.Dim(2)*t.Dim(3),
-		).Contiguous(ctx, (high-low+1)/step, t.Dim(1), t.Dim(2), t.Dim(3))
-	}
-
-	args := []int{
-		low * t.Stride(dim), t.Dim(0),
-		t.Stride(1), t.Dim(1),
-		t.Stride(2), t.Dim(2),
-		t.Stride(3), t.Dim(3),
-	}
-
-	if step == 1 {
-		args[dim*2+1] = high - low
-		return t.View(ctx, args[0], args[1:]...)
-	} else {
-		args[dim*2] = step * t.Stride(dim)
-		args[dim*2+1] = (high - low + 1) / step
-		return t.View(ctx, args[0], args[1:]...)
-	}
-}
-
-// Chunk the tensor into chunk sized tensors along dim. Each sub-tensor is a view of
-// the original.
-func (t *Tensor) Chunk(ctx ml.Context, dim, chunk int) []ml.Tensor {
-	sections := make([]int, 0, t.Dim(dim)/chunk+1)
-	for rest := t.Dim(dim); rest > 0; rest -= chunk {
-		sections = append(sections, min(chunk, rest))
-	}
-	return t.ChunkSections(ctx, dim, sections...)
-}
-
-// ChunkSections split the tensor into section sized tensors along dim. Each sub-tensor is a
-// view of the original. The size of the dim must equal the sum of sections.
-func (t *Tensor) ChunkSections(ctx ml.Context, dim int, sections ...int) []ml.Tensor {
-	var offset int
-	s := make([]ml.Tensor, len(sections))
-	for i, section := range sections {
-		s[i] = t.Slice(ctx, dim, offset, offset+section, 1)
-		offset += section
-	}
-	if offset != t.Dim(dim) {
-		panic("sections do not sum to tensor dimension")
-	}
-	return s
-}
--- a/ml/backend/ggml/ggml/src/ggml-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-impl.h
@@ -693,7 +693,6 @@ GGML_API void ggml_dxgi_pdh_release();
 #endif

 #ifdef __cplusplus
-#include <array>
 #include <initializer_list>
 #include <vector>

@@ -709,21 +708,6 @@ inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph *          cgraph,
    return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
 }

-// Return true if the edges in the graph match expectations.
-inline bool ggml_check_edges(const struct ggml_cgraph *                cgraph,
-                             int                                       start_idx,
-                             std::initializer_list<std::array<int, 3>> edges) {
-    for (const auto & edge : edges) {
-        int dst_node = edge[0];
-        int src_idx  = edge[1];
-        int src_node = edge[2];
-        if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
-            return false;
-        }
-    }
-    return true;
-}
-
 // expose GGUF internals for test code
 GGML_API size_t gguf_type_size(enum gguf_type type);
 GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -677,7 +677,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_
    char name[256];

    snprintf(base, 256, "kernel_mul_mm_id_map0_ne20_%d", ne20);
-    snprintf(name, 256, "%s_ne02=%d", base, ne02);
+    snprintf(name, 256, "%s", base);

    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
    if (res) {
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
@@ -14,7 +14,6 @@ layout (binding = 1)          buffer D {int data_d[];};

 layout (push_constant) uniform parameter {
    uint ncols;
-    uint nrows;
    uint order;
 } p;

@@ -27,9 +26,10 @@ void swap(uint idx0, uint idx1) {
    dst_row[idx1] = tmp;
 }

-void argsort(bool needs_bounds_check, const uint row) {
+void argsort(bool needs_bounds_check) {
    // bitonic sort
    const int col = int(gl_LocalInvocationID.x);
+    const uint row = gl_WorkGroupID.y;

    const uint row_offset = row * p.ncols;

@@ -72,16 +72,8 @@ void argsort(bool needs_bounds_check, const uint row) {

 void main() {
    if (p.ncols == BLOCK_SIZE) {
-        uint row = gl_WorkGroupID.y;
-        while (row < p.nrows) {
-            argsort(false, row);
-            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-        }
+        argsort(false);
    } else {
-        uint row = gl_WorkGroupID.y;
-        while (row < p.nrows) {
-            argsort(true, row);
-            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-        }
+        argsort(true);
    }
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
@@ -437,7 +437,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 #if defined(DATA_A_MXFP4)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return vec2(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[vui >> 4]) * 0.5;
+    return vec2(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[vui >> 4]);
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
    vec2 v0 = dequantize(ib, iqs, a_offset);
@@ -488,9 +488,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {

    const uvec2 qs = uvec2(data_a[a_offset + ib].qs[qsi], data_a[a_offset + ib].qs[qsi + 1]);
    const uint scales = data_a[a_offset + ib].scales[scalesi];
-    const vec2 dm = vec2(data_a[a_offset + ib].dm);
+    const vec2 d = vec2(data_a[a_offset + ib].d);

-    return dm.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - dm.y * float(scales >> 4);
+    return d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
 }
 vec2 get_dm(uint ib, uint a_offset) {
    return vec2(1, 0);
@@ -529,7 +529,7 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint is = 2 * n + b;                 // 0..7
    const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126

-    const vec2 loadd = vec2(data_a[a_offset + ib].dm);
+    const vec2 loadd = vec2(data_a[a_offset + ib].d);

    const uint scidx0 = (is < 4) ? is : (is + 4);
    const uint scidx1 = (is < 4) ? is : (is - 4);
@@ -567,7 +567,7 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {

    const uint8_t hm = uint8_t(1 << (iqs / 16));

-    const vec2 loadd = vec2(data_a[a_offset + ib].dm);
+    const vec2 loadd = vec2(data_a[a_offset + ib].d);

    const uint scidx0 = (is < 4) ? is : (is + 4);
    const uint scidx1 = (is < 4) ? is : (is - 4);
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
@@ -120,7 +120,7 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2
 float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
-    const f16vec2 dm = bl.block.dm;
+    const f16vec2 d = bl.block.d;
    const uint idx = coordInBlock[1];

    const uint scalesi = (idx & 0xF0) >> 4;             // 0..15
@@ -131,7 +131,7 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2
    qs = unpack8(qs)[idx & 1];

    const uint scales = bl.block.scales[scalesi];
-    float16_t ret = dm.x * float16_t(scales & 0xF) * float16_t(qs) - dm.y * float16_t(scales >> 4);
+    float16_t ret = d.x * float16_t(scales & 0xF) * float16_t(qs) - d.y * float16_t(scales >> 4);
    return ret;
 }

@@ -680,7 +680,7 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
    uint32_t qs = bl.block.qs[iqs];
    qs >>= shift;
    qs &= 0xF;
-    float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5);
+    float16_t ret = float16_t(kvalues_mxfp4[qs] * d);
    return ret;
 }
 #endif
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp
@@ -26,7 +26,7 @@ void main() {
    const float d = e8m0_to_fp32(data_a[ib].e);

    [[unroll]] for (uint l = 0; l < 8; ++l) {
-        data_b[b_idx + l +  0] = D_TYPE(d * 0.5 * float(kvalues_mxfp4[data_a[ib].qs[q_idx + l] & 0xF]));
-        data_b[b_idx + l + 16] = D_TYPE(d * 0.5 * float(kvalues_mxfp4[data_a[ib].qs[q_idx + l] >>  4]));
+        data_b[b_idx + l +  0] = D_TYPE(d * kvalues_mxfp4[data_a[ib].qs[q_idx + l] & 0xF]);
+        data_b[b_idx + l + 16] = D_TYPE(d * kvalues_mxfp4[data_a[ib].qs[q_idx + l] >>  4]);
    }
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
@@ -24,8 +24,8 @@ void main() {
        const uint ql_idx = 32 * ip + il;
        const uint8_t qs = data_a[i].qs[32 * ip + il];

-        FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].dm.x);
-        FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].dm.y);
+        FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
+        FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
        data_b[y_idx +  0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4));
        data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4));
        data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4));
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
@@ -20,8 +20,8 @@ void main() {
        const uint is = 2 * il;
        const uint n = 4;

-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].dm.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].dm.y);
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);

        const uint y_idx = ib * QUANT_K + 64 * il + n * ir;
        const uint qs_idx = 32*il + n * ir;
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
@@ -19,8 +19,8 @@ void main() {
        const uint ir = tid % 16;
        const uint is = 2 * il;

-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].dm.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].dm.y);
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);

        const uint y_idx = ib * QUANT_K + 64 * il + 2 * ir;
        const uint qs_idx = 32*il + 2 * ir;
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@@ -41,7 +41,9 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
        const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));

-        const FLOAT_TYPE_VEC2 dm = vec2(data_a[ib0 + i].dm);
+        vec2 d = vec2(data_a[ib0 + i].d);
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);

        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
            vec2 b0 =   vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  0]);
@@ -73,7 +75,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
                       fma(FLOAT_TYPE(b96[l]),  sccache2[csel][ix][6 + 8*v_im],
                       fma(FLOAT_TYPE(b112[l]), sccache2[csel][ix][7 + 8*v_im], sum2))))))));
            }
-            temp[j][n] = fma(dm.x, sum1, fma(-dm.y, sum2, temp[j][n]));
+            temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
        }
    }
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@@ -14,7 +14,9 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,

    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
-        const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
+        vec2 d = vec2(data_a[ib0 + i].d);
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);

        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
@@ -79,7 +81,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
                fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
                fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
                fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6,     FLOAT_TYPE(by232.w) * sc7)))))))))))))));
-            temp[j][n] = fma(dm.x, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dm.y, smin, temp[j][n]));
+            temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
        }
    }
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@@ -14,7 +14,9 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,

    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
-        const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
+        vec2 d = vec2(data_a[ib0 + i].d);
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);

        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
@@ -111,7 +113,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
              fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
              fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
                  (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
-            temp[j][n] = fma(dm.x, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dm.y, smin, temp[j][n]));
+            temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
        }
    }
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -120,11 +120,81 @@ shared FLOAT_TYPE_VEC2 buf_b[BN * SHMEM_STRIDE];

 #define NUM_WARPS (BLOCK_SIZE / WARP)

+#ifdef MUL_MAT_ID
+shared u16vec2 row_ids[BN];
+uint _ne1;
+
+#ifdef MUL_MAT_ID_USE_SUBGROUPS
+shared uvec4 ballots_sh[NUM_WARPS];
+
+void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
+    _ne1 = 0;
+    uint num_elements = p.nei1 * p.nei0;
+    uint nei0shift = findLSB(p.nei0);
+
+    uint ids[16];
+    uint iter = 0;
+
+    for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
+        // prefetch up to 16 elements
+        if (iter == 0) {
+            [[unroll]] for (uint k = 0; k < 16; ++k) {
+                uint i = j + gl_LocalInvocationIndex + k*BLOCK_SIZE;
+                bool in_range = i < num_elements;
+                uint ii1;
+                if (nei0_is_pow2) {
+                    ii1 = i >> nei0shift;
+                } else {
+                    ii1 = i / p.nei0;
+                }
+                uint ii0 = i - ii1 * p.nei0;
+                ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
+            }
+        }
+        uint i = j + gl_LocalInvocationIndex;
+        bool in_range = i < num_elements;
+        uint ii1;
+        if (nei0_is_pow2) {
+            ii1 = i >> nei0shift;
+        } else {
+            ii1 = i / p.nei0;
+        }
+        uint ii0 = i - ii1 * p.nei0;
+        uint id = ids[iter++];
+        uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
+
+        ballots_sh[gl_SubgroupID] = ballot;
+        barrier();
+
+        uint subgroup_base = 0;
+        uint total = 0;
+        for (uint k = 0; k < gl_NumSubgroups; ++k) {
+            if (k == gl_SubgroupID) {
+                subgroup_base = total;
+            }
+            total += subgroupBallotBitCount(ballots_sh[k]);
+        }
+        barrier();
+
+        uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot);
+        if (in_range && id == expert_idx && _ne1 + idx >= ic * BN && _ne1 + idx < (ic + 1) * BN) {
+            row_ids[_ne1 + idx - ic * BN] = u16vec2(ii0, ii1);
+        }
+        _ne1 += total;
+        iter &= 15;
+        if (_ne1 >= (ic + 1) * BN) {
+            break;
+        }
+    }
+    barrier();
+}
+#endif // MUL_MAT_ID_USE_SUBGROUPS
+#endif // MUL_MAT_ID
+
 #ifdef COOPMAT
 shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
 #endif

-#include "mul_mm_id_funcs.glsl"
 #include "mul_mm_funcs.glsl"

 void main() {
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
@@ -134,15 +134,15 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint ib = idx / 128;                         // 2 values per idx
            const uint iqs = idx % 128;                        // 0..127

-            const uint qsi = (iqs / 64) * 16 + (iqs % 16);     // 0..15
+            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
            const uint scalesi = iqs / 8;                      // 0..15
            const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6

-            const uvec2 qs = uvec2(unpack8(data_a_packed16[ib].qs[qsi]));
+            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
            const uint scales = data_a[ib].scales[scalesi];
-            const vec2 dm = vec2(data_a[ib].dm);
+            const vec2 d = vec2(data_a[ib].d);

-            const vec2 v = dm.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - dm.y * float(scales >> 4);
+            const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);

            buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy);
 #elif defined(DATA_A_Q3_K)
@@ -179,7 +179,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint is = 2 * n + b;                 // 0..7
            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126

-            const vec2 loadd = vec2(data_a[ib].dm);
+            const vec2 loadd = vec2(data_a[ib].d);

            const uint scidx0 = (is < 4) ? is : (is + 4);
            const uint scidx1 = (is < 4) ? is : (is - 4);
@@ -215,7 +215,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin

            const uint8_t hm = uint8_t(1 << (iqs / 16));

-            const vec2 loadd = vec2(data_a[ib].dm);
+            const vec2 loadd = vec2(data_a[ib].d);

            const uint scidx0 = (is < 4) ? is : (is + 4);
            const uint scidx1 = (is < 4) ? is : (is - 4);
@@ -468,7 +468,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint ib = idx / 8;
            const uint iqs = (idx & 0x07) * 2;

-            const float d = e8m0_to_fp32(data_a[ib].e) * 0.5;
+            const float d = e8m0_to_fp32(data_a[ib].e);
            const uint vui = uint(data_a[ib].qs[iqs]);
            const uint vui2 = uint(data_a[ib].qs[iqs+1]);

--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl
@@ -1,70 +0,0 @@
-#ifdef MUL_MAT_ID
-shared u16vec2 row_ids[BN];
-uint _ne1;
-
-#ifdef MUL_MAT_ID_USE_SUBGROUPS
-shared uvec4 ballots_sh[NUM_WARPS];
-
-void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
-    _ne1 = 0;
-    uint num_elements = p.nei1 * p.nei0;
-    uint nei0shift = findLSB(p.nei0);
-
-    uint ids[16];
-    uint iter = 0;
-
-    for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
-        // prefetch up to 16 elements
-        if (iter == 0) {
-            [[unroll]] for (uint k = 0; k < 16; ++k) {
-                uint i = j + gl_LocalInvocationIndex + k*BLOCK_SIZE;
-                bool in_range = i < num_elements;
-                uint ii1;
-                if (nei0_is_pow2) {
-                    ii1 = i >> nei0shift;
-                } else {
-                    ii1 = i / p.nei0;
-                }
-                uint ii0 = i - ii1 * p.nei0;
-                ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
-            }
-        }
-        uint i = j + gl_LocalInvocationIndex;
-        bool in_range = i < num_elements;
-        uint ii1;
-        if (nei0_is_pow2) {
-            ii1 = i >> nei0shift;
-        } else {
-            ii1 = i / p.nei0;
-        }
-        uint ii0 = i - ii1 * p.nei0;
-        uint id = ids[iter++];
-        uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
-
-        ballots_sh[gl_SubgroupID] = ballot;
-        barrier();
-
-        uint subgroup_base = 0;
-        uint total = 0;
-        for (uint k = 0; k < gl_NumSubgroups; ++k) {
-            if (k == gl_SubgroupID) {
-                subgroup_base = total;
-            }
-            total += subgroupBallotBitCount(ballots_sh[k]);
-        }
-        barrier();
-
-        uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot);
-        if (in_range && id == expert_idx && _ne1 + idx >= ic * BN && _ne1 + idx < (ic + 1) * BN) {
-            row_ids[_ne1 + idx - ic * BN] = u16vec2(ii0, ii1);
-        }
-        _ne1 += total;
-        iter &= 15;
-        if (_ne1 >= (ic + 1) * BN) {
-            break;
-        }
-    }
-    barrier();
-}
-#endif // MUL_MAT_ID_USE_SUBGROUPS
-#endif // MUL_MAT_ID
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
@@ -10,9 +10,10 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #endif

-#if defined(MUL_MAT_ID_USE_SUBGROUPS)
+#ifdef COOPMAT
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_shader_subgroup_basic : enable
-#extension GL_KHR_shader_subgroup_ballot : enable
 #endif

 #ifdef MUL_MAT_ID
@@ -23,10 +24,7 @@

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-#if defined(A_TYPE_PACKED16)
-layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
-#endif
+layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
 #if defined(A_TYPE_PACKED32)
 layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
 #endif
@@ -78,31 +76,40 @@ layout (constant_id = 10) const uint WARP = 32;

 #define BK 32

-#define MMQ_SHMEM
+#ifdef COOPMAT
+#define SHMEM_STRIDE (BK / 4 + 4)
+#else
+#define SHMEM_STRIDE (BK / 4 + 1)
+#endif

-#include "mul_mmq_shmem_types.glsl"
+shared int32_t buf_a_qs[BM * SHMEM_STRIDE];
+
+#ifndef COOPMAT
+#if QUANT_AUXF == 1
+shared FLOAT_TYPE buf_a_dm[BM];
+#else
+shared FLOAT_TYPE_VEC2 buf_a_dm[BM];
+#endif
+#endif
+
+shared int32_t buf_b_qs[BN * SHMEM_STRIDE];
+#ifndef COOPMAT
+shared FLOAT_TYPE_VEC2 buf_b_ds[BN];
+#endif
+
+#define LOAD_VEC_A (4 * QUANT_R)
+#define LOAD_VEC_B 16

 #ifdef MUL_MAT_ID
-#define BK_STEP 1
-#else
-#ifndef BK_STEP
-#define BK_STEP 4
-#endif
-#endif
-
-// Shared memory cache
-shared block_a_cache buf_a[BM * BK_STEP];
-shared block_b_cache buf_b[BN * BK_STEP];
-// Register cache
-block_a_cache cache_a[WMITER * TM];
-block_b_cache cache_b;
-
-#define LOAD_VEC_A (4 * QUANT_R_MMQ)
-#define LOAD_VEC_B 16
+shared u16vec2 row_ids[4096];
+#endif // MUL_MAT_ID

 #define NUM_WARPS (BLOCK_SIZE / WARP)

-#include "mul_mm_id_funcs.glsl"
+#ifdef COOPMAT
+shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
+#endif
+
 #include "mul_mmq_funcs.glsl"

 void main() {
@@ -132,12 +139,26 @@ void main() {
    const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
    const uint WSUBM = WM / WMITER;
    const uint WSUBN = WN / WNITER;
+
+#ifdef COOPMAT
+    const uint warp_i = gl_SubgroupID;
+
+    const uint tiw = gl_SubgroupInvocationID;
+
+    const uint cms_per_row = WM / TM;
+    const uint cms_per_col = WN / TN;
+
+    const uint storestride = WARP / TM;
+    const uint store_r = tiw % TM;
+    const uint store_c = tiw / TM;
+#else
    const uint warp_i = gl_LocalInvocationID.x / WARP;

    const uint tiw = gl_LocalInvocationID.x % WARP;

    const uint tiwr = tiw % (WSUBM / TM);
    const uint tiwc = tiw / (WSUBM / TM);
+#endif

    const uint warp_r = warp_i % (BM / WM);
    const uint warp_c = warp_i / (BM / WM);
@@ -151,27 +172,17 @@ void main() {
    const uint loadstride_b = BLOCK_SIZE * LOAD_VEC_B / BK;

 #ifdef MUL_MAT_ID
-#ifdef MUL_MAT_ID_USE_SUBGROUPS
-    if (bitCount(p.nei0) == 1) {
-        load_row_ids(expert_idx, true, ic);
-    } else {
-        load_row_ids(expert_idx, false, ic);
-    }
-#else
-    _ne1 = 0;
-    for (uint ii1 = 0; ii1 < p.nei1 && _ne1 < (ic + 1) * BN; ii1++) {
-        for (uint ii0 = 0; ii0 < p.nei0 && _ne1 < (ic + 1) * BN; ii0++) {
+    uint _ne1 = 0;
+    for (uint ii1 = 0; ii1 < p.nei1; ii1++) {
+        for (uint ii0 = 0; ii0 < p.nei0; ii0++) {
            if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) {
-                if (_ne1 >= ic * BN) {
-                    row_ids[_ne1 - ic * BN] = u16vec2(ii0, ii1);
-                }
+                row_ids[_ne1] = u16vec2(ii0, ii1);
                _ne1++;
            }
        }
    }

    barrier();
-#endif

    // Workgroup has no work
    if (ic * BN >= _ne1) return;
@@ -198,70 +209,159 @@ void main() {
    uint pos_b_ib = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / BK;
 #endif

+#ifdef COOPMAT
+    coopmat<int8_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a;
+    coopmat<int8_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
+    coopmat<int32_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> cm_result;
+
+    coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> factors[cms_per_row * cms_per_col];
+
+    coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
+
+    [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
+        sums[i] = coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0f);
+    }
+#else
+    int32_t cache_a_qs[WMITER * TM * BK / 4];
+
+    int32_t cache_b_qs[TN * BK / 4];
+
    ACC_TYPE sums[WMITER * TM * WNITER * TN];

    [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
        sums[i] = ACC_TYPE(0.0f);
    }
+#endif

-    for (uint block = start_k; block < end_k; block += BK * BK_STEP) {
+#if QUANT_AUXF == 1
+    FLOAT_TYPE cache_a_dm[WMITER * TM];
+#else
+    FLOAT_TYPE_VEC2 cache_a_dm[WMITER * TM];
+#endif
+
+    FLOAT_TYPE_VEC2 cache_b_ds[TN];
+
+    for (uint block = start_k; block < end_k; block += BK) {
        [[unroll]] for (uint l = 0; loadc_a + l < BM; l += loadstride_a) {
-            const uint buf_ib = loadc_a + l;
-            const uint ib = pos_a_ib + buf_ib * p.stride_a / BK;
+            const uint ib = pos_a_ib + (loadc_a + l) * p.stride_a / BK;
            const uint iqs = loadr_a;
+            const uint buf_ib = loadc_a + l;

-            [[unroll]] for (uint k_step = 0; k_step < BK_STEP; k_step++) {
-                block_a_to_shmem(k_step * BM + buf_ib, ib + k_step, iqs);
+            if (iqs == 0) {
+#if QUANT_AUXF == 1
+                buf_a_dm[buf_ib] = get_d(ib);
+#else
+                buf_a_dm[buf_ib] = get_dm(ib);
+#endif
            }
+#if QUANT_R == 1
+            buf_a_qs[buf_ib * SHMEM_STRIDE + iqs] = repack(ib, iqs);
+#else
+            const i32vec2 vals = repack(ib, iqs);
+            buf_a_qs[buf_ib * SHMEM_STRIDE + iqs    ] = vals.x;
+            buf_a_qs[buf_ib * SHMEM_STRIDE + iqs + 4] = vals.y;
+#endif
        }
        [[unroll]] for (uint l = 0; loadc_b + l < BN; l += loadstride_b) {
+#ifdef MUL_MAT_ID
+            const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l];
+            const uint idx = pos_b_ib + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x7;
+#else
+            const uint ib = pos_b_ib + (loadc_b + l) * p.stride_b / BK;
+            const uint ib_outer = ib / 4;
+            const uint ib_inner = ib % 4;
+
+            const uint iqs = loadr_b;
+#endif
+
            const uint buf_ib = loadc_b + l;

-#ifdef MUL_MAT_ID
-            const u16vec2 row_idx = row_ids[buf_ib];
-            const uint ib = pos_b_ib + row_idx.y * p.batch_stride_b / BK + (row_idx.x % p.ne11) * p.stride_b / BK;
-#else
-            const uint ib = pos_b_ib + buf_ib * p.stride_b / BK;
-#endif
-            const uint iqs = loadr_b;
-
-            [[unroll]] for (uint k_step = 0; k_step < BK_STEP; k_step++) {
-                block_b_to_shmem(k_step * BN + buf_ib, ib + k_step, iqs);
+            if (iqs == 0) {
+                buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib_outer].ds[ib_inner]);
            }
+            const ivec4 values = data_b[ib_outer].qs[ib_inner * 2 + iqs];
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4    ] = values.x;
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 1] = values.y;
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 2] = values.z;
+            buf_b_qs[buf_ib * SHMEM_STRIDE + iqs * 4 + 3] = values.w;
        }

        barrier();

-        pos_a_ib += BK_STEP;
-        pos_b_ib += BK_STEP;
+        pos_a_ib += 1;
+        pos_b_ib += 1;

-        for (uint k_step = 0; k_step < BK_STEP; k_step++) {
+#ifdef COOPMAT
+        [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+            const uint ib_a = warp_r * WM + cm_row * TM;
            // Load from shared into cache
-            [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
-                [[unroll]] for (uint cr = 0; cr < TM; cr++) {
-                    const uint reg_ib = wsir * TM + cr;
-                    const uint buf_ib = warp_r * WM + wsir * WSUBM + tiwr * TM + cr;
+            coopMatLoad(cache_a, buf_a_qs, ib_a * SHMEM_STRIDE, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor);

-                    block_a_to_registers(reg_ib, k_step * BM + buf_ib);
+            // TODO: only cache values that are actually needed
+            [[unroll]] for (uint t_idx = 0; t_idx < TM; t_idx++) {
+                cache_a_dm[t_idx] = buf_a_dm[ib_a + t_idx];
+            }
+
+            [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+                const uint ib_b = warp_c * WN + cm_col * TN;
+                coopMatLoad(cache_b, buf_b_qs, ib_b * SHMEM_STRIDE, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor);
+
+                // TODO: only cache values that are actually needed
+                [[unroll]] for (uint t_idx = 0; t_idx < TN; t_idx++) {
+                    cache_b_dm[t_idx] = buf_b_d[ib_b + t_idx];
+                }
+
+                cm_result = coopmat<int32_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0);
+                cm_result = coopMatMulAdd(cache_a, cache_b, cm_result);
+
+                [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+                    coopmat_stage[warp_i * TM * TN + (store_c + col) * TM + store_r] = ACC_TYPE(float(cache_a_d[store_r]) * float(cache_b_d[store_c + col]));
+                }
+
+                coopMatLoad(factors, coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+                sums[cm_col * cms_per_row + cm_row] += factors * coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(cm_result);
+            }
+        }
+#else
+        // Load from shared into cache
+        [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
+            [[unroll]] for (uint cr = 0; cr < TM; cr++) {
+                const uint ib = warp_r * WM + wsir * WSUBM + tiwr * TM + cr;
+                cache_a_dm[wsir * TM + cr] = buf_a_dm[ib];
+                [[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {
+                    cache_a_qs[(wsir * TM + cr) * (BK / 4) + idx_k] = buf_a_qs[ib * SHMEM_STRIDE + idx_k];
+                }
+            }
+        }
+
+        [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
+            [[unroll]] for (uint cc = 0; cc < TN; cc++) {
+                const uint ib = warp_c * WN + wsic * WSUBN + tiwc * TN + cc;
+                cache_b_ds[cc] = buf_b_ds[ib];
+                [[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {
+                    cache_b_qs[cc * (BK / 4) + idx_k] = buf_b_qs[ib * SHMEM_STRIDE + idx_k];
                }
            }

-            [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
+            [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
                [[unroll]] for (uint cc = 0; cc < TN; cc++) {
-                    const uint ib = k_step * BN + warp_c * WN + wsic * WSUBN + tiwc * TN + cc;
-                    block_b_to_registers(ib);
-
-                    [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
-                        [[unroll]] for (uint cr = 0; cr < TM; cr++) {
-                            const uint cache_a_idx = wsir * TM + cr;
-                            const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
-
-                            sums[sums_idx] += mmq_dot_product(cache_a_idx);
+                    [[unroll]] for (uint cr = 0; cr < TM; cr++) {
+                        const uint cache_a_idx = wsir * TM + cr;
+                        const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
+                        int32_t q_sum = 0;
+                        [[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {
+                            q_sum += dotPacked4x8EXT(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
+                                                     cache_b_qs[cc * (BK / 4) + idx_k]);
                        }
+
+                        sums[sums_idx] += mul_q8_1(q_sum, cache_a_dm[cache_a_idx], cache_b_ds[cc], 1);
                    }
                }
            }
        }
+#endif

        barrier();
    }
@@ -273,6 +373,54 @@ void main() {
    const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
 #endif

+#ifdef COOPMAT
+#ifdef MUL_MAT_ID
+    [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+        [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+            coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+            [[unroll]] for (uint col = 0; col < BN; col += storestride) {
+                const uint row_i = dc + cm_col * TN + col + store_c;
+                if (row_i >= _ne1) break;
+
+                const u16vec2 row_idx = row_ids[row_i];
+
+                data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+            }
+        }
+    }
+#else
+    const bool is_aligned = p.stride_d % 4 == 0;  // Assumption: D_TYPE == float
+
+    [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+        [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+            const bool is_in_bounds = dr + (cm_row + 1) * TM <= p.M && dc + (cm_col + 1) * TN <= p.N;
+
+            if (is_aligned && is_in_bounds) {
+                // Full coopMat is within bounds and stride_d is aligned with 16B
+                coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> cm_dtype = coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(sums[cm_col * cms_per_row + cm_row]);
+                coopMatStore(cm_dtype, data_d, offsets + (dc + cm_col * TN) * p.stride_d + dr + cm_row * TM, p.stride_d, gl_CooperativeMatrixLayoutColumnMajor);
+            } else if (is_in_bounds) {
+                // Full coopMat is within bounds, but stride_d is not aligned
+                coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+                [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+                    data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+                }
+            } else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) {
+                // Partial coopMat is within bounds
+                coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+                [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+                    if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) {
+                        data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+                    }
+                }
+            }
+        }
+    }
+#endif // MUL_MAT_ID
+#else
    [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
        [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {

@@ -283,21 +431,19 @@ void main() {
                const uint row_i = dc_warp + cc;
                if (row_i >= _ne1) break;

-                const u16vec2 row_idx = row_ids[row_i - ic * BN];
+                const u16vec2 row_idx = row_ids[row_i];
 #endif // MUL_MAT_ID
                [[unroll]] for (uint cr = 0; cr < TM; cr++) {
-                    const uint sums_idx = (wsic * TN + cc) * WMITER * TM + wsir * TM + cr;
 #ifdef MUL_MAT_ID
-                    if (dr_warp + cr < p.M) {
-                        data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[sums_idx].x);
-                    }
+                    data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
 #else
                    if (dr_warp + cr < p.M && dc_warp + cc < p.N) {
-                        data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[sums_idx].x);
+                        data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
                    }
 #endif // MUL_MAT_ID
                }
            }
        }
    }
+#endif // COOPMAT
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl
@@ -6,89 +6,41 @@

 // Each iqs value maps to a 32-bit integer

-#if defined(DATA_A_Q4_0) || defined(DATA_A_Q4_1)
-// 2-byte loads for Q4_0 blocks (18 bytes)
-// 4-byte loads for Q4_1 blocks (20 bytes)
+#if defined(DATA_A_Q4_0)
 i32vec2 repack(uint ib, uint iqs) {
-#ifdef DATA_A_Q4_0
-    const u16vec2 quants = u16vec2(data_a_packed16[ib].qs[iqs * 2    ],
-                                   data_a_packed16[ib].qs[iqs * 2 + 1]);
+    // Use 2-byte loads since a q4_0 block (18 bytes) is not divisible by 4
+    const u16vec2 quants = u16vec2(data_a[ib].qs[iqs * 2    ],
+                                   data_a[ib].qs[iqs * 2 + 1]);
    const uint32_t vui = pack32(quants);
    return i32vec2( vui       & 0x0F0F0F0F,
                   (vui >> 4) & 0x0F0F0F0F);
-#else // DATA_A_Q4_1
-    const uint32_t vui = data_a_packed32[ib].qs[iqs];
-    return i32vec2( vui       & 0x0F0F0F0F,
-                   (vui >> 4) & 0x0F0F0F0F);
-#endif
 }

-#ifdef DATA_A_Q4_0
 ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
    return ACC_TYPE(da * (float(q_sum) * dsb.x - (8 / sum_divisor) * dsb.y));
 }
-#else // DATA_A_Q4_1
+#endif
+
+#if defined(DATA_A_Q4_1)
+i32vec2 repack(uint ib, uint iqs) {
+    // Use 4-byte loads since a q4_1 block (20 bytes) is divisible by 4
+    const uint32_t vui = data_a_packed32[ib].qs[iqs];
+    return i32vec2( vui       & 0x0F0F0F0F,
+                   (vui >> 4) & 0x0F0F0F0F);
+}
+
 ACC_TYPE mul_q8_1(const int32_t q_sum, const vec2 dma, const vec2 dsb, const int32_t sum_divisor) {
    return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y / sum_divisor);
 }
 #endif

-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-#ifdef DATA_A_Q4_0
-    buf_a[buf_ib].qs[iqs] = pack32(u16vec2(data_a_packed16[ib].qs[iqs * 2],
-                                           data_a_packed16[ib].qs[iqs * 2 + 1]));
-
-    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE(data_a_packed16[ib].d);
-    }
-#else // DATA_A_Q4_1
-    buf_a[buf_ib].qs[iqs] = data_a_packed32[ib].qs[iqs];
-
-    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
-    }
-#endif
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].dm = buf_a[buf_ib].dm;
-
-    [[unroll]] for (uint iqs = 0; iqs < 4; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    int32_t q_sum = 0;
-    [[unroll]] for (uint iqs = 0; iqs < 4; iqs++) {
-        const uint32_t vui = cache_a[ib_a].qs[iqs];
-        const i32vec2 qs_a = i32vec2( vui       & 0x0F0F0F0F,
-                                     (vui >> 4) & 0x0F0F0F0F);
-
-        const int32_t qs_b0 = cache_b.qs[iqs];
-        const int32_t qs_b1 = cache_b.qs[iqs + 4];
-
-        q_sum += dotPacked4x8EXT(qs_a.x, qs_b0);
-        q_sum += dotPacked4x8EXT(qs_a.y, qs_b1);
-    }
-
-    return mul_q8_1(q_sum, cache_a[ib_a].dm, cache_b.ds, 1);
-}
-#endif // MMQ_SHMEM
-
-#elif defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
-// 2-byte loads for Q5_0 blocks (22 bytes)
-// 4-byte loads for Q5_1 blocks (24 bytes)
+#if defined(DATA_A_Q5_0)
 i32vec2 repack(uint ib, uint iqs) {
-    const u16vec2 quants = u16vec2(data_a_packed16[ib].qs[iqs * 2    ],
-                                   data_a_packed16[ib].qs[iqs * 2 + 1]);
+    // Use 2-byte loads since a q5_0 block (22 bytes) is not divisible by 4
+    const u16vec2 quants = u16vec2(data_a[ib].qs[iqs * 2    ],
+                                   data_a[ib].qs[iqs * 2 + 1]);
    const uint32_t vui = pack32(quants);
-#ifdef DATA_A_Q5_0
-    const int32_t qh = int32_t((uint32_t(data_a_packed16[ib].qh[1]) << 16 | data_a_packed16[ib].qh[0]) >> (4 * iqs));
-#else // DATA_A_Q5_1
-    const int32_t qh = int32_t(data_a_packed32[ib].qh >> (4 * iqs));
-#endif
+    const int32_t qh = int32_t((uint32_t(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]) >> (4 * iqs));
    const int32_t v0 = int32_t(vui & 0x0F0F0F0F)
                     | ((qh & 0xF) * 0x02040810) & 0x10101010; // (0,1,2,3) -> (4,12,20,28)

@@ -98,457 +50,40 @@ i32vec2 repack(uint ib, uint iqs) {
    return i32vec2(v0, v1);
 }

-#ifdef DATA_A_Q5_0
 ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
    return ACC_TYPE(da * (float(q_sum) * dsb.x - (16 / sum_divisor) * dsb.y));
 }
-#else // DATA_A_Q5_1
+#endif
+
+#if defined(DATA_A_Q5_1)
+i32vec2 repack(uint ib, uint iqs) {
+    // Use 4-byte loads since a q5_1 block (24 bytes) is divisible by 4
+    const uint32_t vui = data_a_packed32[ib].qs[iqs];
+    const int32_t qh = int32_t(data_a_packed32[ib].qh >> (4 * iqs));
+    const int32_t v0 = int32_t(vui & 0x0F0F0F0F)
+                     | ((qh & 0xF) * 0x02040810) & 0x10101010; // (0,1,2,3) -> (4,12,20,28)
+
+    const int32_t v1 = int32_t((vui >> 4) & 0x0F0F0F0F)
+                     | (((qh >> 16) & 0xF) * 0x02040810) & 0x10101010; // (16,17,18,19) -> (4,12,20,28)
+
+    return i32vec2(v0, v1);
+}
+
 ACC_TYPE mul_q8_1(const int32_t q_sum, const vec2 dma, const vec2 dsb, const int32_t sum_divisor) {
    return ACC_TYPE(float(q_sum) * dma.x * dsb.x + dma.y * dsb.y / sum_divisor);
 }
 #endif

-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-#ifdef DATA_A_Q5_0
-    buf_a[buf_ib].qs[iqs] = pack32(u16vec2(data_a_packed16[ib].qs[iqs * 2],
-                                           data_a_packed16[ib].qs[iqs * 2 + 1]));
-
-    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE(data_a_packed16[ib].d);
-        buf_a[buf_ib].qh = pack32(u16vec2(data_a_packed16[ib].qh[0], data_a_packed16[ib].qh[1]));
-    }
-#else // DATA_A_Q5_1
-    buf_a[buf_ib].qs[iqs] = data_a_packed32[ib].qs[iqs];
-
-    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
-        buf_a[buf_ib].qh = data_a_packed32[ib].qh;
-    }
-#endif
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].dm = buf_a[buf_ib].dm;
-    cache_a[reg_ib].qh = buf_a[buf_ib].qh;
-
-    [[unroll]] for (uint iqs = 0; iqs < 4; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    int32_t q_sum = 0;
-    [[unroll]] for (uint iqs = 0; iqs < 4; iqs++) {
-        const uint32_t vui = cache_a[ib_a].qs[iqs];
-        const int32_t qh = int32_t(cache_a[ib_a].qh >> (4 * iqs));
-        const int32_t qs_a0 = int32_t(vui & 0x0F0F0F0F)
-                         | ((qh & 0xF) * 0x02040810) & 0x10101010; // (0,1,2,3) -> (4,12,20,28)
-        const int32_t qs_a1 = int32_t((vui >> 4) & 0x0F0F0F0F)
-                         | (((qh >> 16) & 0xF) * 0x02040810) & 0x10101010; // (16,17,18,19) -> (4,12,20,28)
-
-        const int32_t qs_b0 = cache_b.qs[iqs];
-        const int32_t qs_b1 = cache_b.qs[iqs + 4];
-
-        q_sum += dotPacked4x8EXT(qs_a0, qs_b0);
-        q_sum += dotPacked4x8EXT(qs_a1, qs_b1);
-    }
-
-    return mul_q8_1(q_sum, cache_a[ib_a].dm, cache_b.ds, 1);
-}
-#endif // MMQ_SHMEM
-#endif
-
 #if defined(DATA_A_Q8_0)
-// 2-byte loads for Q8_0 blocks (34 bytes)
 int32_t repack(uint ib, uint iqs) {
-    return pack32(i16vec2(data_a_packed16[ib].qs[iqs * 2    ],
-                          data_a_packed16[ib].qs[iqs * 2 + 1]));
+    // Use 2-byte loads since a q8_0 block (34 bytes) is not divisible by 4
+    return pack32(i16vec2(data_a[ib].qs[iqs * 2    ],
+                          data_a[ib].qs[iqs * 2 + 1]));
 }

 ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
    return ACC_TYPE(float(q_sum) * da * dsb.x);
 }
-
-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-    buf_a[buf_ib].qs[iqs] = pack32(i16vec2(data_a_packed16[ib].qs[iqs * 2],
-                                           data_a_packed16[ib].qs[iqs * 2 + 1]));
-
-    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE(data_a_packed16[ib].d);
-    }
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].dm = buf_a[buf_ib].dm;
-
-    [[unroll]] for (uint iqs = 0; iqs < 8; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    int32_t q_sum = 0;
-    [[unroll]] for (uint iqs = 0; iqs < 8; iqs++) {
-        const int32_t qs_a = cache_a[ib_a].qs[iqs];
-        const int32_t qs_b = cache_b.qs[iqs];
-
-        q_sum += dotPacked4x8EXT(qs_a, qs_b);
-    }
-
-    return mul_q8_1(q_sum, cache_a[ib_a].dm, cache_b.ds, 1);
-}
-#endif // MMQ_SHMEM
-#endif
-
-#if defined(DATA_A_MXFP4)
-// 1-byte loads for mxfp4 blocks (17 bytes)
-i32vec2 repack(uint ib, uint iqs) {
-    const uint32_t quants = pack32(u8vec4(data_a[ib].qs[iqs * 4    ],
-                                          data_a[ib].qs[iqs * 4 + 1],
-                                          data_a[ib].qs[iqs * 4 + 2],
-                                          data_a[ib].qs[iqs * 4 + 3]));
-
-    return i32vec2( quants       & 0x0F0F0F0F,
-                   (quants >> 4) & 0x0F0F0F0F);
-}
-
-ACC_TYPE mul_q8_1(const int32_t q_sum, const float da, const vec2 dsb, const int32_t sum_divisor) {
-    return ACC_TYPE(da * dsb.x * float(q_sum));
-}
-
-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-    const uint32_t qs = pack32(u8vec4(data_a[ib].qs[iqs * 4    ],
-                                      data_a[ib].qs[iqs * 4 + 1],
-                                      data_a[ib].qs[iqs * 4 + 2],
-                                      data_a[ib].qs[iqs * 4 + 3]));
-
-    const u8vec4 i_a0 = unpack8( qs       & 0x0F0F0F0F);
-    const u8vec4 i_a1 = unpack8((qs >> 4) & 0x0F0F0F0F);
-
-    buf_a[buf_ib].qs[iqs    ] = pack32(i8vec4(kvalues_mxfp4[i_a0.x], kvalues_mxfp4[i_a0.y], kvalues_mxfp4[i_a0.z], kvalues_mxfp4[i_a0.w]));
-    buf_a[buf_ib].qs[iqs + 4] = pack32(i8vec4(kvalues_mxfp4[i_a1.x], kvalues_mxfp4[i_a1.y], kvalues_mxfp4[i_a1.z], kvalues_mxfp4[i_a1.w]));
-
-    if (iqs == 0) {
-        buf_a[buf_ib].d = FLOAT_TYPE(e8m0_to_fp32(data_a[ib].e) * 0.5);
-    }
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].d = buf_a[buf_ib].d;
-
-    [[unroll]] for (uint iqs = 0; iqs < 8; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    int32_t q_sum = 0;
-    [[unroll]] for (uint iqs = 0; iqs < 8; iqs++) {
-        const int32_t qs_a = cache_a[ib_a].qs[iqs];
-
-        q_sum += dotPacked4x8EXT(qs_a, cache_b.qs[iqs]);
-    }
-
-    return mul_q8_1(q_sum, cache_a[ib_a].d, cache_b.ds, 1);
-}
-#endif // MMQ_SHMEM
-#endif
-
-// For k-quants, ib and iqs still assume 32-wide blocks, but k-quants are 256-wide
-// iqs still refers to a 32-bit integer, meaning 0..7 for 32-wide quants
-#if defined(DATA_A_Q2_K)
-// 4-byte loads for Q2_K blocks (84 bytes)
-int32_t repack(uint ib, uint iqs) {
-    const uint ib_k = ib / 8;
-    const uint iqs_k = (ib % 8) * 8 + iqs;
-
-    const uint qs_idx = (iqs_k / 32) * 8 + (iqs_k % 8);
-    const uint qs_shift = ((iqs_k % 32) / 8) * 2;
-
-    return int32_t((data_a_packed32[ib_k].qs[qs_idx] >> qs_shift) & 0x03030303);
-}
-
-uint8_t get_scale(uint ib, uint iqs) {
-    const uint ib_k = ib / 8;
-    const uint iqs_k = (ib % 8) * 8 + iqs;
-
-    return data_a[ib_k].scales[iqs_k / 4];
-}
-
-ACC_TYPE mul_q8_1(const int32_t sum_d, const int32_t sum_m, const vec2 dma, const vec2 dsb, const int32_t sum_divisor) {
-    return ACC_TYPE(dsb.x * (dma.x * float(sum_d) - dma.y * float(sum_m)));
-}
-
-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-    const uint ib_k = ib / 8;
-    const uint iqs_k = (ib % 8) * 8 + iqs * QUANT_R_MMQ;
-
-    const uint qs_idx = (iqs_k / 32) * 8 + (iqs_k % 8);
-    const uint qs_shift = ((iqs_k % 32) / 8) * 2;
-
-    // Repack 4x4 quants into one int
-    const uint32_t vals0 = (data_a_packed32[ib_k].qs[qs_idx    ] >> qs_shift) & 0x03030303;
-    const uint32_t vals1 = (data_a_packed32[ib_k].qs[qs_idx + 1] >> qs_shift) & 0x03030303;
-    const uint32_t vals2 = (data_a_packed32[ib_k].qs[qs_idx + 2] >> qs_shift) & 0x03030303;
-    const uint32_t vals3 = (data_a_packed32[ib_k].qs[qs_idx + 3] >> qs_shift) & 0x03030303;
-
-    buf_a[buf_ib].qs[iqs] = vals0 | (vals1 << 2) | (vals2 << 4) | (vals3 << 6);
-
-    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm);
-        buf_a[buf_ib].scales = unpack8(data_a_packed16[ib_k].scales[iqs_k / 8]);
-    }
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].dm = buf_a[buf_ib].dm;
-    cache_a[reg_ib].scales = buf_a[buf_ib].scales;
-
-    [[unroll]] for (uint iqs = 0; iqs < 2; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    int32_t sum_d = 0;
-    int32_t sum_m = 0;
-
-    [[unroll]] for (uint iqs = 0; iqs < 8; iqs++) {
-        const uint8_t scale = cache_a[ib_a].scales[iqs / 4];
-        const int32_t scale_m = int32_t(scale >> 4) * 0x01010101; // Duplicate 8-bit value across 32-bits.
-        const int32_t qs_a = int32_t((cache_a[ib_a].qs[iqs / 4] >> ((iqs % 4) * 2)) & 0x03030303);
-
-        sum_d += dotPacked4x8EXT(qs_a, cache_b.qs[iqs]) * (scale & 0xF);
-        sum_m += dotPacked4x8EXT(scale_m, cache_b.qs[iqs]);
-    }
-
-    return mul_q8_1(sum_d, sum_m, cache_a[ib_a].dm, cache_b.ds, 1);
-}
-#endif // MMQ_SHMEM
-#endif
-
-#if defined(DATA_A_Q3_K)
-// 2-byte loads for Q3_K blocks (110 bytes)
-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-    const uint ib_k = ib / 8;
-    const uint hm_idx = iqs * QUANT_R_MMQ;
-    const uint iqs_k = (ib % 8) * 8 + hm_idx;
-
-    const uint qs_idx = (iqs_k / 32) * 8 + (iqs_k % 8);
-    const uint qs_shift = ((iqs_k % 32) / 8) * 2;
-    const uint hm_shift = iqs_k / 8;
-
-    // Repack 2x4 quants into one int
-    // Add the 3rd bit instead of subtracting it to allow packing the quants
-    const i8vec2 vals00 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2        ] >> qs_shift) & uint16_t(0x0303))) |
-                          unpack8(int16_t(((data_a_packed16[ib_k].hmask[hm_idx * 2    ] >> hm_shift) & uint16_t(0x0101)) << 2));
-    const i8vec2 vals01 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 1    ] >> qs_shift) & uint16_t(0x0303))) |
-                          unpack8(int16_t(((data_a_packed16[ib_k].hmask[hm_idx * 2 + 1] >> hm_shift) & uint16_t(0x0101)) << 2));
-    const i8vec2 vals10 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 2    ] >> qs_shift) & uint16_t(0x0303))) |
-                          unpack8(int16_t(((data_a_packed16[ib_k].hmask[hm_idx * 2 + 2] >> hm_shift) & uint16_t(0x0101)) << 2));
-    const i8vec2 vals11 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 3    ] >> qs_shift) & uint16_t(0x0303))) |
-                          unpack8(int16_t(((data_a_packed16[ib_k].hmask[hm_idx * 2 + 3] >> hm_shift) & uint16_t(0x0101)) << 2));
-    buf_a[buf_ib].qs[iqs] = pack32(u8vec4(vals00.x, vals00.y, vals01.x, vals01.y)) |
-                           (pack32(u8vec4(vals10.x, vals10.y, vals11.x, vals11.y)) << 4);
-
-    if (iqs == 0) {
-        const uint is = iqs_k / 4;
-        const i8vec2 scales = i8vec2(unpack8(((data_a_packed16[ib_k].scales[(is % 8      ) / 2] >> (4 * (is / 8))) & 0x0F0F) |
-                                            (((data_a_packed16[ib_k].scales[(8 + (is % 4)) / 2] >> (2 * (is / 4))) & 0x0303) << 4)));
-
-        buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales - 32);
-    }
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].d_scales = buf_a[buf_ib].d_scales;
-
-    [[unroll]] for (uint iqs = 0; iqs < 4; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    float result = 0.0;
-    int32_t q_sum = 0;
-
-    [[unroll]] for (uint iqs = 0; iqs < 4; iqs++) {
-        // Subtract 4 from the quants to correct the 3rd bit offset
-        const int32_t qs_a = pack32(unpack8(int32_t((cache_a[ib_a].qs[iqs / 2] >> ((iqs % 2) * 4)) & 0x0F0F0F0F)) - int8_t(4));
-
-        q_sum += dotPacked4x8EXT(qs_a, cache_b.qs[iqs]);
-    }
-    result += float(cache_a[ib_a].d_scales[0]) * float(q_sum);
-    q_sum = 0;
-
-    [[unroll]] for (uint iqs = 4; iqs < 8; iqs++) {
-        const int32_t qs_a = pack32(unpack8(int32_t((cache_a[ib_a].qs[iqs / 2] >> ((iqs % 2) * 4)) & 0x0F0F0F0F)) - int8_t(4));
-
-        q_sum += dotPacked4x8EXT(qs_a, cache_b.qs[iqs]);
-    }
-    result += float(cache_a[ib_a].d_scales[1]) * float(q_sum);
-
-    return ACC_TYPE(cache_b.ds.x * result);
-}
-#endif // MMQ_SHMEM
-#endif
-
-#if defined(DATA_A_Q4_K) || defined(DATA_A_Q5_K)
-// 4-byte loads for Q4_K blocks (144 bytes) and Q5_K blocks (176 bytes)
-ACC_TYPE mul_q8_1(const int32_t q_sum, const vec2 dma, const vec2 dsb, const int32_t sum_divisor) {
-    return ACC_TYPE(dsb.x * dma.x * float(q_sum) - dma.y * dsb.y);
-}
-
-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-    const uint ib_k = ib / 8;
-    const uint iqs_k = (ib % 8) * 8 + iqs * QUANT_R_MMQ;
-
-    const uint qs_idx = (iqs_k / 16) * 8 + (iqs_k % 8);
-    const uint qs_shift = ((iqs_k % 16) / 8) * 4;
-
-    // Repack 2x4 quants into one int
-#if defined(DATA_A_Q4_K)
-    const uint32_t vals0 = (data_a_packed32[ib_k].qs[qs_idx    ] >> qs_shift) & 0x0F0F0F0F;
-    const uint32_t vals1 = (data_a_packed32[ib_k].qs[qs_idx + 1] >> qs_shift) & 0x0F0F0F0F;
-
-    buf_a[buf_ib].qs[iqs] = vals0 | (vals1 << 4);
-#else // defined(DATA_A_Q5_K)
-    const uint qh_idx = iqs * QUANT_R_MMQ;
-    const uint qh_shift = iqs_k / 8;
-
-    buf_a[buf_ib].qs[iqs] = int32_t(((data_a_packed32[ib_k].qs[qs_idx] >> qs_shift) & 0x0F0F0F0F) |
-                                   (((data_a_packed32[ib_k].qh[qh_idx] >> qh_shift) & 0x01010101) << 4));
-#endif
-
-
-    if (iqs == 0) {
-        // Scale index
-        const uint is = iqs_k / 8;
-        u8vec2 scale_dm;
-        if (is < 4) {
-            scale_dm = u8vec2(data_a[ib_k].scales[is] & 0x3F, data_a[ib_k].scales[is + 4] & 0x3F);
-        } else {
-            scale_dm = u8vec2((data_a[ib_k].scales[is+4] & 0xF) | ((data_a[ib_k].scales[is-4] & 0xC0) >> 2),
-                              (data_a[ib_k].scales[is+4] >>  4) | ((data_a[ib_k].scales[is  ] & 0xC0) >> 2));
-        }
-
-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm) * FLOAT_TYPE_VEC2(scale_dm);
-    }
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].dm = buf_a[buf_ib].dm;
-
-    [[unroll]] for (uint iqs = 0; iqs < 8 / QUANT_R_MMQ; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    int32_t q_sum = 0;
-
-    [[unroll]] for (uint iqs = 0; iqs < 8; iqs++) {
-#if defined(DATA_A_Q4_K)
-        const int32_t qs_a = int32_t((cache_a[ib_a].qs[iqs / 2] >> ((iqs % 2) * 4)) & 0x0F0F0F0F);
-#else // defined(DATA_A_Q5_K)
-        const int32_t qs_a = cache_a[ib_a].qs[iqs];
-#endif
-
-        q_sum += dotPacked4x8EXT(qs_a, cache_b.qs[iqs]);
-    }
-
-    return mul_q8_1(q_sum, cache_a[ib_a].dm, cache_b.ds, 1);
-}
-#endif // MMQ_SHMEM
-#endif
-
-#ifdef MMQ_SHMEM
-void block_b_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-    const uint ib_outer = ib / 4;
-    const uint ib_inner = ib % 4;
-
-    if (iqs == 0) {
-        buf_b[buf_ib].ds = FLOAT_TYPE_VEC2(data_b[ib_outer].ds[ib_inner]);
-    }
-
-    const ivec4 values = data_b[ib_outer].qs[ib_inner * 2 + iqs];
-    buf_b[buf_ib].qs[iqs * 4    ] = values.x;
-    buf_b[buf_ib].qs[iqs * 4 + 1] = values.y;
-    buf_b[buf_ib].qs[iqs * 4 + 2] = values.z;
-    buf_b[buf_ib].qs[iqs * 4 + 3] = values.w;
-}
-
-void block_b_to_registers(const uint ib) {
-    cache_b.ds = buf_b[ib].ds;
-    [[unroll]] for (uint iqs = 0; iqs < BK / 4; iqs++) {
-        cache_b.qs[iqs] = buf_b[ib].qs[iqs];
-    }
-}
-#endif
-
-#if defined(DATA_A_Q6_K)
-// 2-byte loads for Q6_K blocks (210 bytes)
-#ifdef MMQ_SHMEM
-void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
-    const uint ib_k = ib / 8;
-    const uint iqs_k = (ib % 8) * 8 + iqs;
-
-    const uint ql_idx = (iqs_k / 32) * 16 + iqs_k % 16;
-    const uint ql_shift = ((iqs_k % 32) / 16) * 4;
-
-    const uint qh_idx = (iqs_k / 32) * 8 + iqs;
-    const uint qh_shift = ((iqs_k % 32) / 8) * 2;
-
-    const i8vec2 vals00 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2    ] >> ql_shift) & uint16_t(0x0F0F))) |
-                          unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2    ] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
-    const i8vec2 vals01 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 1] >> ql_shift) & uint16_t(0x0F0F))) |
-                          unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 1] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
-    buf_a[buf_ib].qs[iqs] = pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y));
-
-    if (iqs == 0) {
-        const uint is = iqs_k / 4;
-        const i8vec2 scales = unpack8(data_a_packed16[ib_k].scales[is / 2]);
-
-        buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales);
-    }
-}
-
-void block_a_to_registers(const uint reg_ib, const uint buf_ib) {
-    cache_a[reg_ib].d_scales = buf_a[buf_ib].d_scales;
-
-    [[unroll]] for (uint iqs = 0; iqs < 8; iqs++) {
-        cache_a[reg_ib].qs[iqs] = buf_a[buf_ib].qs[iqs];
-    }
-}
-
-ACC_TYPE mmq_dot_product(const uint ib_a) {
-    float result = 0.0;
-    int32_t q_sum = 0;
-
-    [[unroll]] for (uint iqs = 0; iqs < 4; iqs++) {
-        const int32_t qs_a = cache_a[ib_a].qs[iqs];
-
-        q_sum += dotPacked4x8EXT(qs_a, cache_b.qs[iqs]);
-    }
-    result += float(cache_a[ib_a].d_scales[0]) * float(q_sum);
-    q_sum = 0;
-
-    [[unroll]] for (uint iqs = 4; iqs < 8; iqs++) {
-        const int32_t qs_a = cache_a[ib_a].qs[iqs];
-
-        q_sum += dotPacked4x8EXT(qs_a, cache_b.qs[iqs]);
-    }
-    result += float(cache_a[ib_a].d_scales[1]) * float(q_sum);
-
-    return ACC_TYPE(cache_b.ds.x * result);
-}
-#endif // MMQ_SHMEM
 #endif

 #if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
@@ -568,10 +103,3 @@ FLOAT_TYPE_VEC2 get_dm(uint ib) {
    return FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
 }
 #endif
-
-#if defined(DATA_A_Q2_K)
-FLOAT_TYPE_VEC2 get_dm(uint ib) {
-    const uint ib_k = ib / 8;
-    return FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm);
-}
-#endif
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
@@ -1,78 +0,0 @@
-#if defined(DATA_A_Q4_0)
-#define QUANT_R_MMQ 2
-struct block_a_cache {
-    uint32_t qs[16/4];
-    FLOAT_TYPE dm;
-};
-#elif defined(DATA_A_Q4_1)
-#define QUANT_R_MMQ 2
-struct block_a_cache {
-    uint32_t qs[16/4];
-    FLOAT_TYPE_VEC2 dm;
-};
-#elif defined(DATA_A_Q5_0)
-#define QUANT_R_MMQ 2
-struct block_a_cache {
-    uint32_t qs[16/4];
-    uint32_t qh;
-    FLOAT_TYPE dm;
-};
-#elif defined(DATA_A_Q5_1)
-#define QUANT_R_MMQ 2
-struct block_a_cache {
-    uint32_t qs[16/4];
-    uint32_t qh;
-    FLOAT_TYPE_VEC2 dm;
-};
-#elif defined(DATA_A_Q8_0)
-#define QUANT_R_MMQ 1
-// AMD likes 4, Intel likes 1 and Nvidia likes 2
-// #define BK_STEP 1
-struct block_a_cache {
-    int32_t qs[32/4];
-    FLOAT_TYPE dm;
-};
-#elif defined(DATA_A_MXFP4)
-#define QUANT_R_MMQ 2
-struct block_a_cache {
-    int32_t qs[8];
-    FLOAT_TYPE d;
-};
-#elif defined(DATA_A_Q2_K)
-#define QUANT_R_MMQ 4
-struct block_a_cache {
-    uint32_t qs[2];
-    u8vec2 scales;
-    FLOAT_TYPE_VEC2 dm;
-};
-#elif defined(DATA_A_Q3_K)
-#define QUANT_R_MMQ 2
-struct block_a_cache {
-    uint32_t qs[4];
-    FLOAT_TYPE_VEC2 d_scales;
-};
-#elif defined(DATA_A_Q4_K)
-#define QUANT_R_MMQ 2
-struct block_a_cache {
-    uint32_t qs[4];
-    FLOAT_TYPE_VEC2 dm;
-};
-#elif defined(DATA_A_Q5_K)
-#define QUANT_R_MMQ 1
-struct block_a_cache {
-    int32_t qs[8];
-    FLOAT_TYPE_VEC2 dm;
-};
-#elif defined(DATA_A_Q6_K)
-#define QUANT_R_MMQ 1
-struct block_a_cache {
-    int32_t qs[8];
-    FLOAT_TYPE_VEC2 d_scales;
-};
-#endif
-
-struct block_b_cache
-{
-    int32_t qs[8];
-    FLOAT_TYPE_VEC2 ds;
-};
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
@@ -10,7 +10,6 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer Y {int data_pos[];};
 layout (binding = 2) readonly buffer Z {float data_ff[];};
 layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
-layout (binding = 4) readonly buffer I {uvec2 data_i[];}; // indices for set_rows

 layout (push_constant) uniform parameter {
    uint ncols;
@@ -28,7 +27,6 @@ layout (push_constant) uniform parameter {
    uint s2;
    int sections[4];
    uint is_back;
-    uint set_rows_stride;
 } p;

 float rope_yarn_ramp(const float low, const float high, const uint i0) {
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
@@ -16,19 +16,12 @@ void main() {
    const uint row_x     = row_dst % ne1;
    const uint channel_x = row_dst / ne1;

-    uint idst = row_dst*ne0 + i0/2;
+    const uint idst = row_dst*ne0 + i0/2;
    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;

-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
-    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
-    if (p.set_rows_stride != 0) {
-        idst = row_x*ne0 + i0/2;
-        idst += data_i[channel_x].x * p.set_rows_stride;
-    }
-
    if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = D_TYPE(data_a[ix + i0/2 + 0]);
-        data_d[idst + i0/2 + 1] = D_TYPE(data_a[ix + i0/2 + 1]);
+        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
+        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];

        return;
    }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
@@ -16,19 +16,12 @@ void main() {
    const uint row_x     = row_dst % ne1;
    const uint channel_x = row_dst / ne1;

-    uint idst = row_dst*ne0 + i0;
+    const uint idst = row_dst*ne0 + i0;
    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0;

-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
-    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
-    if (p.set_rows_stride != 0) {
-        idst = row_x*ne0 + i0;
-        idst += data_i[channel_x].x * p.set_rows_stride;
-    }
-
    if (i0 >= p.n_dims) {
-        data_d[idst + 0] = D_TYPE(data_a[ix + 0]);
-        data_d[idst + 1] = D_TYPE(data_a[ix + 1]);
+        data_d[idst + 0] = data_a[ix + 0];
+        data_d[idst + 1] = data_a[ix + 1];

        return;
    }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@@ -11,8 +11,6 @@ layout (push_constant) uniform parameter
 {
    uint n_rows;
    uint n_expert_used;
-    float clamp_min;
-    float clamp_max;
 };

 layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
@@ -20,7 +18,6 @@ layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
 layout(constant_id = 0) const uint WARP_SIZE = 32;
 layout(constant_id = 1) const uint n_experts = 512;
 layout(constant_id = 2) const bool with_norm = true;
-layout(constant_id = 3) const bool late_softmax = false;

 const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;

@@ -28,52 +25,6 @@ layout (binding = 0, std430) readonly buffer Logits {float logits[];};
 layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
 layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};

-const float INFINITY = 1.0 / 0.0;
-
-// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
-void softmax_warp_inplace(inout float vals[experts_per_thread], const uint limit, const uint lane, const bool use_limit) {
-    float max_val = -INFINITY;
-
-    [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        const uint idx       = lane + i * WARP_SIZE;
-        const bool is_active = !use_limit || (idx < limit);
-        if (is_active) {
-            max_val = max(max_val, vals[i]);
-        }
-    }
-
-    max_val = subgroupMax(max_val);
-
-    float sum = 0.f;
-
-    [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        const uint idx       = lane + i * WARP_SIZE;
-        const bool is_active = !use_limit || (idx < limit);
-        if (is_active) {
-            const float val = exp(vals[i] - max_val);
-            vals[i]         = val;
-            sum += val;
-        } else {
-            vals[i] = 0.f;
-        }
-    }
-
-    sum = subgroupAdd(sum);
-
-    const float inv_sum = 1.0f / sum;
-
-    [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        const uint idx       = lane + i * WARP_SIZE;
-        const bool is_active = !use_limit || (idx < limit);
-        if (is_active) {
-            vals[i] *= inv_sum;
-        }
-    }
-}
-
 void main() {
    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
    if (row >= n_rows) {
@@ -84,16 +35,43 @@ void main() {
    const uint weights_offset = n_expert_used * row;
    const uint ids_offset = n_experts * row;

-    float wt[experts_per_thread];
+    float logits_r[experts_per_thread];
+
+    const float INFINITY = 1.0 / 0.0;

    [[unroll]]
    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
-        const uint expert = i + gl_LocalInvocationID.x;
-        wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
+        const uint expert        = i + gl_LocalInvocationID.x;
+        logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[logits_offset + expert] : -INFINITY;
    }

-    if (!late_softmax) {
-        softmax_warp_inplace(wt, n_experts, gl_LocalInvocationID.x, false);
+    float max_val = logits_r[0];
+
+    [[unroll]]
+    for (int i = 1; i < experts_per_thread; i++) {
+        const float val = logits_r[i];
+        max_val         = max(val, max_val);
+    }
+
+    max_val = subgroupMax(max_val);
+
+    float wt[experts_per_thread];
+    float tmp = 0.f;
+
+    [[unroll]]
+    for (int i = 0; i < experts_per_thread; i++) {
+        const float val = logits_r[i];
+        wt[i]           = exp(val - max_val);
+        tmp += wt[i];
+    }
+
+    tmp = subgroupAdd(tmp);
+
+    const float inv_sum = 1.0f / tmp;
+
+    [[unroll]]
+    for (int i = 0; i < experts_per_thread; i++) {
+        wt[i] = wt[i] * inv_sum;
    }

    // at this point, each thread holds a portion of softmax,
@@ -104,11 +82,6 @@ void main() {

    float output_weights[experts_per_thread];

-    [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        output_weights[i] = 0.f;
-    }
-
    for (int k = 0; k < n_expert_used; k++) {
        float max_val    = wt[0];
        uint   max_expert = gl_LocalInvocationID.x;
@@ -148,7 +121,6 @@ void main() {

    if (with_norm) {
        wt_sum              = subgroupAdd(wt_sum);
-        wt_sum              = clamp(wt_sum, clamp_min, clamp_max);
        const float inv_sum = 1.0f / wt_sum;

        [[unroll]]
@@ -157,10 +129,6 @@ void main() {
        }
    }

-    if (late_softmax) {
-        softmax_warp_inplace(output_weights, n_expert_used, gl_LocalInvocationID.x, true);
-    }
-
    [[unroll]]
    for (uint i = 0; i < experts_per_thread; ++i) {
        uint idx = i * WARP_SIZE + gl_LocalInvocationID.x;
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
@@ -66,7 +66,6 @@ struct block_q4_0_packed16
 #define QUANT_AUXF 1
 #define A_TYPE block_q4_0
 #define A_TYPE_PACKED16 block_q4_0_packed16
-#define DATA_A_QUANT_LEGACY
 #endif

 #define QUANT_K_Q4_1 32
@@ -99,7 +98,6 @@ struct block_q4_1_packed32
 #define A_TYPE block_q4_1
 #define A_TYPE_PACKED16 block_q4_1_packed16
 #define A_TYPE_PACKED32 block_q4_1_packed32
-#define DATA_A_QUANT_LEGACY
 #endif

 #define QUANT_K_Q5_0 32
@@ -125,7 +123,6 @@ struct block_q5_0_packed16
 #define QUANT_AUXF 1
 #define A_TYPE block_q5_0
 #define A_TYPE_PACKED16 block_q5_0_packed16
-#define DATA_A_QUANT_LEGACY
 #endif

 #define QUANT_K_Q5_1 32
@@ -161,7 +158,6 @@ struct block_q5_1_packed32
 #define A_TYPE block_q5_1
 #define A_TYPE_PACKED16 block_q5_1_packed16
 #define A_TYPE_PACKED32 block_q5_1_packed32
-#define DATA_A_QUANT_LEGACY
 #endif

 #define QUANT_K_Q8_0 32
@@ -190,7 +186,6 @@ struct block_q8_0_packed32
 #define A_TYPE block_q8_0
 #define A_TYPE_PACKED16 block_q8_0_packed16
 #define A_TYPE_PACKED32 block_q8_0_packed32
-#define DATA_A_QUANT_LEGACY
 #endif

 #define QUANT_K_Q8_1 32
@@ -231,21 +226,21 @@ struct block_q2_K
 {
    uint8_t scales[QUANT_K_Q2_K/16];
    uint8_t qs[QUANT_K_Q2_K/4];
-    f16vec2 dm;
+    f16vec2 d;
 };

 struct block_q2_K_packed16
 {
    uint16_t scales[QUANT_K_Q2_K/16/2];
    uint16_t qs[QUANT_K_Q2_K/4/2];
-    f16vec2 dm;
+    f16vec2 d;
 };

 struct block_q2_K_packed32
 {
    uint32_t scales[QUANT_K_Q2_K/16/4];
    uint32_t qs[QUANT_K_Q2_K/4/4];
-    f16vec2 dm;
+    f16vec2 d;
 };

 #if defined(DATA_A_Q2_K)
@@ -254,8 +249,6 @@ struct block_q2_K_packed32
 #define A_TYPE block_q2_K
 #define A_TYPE_PACKED16 block_q2_K_packed16
 #define A_TYPE_PACKED32 block_q2_K_packed32
-#define SCALES_PER_32 2
-#define DATA_A_QUANT_K
 #endif

 #define QUANT_K_Q3_K 256
@@ -281,28 +274,27 @@ struct block_q3_K_packed16
 #define QUANT_R 1
 #define A_TYPE block_q3_K
 #define A_TYPE_PACKED16 block_q3_K_packed16
-#define DATA_A_QUANT_K
 #endif

 #define QUANT_K_Q4_K 256

 struct block_q4_K
 {
-    f16vec2 dm;
+    f16vec2 d;
    uint8_t scales[3*QUANT_K_Q4_K/64];
    uint8_t qs[QUANT_K_Q4_K/2];
 };

 struct block_q4_K_packed16
 {
-    f16vec2 dm;
+    f16vec2 d;
    uint16_t scales[3*QUANT_K_Q4_K/64/2];
    uint16_t qs[QUANT_K_Q4_K/2/2];
 };

 struct block_q4_K_packed32
 {
-    f16vec2 dm;
+    f16vec2 d;
    uint32_t scales[3*QUANT_K_Q4_K/64/4];
    uint32_t qs[QUANT_K_Q4_K/2/4];
 };
@@ -318,14 +310,13 @@ struct block_q4_K_packed128
 #define A_TYPE block_q4_K
 #define A_TYPE_PACKED16 block_q4_K_packed16
 #define A_TYPE_PACKED32 block_q4_K_packed32
-#define DATA_A_QUANT_K
 #endif

 #define QUANT_K_Q5_K 256

 struct block_q5_K
 {
-    f16vec2 dm;
+    f16vec2 d;
    uint8_t scales[12];
    uint8_t qh[QUANT_K_Q5_K/8];
    uint8_t qs[QUANT_K_Q5_K/2];
@@ -333,20 +324,12 @@ struct block_q5_K

 struct block_q5_K_packed16
 {
-    f16vec2 dm;
+    f16vec2 d;
    uint16_t scales[12/2];
    uint16_t qh[QUANT_K_Q5_K/8/2];
    uint16_t qs[QUANT_K_Q5_K/2/2];
 };

-struct block_q5_K_packed32
-{
-    f16vec2 dm;
-    uint32_t scales[12/4];
-    uint32_t qh[QUANT_K_Q5_K/8/4];
-    uint32_t qs[QUANT_K_Q5_K/2/4];
-};
-
 struct block_q5_K_packed128
 {
    uvec4 q5k[11];
@@ -357,8 +340,6 @@ struct block_q5_K_packed128
 #define QUANT_R 1
 #define A_TYPE block_q5_K
 #define A_TYPE_PACKED16 block_q5_K_packed16
-#define A_TYPE_PACKED32 block_q5_K_packed32
-#define DATA_A_QUANT_K
 #endif

 #define QUANT_K_Q6_K 256
@@ -375,7 +356,7 @@ struct block_q6_K_packed16
 {
    uint16_t ql[QUANT_K_Q6_K/2/2];
    uint16_t qh[QUANT_K_Q6_K/4/2];
-    int16_t scales[QUANT_K_Q6_K/16/2];
+    int8_t scales[QUANT_K_Q6_K/16];
    float16_t d;
 };

@@ -384,7 +365,6 @@ struct block_q6_K_packed16
 #define QUANT_R 1
 #define A_TYPE block_q6_K
 #define A_TYPE_PACKED16 block_q6_K_packed16
-#define DATA_A_QUANT_K
 #endif

 // IQuants
@@ -1383,11 +1363,18 @@ struct block_mxfp4
    uint8_t qs[QUANT_K_MXFP4/2];
 };

+//struct block_mxfp4_packed16
+//{
+//    uint8_t e;
+//    uint16_t qs[QUANT_K_MXFP4/2/2];
+//};
+
 #if defined(DATA_A_MXFP4)
 #define QUANT_K QUANT_K_MXFP4
 #define QUANT_R QUANT_R_MXFP4
 #define QUANT_AUXF 1
 #define A_TYPE block_mxfp4
+//#define A_TYPE_PACKED16 block_mxfp4_packed16
 #endif

 #if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
@@ -1410,12 +1397,12 @@ void init_iq_shmem(uvec3 wgsize)
 #endif

 #if defined(DATA_A_MXFP4)
-const int8_t kvalues_mxfp4_const[16] = {
-    int8_t(0), int8_t(1), int8_t(2), int8_t(3), int8_t(4), int8_t(6), int8_t(8), int8_t(12),
-    int8_t(0), int8_t(-1), int8_t(-2), int8_t(-3), int8_t(-4), int8_t(-6), int8_t(-8), int8_t(-12),
+const FLOAT_TYPE kvalues_mxfp4_const[16] = {
+    FLOAT_TYPE(0.0f), FLOAT_TYPE(0.5f), FLOAT_TYPE(1.0f), FLOAT_TYPE(1.5f), FLOAT_TYPE(2.0f), FLOAT_TYPE(3.0f), FLOAT_TYPE(4.0f), FLOAT_TYPE(6.0f),
+    FLOAT_TYPE(-0.0f), FLOAT_TYPE(-0.5f), FLOAT_TYPE(-1.0f), FLOAT_TYPE(-1.5f), FLOAT_TYPE(-2.0f), FLOAT_TYPE(-3.0f), FLOAT_TYPE(-4.0f), FLOAT_TYPE(-6.0f)
 };

-shared int8_t kvalues_mxfp4[16];
+shared FLOAT_TYPE kvalues_mxfp4[16];

 #define NEEDS_INIT_IQ_SHMEM
 void init_iq_shmem(uvec3 wgsize)
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -566,8 +566,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
        }

 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
-        // Integer dot mmq performs better with f32 accumulators
-        if (!f16acc && !coopmat && !coopmat2 && (is_legacy_quant(tname) || is_k_quant(tname) || tname == "mxfp4")) {
+        if (!coopmat && !coopmat2 && matmul_id_type == MatMulIdType::NONE && is_legacy_quant(tname)) {
            string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
        }
 #endif
@@ -575,7 +574,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
 }

 void process_shaders() {
-    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}};
+    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};

    // matmul
    for (const MatMulIdType& matmul_id_type : {MatMulIdType::NONE, MatMulIdType::DEFAULT, MatMulIdType::SUBGROUP}) {
@@ -842,14 +841,10 @@ void process_shaders() {
    string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
    string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
-    string_to_spv("rope_norm_f32_f16", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_norm_f32_f16_rte", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});

    string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
    string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
-    string_to_spv("rope_neox_f32_f16", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_neox_f32_f16_rte", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});

    string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
--- a/Show More
+++ b/Show More