convert: check file size for safetensors to warn for improper conversion

routes: add logprobs in tool calls (#13238 )
cmd/bench: fix binary name in README (#13276 )
2025-12-10 17:58:16 -08:00 · 2025-12-10 17:28:41 -08:00 · 2025-12-10 14:16:58 -08:00 · 2025-12-10 14:07:48 -08:00 · 2025-12-10 12:59:27 -08:00 · 2025-12-10 15:29:48 -05:00
525 changed files with 51441 additions and 41904 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,8 +15,12 @@ ml/backend/**/*.cu linguist-vendored
 ml/backend/**/*.cuh linguist-vendored
 ml/backend/**/*.m linguist-vendored
 ml/backend/**/*.metal linguist-vendored
+ml/backend/**/*.comp linguist-vendored
+ml/backend/**/*.glsl linguist-vendored
 ml/backend/**/CMakeLists.txt linguist-vendored

+app/webview linguist-vendored
+
 llama/build-info.cpp linguist-generated
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated

--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -16,13 +16,15 @@ jobs:
    outputs:
      GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
      VERSION: ${{ steps.goflags.outputs.VERSION }}
+      vendorsha: ${{ steps.changes.outputs.vendorsha }}
    steps:
      - uses: actions/checkout@v4
      - name: Set environment
        id: goflags
        run: |
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
-          echo VERSION="${GITHUB_REF_NAME#v}" >>$GITHUB_OUTPUT
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" | tee -a $GITHUB_OUTPUT
+          echo VERSION="${GITHUB_REF_NAME#v}" | tee -a $GITHUB_OUTPUT
+          echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT

  darwin-build:
    runs-on: macos-14-xlarge
@@ -53,6 +55,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - run: |
          ./scripts/build_darwin.sh
      - name: Log build results
@@ -104,6 +109,13 @@ jobs:
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
+            runner_dir: 'rocm'
+          - os: windows
+            arch: amd64
+            preset: Vulkan
+            install: https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe
+            flags: ''
+            runner_dir: 'vulkan'
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -113,13 +125,14 @@ jobs:
        run: |
          choco install -y --no-progress ccache ninja
          ccache -o cache_dir=${{ github.workspace }}\.ccache
-      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ')
+      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ') || startsWith(matrix.preset, 'Vulkan')
        id: cache-install
        uses: actions/cache/restore@v4
        with:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
+            C:\VulkanSDK
          key: ${{ matrix.install }}
      - if: startsWith(matrix.preset, 'CUDA ')
        name: Install CUDA ${{ matrix.cuda-version }}
@@ -149,6 +162,18 @@ jobs:
          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
+      - if: matrix.preset == 'Vulkan'
+        name: Install Vulkan ${{ matrix.rocm-version }}
+        run: |
+          $ErrorActionPreference = "Stop"
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
+            Start-Process -FilePath .\install.exe -ArgumentList "-c","--am","--al","in" -NoNewWindow -Wait
+          }
+          
+          $vulkanPath = (Resolve-Path "C:\VulkanSDK\*").path
+          echo "$vulkanPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "VULKAN_SDK=$vulkanPath" >> $env:GITHUB_ENV
      - if: matrix.preset == 'CPU'
        run: |
          echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
@@ -159,19 +184,20 @@ jobs:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
+            C:\VulkanSDK
          key: ${{ matrix.install }}
      - uses: actions/checkout@v4
      - uses: actions/cache@v4
        with:
          path: ${{ github.workspace }}\.ccache
-          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
+          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}-${{ needs.setup-environment.outputs.vendorsha }}
      - name: Build target "${{ matrix.preset }}"
        run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} --install-prefix "$((pwd).Path)\dist\${{ matrix.os }}-${{ matrix.arch }}"
          cmake --build --parallel ([Environment]::ProcessorCount) --preset "${{ matrix.preset }}"
-          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip
+          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || startsWith(matrix.preset, 'Vulkan') && 'Vulkan' || 'CPU' }}" --strip
          Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
        env:
          CMAKE_GENERATOR: Ninja
@@ -228,6 +254,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - name: Verify gcc is actually clang
        run: |
          $ErrorActionPreference='Continue'
@@ -281,6 +310,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - uses: actions/download-artifact@v4
        with:
          pattern: depends-windows*
@@ -312,13 +344,13 @@ jobs:
        include:
          - os: linux
            arch: amd64
-            target: archive_novulkan
+            target: archive
          - os: linux
            arch: amd64
            target: rocm
          - os: linux
            arch: arm64
-            target: archive_novulkan
+            target: archive
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: setup-environment
@@ -345,6 +377,7 @@ jobs:
              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
@@ -374,14 +407,12 @@ jobs:
        include:
          - os: linux
            arch: arm64
-            target: novulkan
            build-args: |
              CGO_CFLAGS
              CGO_CXXFLAGS
              GOFLAGS
          - os: linux
            arch: amd64
-            target: novulkan
            build-args: |
              CGO_CFLAGS
              CGO_CXXFLAGS
@@ -394,14 +425,6 @@ jobs:
              CGO_CXXFLAGS
              GOFLAGS
              FLAVOR=rocm
-          - os: linux
-            arch: amd64
-            suffix: '-vulkan'
-            target: default
-            build-args: |
-              CGO_CFLAGS
-              CGO_CXXFLAGS
-              GOFLAGS
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: setup-environment
@@ -419,7 +442,6 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.os }}/${{ matrix.arch }}
-          target: ${{ matrix.preset }}
          build-args: ${{ matrix.build-args }}
          outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -22,6 +22,7 @@ jobs:
    runs-on: ubuntu-latest
    outputs:
      changed: ${{ steps.changes.outputs.changed }}
+      vendorsha: ${{ steps.changes.outputs.vendorsha }}
    steps:
      - uses: actions/checkout@v4
        with:
@@ -37,6 +38,7 @@ jobs:
          }

          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
+          echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT

  linux:
    needs: [changes]
@@ -83,7 +85,7 @@ jobs:
      - uses: actions/cache@v4
        with:
          path: /github/home/.cache/ccache
-          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
      - run: |
          cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
          cmake --build --preset ${{ matrix.preset }} --parallel
@@ -172,12 +174,13 @@ jobs:
          path: |
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
+            C:\VulkanSDK
          key: ${{ matrix.install }}
      - uses: actions/checkout@v4
      - uses: actions/cache@v4
        with:
          path: ${{ github.workspace }}\.ccache
-          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
      - run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
@@ -205,6 +208,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: 'go.mod'
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - uses: actions/setup-node@v4
        with:
          node-version: '20'
@@ -225,12 +231,9 @@ jobs:
        if: always()
        run: go test -count=1 -benchtime=1x ./...

-      # TODO(bmizerany): replace this heavy tool with just the
-      # tools/checks/binaries we want and then make them all run in parallel
-      # across jobs, not on a single tiny vm on Github Actions.
-      - uses: golangci/golangci-lint-action@v6
+      - uses: golangci/golangci-lint-action@v9
        with:
-          args: --timeout 10m0s -v
+          only-new-issues: true

  patches:
    runs-on: ubuntu-latest
@@ -239,4 +242,4 @@ jobs:
      - name: Verify patches apply cleanly and do not change files
        run: |
          make -f Makefile.sync clean checkout apply-patches sync
-          git diff --compact-summary --exit-code
+          git diff --compact-summary --exit-code
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -1,5 +1,4 @@
-run:
-  timeout: 5m
+version: "2"
 linters:
  enable:
    - asasalint
@@ -7,35 +6,46 @@ linters:
    - bodyclose
    - containedctx
    - gocheckcompilerdirectives
-    - gofmt
-    - gofumpt
-    - gosimple
-    - govet
-    - ineffassign
    - intrange
    - makezero
    - misspell
    - nilerr
    - nolintlint
    - nosprintfhostport
-    - staticcheck
    - unconvert
    - usetesting
    - wastedassign
    - whitespace
  disable:
-    - usestdlibvars
    - errcheck
-linters-settings:
-  staticcheck:
-    checks:
-      - all
-      - -SA1019 # omit Deprecated check
+    - usestdlibvars
+  settings:
+    govet:
+      disable:
+        - unusedresult
+    staticcheck:
+      checks:
+        - all
+        - -QF* # disable quick fix suggestions
+        - -SA1019
+        - -ST1000 # package comment format
+        - -ST1003 # underscores in package names
+        - -ST1005 # error strings should not be capitalized
+        - -ST1012 # error var naming (ErrFoo)
+        - -ST1016 # receiver name consistency
+        - -ST1020 # comment on exported function format
+        - -ST1021 # comment on exported type format
+        - -ST1022 # comment on exported var format
+        - -ST1023 # omit type from declaration
 severity:
-  default-severity: error
+  default: error
  rules:
    - linters:
        - gofmt
        - goimports
        - intrange
      severity: info
+formatters:
+  enable:
+    - gofmt
+    - gofumpt
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,7 +16,7 @@ See the [development documentation](./docs/development.md) for instructions on h

 * New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
 * Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
-* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.
+* Documentation: small updates to fill in or correct missing documentation are helpful, however large documentation additions can be hard to maintain over time.

 ### Issues that may not be accepted

@@ -43,7 +43,7 @@ Tips for proposals:
 * Explain how the change will be tested.

 Additionally, for bonus points: Provide draft documentation you would expect to
-see if the change were accepted.
+see if the changes were accepted.

 ## Pull requests

@@ -66,7 +66,6 @@ Examples:

      llm/backend/mlx: support the llama architecture
      CONTRIBUTING: provide clarity on good commit messages, and bad
-      docs: simplify manual installation with shorter curl commands

 Bad Examples:

--- a/41
+++ b/41
@@ -39,14 +39,14 @@ ENV CC=clang CXX=clang++
 FROM base-${TARGETARCH} AS base
 ARG CMAKEVERSION
 RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
-COPY CMakeLists.txt CMakePresets.json .
-COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ENV LDFLAGS=-s

 FROM base AS cpu
 RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
 ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CPU' \
        && cmake --build --parallel ${PARALLEL} --preset 'CPU' \
@@ -57,6 +57,8 @@ ARG CUDA11VERSION=11.8
 RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 11' \
        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
@@ -67,6 +69,8 @@ ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 12' \
        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
@@ -78,6 +82,8 @@ ARG CUDA13VERSION=13.0
 RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
 ENV PATH=/usr/local/cuda-13/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 13' \
        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
@@ -87,6 +93,8 @@ RUN --mount=type=cache,target=/root/.ccache \
 FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'ROCm 6' \
        && cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
@@ -118,6 +126,8 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}

 FROM base AS vulkan
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'Vulkan' \
        && cmake --build --parallel --preset 'Vulkan' \
@@ -159,32 +169,7 @@ ARG VULKANVERSION
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama

-# Temporary opt-out stages for Vulkan
-FROM --platform=linux/amd64 scratch AS amd64_novulkan
-# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
-COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
-FROM arm64 AS arm64_novulkan
-FROM ${FLAVOR}_novulkan AS archive_novulkan
-COPY --from=cpu dist/lib/ollama /lib/ollama
-COPY --from=build /bin/ollama /bin/ollama
-FROM ubuntu:24.04 AS novulkan
-RUN apt-get update \
-    && apt-get install -y ca-certificates \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-COPY --from=archive_novulkan /bin /usr/bin
-ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-COPY --from=archive_novulkan /lib/ollama /usr/lib/ollama
-ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-ENV NVIDIA_VISIBLE_DEVICES=all
-ENV OLLAMA_HOST=0.0.0.0:11434
-EXPOSE 11434
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
-
-FROM ubuntu:24.04 AS default
+FROM ubuntu:24.04
 RUN apt-get update \
    && apt-get install -y ca-certificates libvulkan1 \
    && apt-get clean \
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=3cfa9c3f125763305b4226bc032f1954f08990dc
+FETCH_HEAD=17f7f4baad8b3a716ee139da7bb56ae984e8c0fa

 .PHONY: help
 help:
@@ -57,7 +57,7 @@ checkout: $(WORKDIR)
 $(WORKDIR):
 	git clone $(UPSTREAM) $(WORKDIR)

-.PHONE: format-patches
+.PHONY: format-patches
 format-patches: llama/patches
 	git -C $(WORKDIR) format-patch \
 		--no-signature \
@@ -66,7 +66,11 @@ format-patches: llama/patches
 		-o $(realpath $<) \
 		$(FETCH_HEAD)

-.PHONE: clean
+.PHONY: clean
 clean: checkout
 	@git -C $(WORKDIR) am --abort || true
 	$(RM) llama/patches/.*.patched
+
+.PHONY: print-base
+print-base:
+	@echo $(FETCH_HEAD)
--- a/README.md
+++ b/README.md
@@ -299,6 +299,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
+- [AI-UI](https://github.com/bajahaw/ai-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
 - [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
@@ -365,7 +366,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
+- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VS Code extension for multi-file/whole-repo coding
+- [Void](https://github.com/voideditor/void) (Open source AI code editor and Cursor alternative)
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
@@ -397,7 +399,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
 - [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
 - [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
+- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VS Code extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
 - [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
 - [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
@@ -426,6 +428,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
 - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
 - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
+- [KDeps](https://github.com/kdeps/kdeps) (Kdeps is an offline-first AI framework for building Dockerized full-stack AI applications declaratively using Apple PKL and integrates APIs with Ollama on the backend.)
 - [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
 - [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
 - [Hillnote](https://hillnote.com) (A Markdown-first workspace designed to supercharge your AI workflow. Create documents ready to integrate with Claude, ChatGPT, Gemini, Cursor, and more - all while keeping your work on your device.)
@@ -552,7 +555,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
 - [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
+- [Swollama for Swift]([https://github.com/marcusziade/Swollama](https://github.com/guitaripod/Swollama) with [DocC]( https://guitaripod.github.io/Swollama/documentation/swollama)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
@@ -615,7 +618,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
- [AI Summmary Helper plugin](https://github.com/philffm/ai-summary-helper)
+- [AI Summary Helper plugin](https://github.com/philffm/ai-summary-helper)
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
@@ -623,7 +626,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
 - [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
+- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Editor tool to analyze scripts via Ollama)
 - [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
 - [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
 - [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network)
@@ -633,12 +636,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.

 ### Observability
- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
+- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native integration to Ollama.
 - [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
 - [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
 - [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
 - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.

-## Security
+### Security
 - [Ollama Fortress](https://github.com/ParisNeo/ollama_proxy_server)
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -14,7 +14,7 @@ Please include the following details in your report:

 ## Security best practices

-While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
+While the maintainer team does its best to secure Ollama, users are encouraged to implement their own security best practices, such as:

 - Regularly updating to the latest version of Ollama
 - Securing access to hosted instances of Ollama
--- a/api/client.go
+++ b/api/client.go
@@ -226,7 +226,14 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f

 		bts := scanner.Bytes()
 		if err := json.Unmarshal(bts, &errorResponse); err != nil {
-			return fmt.Errorf("unmarshal: %w", err)
+			if response.StatusCode >= http.StatusBadRequest {
+				return StatusError{
+					StatusCode:   response.StatusCode,
+					Status:       response.Status,
+					ErrorMessage: string(bts),
+				}
+			}
+			return errors.New(string(bts))
 		}

 		if response.StatusCode == http.StatusUnauthorized {
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -55,6 +55,7 @@ func TestClientFromEnvironment(t *testing.T) {
 type testError struct {
 	message    string
 	statusCode int
+	raw        bool // if true, write message as-is instead of JSON encoding
 }

 func (e testError) Error() string {
@@ -111,6 +112,20 @@ func TestClientStream(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "plain text error response",
+			responses: []any{
+				"internal server error",
+			},
+			wantErr: "internal server error",
+		},
+		{
+			name: "HTML error page",
+			responses: []any{
+				"<html><body>404 Not Found</body></html>",
+			},
+			wantErr: "404 Not Found",
+		},
 	}

 	for _, tc := range testCases {
@@ -135,6 +150,12 @@ func TestClientStream(t *testing.T) {
 						return
 					}

+					if str, ok := resp.(string); ok {
+						fmt.Fprintln(w, str)
+						flusher.Flush()
+						continue
+					}
+
 					if err := json.NewEncoder(w).Encode(resp); err != nil {
 						t.Fatalf("failed to encode response: %v", err)
 					}
@@ -173,9 +194,10 @@ func TestClientStream(t *testing.T) {

 func TestClientDo(t *testing.T) {
 	testCases := []struct {
-		name     string
-		response any
-		wantErr  string
+		name           string
+		response       any
+		wantErr        string
+		wantStatusCode int
 	}{
 		{
 			name: "immediate error response",
@@ -183,7 +205,8 @@ func TestClientDo(t *testing.T) {
 				message:    "test error message",
 				statusCode: http.StatusBadRequest,
 			},
-			wantErr: "test error message",
+			wantErr:        "test error message",
+			wantStatusCode: http.StatusBadRequest,
 		},
 		{
 			name: "server error response",
@@ -191,7 +214,8 @@ func TestClientDo(t *testing.T) {
 				message:    "internal error",
 				statusCode: http.StatusInternalServerError,
 			},
-			wantErr: "internal error",
+			wantErr:        "internal error",
+			wantStatusCode: http.StatusInternalServerError,
 		},
 		{
 			name: "successful response",
@@ -203,6 +227,26 @@ func TestClientDo(t *testing.T) {
 				Success: true,
 			},
 		},
+		{
+			name: "plain text error response",
+			response: testError{
+				message:    "internal server error",
+				statusCode: http.StatusInternalServerError,
+				raw:        true,
+			},
+			wantErr:        "internal server error",
+			wantStatusCode: http.StatusInternalServerError,
+		},
+		{
+			name: "HTML error page",
+			response: testError{
+				message:    "<html><body>404 Not Found</body></html>",
+				statusCode: http.StatusNotFound,
+				raw:        true,
+			},
+			wantErr:        "<html><body>404 Not Found</body></html>",
+			wantStatusCode: http.StatusNotFound,
+		},
 	}

 	for _, tc := range testCases {
@@ -210,11 +254,16 @@ func TestClientDo(t *testing.T) {
 			ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				if errResp, ok := tc.response.(testError); ok {
 					w.WriteHeader(errResp.statusCode)
-					err := json.NewEncoder(w).Encode(map[string]string{
-						"error": errResp.message,
-					})
-					if err != nil {
-						t.Fatal("failed to encode error response:", err)
+					if !errResp.raw {
+						err := json.NewEncoder(w).Encode(map[string]string{
+							"error": errResp.message,
+						})
+						if err != nil {
+							t.Fatal("failed to encode error response:", err)
+						}
+					} else {
+						// Write raw message (simulates non-JSON error responses)
+						fmt.Fprint(w, errResp.message)
 					}
 					return
 				}
@@ -241,6 +290,15 @@ func TestClientDo(t *testing.T) {
 				if err.Error() != tc.wantErr {
 					t.Errorf("error message mismatch: got %q, want %q", err.Error(), tc.wantErr)
 				}
+				if tc.wantStatusCode != 0 {
+					if statusErr, ok := err.(StatusError); ok {
+						if statusErr.StatusCode != tc.wantStatusCode {
+							t.Errorf("status code mismatch: got %d, want %d", statusErr.StatusCode, tc.wantStatusCode)
+						}
+					} else {
+						t.Errorf("expected StatusError, got %T", err)
+					}
+				}
 				return
 			}

--- a/api/examples/chat/main.go
+++ b/api/examples/chat/main.go
@@ -15,19 +15,19 @@ func main() {
 	}

 	messages := []api.Message{
-		api.Message{
+		{
 			Role:    "system",
 			Content: "Provide very brief, concise responses",
 		},
-		api.Message{
+		{
 			Role:    "user",
 			Content: "Name some unusual animals",
 		},
-		api.Message{
+		{
 			Role:    "assistant",
 			Content: "Monotreme, platypus, echidna",
 		},
-		api.Message{
+		{
 			Role:    "user",
 			Content: "which of these is the most dangerous?",
 		},
--- a/api/types.go
+++ b/api/types.go
@@ -117,6 +117,14 @@ type GenerateRequest struct {
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
+
+	// Logprobs specifies whether to return log probabilities of the output tokens.
+	Logprobs bool `json:"logprobs,omitempty"`
+
+	// TopLogprobs is the number of most likely tokens to return at each token position,
+	// each with an associated log probability. Only applies when Logprobs is true.
+	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
+	TopLogprobs int `json:"top_logprobs,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -159,6 +167,14 @@ type ChatRequest struct {
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
+
+	// Logprobs specifies whether to return log probabilities of the output tokens.
+	Logprobs bool `json:"logprobs,omitempty"`
+
+	// TopLogprobs is the number of most likely tokens to return at each token position,
+	// each with an associated log probability. Only applies when Logprobs is true.
+	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
+	TopLogprobs int `json:"top_logprobs,omitempty"`
 }

 type Tools []Tool
@@ -343,6 +359,27 @@ func (t *ToolFunction) String() string {
 	return string(bts)
 }

+// TokenLogprob represents log probability information for a single token alternative.
+type TokenLogprob struct {
+	// Token is the text representation of the token.
+	Token string `json:"token"`
+
+	// Logprob is the log probability of this token.
+	Logprob float64 `json:"logprob"`
+
+	// Bytes contains the raw byte representation of the token
+	Bytes []int `json:"bytes,omitempty"`
+}
+
+// Logprob contains log probability information for a generated token.
+type Logprob struct {
+	TokenLogprob
+
+	// TopLogprobs contains the most likely tokens and their log probabilities
+	// at this position, if requested via TopLogprobs parameter.
+	TopLogprobs []TokenLogprob `json:"top_logprobs,omitempty"`
+}
+
 // ChatResponse is the response returned by [Client.Chat]. Its fields are
 // similar to [GenerateResponse].
 type ChatResponse struct {
@@ -369,6 +406,10 @@ type ChatResponse struct {

 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`

+	// Logprobs contains log probability information for the generated tokens,
+	// if requested via the Logprobs parameter.
+	Logprobs []Logprob `json:"logprobs,omitempty"`
+
 	Metrics
 }

@@ -677,6 +718,10 @@ type GenerateResponse struct {
 	ToolCalls []ToolCall `json:"tool_calls,omitempty"`

 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
+
+	// Logprobs contains log probability information for the generated tokens,
+	// if requested via the Logprobs parameter.
+	Logprobs []Logprob `json:"logprobs,omitempty"`
 }

 // ModelDetails provides details about a model.
--- a/app/cmd/app/app.go
+++ b/app/cmd/app/app.go
@@ -273,10 +273,6 @@ func main() {
 		Handler: uiServer.Handler(),
 	}

-	if _, err := uiServer.UserData(ctx); err != nil {
-		slog.Warn("failed to load user data", "error", err)
-	}
-
 	// Start the UI server
 	slog.Info("starting ui server", "port", port)
 	go func() {
@@ -320,6 +316,17 @@ func main() {
 		slog.Debug("no URL scheme request to handle")
 	}

+	go func() {
+		slog.Debug("waiting for ollama server to be ready")
+		if err := ui.WaitForServer(ctx, 10*time.Second); err != nil {
+			slog.Warn("ollama server not ready, continuing anyway", "error", err)
+		}
+
+		if _, err := uiServer.UserData(ctx); err != nil {
+			slog.Warn("failed to load user data", "error", err)
+		}
+	}()
+
 	osRun(cancel, hasCompletedFirstRun, startHidden)

 	slog.Info("shutting down desktop server")
@@ -361,7 +368,7 @@ func checkUserLoggedIn(uiServerPort int) bool {
 		return false
 	}

-	resp, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/api/v1/me", uiServerPort))
+	resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/me", uiServerPort), "application/json", nil)
 	if err != nil {
 		slog.Debug("failed to call local auth endpoint", "error", err)
 		return false
@@ -397,8 +404,8 @@ func checkUserLoggedIn(uiServerPort int) bool {
 // handleConnectURLScheme fetches the connect URL and opens it in the browser
 func handleConnectURLScheme() {
 	if checkUserLoggedIn(uiServerPort) {
-		slog.Info("user is already logged in, opening settings instead")
-		sendUIRequestMessage("/")
+		slog.Info("user is already logged in, opening app instead")
+		showWindow(wv.webview.Window())
 		return
 	}

@@ -434,37 +441,30 @@ func openInBrowser(url string) {
 	}
 }

-// parseURLScheme parses an ollama:// URL and returns whether it's a connect URL and the UI path
-func parseURLScheme(urlSchemeRequest string) (isConnect bool, uiPath string, err error) {
+// parseURLScheme parses an ollama:// URL and validates it
+// Supports: ollama:// (open app) and ollama://connect (OAuth)
+func parseURLScheme(urlSchemeRequest string) (isConnect bool, err error) {
 	parsedURL, err := url.Parse(urlSchemeRequest)
 	if err != nil {
-		return false, "", err
+		return false, fmt.Errorf("invalid URL: %w", err)
 	}

 	// Check if this is a connect URL
 	if parsedURL.Host == "connect" || strings.TrimPrefix(parsedURL.Path, "/") == "connect" {
-		return true, "", nil
+		return true, nil
 	}

-	// Extract the UI path
-	path := "/"
-	if parsedURL.Path != "" && parsedURL.Path != "/" {
-		// For URLs like ollama:///settings, use the path directly
-		path = parsedURL.Path
-	} else if parsedURL.Host != "" {
-		// For URLs like ollama://settings (without triple slash),
-		// the "settings" part is parsed as the host, not the path.
-		// We need to convert it to a path by prepending "/"
-		// This also handles ollama://settings/ where Windows adds a trailing slash
-		path = "/" + parsedURL.Host
+	// Allow bare ollama:// or ollama:/// to open the app
+	if (parsedURL.Host == "" && parsedURL.Path == "") || parsedURL.Path == "/" {
+		return false, nil
 	}

-	return false, path, nil
+	return false, fmt.Errorf("unsupported ollama:// URL path: %s", urlSchemeRequest)
 }

 // handleURLSchemeInCurrentInstance processes URL scheme requests in the current instance
 func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
-	isConnect, uiPath, err := parseURLScheme(urlSchemeRequest)
+	isConnect, err := parseURLScheme(urlSchemeRequest)
 	if err != nil {
 		slog.Error("failed to parse URL scheme request", "url", urlSchemeRequest, "error", err)
 		return
@@ -473,6 +473,8 @@ func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
 	if isConnect {
 		handleConnectURLScheme()
 	} else {
-		sendUIRequestMessage(uiPath)
+		if wv.webview != nil {
+			showWindow(wv.webview.Window())
+		}
 	}
 }
--- a/app/cmd/app/app_darwin.m
+++ b/app/cmd/app/app_darwin.m
@@ -24,27 +24,14 @@ bool firstTimeRun,startHidden; // Set in run before initialization
    for (NSURL *url in urls) {
        if ([url.scheme isEqualToString:@"ollama"]) {
            NSString *path = url.path;
-            if (!path || [path isEqualToString:@""]) {
-                // For URLs like ollama://settings (without triple slash),
-                // the "settings" part is parsed as the host, not the path.
-                // We need to convert it to a path by prepending "/"
-                if (url.host && ![url.host isEqualToString:@""]) {
-                    path = [@"/" stringByAppendingString:url.host];
-                } else {
-                    path = @"/";
-                }
-            }
-            
-            if ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"]) {
+
+            if (path && ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"])) {
                // Special case: handle connect by opening browser instead of app
                handleConnectURL();
            } else {
                // Set app to be active and visible
                [NSApp setActivationPolicy:NSApplicationActivationPolicyRegular];
                [NSApp activateIgnoringOtherApps:YES];
-                
-                // Open the path with the UI
-                [self uiRequest:path];
            }
            
            break;
@@ -260,7 +247,7 @@ bool firstTimeRun,startHidden; // Set in run before initialization
 }

 - (void)openHelp:(id)sender {
-    NSURL *url = [NSURL URLWithString:@"https://github.com/ollama/ollama/tree/main/docs"];
+    NSURL *url = [NSURL URLWithString:@"https://docs.ollama.com/"];
    [[NSWorkspace sharedWorkspace] openURL:url];
 }

--- a/app/cmd/app/app_windows.go
+++ b/app/cmd/app/app_windows.go
@@ -138,7 +138,7 @@ func (app *appCallbacks) HandleURLScheme(urlScheme string) {

 // handleURLSchemeRequest processes URL scheme requests from other instances
 func handleURLSchemeRequest(urlScheme string) {
-	isConnect, uiPath, err := parseURLScheme(urlScheme)
+	isConnect, err := parseURLScheme(urlScheme)
 	if err != nil {
 		slog.Error("failed to parse URL scheme request", "url", urlScheme, "error", err)
 		return
@@ -147,7 +147,9 @@ func handleURLSchemeRequest(urlScheme string) {
 	if isConnect {
 		handleConnectURLScheme()
 	} else {
-		sendUIRequestMessage(uiPath)
+		if wv.webview != nil {
+			showWindow(wv.webview.Window())
+		}
 	}
 }

--- a/app/ui/app/codegen/gotypes.gen.ts
+++ b/app/ui/app/codegen/gotypes.gen.ts
@@ -469,26 +469,24 @@ export class HealthResponse {
 }
 export class User {
    id: string;
-    name: string;
    email: string;
-    avatarURL: string;
-    plan: string;
-    bio: string;
-    firstName: string;
-    lastName: string;
-    overThreshold: boolean;
+    name: string;
+    bio?: string;
+    avatarurl?: string;
+    firstname?: string;
+    lastname?: string;
+    plan?: string;

    constructor(source: any = {}) {
        if ('string' === typeof source) source = JSON.parse(source);
        this.id = source["id"];
-        this.name = source["name"];
        this.email = source["email"];
-        this.avatarURL = source["avatarURL"];
-        this.plan = source["plan"];
+        this.name = source["name"];
        this.bio = source["bio"];
-        this.firstName = source["firstName"];
-        this.lastName = source["lastName"];
-        this.overThreshold = source["overThreshold"];
+        this.avatarurl = source["avatarurl"];
+        this.firstname = source["firstname"];
+        this.lastname = source["lastname"];
+        this.plan = source["plan"];
    }
 }
 export class Attachment {
--- a/app/ui/app/src/api.ts
+++ b/app/ui/app/src/api.ts
@@ -15,6 +15,7 @@ import {
 import { parseJsonlFromResponse } from "./util/jsonl-parsing";
 import { ollamaClient as ollama } from "./lib/ollama-client";
 import type { ModelResponse } from "ollama/browser";
+import { API_BASE, OLLAMA_DOT_COM } from "./lib/config";

 // Extend Model class with utility methods
 declare module "@/gotypes" {
@@ -26,9 +27,6 @@ declare module "@/gotypes" {
 Model.prototype.isCloud = function (): boolean {
  return this.model.endsWith("cloud");
 };
-
-const API_BASE = import.meta.env.DEV ? "http://127.0.0.1:3001" : "";
-
 // Helper function to convert Uint8Array to base64
 function uint8ArrayToBase64(uint8Array: Uint8Array): string {
  const chunkSize = 0x8000; // 32KB chunks to avoid stack overflow
@@ -43,44 +41,50 @@ function uint8ArrayToBase64(uint8Array: Uint8Array): string {
 }

 export async function fetchUser(): Promise<User | null> {
-  try {
-    const response = await fetch(`${API_BASE}/api/v1/me`, {
-      method: "GET",
-      headers: {
-        "Content-Type": "application/json",
-      },
-    });
-
-    if (response.ok) {
-      const userData: User = await response.json();
-      return userData;
-    }
-
-    return null;
-  } catch (error) {
-    console.error("Error fetching user:", error);
-    return null;
-  }
-}
-
-export async function fetchConnectUrl(): Promise<string> {
-  const response = await fetch(`${API_BASE}/api/v1/connect`, {
-    method: "GET",
+  const response = await fetch(`${API_BASE}/api/me`, {
+    method: "POST",
    headers: {
      "Content-Type": "application/json",
    },
  });

-  if (!response.ok) {
-    throw new Error("Failed to fetch connect URL");
+  if (response.ok) {
+    const userData: User = await response.json();
+
+    if (userData.avatarurl && !userData.avatarurl.startsWith("http")) {
+      userData.avatarurl = `${OLLAMA_DOT_COM}${userData.avatarurl}`;
+    }
+
+    return userData;
  }

-  const data = await response.json();
-  return data.connect_url;
+  if (response.status === 401 || response.status === 403) {
+    return null;
+  }
+
+  throw new Error(`Failed to fetch user: ${response.status}`);
+}
+
+export async function fetchConnectUrl(): Promise<string> {
+  const response = await fetch(`${API_BASE}/api/me`, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+  });
+
+  if (response.status === 401) {
+    const data = await response.json();
+    if (data.signin_url) {
+      return data.signin_url;
+    }
+  }
+
+  throw new Error("Failed to fetch connect URL");
 }

 export async function disconnectUser(): Promise<void> {
-  const response = await fetch(`${API_BASE}/api/v1/disconnect`, {
+  const response = await fetch(`${API_BASE}/api/signout`, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
@@ -205,6 +209,11 @@ export async function* sendMessage(
    data: uint8ArrayToBase64(att.data),
  }));

+  // Send think parameter when it's explicitly set (true, false, or a non-empty string).
+  const shouldSendThink =
+    think !== undefined &&
+    (typeof think === "boolean" || (typeof think === "string" && think !== ""));
+
  const response = await fetch(`${API_BASE}/api/v1/chat/${chatId}`, {
    method: "POST",
    headers: {
@@ -222,7 +231,7 @@ export async function* sendMessage(
        web_search: webSearch ?? false,
        file_tools: fileTools ?? false,
        ...(forceUpdate !== undefined ? { forceUpdate } : {}),
-        ...(think !== undefined ? { think } : {}),
+        ...(shouldSendThink ? { think } : {}),
      }),
    ),
    signal,
@@ -385,7 +394,8 @@ export async function getInferenceCompute(): Promise<InferenceCompute[]> {

 export async function fetchHealth(): Promise<boolean> {
  try {
-    const response = await fetch(`${API_BASE}/api/v1/health`, {
+    // Use the /api/version endpoint as a health check
+    const response = await fetch(`${API_BASE}/api/version`, {
      method: "GET",
      headers: {
        "Content-Type": "application/json",
@@ -394,7 +404,8 @@ export async function fetchHealth(): Promise<boolean> {

    if (response.ok) {
      const data = await response.json();
-      return data.healthy || false;
+      // If we get a version back, the server is healthy
+      return !!data.version;
    }

    return false;
--- a/app/ui/app/src/components/Settings.tsx
+++ b/app/ui/app/src/components/Settings.tsx
@@ -299,9 +299,9 @@ export default function Settings() {
                        </Button>
                      </div>
                    </div>
-                    {user?.avatarURL && (
+                    {user?.avatarurl && (
                      <img
-                        src={user.avatarURL}
+                        src={user.avatarurl}
                        alt={user?.name}
                        className="h-10 w-10 rounded-full bg-neutral-200 dark:bg-neutral-700 flex-shrink-0"
                        onError={(e) => {
--- a/app/ui/app/src/components/Thinking.tsx
+++ b/app/ui/app/src/components/Thinking.tsx
@@ -50,21 +50,33 @@ export default function Thinking({
  // Position content to show bottom when collapsed
  useEffect(() => {
    if (isCollapsed && contentRef.current && wrapperRef.current) {
-      const contentHeight = contentRef.current.scrollHeight;
-      const wrapperHeight = wrapperRef.current.clientHeight;
-      if (contentHeight > wrapperHeight) {
-        const translateY = -(contentHeight - wrapperHeight);
-        contentRef.current.style.transform = `translateY(${translateY}px)`;
-        setHasOverflow(true);
-      } else {
-        setHasOverflow(false);
-      }
+      requestAnimationFrame(() => {
+        if (!contentRef.current || !wrapperRef.current) return;
+
+        const contentHeight = contentRef.current.scrollHeight;
+        const wrapperHeight = wrapperRef.current.clientHeight;
+        if (contentHeight > wrapperHeight) {
+          const translateY = -(contentHeight - wrapperHeight);
+          contentRef.current.style.transform = `translateY(${translateY}px)`;
+          setHasOverflow(true);
+        } else {
+          contentRef.current.style.transform = "translateY(0)";
+          setHasOverflow(false);
+        }
+      });
    } else if (contentRef.current) {
      contentRef.current.style.transform = "translateY(0)";
      setHasOverflow(false);
    }
  }, [thinking, isCollapsed]);

+  useEffect(() => {
+    if (activelyThinking && wrapperRef.current && !isCollapsed) {
+      // When expanded and actively thinking, scroll to bottom
+      wrapperRef.current.scrollTop = wrapperRef.current.scrollHeight;
+    }
+  }, [thinking, activelyThinking, isCollapsed]);
+
  const handleToggle = () => {
    setIsCollapsed(!isCollapsed);
    setHasUserInteracted(true);
--- a/app/ui/app/src/hooks/useChats.ts
+++ b/app/ui/app/src/hooks/useChats.ts
@@ -7,6 +7,7 @@ import { createQueryBatcher } from "./useQueryBatcher";
 import { useRefetchModels } from "./useModels";
 import { useStreamingContext } from "@/contexts/StreamingContext";
 import { useSettings } from "./useSettings";
+import { getModelCapabilities } from "@/api";

 export const useChats = () => {
  return useQuery({
@@ -606,6 +607,24 @@ export const useSendMessage = (chatId: string) => {
              queryClient.setQueryData(["staleModels"], newStaleMap);

              queryClient.invalidateQueries({ queryKey: ["models"] });
+
+              // Fetch fresh capabilities for the downloaded model
+              getModelCapabilities(selectedModel.model)
+                .then((capabilities) => {
+                  queryClient.setQueryData(
+                    ["modelCapabilities", selectedModel.model],
+                    capabilities,
+                  );
+                })
+                .catch((error) => {
+                  console.error(
+                    "Failed to fetch capabilities after download:",
+                    error,
+                  );
+                  queryClient.invalidateQueries({
+                    queryKey: ["modelCapabilities", selectedModel.model],
+                  });
+                });
            }
            break;
          }
--- a/app/ui/app/src/hooks/useDownloadModel.ts
+++ b/app/ui/app/src/hooks/useDownloadModel.ts
@@ -1,114 +0,0 @@
-import { useMutation, useQueryClient } from "@tanstack/react-query";
-import { useState } from "react";
-import { pullModel } from "@/api";
-import { useSelectedModel } from "./useSelectedModel";
-import { useSettings } from "./useSettings";
-
-interface DownloadProgress {
-  status: string;
-  digest?: string;
-  total?: number;
-  completed?: number;
-  done?: boolean;
-}
-
-export function useDownloadModel(chatId?: string) {
-  const queryClient = useQueryClient();
-  const { selectedModel } = useSelectedModel(chatId);
-  const { setSettings } = useSettings();
-  const [downloadProgress, setDownloadProgress] =
-    useState<DownloadProgress | null>(null);
-  const [abortController, setAbortController] =
-    useState<AbortController | null>(null);
-  const [downloadingChatIds, setDownloadingChatIds] = useState<Set<string>>(
-    new Set(),
-  );
-
-  const mutation = useMutation({
-    mutationFn: async (modelName: string) => {
-      const controller = new AbortController();
-      setAbortController(controller);
-      setDownloadProgress({ status: "Starting download..." });
-      if (chatId) {
-        setDownloadingChatIds((prev) => new Set(prev).add(chatId));
-      }
-
-      try {
-        for await (const progress of pullModel(modelName, controller.signal)) {
-          setDownloadProgress(progress);
-
-          if (progress.status === "success") {
-            // Update selected model to indicate it's now available locally
-            if (selectedModel && selectedModel.model === modelName) {
-              setSettings({ SelectedModel: modelName });
-            }
-            // Invalidate models query to refresh the list
-            await queryClient.invalidateQueries({ queryKey: ["models"] });
-            break;
-          }
-        }
-      } finally {
-        setAbortController(null);
-        if (chatId) {
-          setDownloadingChatIds((prev) => {
-            const newSet = new Set(prev);
-            newSet.delete(chatId);
-            return newSet;
-          });
-        }
-      }
-    },
-    onSuccess: () => {
-      setDownloadProgress(null);
-      if (chatId) {
-        setDownloadingChatIds((prev) => {
-          const newSet = new Set(prev);
-          newSet.delete(chatId);
-          return newSet;
-        });
-      }
-    },
-    onError: (error: Error) => {
-      const status =
-        error.name === "AbortError" ? "Download cancelled" : "Download failed";
-      setDownloadProgress({ status, done: true });
-
-      // Clear error message after delay
-      const delay = error.name === "AbortError" ? 1500 : 3000;
-      setTimeout(() => {
-        setDownloadProgress(null);
-        if (chatId) {
-          setDownloadingChatIds((prev) => {
-            const newSet = new Set(prev);
-            newSet.delete(chatId);
-            return newSet;
-          });
-        }
-      }, delay);
-    },
-  });
-
-  const cancelDownload = () => {
-    if (abortController) {
-      abortController.abort();
-      setAbortController(null);
-      if (chatId) {
-        setDownloadingChatIds((prev) => {
-          const newSet = new Set(prev);
-          newSet.delete(chatId);
-          return newSet;
-        });
-      }
-    }
-  };
-
-  return {
-    downloadModel: mutation.mutate,
-    isDownloading:
-      mutation.isPending && chatId ? downloadingChatIds.has(chatId) : false,
-    downloadProgress:
-      chatId && downloadingChatIds.has(chatId) ? downloadProgress : null,
-    error: mutation.error,
-    cancelDownload,
-  };
-}
--- a/app/ui/app/src/hooks/useUser.ts
+++ b/app/ui/app/src/hooks/useUser.ts
@@ -1,29 +1,20 @@
 import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
-import { useEffect, useState } from "react";
 import { fetchUser, fetchConnectUrl, disconnectUser } from "@/api";

 export function useUser() {
  const queryClient = useQueryClient();
-  const [initialDataLoaded, setInitialDataLoaded] = useState(false);
-
-  // Wait for initial data to be loaded
-  useEffect(() => {
-    const initialPromise = window.__initialUserDataPromise;
-    if (initialPromise) {
-      initialPromise.finally(() => {
-        setInitialDataLoaded(true);
-      });
-    } else {
-      setInitialDataLoaded(true);
-    }
-  }, []);

  const userQuery = useQuery({
    queryKey: ["user"],
-    queryFn: () => fetchUser(),
+    queryFn: async () => {
+      const result = await fetchUser();
+      return result;
+    },
    staleTime: 5 * 60 * 1000, // Consider data stale after 5 minutes
    gcTime: 10 * 60 * 1000, // Keep in cache for 10 minutes
-    initialData: null, // Start with null to prevent flashing
+    retry: 10,
+    retryDelay: (attemptIndex) => Math.min(500 * attemptIndex, 2000),
+    refetchOnMount: true, // Always fetch when component mounts
  });

  // Mutation to refresh user data
@@ -49,14 +40,15 @@ export function useUser() {
    },
  });

+  const isLoading = userQuery.isLoading || userQuery.isFetching;
+  const isAuthenticated = Boolean(userQuery.data?.name);
+
  return {
    user: userQuery.data,
-    isLoading:
-      !initialDataLoaded ||
-      (userQuery.isLoading && userQuery.data === undefined), // Show loading until initial data is loaded
+    isLoading,
    isError: userQuery.isError,
    error: userQuery.error,
-    isAuthenticated: Boolean(userQuery.data?.name),
+    isAuthenticated,
    refreshUser: refreshUser.mutate,
    isRefreshing: refreshUser.isPending,
    refetchUser: userQuery.refetch,
--- a/app/ui/app/src/lib/config.ts
+++ b/app/ui/app/src/lib/config.ts
@@ -0,0 +1,13 @@
+// API configuration
+const DEV_API_URL = "http://127.0.0.1:3001";
+
+// Base URL for fetch API calls (can be relative in production)
+export const API_BASE = import.meta.env.DEV ? DEV_API_URL : "";
+
+// Full host URL for Ollama client (needs full origin in production)
+export const OLLAMA_HOST = import.meta.env.DEV
+  ? DEV_API_URL
+  : window.location.origin;
+
+export const OLLAMA_DOT_COM =
+  import.meta.env.VITE_OLLAMA_DOT_COM_URL || "https://ollama.com";
--- a/app/ui/app/src/lib/ollama-client.ts
+++ b/app/ui/app/src/lib/ollama-client.ts
@@ -1,4 +1,5 @@
 import { Ollama } from "ollama/browser";
+import { OLLAMA_HOST } from "./config";

 let _ollamaClient: Ollama | null = null;

@@ -6,7 +7,7 @@ export const ollamaClient = new Proxy({} as Ollama, {
  get(_target, prop) {
    if (!_ollamaClient) {
      _ollamaClient = new Ollama({
-        host: window.location.origin,
+        host: OLLAMA_HOST,
      });
    }
    const value = _ollamaClient[prop as keyof Ollama];
--- a/app/ui/app/src/main.tsx
+++ b/app/ui/app/src/main.tsx
@@ -5,13 +5,6 @@ import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import { routeTree } from "./routeTree.gen";
 import { fetchUser } from "./api";
 import { StreamingProvider } from "./contexts/StreamingContext";
-import { User } from "@/gotypes";
-
-declare global {
-  interface Window {
-    __initialUserDataPromise?: Promise<User | null>;
-  }
-}

 const queryClient = new QueryClient({
  defaultOptions: {
@@ -24,27 +17,11 @@ const queryClient = new QueryClient({
  },
 });

-// Track initial user data fetch
-let initialUserDataPromise: Promise<User | null> | null = null;
-
-// Initialize user data on app startup
-const initializeUserData = async () => {
-  try {
-    const userData = await fetchUser();
+fetchUser().then((userData) => {
+  if (userData) {
    queryClient.setQueryData(["user"], userData);
-    return userData;
-  } catch (error) {
-    console.error("Error initializing user data:", error);
-    queryClient.setQueryData(["user"], null);
-    return null;
  }
-};
-
-// Start initialization immediately and track the promise
-initialUserDataPromise = initializeUserData();
-
-// Export the promise so hooks can await it
-window.__initialUserDataPromise = initialUserDataPromise;
+});

 const router = createRouter({
  routeTree,
--- a/app/ui/responses/types.go
+++ b/app/ui/responses/types.go
@@ -101,15 +101,14 @@ type HealthResponse struct {
 }

 type User struct {
-	ID            string `json:"id"`
-	Name          string `json:"name"`
-	Email         string `json:"email"`
-	AvatarURL     string `json:"avatarURL"`
-	Plan          string `json:"plan"`
-	Bio           string `json:"bio"`
-	FirstName     string `json:"firstName"`
-	LastName      string `json:"lastName"`
-	OverThreshold bool   `json:"overThreshold"`
+	ID        string `json:"id"`
+	Email     string `json:"email"`
+	Name      string `json:"name"`
+	Bio       string `json:"bio,omitempty"`
+	AvatarURL string `json:"avatarurl,omitempty"`
+	FirstName string `json:"firstname,omitempty"`
+	LastName  string `json:"lastname,omitempty"`
+	Plan      string `json:"plan,omitempty"`
 }

 type Attachment struct {
--- a/app/ui/ui.go
+++ b/app/ui/ui.go
@@ -23,7 +23,6 @@ import (

 	"github.com/google/uuid"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/app/auth"
 	"github.com/ollama/ollama/app/server"
 	"github.com/ollama/ollama/app/store"
 	"github.com/ollama/ollama/app/tools"
@@ -264,11 +263,10 @@ func (s *Server) Handler() http.Handler {
 	ollamaProxy := s.ollamaProxy()
 	mux.Handle("GET /api/tags", ollamaProxy)
 	mux.Handle("POST /api/show", ollamaProxy)
-
-	mux.Handle("GET /api/v1/me", handle(s.me))
-	mux.Handle("POST /api/v1/disconnect", handle(s.disconnect))
-	mux.Handle("GET /api/v1/connect", handle(s.connectURL))
-	mux.Handle("GET /api/v1/health", handle(s.health))
+	mux.Handle("GET /api/version", ollamaProxy)
+	mux.Handle("HEAD /api/version", ollamaProxy)
+	mux.Handle("POST /api/me", ollamaProxy)
+	mux.Handle("POST /api/signout", ollamaProxy)

 	// React app - catch all non-API routes and serve the React app
 	mux.Handle("GET /", s.appHandler())
@@ -338,7 +336,7 @@ func (s *Server) doSelfSigned(ctx context.Context, method, path string) (*http.R
 }

 // UserData fetches user data from ollama.com API for the current ollama key
-func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
+func (s *Server) UserData(ctx context.Context) (*api.UserResponse, error) {
 	resp, err := s.doSelfSigned(ctx, http.MethodPost, "/api/me")
 	if err != nil {
 		return nil, fmt.Errorf("failed to call ollama.com/api/me: %w", err)
@@ -349,7 +347,7 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
 		return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
 	}

-	var user responses.User
+	var user api.UserResponse
 	if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
 		return nil, fmt.Errorf("failed to parse user response: %w", err)
 	}
@@ -368,29 +366,27 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
 	return &user, nil
 }

-func waitForServer(ctx context.Context) error {
-	timeout := time.Now().Add(10 * time.Second)
-	// TODO: this avoids an error on first load of the app
-	// however we should either show a loading state or
-	// wait for the Ollama server to be ready before redirecting
-	for {
+// WaitForServer waits for the Ollama server to be ready
+func WaitForServer(ctx context.Context, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
 		c, err := api.ClientFromEnvironment()
 		if err != nil {
 			return err
 		}
 		if _, err := c.Version(ctx); err == nil {
-			break
-		}
-		if time.Now().After(timeout) {
-			return fmt.Errorf("timeout waiting for Ollama server to be ready")
+			slog.Debug("ollama server is ready")
+			return nil
 		}
 		time.Sleep(10 * time.Millisecond)
 	}
-	return nil
+	return errors.New("timeout waiting for Ollama server to be ready")
 }

 func (s *Server) createChat(w http.ResponseWriter, r *http.Request) error {
-	waitForServer(r.Context())
+	if err := WaitForServer(r.Context(), 10*time.Second); err != nil {
+		return err
+	}

 	id, err := uuid.NewV7()
 	if err != nil {
@@ -1438,129 +1434,6 @@ func (s *Server) settings(w http.ResponseWriter, r *http.Request) error {
 	})
 }

-func (s *Server) me(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodGet {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	user, err := s.UserData(r.Context())
-	if err != nil {
-		// If fetching from API fails, try to return cached user data if available
-		if cachedUser, cacheErr := s.Store.User(); cacheErr == nil && cachedUser != nil {
-			s.log().Info("API request failed, returning cached user data", "error", err)
-			responseUser := &responses.User{
-				Name:  cachedUser.Name,
-				Email: cachedUser.Email,
-				Plan:  cachedUser.Plan,
-			}
-			w.Header().Set("Content-Type", "application/json")
-			w.WriteHeader(http.StatusOK)
-			return json.NewEncoder(w).Encode(responseUser)
-		}
-
-		s.log().Error("failed to get user data", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to get user data",
-		})
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(user)
-}
-
-func (s *Server) disconnect(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodPost {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	if err := s.Store.ClearUser(); err != nil {
-		s.log().Warn("failed to clear cached user data", "error", err)
-	}
-
-	// Get the SSH public key to encode for the delete request
-	pubKey, err := ollamaAuth.GetPublicKey()
-	if err != nil {
-		s.log().Error("failed to get public key", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to get public key",
-		})
-	}
-
-	// Encode the key using base64 URL encoding
-	encodedKey := base64.RawURLEncoding.EncodeToString([]byte(pubKey))
-
-	// Call the /api/user/keys/{encodedKey} endpoint with DELETE
-	resp, err := s.doSelfSigned(r.Context(), http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey))
-	if err != nil {
-		s.log().Error("failed to call ollama.com/api/user/keys", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to disconnect from ollama.com",
-		})
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		s.log().Error("disconnect request failed", "status", resp.StatusCode)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to disconnect from ollama.com",
-		})
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(map[string]string{"status": "disconnected"})
-}
-
-func (s *Server) connectURL(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodGet {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	connectURL, err := auth.BuildConnectURL(OllamaDotCom)
-	if err != nil {
-		s.log().Error("failed to build connect URL", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to build connect URL",
-		})
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(map[string]string{
-		"connect_url": connectURL,
-	})
-}
-
-func (s *Server) health(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodGet {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	healthy := false
-	c, err := api.ClientFromEnvironment()
-	if err == nil {
-		if _, err := c.Version(r.Context()); err == nil {
-			healthy = true
-		}
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(responses.HealthResponse{
-		Healthy: healthy,
-	})
-}
-
 func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) error {
 	ctx, cancel := context.WithTimeout(r.Context(), 500*time.Millisecond)
 	defer cancel()
@@ -1794,13 +1667,14 @@ func (s *Server) buildChatRequest(chat *store.Chat, model string, think any, ava

 	var thinkValue *api.ThinkValue
 	if think != nil {
+		// Only set Think if it's actually requesting thinking
 		if boolValue, ok := think.(bool); ok {
-			thinkValue = &api.ThinkValue{
-				Value: boolValue,
+			if boolValue {
+				thinkValue = &api.ThinkValue{Value: boolValue}
 			}
 		} else if stringValue, ok := think.(string); ok {
-			thinkValue = &api.ThinkValue{
-				Value: stringValue,
+			if stringValue != "" && stringValue != "none" {
+				thinkValue = &api.ThinkValue{Value: stringValue}
 			}
 		}
 	}
--- a/cmd/bench/README.md
+++ b/cmd/bench/README.md
@@ -0,0 +1,115 @@
+Ollama Benchmark Tool
+---------------------
+
+A Go-based command-line tool for benchmarking Ollama models with configurable parameters and multiple output formats.
+
+## Features
+
+ * Benchmark multiple models in a single run
+ * Support for both text and image prompts
+ * Configurable generation parameters (temperature, max tokens, seed, etc.)
+ * Supports benchstat and CSV output formats
+ * Detailed performance metrics (prefill, generate, load, total durations)
+
+## Building from Source
+
+```
+go build -o ollama-bench bench.go
+./ollama-bench -model gpt-oss:20b -epochs 6 -format csv
+```
+
+Using Go Run (without building)
+
+```
+go run bench.go -model gpt-oss:20b -epochs 3
+```
+
+## Usage
+
+### Basic Example
+
+```
+./ollama-bench -model gemma3 -epochs 6
+```
+
+### Benchmark Multiple Models
+
+```
+./ollama-bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
+benchstat -col /name gemma.bench
+```
+
+### With Image Prompt
+
+```
+./ollama-bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
+```
+
+### Advanced Example
+
+```
+./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
+```
+
+## Command Line Options
+
+| Option  	| Description | Default |
+|----------|-------------|---------|
+| -model	| Comma-separated list of models to benchmark	| (required)		|
+| -epochs	| Number of iterations per model		| 1			|
+| -max-tokens	| Maximum tokens for model response		| 0 (unlimited)		|
+| -temperature	| Temperature parameter				| 0.0			|
+| -seed		| Random seed					| 0 (random)		|
+| -timeout	| Timeout in seconds				| 300			|
+| -p		| Prompt text					| "Write a long story."	|
+| -image	| Image file to include in prompt		| 			|
+| -k		| Keep-alive duration in seconds		| 0			|
+| -format	| Output format (benchstat, csv)		| benchstat		|
+| -output	| Output file for results			| "" (stdout)		|
+| -v		| Verbose mode					| false			|
+| -debug	| Show debug information			| false			|
+
+## Output Formats
+
+### Markdown Format
+
+The default markdown format is suitable for copying and pasting into a GitHub issue and will look like:
+```
+ Model | Step | Count | Duration | nsPerToken | tokensPerSec |
+|-------|------|-------|----------|------------|--------------|
+| gpt-oss:20b | prefill | 124 | 30.006458ms | 241987.56 | 4132.44 |
+| gpt-oss:20b | generate | 200 | 2.646843954s | 13234219.77 | 75.56 |
+| gpt-oss:20b | load | 1 | 121.674208ms | - | - |
+| gpt-oss:20b | total | 1 | 2.861047625s | - | - |
+```
+
+### Benchstat Format
+
+Compatible with Go's benchstat tool for statistical analysis:
+
+```
+BenchmarkModel/name=gpt-oss:20b/step=prefill 128 78125.00 ns/token 12800.00 token/sec
+BenchmarkModel/name=gpt-oss:20b/step=generate 512 19531.25 ns/token 51200.00 token/sec
+BenchmarkModel/name=gpt-oss:20b/step=load 1 1500000000 ns/request
+```
+
+### CSV Format
+
+Machine-readable comma-separated values:
+
+```
+NAME,STEP,COUNT,NS_PER_COUNT,TOKEN_PER_SEC
+gpt-oss:20b,prefill,128,78125.00,12800.00
+gpt-oss:20b,generate,512,19531.25,51200.00
+gpt-oss:20b,load,1,1500000000,0
+```
+
+## Metrics Explained
+
+The tool reports four types of metrics for each model:
+
+ * prefill: Time spent processing the prompt
+ * generate: Time spent generating the response
+ * load: Model loading time (one-time cost)
+ * total: Total request duration
+
--- a/cmd/bench/bench.go
+++ b/cmd/bench/bench.go
@@ -0,0 +1,321 @@
+package main
+
+import (
+	"cmp"
+	"context"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"runtime"
+	"slices"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+type flagOptions struct {
+	models      *string
+	epochs      *int
+	maxTokens   *int
+	temperature *float64
+	seed        *int
+	timeout     *int
+	prompt      *string
+	imageFile   *string
+	keepAlive   *float64
+	format      *string
+	outputFile  *string
+	debug       *bool
+	verbose     *bool
+}
+
+type Metrics struct {
+	Model    string
+	Step     string
+	Count    int
+	Duration time.Duration
+}
+
+var once sync.Once
+
+const DefaultPrompt = `Please write a descriptive story about a llama named Alonso who grows up to be President of the Land of Llamas. Include details about Alonso's childhood, adolescent years, and how he grew up to be a political mover and shaker. Write the story with a sense of whimsy.`
+
+func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool) {
+	switch format {
+	case "benchstat":
+		if verbose {
+			printHeader := func() {
+				fmt.Fprintf(w, "sysname: %s\n", runtime.GOOS)
+				fmt.Fprintf(w, "machine: %s\n", runtime.GOARCH)
+			}
+			once.Do(printHeader)
+		}
+		for _, m := range metrics {
+			if m.Step == "generate" || m.Step == "prefill" {
+				if m.Count > 0 {
+					nsPerToken := float64(m.Duration.Nanoseconds()) / float64(m.Count)
+					tokensPerSec := float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
+
+					fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d %.2f ns/token %.2f token/sec\n",
+						m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
+				} else {
+					fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d 0 ns/token 0 token/sec\n",
+						m.Model, m.Step, m.Count)
+				}
+			} else {
+				var suffix string
+				if m.Step == "load" {
+					suffix = "/step=load"
+				}
+				fmt.Fprintf(w, "BenchmarkModel/name=%s%s 1 %d ns/request\n",
+					m.Model, suffix, m.Duration.Nanoseconds())
+			}
+		}
+	case "csv":
+		printHeader := func() {
+			headings := []string{"NAME", "STEP", "COUNT", "NS_PER_COUNT", "TOKEN_PER_SEC"}
+			fmt.Fprintln(w, strings.Join(headings, ","))
+		}
+		once.Do(printHeader)
+
+		for _, m := range metrics {
+			if m.Step == "generate" || m.Step == "prefill" {
+				var nsPerToken float64
+				var tokensPerSec float64
+				if m.Count > 0 {
+					nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
+					tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
+				}
+				fmt.Fprintf(w, "%s,%s,%d,%.2f,%.2f\n", m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
+			} else {
+				fmt.Fprintf(w, "%s,%s,1,%d,0\n", m.Model, m.Step, m.Duration.Nanoseconds())
+			}
+		}
+	case "markdown":
+		printHeader := func() {
+			fmt.Fprintln(w, "| Model | Step | Count | Duration | nsPerToken | tokensPerSec |")
+			fmt.Fprintln(w, "|-------|------|-------|----------|------------|--------------|")
+		}
+		once.Do(printHeader)
+
+		for _, m := range metrics {
+			var nsPerToken, tokensPerSec float64
+			var nsPerTokenStr, tokensPerSecStr string
+
+			if m.Step == "generate" || m.Step == "prefill" {
+				nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
+				tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
+				nsPerTokenStr = fmt.Sprintf("%.2f", nsPerToken)
+				tokensPerSecStr = fmt.Sprintf("%.2f", tokensPerSec)
+			} else {
+				nsPerTokenStr = "-"
+				tokensPerSecStr = "-"
+			}
+
+			fmt.Fprintf(w, "| %s | %s | %d | %v | %s | %s |\n",
+				m.Model, m.Step, m.Count, m.Duration, nsPerTokenStr, tokensPerSecStr)
+		}
+	default:
+		fmt.Fprintf(os.Stderr, "Unknown output format '%s'\n", format)
+	}
+}
+
+func BenchmarkChat(fOpt flagOptions) error {
+	models := strings.Split(*fOpt.models, ",")
+
+	// todo - add multi-image support
+	var imgData api.ImageData
+	var err error
+	if *fOpt.imageFile != "" {
+		imgData, err = readImage(*fOpt.imageFile)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "ERROR: Couldn't read image '%s': %v\n", *fOpt.imageFile, err)
+			return err
+		}
+	}
+
+	if *fOpt.debug && imgData != nil {
+		fmt.Fprintf(os.Stderr, "Read file '%s'\n", *fOpt.imageFile)
+	}
+
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "ERROR: Couldn't create ollama client: %v\n", err)
+		return err
+	}
+
+	var out io.Writer = os.Stdout
+	if fOpt.outputFile != nil && *fOpt.outputFile != "" {
+		f, err := os.OpenFile(*fOpt.outputFile, os.O_CREATE|os.O_WRONLY, 0o644)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "ERROR: cannot open output file %s: %v\n", *fOpt.outputFile, err)
+			return err
+		}
+		defer f.Close()
+		out = f
+	}
+
+	for _, model := range models {
+		for range *fOpt.epochs {
+			options := make(map[string]interface{})
+			if *fOpt.maxTokens > 0 {
+				options["num_predict"] = *fOpt.maxTokens
+			}
+			options["temperature"] = *fOpt.temperature
+			if fOpt.seed != nil && *fOpt.seed > 0 {
+				options["seed"] = *fOpt.seed
+			}
+
+			var keepAliveDuration *api.Duration
+			if *fOpt.keepAlive > 0 {
+				duration := api.Duration{Duration: time.Duration(*fOpt.keepAlive * float64(time.Second))}
+				keepAliveDuration = &duration
+			}
+
+			req := &api.ChatRequest{
+				Model: model,
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: *fOpt.prompt,
+					},
+				},
+				Options:   options,
+				KeepAlive: keepAliveDuration,
+			}
+
+			if imgData != nil {
+				req.Messages[0].Images = []api.ImageData{imgData}
+			}
+
+			var responseMetrics *api.Metrics
+
+			ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*fOpt.timeout)*time.Second)
+			defer cancel()
+
+			err = client.Chat(ctx, req, func(resp api.ChatResponse) error {
+				if *fOpt.debug {
+					fmt.Fprintf(os.Stderr, "%s", cmp.Or(resp.Message.Thinking, resp.Message.Content))
+				}
+
+				if resp.Done {
+					responseMetrics = &resp.Metrics
+				}
+				return nil
+			})
+
+			if *fOpt.debug {
+				fmt.Fprintln(os.Stderr)
+			}
+
+			if err != nil {
+				if ctx.Err() == context.DeadlineExceeded {
+					fmt.Fprintf(os.Stderr, "ERROR: Chat request timed out with model '%s' after %vs\n", model, 1)
+					continue
+				}
+				fmt.Fprintf(os.Stderr, "ERROR: Couldn't chat with model '%s': %v\n", model, err)
+				continue
+			}
+
+			if responseMetrics == nil {
+				fmt.Fprintf(os.Stderr, "ERROR: No metrics received for model '%s'\n", model)
+				continue
+			}
+
+			metrics := []Metrics{
+				{
+					Model:    model,
+					Step:     "prefill",
+					Count:    responseMetrics.PromptEvalCount,
+					Duration: responseMetrics.PromptEvalDuration,
+				},
+				{
+					Model:    model,
+					Step:     "generate",
+					Count:    responseMetrics.EvalCount,
+					Duration: responseMetrics.EvalDuration,
+				},
+				{
+					Model:    model,
+					Step:     "load",
+					Count:    1,
+					Duration: responseMetrics.LoadDuration,
+				},
+				{
+					Model:    model,
+					Step:     "total",
+					Count:    1,
+					Duration: responseMetrics.TotalDuration,
+				},
+			}
+
+			OutputMetrics(out, *fOpt.format, metrics, *fOpt.verbose)
+
+			if *fOpt.keepAlive > 0 {
+				time.Sleep(time.Duration(*fOpt.keepAlive*float64(time.Second)) + 200*time.Millisecond)
+			}
+		}
+	}
+
+	return nil
+}
+
+func readImage(filePath string) (api.ImageData, error) {
+	file, err := os.Open(filePath)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	data, err := io.ReadAll(file)
+	if err != nil {
+		return nil, err
+	}
+
+	return api.ImageData(data), nil
+}
+
+func main() {
+	fOpt := flagOptions{
+		models:      flag.String("model", "", "Model to benchmark"),
+		epochs:      flag.Int("epochs", 6, "Number of epochs (iterations) per model"),
+		maxTokens:   flag.Int("max-tokens", 200, "Maximum tokens for model response"),
+		temperature: flag.Float64("temperature", 0, "Temperature parameter"),
+		seed:        flag.Int("seed", 0, "Random seed"),
+		timeout:     flag.Int("timeout", 60*5, "Timeout in seconds (default 300s)"),
+		prompt:      flag.String("p", DefaultPrompt, "Prompt to use"),
+		imageFile:   flag.String("image", "", "Filename for an image to include"),
+		keepAlive:   flag.Float64("k", 0, "Keep alive duration in seconds"),
+		format:      flag.String("format", "markdown", "Output format [benchstat|csv] (default benchstat)"),
+		outputFile:  flag.String("output", "", "Output file for results (stdout if empty)"),
+		verbose:     flag.Bool("v", false, "Show system information"),
+		debug:       flag.Bool("debug", false, "Show debug information"),
+	}
+
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS]\n\n", os.Args[0])
+		fmt.Fprintf(os.Stderr, "Description:\n")
+		fmt.Fprintf(os.Stderr, "  Model benchmarking tool with configurable parameters\n\n")
+		fmt.Fprintf(os.Stderr, "Options:\n")
+		flag.PrintDefaults()
+		fmt.Fprintf(os.Stderr, "\nExamples:\n")
+		fmt.Fprintf(os.Stderr, "  bench -model gpt-oss:20b -epochs 3 -temperature 0.7\n")
+	}
+	flag.Parse()
+
+	if !slices.Contains([]string{"markdown", "benchstat", "csv"}, *fOpt.format) {
+		fmt.Fprintf(os.Stderr, "ERROR: Unknown format '%s'\n", *fOpt.format)
+		os.Exit(1)
+	}
+
+	if len(*fOpt.models) == 0 {
+		fmt.Fprintf(os.Stderr, "ERROR: No model(s) specified to benchmark.\n")
+		flag.Usage()
+		return
+	}
+
+	BenchmarkChat(fOpt)
+}
--- a/cmd/bench/bench_test.go
+++ b/cmd/bench/bench_test.go
@@ -0,0 +1,463 @@
+package main
+
+import (
+	"bytes"
+	"crypto/rand"
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+func createTestFlagOptions() flagOptions {
+	models := "test-model"
+	format := "benchstat"
+	epochs := 1
+	maxTokens := 100
+	temperature := 0.7
+	seed := 42
+	timeout := 30
+	prompt := "test prompt"
+	imageFile := ""
+	keepAlive := 5.0
+	verbose := false
+	debug := false
+
+	return flagOptions{
+		models:      &models,
+		format:      &format,
+		epochs:      &epochs,
+		maxTokens:   &maxTokens,
+		temperature: &temperature,
+		seed:        &seed,
+		timeout:     &timeout,
+		prompt:      &prompt,
+		imageFile:   &imageFile,
+		keepAlive:   &keepAlive,
+		verbose:     &verbose,
+		debug:       &debug,
+	}
+}
+
+func captureOutput(f func()) string {
+	oldStdout := os.Stdout
+	oldStderr := os.Stderr
+	defer func() {
+		os.Stdout = oldStdout
+		os.Stderr = oldStderr
+	}()
+
+	r, w, _ := os.Pipe()
+	os.Stdout = w
+	os.Stderr = w
+
+	f()
+
+	w.Close()
+	var buf bytes.Buffer
+	io.Copy(&buf, r)
+	return buf.String()
+}
+
+func createMockOllamaServer(t *testing.T, responses []api.ChatResponse) *httptest.Server {
+	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/api/chat" {
+			t.Errorf("Expected path /api/chat, got %s", r.URL.Path)
+			http.Error(w, "Not found", http.StatusNotFound)
+			return
+		}
+
+		if r.Method != "POST" {
+			t.Errorf("Expected POST method, got %s", r.Method)
+			http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+			return
+		}
+
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+
+		for _, resp := range responses {
+			jsonData, err := json.Marshal(resp)
+			if err != nil {
+				t.Errorf("Failed to marshal response: %v", err)
+				return
+			}
+			w.Write(jsonData)
+			w.Write([]byte("\n"))
+			if f, ok := w.(http.Flusher); ok {
+				f.Flush()
+			}
+			time.Sleep(10 * time.Millisecond) // Simulate some delay
+		}
+	}))
+}
+
+func TestBenchmarkChat_Success(t *testing.T) {
+	fOpt := createTestFlagOptions()
+
+	mockResponses := []api.ChatResponse{
+		{
+			Model: "test-model",
+			Message: api.Message{
+				Role:    "assistant",
+				Content: "test response part 1",
+			},
+			Done: false,
+		},
+		{
+			Model: "test-model",
+			Message: api.Message{
+				Role:    "assistant",
+				Content: "test response part 2",
+			},
+			Done: true,
+			Metrics: api.Metrics{
+				PromptEvalCount:    10,
+				PromptEvalDuration: 100 * time.Millisecond,
+				EvalCount:          50,
+				EvalDuration:       500 * time.Millisecond,
+				TotalDuration:      600 * time.Millisecond,
+				LoadDuration:       50 * time.Millisecond,
+			},
+		},
+	}
+
+	server := createMockOllamaServer(t, mockResponses)
+	defer server.Close()
+
+	t.Setenv("OLLAMA_HOST", server.URL)
+
+	output := captureOutput(func() {
+		err := BenchmarkChat(fOpt)
+		if err != nil {
+			t.Errorf("Expected no error, got %v", err)
+		}
+	})
+
+	if !strings.Contains(output, "BenchmarkModel/name=test-model/step=prefill") {
+		t.Errorf("Expected output to contain prefill metrics, got: %s", output)
+	}
+	if !strings.Contains(output, "BenchmarkModel/name=test-model/step=generate") {
+		t.Errorf("Expected output to contain generate metrics, got: %s", output)
+	}
+	if !strings.Contains(output, "ns/token") {
+		t.Errorf("Expected output to contain ns/token metric, got: %s", output)
+	}
+}
+
+func TestBenchmarkChat_ServerError(t *testing.T) {
+	fOpt := createTestFlagOptions()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.Error(w, "Internal server error", http.StatusInternalServerError)
+	}))
+	defer server.Close()
+
+	t.Setenv("OLLAMA_HOST", server.URL)
+
+	output := captureOutput(func() {
+		err := BenchmarkChat(fOpt)
+		if err != nil {
+			t.Errorf("Expected error to be handled internally, got returned error: %v", err)
+		}
+	})
+
+	if !strings.Contains(output, "ERROR: Couldn't chat with model") {
+		t.Errorf("Expected error message about chat failure, got: %s", output)
+	}
+}
+
+func TestBenchmarkChat_Timeout(t *testing.T) {
+	fOpt := createTestFlagOptions()
+	shortTimeout := 1 // Very short timeout
+	fOpt.timeout = &shortTimeout
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Simulate a long delay that will cause timeout
+		time.Sleep(2 * time.Second)
+
+		w.Header().Set("Content-Type", "application/json")
+		response := api.ChatResponse{
+			Model: "test-model",
+			Message: api.Message{
+				Role:    "assistant",
+				Content: "test response",
+			},
+			Done: true,
+			Metrics: api.Metrics{
+				PromptEvalCount:    10,
+				PromptEvalDuration: 100 * time.Millisecond,
+				EvalCount:          50,
+				EvalDuration:       500 * time.Millisecond,
+				TotalDuration:      600 * time.Millisecond,
+				LoadDuration:       50 * time.Millisecond,
+			},
+		}
+		jsonData, _ := json.Marshal(response)
+		w.Write(jsonData)
+	}))
+	defer server.Close()
+
+	t.Setenv("OLLAMA_HOST", server.URL)
+
+	output := captureOutput(func() {
+		err := BenchmarkChat(fOpt)
+		if err != nil {
+			t.Errorf("Expected timeout to be handled internally, got returned error: %v", err)
+		}
+	})
+
+	if !strings.Contains(output, "ERROR: Chat request timed out") {
+		t.Errorf("Expected timeout error message, got: %s", output)
+	}
+}
+
+func TestBenchmarkChat_NoMetrics(t *testing.T) {
+	fOpt := createTestFlagOptions()
+
+	mockResponses := []api.ChatResponse{
+		{
+			Model: "test-model",
+			Message: api.Message{
+				Role:    "assistant",
+				Content: "test response",
+			},
+			Done: false, // Never sends Done=true
+		},
+	}
+
+	server := createMockOllamaServer(t, mockResponses)
+	defer server.Close()
+
+	t.Setenv("OLLAMA_HOST", server.URL)
+
+	output := captureOutput(func() {
+		err := BenchmarkChat(fOpt)
+		if err != nil {
+			t.Errorf("Expected no error, got %v", err)
+		}
+	})
+
+	if !strings.Contains(output, "ERROR: No metrics received") {
+		t.Errorf("Expected no metrics error message, got: %s", output)
+	}
+}
+
+func TestBenchmarkChat_MultipleModels(t *testing.T) {
+	fOpt := createTestFlagOptions()
+	models := "model1,model2"
+	epochs := 2
+	fOpt.models = &models
+	fOpt.epochs = &epochs
+
+	callCount := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		callCount++
+
+		w.Header().Set("Content-Type", "application/json")
+
+		var req api.ChatRequest
+		body, _ := io.ReadAll(r.Body)
+		json.Unmarshal(body, &req)
+
+		response := api.ChatResponse{
+			Model: req.Model,
+			Message: api.Message{
+				Role:    "assistant",
+				Content: "test response for " + req.Model,
+			},
+			Done: true,
+			Metrics: api.Metrics{
+				PromptEvalCount:    10,
+				PromptEvalDuration: 100 * time.Millisecond,
+				EvalCount:          50,
+				EvalDuration:       500 * time.Millisecond,
+				TotalDuration:      600 * time.Millisecond,
+				LoadDuration:       50 * time.Millisecond,
+			},
+		}
+		jsonData, _ := json.Marshal(response)
+		w.Write(jsonData)
+	}))
+	defer server.Close()
+
+	t.Setenv("OLLAMA_HOST", server.URL)
+
+	output := captureOutput(func() {
+		err := BenchmarkChat(fOpt)
+		if err != nil {
+			t.Errorf("Expected no error, got %v", err)
+		}
+	})
+
+	// Should be called 4 times (2 models × 2 epochs)
+	if callCount != 4 {
+		t.Errorf("Expected 4 API calls, got %d", callCount)
+	}
+
+	if !strings.Contains(output, "BenchmarkModel/name=model1") || !strings.Contains(output, "BenchmarkModel/name=model2") {
+		t.Errorf("Expected output for both models, got: %s", output)
+	}
+}
+
+func TestBenchmarkChat_WithImage(t *testing.T) {
+	fOpt := createTestFlagOptions()
+
+	tmpfile, err := os.CreateTemp(t.TempDir(), "testimage")
+	if err != nil {
+		t.Fatalf("Failed to create temp file: %v", err)
+	}
+	defer os.Remove(tmpfile.Name())
+
+	content := []byte("fake image data")
+	if _, err := tmpfile.Write(content); err != nil {
+		t.Fatalf("Failed to write to temp file: %v", err)
+	}
+	tmpfile.Close()
+
+	tmpfileName := tmpfile.Name()
+	fOpt.imageFile = &tmpfileName
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Verify the request contains image data
+		var req api.ChatRequest
+		body, _ := io.ReadAll(r.Body)
+		json.Unmarshal(body, &req)
+
+		if len(req.Messages) == 0 || len(req.Messages[0].Images) == 0 {
+			t.Error("Expected request to contain images")
+		}
+
+		w.Header().Set("Content-Type", "application/json")
+		response := api.ChatResponse{
+			Model: "test-model",
+			Message: api.Message{
+				Role:    "assistant",
+				Content: "test response with image",
+			},
+			Done: true,
+			Metrics: api.Metrics{
+				PromptEvalCount:    10,
+				PromptEvalDuration: 100 * time.Millisecond,
+				EvalCount:          50,
+				EvalDuration:       500 * time.Millisecond,
+				TotalDuration:      600 * time.Millisecond,
+				LoadDuration:       50 * time.Millisecond,
+			},
+		}
+		jsonData, _ := json.Marshal(response)
+		w.Write(jsonData)
+	}))
+	defer server.Close()
+
+	t.Setenv("OLLAMA_HOST", server.URL)
+
+	output := captureOutput(func() {
+		err := BenchmarkChat(fOpt)
+		if err != nil {
+			t.Errorf("Expected no error, got %v", err)
+		}
+	})
+
+	if !strings.Contains(output, "BenchmarkModel/name=test-model") {
+		t.Errorf("Expected benchmark output, got: %s", output)
+	}
+}
+
+func TestBenchmarkChat_ImageError(t *testing.T) {
+	randFileName := func() string {
+		const charset = "abcdefghijklmnopqrstuvwxyz0123456789"
+		const length = 8
+
+		result := make([]byte, length)
+		rand.Read(result) // Fill with random bytes
+
+		for i := range result {
+			result[i] = charset[result[i]%byte(len(charset))]
+		}
+
+		return string(result) + ".txt"
+	}
+
+	fOpt := createTestFlagOptions()
+	imageFile := randFileName()
+	fOpt.imageFile = &imageFile
+
+	output := captureOutput(func() {
+		err := BenchmarkChat(fOpt)
+		if err == nil {
+			t.Error("Expected error from image reading, got nil")
+		}
+	})
+
+	if !strings.Contains(output, "ERROR: Couldn't read image") {
+		t.Errorf("Expected image read error message, got: %s", output)
+	}
+}
+
+func TestReadImage_Success(t *testing.T) {
+	tmpfile, err := os.CreateTemp(t.TempDir(), "testimage")
+	if err != nil {
+		t.Fatalf("Failed to create temp file: %v", err)
+	}
+	defer os.Remove(tmpfile.Name())
+
+	content := []byte("fake image data")
+	if _, err := tmpfile.Write(content); err != nil {
+		t.Fatalf("Failed to write to temp file: %v", err)
+	}
+	tmpfile.Close()
+
+	imgData, err := readImage(tmpfile.Name())
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if imgData == nil {
+		t.Error("Expected image data, got nil")
+	}
+
+	expected := api.ImageData(content)
+	if string(imgData) != string(expected) {
+		t.Errorf("Expected image data %v, got %v", expected, imgData)
+	}
+}
+
+func TestReadImage_FileNotFound(t *testing.T) {
+	imgData, err := readImage("nonexistentfile.jpg")
+	if err == nil {
+		t.Error("Expected error for non-existent file, got nil")
+	}
+	if imgData != nil {
+		t.Error("Expected nil image data for non-existent file")
+	}
+}
+
+func TestOptionsMapCreation(t *testing.T) {
+	fOpt := createTestFlagOptions()
+
+	options := make(map[string]interface{})
+	if *fOpt.maxTokens > 0 {
+		options["num_predict"] = *fOpt.maxTokens
+	}
+	options["temperature"] = *fOpt.temperature
+	if fOpt.seed != nil && *fOpt.seed > 0 {
+		options["seed"] = *fOpt.seed
+	}
+
+	if options["num_predict"] != *fOpt.maxTokens {
+		t.Errorf("Expected num_predict %d, got %v", *fOpt.maxTokens, options["num_predict"])
+	}
+	if options["temperature"] != *fOpt.temperature {
+		t.Errorf("Expected temperature %f, got %v", *fOpt.temperature, options["temperature"])
+	}
+	if options["seed"] != *fOpt.seed {
+		t.Errorf("Expected seed %d, got %v", *fOpt.seed, options["seed"])
+	}
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1430,7 +1430,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		latest.Summary()
 	}

-	return &api.Message{Role: role, Content: fullResponse.String()}, nil
+	return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
 }

 func generate(cmd *cobra.Command, opts runOptions) error {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -202,10 +202,16 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &qwen3VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
+	case "NomicBertModel", "NomicBertMoEModel":
+		conv = &nomicbertModel{}
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
 	case "GptOssForCausalLM":
 		conv = &gptossModel{}
+	case "DeepseekOCRForCausalLM":
+		conv = &deepseekocr{}
+	case "DeepseekV3ForCausalLM":
+		conv = &deepseek2Model{}
 	default:
 		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
--- a/convert/convert_deepseek2.go
+++ b/convert/convert_deepseek2.go
@@ -0,0 +1,173 @@
+package convert
+
+import (
+	"cmp"
+	"fmt"
+	"log/slog"
+	"regexp"
+	"strconv"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type deepseek2Model struct {
+	ModelParameters               // architectures, vocab_size
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	RMSNormEPS            float32 `json:"rms_norm_eps"`
+
+	RopeTheta     float32 `json:"rope_theta"`
+	QKNopeHeadDim uint32  `json:"qk_nope_head_dim"`
+	QKRopeHeadDim uint32  `json:"qk_rope_head_dim"`
+	KVLoraRank    uint32  `json:"kv_lora_rank"`
+	QLoraRank     uint32  `json:"q_lora_rank"`
+	VHeadDim      uint32  `json:"v_head_dim"`
+
+	ExpertCount            uint32  `json:"n_routed_experts"`
+	ExpertSharedCount      uint32  `json:"n_shared_experts"`
+	ExpertIntermediateSize uint32  `json:"moe_intermediate_size"`
+	ExpertUsedCount        uint32  `json:"num_experts_per_tok"`
+	ExpertWeightsNorm      bool    `json:"norm_topk_prob"`
+	ExpertWeightsScale     float32 `json:"routed_scaling_factor"`
+
+	ScoringFunc            string `json:"scoring_func"`
+	LeadingDenseBlockCount uint32 `json:"first_k_dense_replace"`
+
+	RopeScaling struct {
+		Factor                        float32 `json:"factor"`
+		OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+		Type                          string  `json:"type"`
+		MScaleAllDim                  float32 `json:"mscale_all_dim"`
+	} `json:"rope_scaling"`
+
+	Architecture string
+}
+
+func (p *deepseek2Model) KV(t *Tokenizer) ggml.KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "deepseek2"
+	kv["general.type"] = "model"
+	kv["deepseek2.block_count"] = p.HiddenLayers
+
+	numHeads := p.NumAttentionHeads
+	numKVHeads := p.NumKeyValueHeads
+
+	kv["deepseek2.attention.head_count"] = numHeads
+	kv["deepseek2.attention.head_count_kv"] = numKVHeads
+	kv["deepseek2.attention.key_length"] = p.QKNopeHeadDim + p.QKRopeHeadDim
+	kv["deepseek2.attention.kv_lora_rank"] = p.KVLoraRank
+	kv["deepseek2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["deepseek2.attention.q_lora_rank"] = p.QLoraRank
+	kv["deepseek2.attention.value_length"] = p.VHeadDim
+	kv["deepseek2.context_length"] = p.MaxPositionEmbeddings
+	kv["deepseek2.embedding_length"] = p.HiddenSize
+	kv["deepseek2.expert_count"] = p.ExpertCount
+	kv["deepseek2.expert_feed_forward_length"] = p.ExpertIntermediateSize
+	kv["deepseek2.expert_shared_count"] = p.ExpertSharedCount
+
+	var scoringFunc uint32
+	switch p.ScoringFunc {
+	case "softmax":
+		// not currently supported in the model, but needed for Deepseek-OCR
+		scoringFunc = 1
+	case "sigmoid":
+		scoringFunc = 2
+	}
+	kv["deepseek2.expert_gating_func"] = scoringFunc
+	kv["deepseek2.expert_used_count"] = p.ExpertUsedCount
+	kv["deepseek2.expert_weights_norm"] = p.ExpertWeightsNorm
+	kv["deepseek2.expert_weights_scale"] = p.ExpertWeightsScale
+	kv["deepseek2.feed_forward_length"] = p.IntermediateSize
+	kv["deepseek2.leading_dense_block_count"] = p.LeadingDenseBlockCount
+
+	kv["deepseek2.rope.dimension_count"] = p.QKRopeHeadDim
+	kv["deepseek2.rope.freq_base"] = cmp.Or(p.RopeTheta, 10000.0)
+	kv["deepseek2.rope.scaling.factor"] = p.RopeScaling.Factor
+	kv["deepseek2.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
+	kv["deepseek2.rope.scaling.type"] = p.RopeScaling.Type
+	kv["deepseek2.rope.scaling.yarn_log_multiplier"] = 0.1 * p.RopeScaling.MScaleAllDim
+
+	kv["tokenizer.ggml.pre"] = "deepseek-v3"
+
+	return kv
+}
+
+func (p *deepseek2Model) Replacements() []string {
+	return []string{
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.norm", "output_norm",
+		"language_model.", "",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.kv_a_proj_with_mqa", "attn_kv_a_mqa",
+		"self_attn.kv_a_layernorm", "attn_kv_a_norm",
+		"self_attn.kv_b_proj", "attn_kv_b",
+		"self_attn.q_a_proj", "attn_q_a",
+		"self_attn.q_a_layernorm", "attn_q_a_norm",
+		"self_attn.q_b_proj", "attn_q_b",
+		"self_attn.o_proj", "attn_output",
+		"post_attention_layernorm", "ffn_norm",
+		"mlp.shared_experts.down_proj", "ffn_down_shexp",
+		"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
+		"mlp.shared_experts.up_proj", "ffn_up_shexp",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"mlp.gate.e_score_correction_bias", "exp_probs_b.bias",
+		"mlp.gate", "ffn_gate_inp",
+	}
+}
+
+func (p *deepseek2Model) Tensors(s []Tensor) (out []*ggml.Tensor) {
+	merges := make([]merge, p.HiddenLayers*3)
+	for i := range p.HiddenLayers {
+		merges[i*3+0] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}
+		merges[i*3+1] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}
+		merges[i*3+2] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}
+	}
+
+	skipLayer := func(n string, minValue uint32) bool {
+		re := regexp.MustCompile(`^blk\.(\d+)`)
+		matches := re.FindStringSubmatch(n)
+		if matches == nil {
+			return false
+		}
+
+		blkNum, err := strconv.Atoi(matches[1])
+		if err != nil {
+			return false
+		}
+
+		return uint32(blkNum) >= minValue
+	}
+
+	out, s = mergeTensors(s, merges...)
+	for _, t := range s {
+		// skip any additional layers (such as the Multi-Token Prediction layer)
+		if skipLayer(t.Name(), p.HiddenLayers) {
+			slog.Debug("skipping layer", "name", t.Name())
+			continue
+		}
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+	return out
+}
--- a/convert/convert_deepseekocr.go
+++ b/convert/convert_deepseekocr.go
@@ -0,0 +1,136 @@
+package convert
+
+import (
+	"fmt"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type deepseekocr struct {
+	ModelParameters
+	LanguageConfig struct {
+		MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
+		HiddenSize            uint32 `json:"hidden_size"`
+		HiddenLayers          uint32 `json:"num_hidden_layers"`
+		IntermediateSize      uint32 `json:"intermediate_size"`
+		NumAttentionHeads     uint32 `json:"num_attention_heads"`
+		NumKeyValueHeads      uint32 `json:"num_key_value_heads"`
+		NumRoutedExperts      uint32 `json:"n_routed_experts"`
+		NumSharedExperts      uint32 `json:"n_shared_experts"`
+		NumExpertsPerToken    uint32 `json:"num_experts_per_tok"`
+		FirstKDenseReplace    uint32 `json:"first_k_dense_replace"`
+	} `json:"language_config"`
+
+	VisionConfig struct {
+		ImageSize uint32 `json:"image_size"`
+		Width     struct {
+			Vision struct {
+				Heads     uint32 `json:"heads"`
+				ImageSize uint32 `json:"image_size"`
+				Layers    uint32 `json:"layers"`
+				PatchSize uint32 `json:"patch_size"`
+				Width     uint32 `json:"width"`
+			} `json:"clip-l-14-224"`
+			Sam struct {
+				GlobalAttentionIndexes []int32 `json:"global_attn_indexes"`
+				Heads                  uint32  `json:"heads"`
+				Layers                 uint32  `json:"layers"`
+				Width                  uint32  `json:"width"`
+			} `json:"sam_vit_b"`
+		}
+	} `json:"vision_config"`
+}
+
+func (m *deepseekocr) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "deepseekocr"
+	kv["block_count"] = m.LanguageConfig.HiddenLayers
+	kv["context_length"] = m.LanguageConfig.MaxPositionEmbeddings
+	kv["embedding_length"] = m.LanguageConfig.HiddenSize
+	kv["feed_forward_length"] = m.LanguageConfig.IntermediateSize
+	kv["attention.head_count"] = m.LanguageConfig.NumAttentionHeads
+	kv["attention.head_count_kv"] = m.LanguageConfig.NumKeyValueHeads
+	kv["expert_count"] = m.LanguageConfig.NumRoutedExperts
+	kv["expert_used_count"] = m.LanguageConfig.NumExpertsPerToken
+	kv["leading_dense_block_count"] = m.LanguageConfig.FirstKDenseReplace
+
+	kv["vision.block_count"] = m.VisionConfig.Width.Vision.Layers
+	kv["vision.embedding_length"] = m.VisionConfig.Width.Vision.Width
+	kv["vision.head_count"] = m.VisionConfig.Width.Vision.Heads
+	kv["vision.image_size"] = m.VisionConfig.Width.Vision.ImageSize
+	kv["vision.patch_size"] = m.VisionConfig.Width.Vision.PatchSize
+
+	kv["sam.block_count"] = m.VisionConfig.Width.Sam.Layers
+	kv["sam.embedding_length"] = m.VisionConfig.Width.Sam.Width
+	kv["sam.head_count"] = m.VisionConfig.Width.Sam.Heads
+	kv["sam.global_attention_indexes"] = m.VisionConfig.Width.Sam.GlobalAttentionIndexes
+	return kv
+}
+
+func (m *deepseekocr) Tensors(s []Tensor) (out []*ggml.Tensor) {
+	merges := make([]merge, m.LanguageConfig.HiddenLayers*3)
+	for i := range m.LanguageConfig.HiddenLayers {
+		merges[i*3+0] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}
+		merges[i*3+1] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}
+		merges[i*3+2] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}
+	}
+
+	out, s = mergeTensors(s, merges...)
+	for _, t := range s {
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+	return out
+}
+
+func (m *deepseekocr) Replacements() []string {
+	return []string{
+		"model.embed_tokens", "token_embd",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"post_attention_layernorm", "ffn_norm",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
+		"mlp.shared_experts.up_proj", "ffn_up_shexp",
+		"mlp.shared_experts.down_proj", "ffn_down_shexp",
+		"model.norm", "output_norm",
+		"lm_head", "output",
+
+		"model.vision_model", "v",
+		"embeddings.patch_embedding", "patch_embd",
+		"embeddings.class_embedding", "class_embd",
+		"embeddings.position_embedding", "position_embd",
+		"transformer.layers", "blk",
+
+		"model.projector", "mm",
+		"model.image_newline", "mm.image_newline",
+		//nolint:misspell // this misspelling is upstream. fixing it breaks the model
+		"model.view_seperator", "mm.view_seperator",
+
+		"model.sam_model.patch_embed.proj", "s.patch_embd",
+		"model.sam_model.pos_embed", "s.position_embd",
+		"model.sam_model.blocks", "s.blk",
+		"model.sam_model.neck", "s.neck",
+		"model.sam_model.net_", "s.net_",
+	}
+}
--- a/convert/convert_gemma3.go
+++ b/convert/convert_gemma3.go
@@ -2,6 +2,7 @@ package convert

 import (
 	"cmp"
+	"slices"

 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -26,16 +27,26 @@ type gemma3Model struct {
 		NumChannels       uint32  `json:"num_channels"`        // num_channels 3
 		PatchSize         uint32  `json:"patch_size"`          // patch_size 14
 	} `json:"vision_config"`
-	MaxPositionEmbeddings    uint32  `json:"max_position_embeddings"`
-	NumAttentionHeads        uint32  `json:"num_attention_heads"`
-	NumKeyValueHeads         uint32  `json:"num_key_value_heads"`
-	RMSNormEPS               float32 `json:"rms_norm_eps"`
-	HeadDim                  uint32  `json:"head_dim"`
-	FinalLogitSoftcap        float32 `json:"final_logit_softcapping"`
-	RopeLocalTheta           float32 `json:"rope_local_base_freq"`
-	RopeGlobalTheta          float32 `json:"rope_global_base_freq"`
-	SlidingWindow            uint32  `json:"sliding_window"`
-	MultiModalTokensPerImage uint32  `json:"mm_tokens_per_image"`
+	MaxPositionEmbeddings    uint32   `json:"max_position_embeddings"`
+	NumAttentionHeads        uint32   `json:"num_attention_heads"`
+	NumKeyValueHeads         uint32   `json:"num_key_value_heads"`
+	RMSNormEPS               float32  `json:"rms_norm_eps"`
+	HeadDim                  uint32   `json:"head_dim"`
+	FinalLogitSoftcap        float32  `json:"final_logit_softcapping"`
+	RopeLocalTheta           float32  `json:"rope_local_base_freq"`
+	RopeTheta                float32  `json:"rope_theta"`
+	SlidingWindow            uint32   `json:"sliding_window"`
+	SlidingWindowPattern     *uint32  `json:"sliding_window_pattern"`
+	LayerTypes               []string `json:"layer_types"`
+	MultiModalTokensPerImage uint32   `json:"mm_tokens_per_image"`
+	RopeScaling              *struct {
+		Type                          string  `json:"rope_type"`
+		Factor                        float32 `json:"factor"`
+		OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+		ExtrapolationFactor           float32 `json:"extrapolation_factor"`
+		BetaFast                      float32 `json:"beta_fast"`
+		BetaSlow                      float32 `json:"beta_slow"`
+	} `json:"rope_scaling"`
 }

 const (
@@ -81,9 +92,38 @@ func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
 		kv["gemma3.attention.key_length"] = p.HeadDim
 		kv["gemma3.attention.value_length"] = p.HeadDim
 		kv["gemma3.attention.sliding_window"] = p.SlidingWindow
-		kv["gemma3.final_logit_softcapping"] = cmp.Or(p.FinalLogitSoftcap, 30)
+
+		// The sliding window pattern is either provided as the sliding_window_pattern
+		// key (an int) or as the layer_types key (a list of strings).
+		if p.SlidingWindowPattern != nil || len(p.LayerTypes) > 0 {
+			kv["gemma3.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
+				for i := range numBlocks {
+					var isLocal bool
+					if len(p.LayerTypes) > 0 && int(i) < len(p.LayerTypes) {
+						isLocal = p.LayerTypes[i] == "sliding_attention"
+					} else if p.SlidingWindowPattern != nil && *p.SlidingWindowPattern > 0 {
+						isLocal = (i+1)%*p.SlidingWindowPattern != 0
+					}
+					if !yield(isLocal) {
+						break
+					}
+				}
+			})
+		}
+		if p.FinalLogitSoftcap > 0 {
+			kv["gemma3.final_logit_softcapping"] = p.FinalLogitSoftcap
+		}
 		kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
-		kv["gemma3.rope.global.freq_base"] = cmp.Or(p.RopeGlobalTheta, 1000000.0)
+		kv["gemma3.rope.freq_base"] = cmp.Or(p.RopeTheta, 1000000.0)
+		if p.RopeScaling != nil && p.RopeScaling.Type == "yarn" && p.RopeScaling.Factor > 0 {
+			kv["gemma3.rope.scaling.type"] = "yarn"
+			kv["gemma3.rope.scaling.factor"] = p.RopeScaling.Factor
+			kv["gemma3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
+			kv["gemma3.rope.scaling.extrapolation_factor"] = cmp.Or(p.RopeScaling.ExtrapolationFactor, float32(1.0))
+			kv["gemma3.rope.scaling.beta_fast"] = cmp.Or(p.RopeScaling.BetaFast, float32(64.0))
+			kv["gemma3.rope.scaling.beta_slow"] = cmp.Or(p.RopeScaling.BetaSlow, float32(1.0))
+		}
+
 		kv["gemma3.embedding_length"] = p.HiddenSize
 		kv["gemma3.feed_forward_length"] = p.IntermediateSize
 	default:
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -110,9 +110,12 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {

 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
+		if !strings.HasSuffix(name, ".weight") {
+			name = name + ".weight"
+		}
 		if strings.Contains(name, "ffn_down_exps") {
 			out = append(out, &ggml.Tensor{
-				Name:     name + ".weight",
+				Name:     name,
 				Kind:     uint32(ggml.TensorTypeMXFP4),
 				Shape:    []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
 				WriterTo: mxfp4,
@@ -121,12 +124,12 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 			// gate_up_exps is interleaved, need to split into gate_exps and up_exps
 			// e.g. gate_exps, up_exps = gate_up_exps[:, 0::2, ...], gate_up_exps[:, 1::2, ...]
 			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "gate_up", "gate", 1) + ".weight",
+				Name:     strings.Replace(name, "gate_up", "gate", 1),
 				Kind:     uint32(ggml.TensorTypeMXFP4),
 				Shape:    []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
 				WriterTo: mxfp4.slice(1, 0, int(dims[1]), 2),
 			}, &ggml.Tensor{
-				Name:     strings.Replace(name, "gate_up", "up", 1) + ".weight",
+				Name:     strings.Replace(name, "gate_up", "up", 1),
 				Kind:     uint32(ggml.TensorTypeMXFP4),
 				Shape:    []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
 				WriterTo: mxfp4.slice(1, 1, int(dims[1]), 2),
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -29,6 +29,15 @@ type mistral3Model struct {
 		SlidingWindow         *uint32 `json:"sliding_window"`
 		HiddenAct             string  `json:"hidden_act"`
 		VocabSize             uint32  `json:"vocab_size"`
+		RopeParameters        struct {
+			BetaFast                  float32 `json:"beta_fast"`
+			BetaSlow                  float32 `json:"beta_slow"`
+			Factor                    float32 `json:"factor"`
+			ScalingBeta               float32 `json:"llama_4_scaling_beta"`
+			OrigMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+			RopeType                  string  `json:"rope_type"`
+			RopeTheta                 float32 `json:"rope_theta"`
+		} `json:"rope_parameters"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumAttentionHeads uint32  `json:"num_attention_heads"`
@@ -61,8 +70,13 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv["mistral3.attention.layer_norm_rms_epsilon"] = p.TextModel.RMSNormEPS
 	kv["mistral3.attention.key_length"] = p.TextModel.HeadDim
 	kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
-	kv["mistral3.rope.dimension_count"] = p.TextModel.HiddenSize / p.TextModel.NumHiddenLayers
-	kv["mistral3.rope.freq_base"] = p.TextModel.RopeTheta
+	kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
+	kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
+
+	if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
+		kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
+		kv["mistral3.rope.scaling_beta"] = p.TextModel.RopeParameters.ScalingBeta
+	}

 	// Vision configuration
 	kv["mistral3.vision.block_count"] = p.VisionModel.NumHiddenLayers
--- a/convert/convert_nomicbert.go
+++ b/convert/convert_nomicbert.go
@@ -0,0 +1,213 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type nomicbertModel struct {
+	ModelParameters
+	NLayers               uint32  `json:"n_layers"`
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	LayerNormEPS          float32 `json:"layer_norm_eps"`
+	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
+	RopeFreqBase          float32 `json:"rope_theta"`
+	normalizeEmbeddings   bool
+	PoolingType           uint32
+
+	// MoE parameters (only present in v2 models)
+	NumExperts      uint32 `json:"num_local_experts"`
+	NumExpertsUsed  uint32 `json:"num_experts_per_tok"`
+	MoEEveryNLayers uint32 `json:"moe_every_n_layers"`
+}
+
+var (
+	_ ModelConverter = (*nomicbertModel)(nil)
+	_ moreParser     = (*nomicbertModel)(nil)
+)
+
+func (p *nomicbertModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "modules.json")
+	if err != nil {
+		return err
+	}
+
+	var modules []struct {
+		Type string `json:"type"`
+		Path string `json:"path"`
+	}
+
+	if err := json.Unmarshal(bts, &modules); err != nil {
+		return err
+	}
+
+	var pooling string
+	for _, m := range modules {
+		switch m.Type {
+		case "sentence_transformers.models.Pooling":
+			pooling = m.Path
+		case "sentence_transformers.models.Normalize":
+			p.normalizeEmbeddings = true
+		}
+	}
+
+	if pooling != "" {
+		bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
+		if err != nil {
+			return err
+		}
+
+		var pc struct {
+			PoolingModeCLSToken   bool `json:"pooling_mode_cls_token"`
+			PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
+		}
+
+		if err := json.Unmarshal(bts, &pc); err != nil {
+			return err
+		}
+
+		if pc.PoolingModeMeanTokens {
+			p.PoolingType = 1
+		} else if pc.PoolingModeCLSToken {
+			p.PoolingType = 2
+		}
+	}
+
+	return nil
+}
+
+func (p *nomicbertModel) KV(t *Tokenizer) ggml.KV {
+	kv := p.ModelParameters.KV(t)
+
+	// Determine architecture based on MoE parameters (following qwen3 pattern)
+	arch := "nomic-bert"
+	if p.MoEEveryNLayers > 0 {
+		arch += "-moe"
+	}
+
+	kv["general.architecture"] = arch
+	kv["attention.causal"] = false
+	kv["pooling_type"] = p.PoolingType
+	kv["normalize_embeddings"] = p.normalizeEmbeddings
+
+	kv["block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers)
+
+	if contextLength := p.MaxPositionEmbeddings; contextLength > 0 {
+		kv["context_length"] = contextLength
+	}
+
+	if embeddingLength := p.HiddenSize; embeddingLength > 0 {
+		kv["embedding_length"] = p.HiddenSize
+	}
+
+	if feedForwardLength := p.IntermediateSize; feedForwardLength > 0 {
+		kv["feed_forward_length"] = p.IntermediateSize
+	}
+
+	if headCount := p.NumAttentionHeads; headCount > 0 {
+		kv["attention.head_count"] = p.NumAttentionHeads
+	}
+
+	if kvHeadCount := p.NumKeyValueHeads; kvHeadCount > 0 {
+		kv["attention.head_count_kv"] = p.NumKeyValueHeads
+	}
+
+	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon); layerNormEpsilon > 0 {
+		kv["attention.layer_norm_epsilon"] = layerNormEpsilon
+	}
+
+	if p.RopeFreqBase > 0 {
+		kv["rope.freq_base"] = p.RopeFreqBase
+	}
+
+	// MoE specific parameters (only if MoE is enabled)
+	if p.NumExperts > 0 {
+		kv["expert_count"] = p.NumExperts
+	}
+
+	if p.NumExpertsUsed > 0 {
+		kv["expert_used_count"] = p.NumExpertsUsed
+	}
+
+	if p.MoEEveryNLayers > 0 {
+		kv["moe_every_n_layers"] = p.MoEEveryNLayers
+	}
+
+	kv["tokenizer.ggml.model"] = "bert"
+	kv["tokenizer.ggml.token_type_count"] = uint32(2)
+
+	// convert to phantom space tokens
+	for i, e := range t.Tokens {
+		switch {
+		case strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]"):
+			// noop - keep special tokens as-is
+		case strings.HasPrefix(e, "##"):
+			t.Tokens[i] = e[2:]
+		default:
+			t.Tokens[i] = "\u2581" + e
+		}
+	}
+
+	kv["tokenizer.ggml.tokens"] = t.Tokens
+
+	return kv
+}
+
+func (p *nomicbertModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	out := make([]*ggml.Tensor, 0, len(ts))
+	for _, t := range ts {
+		if slices.Contains([]string{
+			"embeddings.position_ids",
+			"pooler.dense.weight",
+			"pooler.dense.bias",
+		}, t.Name()) {
+			continue
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (nomicbertModel) Replacements() []string {
+	return []string{
+		"encoder.layer", "blk",
+		"encoder.layers", "blk",
+		"embeddings.word_embeddings", "token_embd",
+		"embeddings.token_type_embeddings", "token_types",
+		"embeddings.LayerNorm", "token_embd_norm",
+
+		"attention.self.qkv", "attn_qkv",
+
+		"attention.output.dense", "attn_output",
+		"attention.output.LayerNorm", "attn_output_norm",
+
+		"mlp.up", "ffn_up",
+		"mlp.down", "ffn_down",
+
+		"mlp.router", "ffn_gate_inp",
+		"mlp.experts.up", "ffn_up_exps",
+		"mlp.experts.down", "ffn_down_exps",
+
+		"intermediate.dense", "ffn_up",
+		"output.dense", "ffn_down",
+		"output.LayerNorm", "layer_output_norm",
+	}
+}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -44,7 +44,10 @@ func (t tensorBase) Kind() uint32 {
 		t.name == "v.positional_embedding_vlm" ||
 		t.name == "v.tile_position_embd.weight" ||
 		t.name == "v.pre_tile_position_embd.weight" ||
-		t.name == "v.post_tile_position_embd.weight" {
+		t.name == "v.post_tile_position_embd.weight" ||
+		t.name == "s.position_embd" ||
+		strings.HasSuffix(t.name, "rel_pos_h") ||
+		strings.HasSuffix(t.name, "rel_pos_w") {
 		// these tensors are always F32
 		return tensorKindFP32
 	}
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -37,6 +37,10 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 			return nil, err
 		}

+		if n <= 0 || n > 100<<20 {
+			return nil, fmt.Errorf("invalid safetensors file %q (header size: %d): file may be corrupted or a Git LFS pointer", p, n)
+		}
+
 		b := bytes.NewBuffer(make([]byte, 0, n))
 		if _, err = io.CopyN(b, f, n); err != nil {
 			return nil, err
@@ -96,7 +100,10 @@ type safetensor struct {

 func (st safetensor) Kind() uint32 {
 	kind := st.tensorBase.Kind()
-	if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
+	if st.dtype == "BF16" &&
+		!strings.HasPrefix(st.name, "v.") &&
+		!strings.HasPrefix(st.name, "s.") &&
+		kind != tensorKindFP32 {
 		kind = tensorKindBF16
 	}

--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -2,10 +2,12 @@ package convert

 import (
 	"cmp"
+	"errors"
 	"io"
 	"iter"
 	"path"
 	"slices"
+	"strconv"
 	"strings"

 	"github.com/pdevine/tensor"
@@ -94,6 +96,26 @@ func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []
 			return matched
 		})

+		slices.SortStableFunc(matched, func(a, b Tensor) int {
+			x := strings.Split(a.Name(), ".")
+			y := strings.Split(b.Name(), ".")
+			if len(x) != len(y) {
+				return cmp.Compare(len(x), len(y))
+			}
+
+			vals := make([]int, len(x))
+			for i := range x {
+				vals[i] = strings.Compare(x[i], y[i])
+				m, err := strconv.ParseInt(x[i], 0, 0)
+				n, err2 := strconv.ParseInt(y[i], 0, 0)
+				if errors.Join(err, err2) == nil {
+					vals[i] = cmp.Compare(m, n)
+				}
+			}
+
+			return cmp.Or(vals...)
+		})
+
 		if len(matched) > 0 {
 			out = append(out, &ggml.Tensor{
 				Name:     merges[i].name,
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -3,8 +3,10 @@ package convert
 import (
 	"bytes"
 	"encoding/binary"
+	"fmt"
 	"io"
 	"iter"
+	"math/rand/v2"
 	"slices"
 	"strings"
 	"testing"
@@ -951,3 +953,45 @@ func TestMerge(t *testing.T) {
 		}
 	})
 }
+
+func TestMergeOrder(t *testing.T) {
+	for range 8 {
+		t.Run("", func(t *testing.T) {
+			tensors := make([]Tensor, 16)
+			for i := range tensors {
+				tensors[i] = &fakeTensor{
+					name:  fmt.Sprintf("layer.%d.weight", i),
+					shape: []uint64{1},
+					data:  []float32{float32(i)},
+				}
+			}
+
+			rand.Shuffle(len(tensors), func(i, j int) {
+				tensors[i], tensors[j] = tensors[j], tensors[i]
+			})
+
+			matched, unmatched := mergeTensors(tensors, merge{"layer.*.weight", "layer.weight"})
+			if len(unmatched) != 0 {
+				t.Error("expected no remaining tensors, got", len(unmatched))
+			}
+
+			if len(matched) != 1 {
+				t.Error("expected 1 merged tensor, got", len(matched))
+			}
+
+			var b bytes.Buffer
+			if _, err := matched[0].WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			var f32s [16]float32
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.IsSorted(f32s[:]) {
+				t.Errorf("merged tensor data is not in order: %+v", f32s)
+			}
+		})
+	}
+}
--- a/discover/cpu_linux.go
+++ b/discover/cpu_linux.go
@@ -2,6 +2,7 @@ package discover

 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -10,12 +11,21 @@ import (
 	"reflect"
 	"regexp"
 	"sort"
+	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
 )

 func GetCPUMem() (memInfo, error) {
+	mem, err := getCPUMem()
+	if err != nil {
+		return memInfo{}, err
+	}
+	return getCPUMemByCgroups(mem), nil
+}
+
+func getCPUMem() (memInfo, error) {
 	var mem memInfo
 	var total, available, free, buffers, cached, freeSwap uint64
 	f, err := os.Open("/proc/meminfo")
@@ -56,6 +66,32 @@ func GetCPUMem() (memInfo, error) {
 	return mem, nil
 }

+func getCPUMemByCgroups(mem memInfo) memInfo {
+	total, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.max")
+	if err == nil {
+		mem.TotalMemory = total
+	}
+	used, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.current")
+	if err == nil {
+		mem.FreeMemory = mem.TotalMemory - used
+	}
+	return mem
+}
+
+func getUint64ValueFromFile(path string) (uint64, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return 0, err
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		line := s.Text()
+		return strconv.ParseUint(line, 10, 64)
+	}
+	return 0, errors.New("empty file content")
+}
+
 const CpuInfoFilename = "/proc/cpuinfo"

 type linuxCpuInfo struct {
@@ -74,7 +110,41 @@ func GetCPUDetails() []CPU {
 		return nil
 	}
 	defer file.Close()
-	return linuxCPUDetails(file)
+	cpus := linuxCPUDetails(file)
+	return overwriteThreadCountByLinuxCgroups(cpus)
+}
+
+func overwriteThreadCountByLinuxCgroups(cpus []CPU) []CPU {
+	file, err := os.Open("/sys/fs/cgroup/cpu.max")
+	if err != nil {
+		return cpus
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if sl := strings.Split(line, " "); len(sl) == 2 {
+			allowdUs, err := strconv.ParseInt(sl[0], 10, 64)
+			if err != nil {
+				slog.Warn("failed to parse CPU allowed micro secs", "error", err)
+				return cpus
+			}
+			unitUs, err := strconv.ParseInt(sl[1], 10, 64)
+			if err != nil {
+				slog.Warn("failed to parse CPU unit micro secs", "error", err)
+				return cpus
+			}
+
+			threads := int(max(allowdUs/unitUs, 1))
+
+			cpu := cpus[0]
+			cpu.CoreCount = threads
+			cpu.ThreadCount = threads
+			return []CPU{cpu}
+		}
+	}
+	return cpus
 }

 func linuxCPUDetails(file io.Reader) []CPU {
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -65,6 +65,11 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		}

 		slog.Info("discovering available GPUs...")
+		detectIncompatibleLibraries()
+
+		// Warn if any user-overrides are set which could lead to incorrect GPU discovery
+		overrideWarnings()
+
 		requested := envconfig.LLMLibrary()
 		jetpack := cudaJetpack()

@@ -90,10 +95,16 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			var dirs []string
 			if dir != "" {
 				if requested != "" && filepath.Base(dir) != requested {
-					slog.Debug("skipping available library at users request", "requested", requested, "libDir", dir)
+					slog.Debug("skipping available library at user's request", "requested", requested, "libDir", dir)
 					continue
 				} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
 					continue
+				} else if jetpack == "" && strings.Contains(filepath.Base(dir), "cuda_jetpack") {
+					slog.Debug("jetpack not detected (set JETSON_JETPACK or OLLAMA_LLM_LIBRARY to override), skipping", "libDir", dir)
+					continue
+				} else if !envconfig.EnableVulkan() && strings.Contains(filepath.Base(dir), "vulkan") {
+					slog.Info("experimental Vulkan support disabled.  To enable, set OLLAMA_VULKAN=1")
+					continue
 				}
 				dirs = []string{ml.LibOllamaPath, dir}
 			} else {
@@ -110,7 +121,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		// In the second pass, we more deeply initialize the GPUs to weed out devices that
 		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
 		// Only devices that need verification are included in this pass
-		slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
+		slog.Debug("evaluating which, if any, devices to filter out", "initial_count", len(devices))
 		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
 		defer cancel()
 		var wg sync.WaitGroup
@@ -118,15 +129,25 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		supportedMu := sync.Mutex{}
 		supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
 		for i := range devices {
+			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
 			if !devices[i].NeedsInitValidation() {
+				// No need to validate, add to the supported map
+				supportedMu.Lock()
+				if _, ok := supported[devices[i].Library]; !ok {
+					supported[devices[i].Library] = make(map[string]map[string]int)
+				}
+				if _, ok := supported[devices[i].Library][libDir]; !ok {
+					supported[devices[i].Library][libDir] = make(map[string]int)
+				}
+				supported[devices[i].Library][libDir][devices[i].ID] = i
+				supportedMu.Unlock()
 				continue
 			}
-			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
-			slog.Debug("verifying device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
+			slog.Debug("verifying if device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
-				extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1])
+				extraEnvs := ml.GetVisibleDevicesEnv(devices[i:i+1], true)
 				devices[i].AddInitValidation(extraEnvs)
 				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
 					slog.Debug("filtering device which didn't fully initialize",
@@ -312,7 +333,8 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			defer cancel()

 			// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
-			devFilter := ml.GetVisibleDevicesEnv(devices)
+			// We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
+			devFilter := ml.GetVisibleDevicesEnv(devices, false)

 			for dir := range libDirs {
 				updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter)
@@ -446,3 +468,37 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map

 	return devices
 }
+
+func overrideWarnings() {
+	anyFound := false
+	m := envconfig.AsMap()
+	for _, k := range []string{
+		"CUDA_VISIBLE_DEVICES",
+		"HIP_VISIBLE_DEVICES",
+		"ROCR_VISIBLE_DEVICES",
+		"GGML_VK_VISIBLE_DEVICES",
+		"GPU_DEVICE_ORDINAL",
+		"HSA_OVERRIDE_GFX_VERSION",
+	} {
+		if e, found := m[k]; found && e.Value != "" {
+			anyFound = true
+			slog.Warn("user overrode visible devices", k, e.Value)
+		}
+	}
+	if anyFound {
+		slog.Warn("if GPUs are not correctly discovered, unset and try again")
+	}
+}
+
+func detectIncompatibleLibraries() {
+	if runtime.GOOS != "windows" {
+		return
+	}
+	basePath, err := exec.LookPath("ggml-base.dll")
+	if err != nil || basePath == "" {
+		return
+	}
+	if !strings.HasPrefix(basePath, ml.LibOllamaPath) {
+		slog.Warn("potentially incompatible library detected in PATH", "location", basePath)
+	}
+}
--- a/docs/api/introduction.mdx
+++ b/docs/api/introduction.mdx
--- a/docs/capabilities/embeddings.mdx
+++ b/docs/capabilities/embeddings.mdx
@@ -13,9 +13,23 @@ Embeddings turn text into numeric vectors you can store in a vector database, se

 ## Generate embeddings

-Use `/api/embed` with a single string.
-
 <Tabs>
+  <Tab title="CLI">
+    Generate embeddings directly from the command line:
+
+    ```shell
+    ollama run embeddinggemma "Hello world"
+    ```
+
+    You can also pipe text to generate embeddings:
+
+    ```shell
+    echo "Hello world" | ollama run embeddinggemma
+    ```
+
+    Output is a JSON array.
+
+  </Tab>
  <Tab title="cURL">
    ```shell
    curl -X POST http://localhost:11434/api/embed \
--- a/docs/capabilities/tool-calling.mdx
+++ b/docs/capabilities/tool-calling.mdx
@@ -15,7 +15,7 @@ Also known as "single-shot" tool calling.
    ```shell
    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
      "model": "qwen3",
-      "messages": [{"role": "user", "content": "What's the temperature in New York?"}],
+      "messages": [{"role": "user", "content": "What is the temperature in New York?"}],
      "stream": false,
      "tools": [
        {
@@ -41,7 +41,7 @@ Also known as "single-shot" tool calling.
    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
      "model": "qwen3",
      "messages": [
-        {"role": "user", "content": "What's the temperature in New York?"},
+        {"role": "user", "content": "What is the temperature in New York?"},
        {
          "role": "assistant",
          "tool_calls": [
@@ -90,7 +90,7 @@ Also known as "single-shot" tool calling.
      }
      return temperatures.get(city, "Unknown")

-    messages = [{"role": "user", "content": "What's the temperature in New York?"}]
+    messages = [{"role": "user", "content": "What is the temperature in New York?"}]

    # pass functions directly as tools in the tools list or as a JSON schema
    response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
@@ -146,7 +146,7 @@ Also known as "single-shot" tool calling.
      },
    ]

-    const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+    const messages = [{ role: 'user', content: "What is the temperature in New York?" }]

    const response = await ollama.chat({
      model: 'qwen3',
@@ -609,7 +609,7 @@ def get_temperature(city: str) -> str:
  return temperatures.get(city, 'Unknown')


-messages = [{'role': 'user', 'content': "What's the temperature in New York?"}]
+messages = [{'role': 'user', 'content': "What is the temperature in New York?"}]

 while True:
  stream = chat(
@@ -684,7 +684,7 @@ const getTemperatureTool = {
 }

 async function agentLoop() {
-  const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+  const messages = [{ role: 'user', content: "What is the temperature in New York?" }]

  while (true) {
    const stream = await ollama.chat({
--- a/docs/cloud.mdx
+++ b/docs/cloud.mdx
@@ -9,15 +9,9 @@ sidebarTitle: Cloud

 Ollama's cloud models are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud service while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn't fit on a personal computer.

-Ollama currently supports the following cloud models, with more coming soon:
+### Supported models

- `deepseek-v3.1:671b-cloud`
- `gpt-oss:20b-cloud`
- `gpt-oss:120b-cloud`
- `kimi-k2:1t-cloud`
- `qwen3-coder:480b-cloud`
- `glm-4.6:cloud`
- `minimax-m2:cloud`
+For a list of supported models, see Ollama's [model library](https://ollama.com/search?c=cloud).

 ### Running Cloud models

--- a/docs/development.md
+++ b/docs/development.md
@@ -49,6 +49,8 @@ Install prerequisites:
    - [Ninja](https://github.com/ninja-build/ninja/releases)
 - (Optional) NVIDIA GPU support
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
+- (Optional) VULKAN GPU support
+    - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs

 Then, configure and build the project:

@@ -57,6 +59,17 @@ cmake -B build
 cmake --build build --config Release
 ```

+> Building for Vulkan requires VULKAN_SDK environment variable:
+> 
+> PowerShell
+> ```powershell
+> $env:VULKAN_SDK="C:\VulkanSDK\<version>"
+> ```
+> CMD
+> ```cmd
+> set VULKAN_SDK=C:\VulkanSDK\<version>
+> ```
+
 > [!IMPORTANT]
 > Building for ROCm requires additional flags:
 > ```
@@ -65,6 +78,7 @@ cmake --build build --config Release
 > ```


+
 Lastly, run Ollama:

 ```shell
@@ -84,7 +98,9 @@ Install prerequisites:
    - [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
 - (Optional) NVIDIA GPU support
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
-
+- (Optional) VULKAN GPU support
+    - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
+    - Or install via package manager: `sudo apt install vulkan-sdk` (Ubuntu/Debian) or `sudo dnf install vulkan-sdk` (Fedora/CentOS)
 > [!IMPORTANT]
 > Ensure prerequisites are in `PATH` before running CMake.

--- a/docs/docker.mdx
+++ b/docs/docker.mdx
@@ -68,6 +68,15 @@ To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following c
 docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
 ```

+## Vulkan Support
+
+Vulkan is bundled into the `ollama/ollama` image.  
+
+```shell
+docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 -e OLLAMA_VULKAN=1 --name ollama ollama/ollama
+```
+
+
 ## Run model locally

 Now you can run a model:
@@ -79,3 +88,4 @@ docker exec -it ollama ollama run llama3.2
 ## Try different models

 More models can be found on the [Ollama library](https://ollama.com/library).
+
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -63,6 +63,10 @@
    {
      "source": "/api/openai",
      "destination": "/api/openai-compatibility"
+    },
+    {
+      "source": "/api",
+      "destination": "/api/introduction"
    }
  ],
  "navigation": {
@@ -130,7 +134,7 @@
          {
            "group": "API Reference",
            "pages": [
-              "/api/index",
+              "/api/introduction",
              "/api/authentication",
              "/api/streaming",
              "/api/usage",
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -57,8 +57,13 @@ ollama ps
 ```

 <Info>
-  **Output**: ``` NAME ID SIZE PROCESSOR UNTIL llama3:70b bcfb190ca3a7 42 GB
-  100% GPU 4 minutes from now ```
+
+**Output**:
+
+```
+NAME        ID            SIZE    PROCESSOR   UNTIL
+llama3:70b  bcfb190ca3a7  42 GB   100% GPU    4 minutes from now
+```
 </Info>

 The `Processor` column will show which memory the model was loaded in to:
@@ -223,7 +228,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 ## How can I use Ollama in Visual Studio Code?

-There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
+There is already a large collection of plugins available for VS Code as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.

 ## How do I use Ollama with GPU acceleration in Docker?

@@ -385,4 +390,4 @@ Ollama for Windows and macOS register as a login item during installation.  You
 - In `Task Manager` go to the `Startup apps` tab, search for `ollama` then click `Disable`

 **MacOS**
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/docs/gpu.mdx
+++ b/docs/gpu.mdx
@@ -52,7 +52,11 @@ sudo modprobe nvidia_uvm`

 ## AMD Radeon

-Ollama supports the following AMD GPUs:
+Ollama supports the following AMD GPUs via the ROCm library:
+
+> [!NOTE]
+> Additional AMD GPU support is provided by the Vulkan Library - see below.
+

 ### Linux Support

@@ -121,6 +125,42 @@ In some Linux distributions, SELinux can prevent containers from
 accessing the AMD GPU devices. On the host system you can run
 `sudo setsebool container_use_devices=1` to allow containers to use devices.

-### Metal (Apple GPUs)
+## Metal (Apple GPUs)

 Ollama supports GPU acceleration on Apple devices via the Metal API.
+
+
+## Vulkan GPU Support
+
+> [!NOTE]
+> Vulkan is currently an Experimental feature.  To enable, you must set OLLAMA_VULKAN=1 for the Ollama server as
+described in the [FAQ](faq.md#how-do-i-configure-ollama-server)
+
+Additional GPU support on Windows and Linux is provided via
+[Vulkan](https://www.vulkan.org/). On Windows most GPU vendors drivers come
+bundled with Vulkan support and require no additional setup steps. Most Linux
+distributions require installing additional components, and you may have
+multiple options for Vulkan drivers between Mesa and GPU Vendor specific packages
+
+- Linux Intel GPU Instructions - https://dgpu-docs.intel.com/driver/client/overview.html
+- Linux AMD GPU Instructions - https://amdgpu-install.readthedocs.io/en/latest/install-script.html#specifying-a-vulkan-implementation
+
+For AMD GPUs on some Linux distributions, you may need to add the `ollama` user to the `render` group.
+
+The Ollama scheduler leverages available VRAM data reported by the GPU libraries to
+make optimal scheduling decisions.  Vulkan requires additional capabilities or
+running as root to expose this available VRAM data.  If neither root access or this
+capability are granted, Ollama will use approximate sizes of the models
+to make best effort scheduling decisions.
+
+```bash
+sudo setcap cap_perfmon+ep /usr/local/bin/ollama
+```
+
+### GPU Selection
+
+To select specific Vulkan GPU(s), you can set the environment variable
+`GGML_VK_VISIBLE_DEVICES` to one or more numeric IDs on the Ollama server as
+described in the [FAQ](faq.md#how-do-i-configure-ollama-server). If you
+encounter any problems with Vulkan based GPUs, you can disable all Vulkan GPUs
+by setting `GGML_VK_VISIBLE_DEVICES=-1` 
--- a/docs/integrations/vscode.mdx
+++ b/docs/integrations/vscode.mdx
@@ -1,34 +1,34 @@
 ---
-title: VS Code 
+title: VS Code
 ---

 ## Install

-Install [VSCode](https://code.visualstudio.com/download). 
+Install [VS Code](https://code.visualstudio.com/download).

-## Usage with Ollama 
+## Usage with Ollama

 1. Open Copilot side bar found in top right window
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/vscode-sidebar.png" 
-    alt="VSCode chat Sidebar"
-    width="75%"
-  />
-</div>
-2. Select the model drowpdown > **Manage models**
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/vscode-models.png" 
-    alt="VSCode model picker"
-    width="75%"
-  />
-</div>
+   <div style={{ display: "flex", justifyContent: "center" }}>
+     <img
+       src="/images/vscode-sidebar.png"
+       alt="VS Code chat Sidebar"
+       width="75%"
+     />
+   </div>
+2. Select the model dropdown > **Manage models**
+   <div style={{ display: "flex", justifyContent: "center" }}>
+     <img
+       src="/images/vscode-models.png"
+       alt="VS Code model picker"
+       width="75%"
+     />
+   </div>
 3. Enter **Ollama** under **Provider Dropdown** and select desired models (e.g `qwen3, qwen3-coder:480b-cloud`)
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/vscode-model-options.png" 
-    alt="VSCode model options dropdown"
-    width="75%"
-  />
-</div>
+   <div style={{ display: "flex", justifyContent: "center" }}>
+     <img
+       src="/images/vscode-model-options.png"
+       alt="VS Code model options dropdown"
+       width="75%"
+     />
+   </div>
--- a/docs/modelfile.mdx
+++ b/docs/modelfile.mdx
@@ -149,9 +149,6 @@ PARAMETER <parameter> <parametervalue>

 | Parameter      | Description                                                                                                                                                                                                                                                                                                                                                                     | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| mirostat       | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)                                                                                                                                                                                                                                                                 | int        | mirostat 0           |
-| mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                                                                                                                                                | float      | mirostat_eta 0.1     |
-| mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                                                                                                                                                 | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                                                                                                                                            | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                                                                                                                                                   | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                                                                                                                                             | float      | repeat_penalty 1.1   |
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -2,12 +2,15 @@ openapi: 3.1.0
 info:
  title: Ollama API
  version: 0.1.0
+  license:
+    name: MIT
+    url: https://opensource.org/licenses/MIT
  description: |
    OpenAPI specification for the Ollama HTTP API
-
 servers:
  - url: http://localhost:11434
-    description: Local Ollama instance
+    description: Ollama
+security: []
 components:
  securitySchemes:
    bearerAuth:
@@ -93,8 +96,11 @@ components:
          type: boolean
          default: true
        think:
-          type: boolean
-          description: When true, returns separate thinking output in addition to content
+          oneOf:
+            - type: boolean
+            - type: string
+              enum: [high, medium, low]
+          description: When true, returns separate thinking output in addition to content. Can be a boolean (true/false) or a string ("high", "medium", "low") for supported models.
        raw:
          type: boolean
          description: When true, returns the raw response from the model without any prompt templating
@@ -105,6 +111,12 @@ components:
          description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
        options:
          $ref: "#/components/schemas/ModelOptions"
+        logprobs:
+          type: boolean
+          description: Whether to return log probabilities of the output tokens
+        top_logprobs:
+          type: integer
+          description: Number of most likely tokens to return at each token position when logprobs are enabled
    GenerateResponse:
      type: object
      properties:
@@ -144,6 +156,11 @@ components:
        eval_duration:
          type: integer
          description: Time spent generating tokens in nanoseconds
+        logprobs:
+          type: array
+          items:
+            $ref: "#/components/schemas/Logprob"
+          description: Log probability information for the generated tokens when logprobs are enabled
    GenerateStreamEvent:
      type: object
      properties:
@@ -271,13 +288,22 @@ components:
          type: boolean
          default: true
        think:
-          type: boolean
-          description: When true, returns separate thinking output in addition to content
+          oneOf:
+            - type: boolean
+            - type: string
+              enum: [high, medium, low]
+          description: When true, returns separate thinking output in addition to content. Can be a boolean (true/false) or a string ("high", "medium", "low") for supported models.
        keep_alive:
          oneOf:
            - type: string
            - type: number
          description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
+        logprobs:
+          type: boolean
+          description: Whether to return log probabilities of the output tokens
+        top_logprobs:
+          type: integer
+          description: Number of most likely tokens to return at each token position when logprobs are enabled
    ChatResponse:
      type: object
      properties:
@@ -310,7 +336,6 @@ components:
              type: array
              items:
                type: string
-              nullable: true
              description: Optional base64-encoded images in the response
        done:
          type: boolean
@@ -336,6 +361,11 @@ components:
        eval_duration:
          type: integer
          description: Time spent generating tokens in nanoseconds
+        logprobs:
+          type: array
+          items:
+            $ref: "#/components/schemas/Logprob"
+          description: Log probability information for the generated tokens when logprobs are enabled
    ChatStreamEvent:
      type: object
      properties:
@@ -367,7 +397,6 @@ components:
              type: array
              items:
                type: string
-              nullable: true
              description: Partial base64-encoded images, when present
        done:
          type: boolean
@@ -543,6 +572,9 @@ components:
        license:
          type: string
          description: The license of the model
+        modified_at:
+          type: string
+          description: Last modified timestamp in ISO 8601 format
        details:
          type: object
          description: High-level model details
@@ -622,6 +654,9 @@ components:
        size_vram:
          type: integer
          description: VRAM usage in bytes
+        context_length:
+          type: integer
+          description: Context length for the running model
    PsResponse:
      type: object
      properties:
@@ -693,6 +728,41 @@ components:
        version:
          type: string
          description: Version of Ollama
+    TokenLogprob:
+      type: object
+      description: Log probability information for a single token alternative
+      properties:
+        token:
+          type: string
+          description: The text representation of the token
+        logprob:
+          type: number
+          description: The log probability of this token
+        bytes:
+          type: array
+          items:
+            type: integer
+          description: The raw byte representation of the token
+    Logprob:
+      type: object
+      description: Log probability information for a generated token
+      properties:
+        token:
+          type: string
+          description: The text representation of the token
+        logprob:
+          type: number
+          description: The log probability of this token
+        bytes:
+          type: array
+          items:
+            type: integer
+          description: The raw byte representation of the token
+        top_logprobs:
+          type: array
+          items:
+            $ref: "#/components/schemas/TokenLogprob"
+          description: Most likely tokens and their log probabilities at this position
    ErrorResponse:
      type: object
      properties:
@@ -1275,6 +1345,9 @@ paths:
            example:
              source: gemma3
              destination: gemma3-backup
+      responses:
+        "200":
+          description: Model successfully copied
  /api/pull:
    post:
      summary: Pull a model
@@ -1382,16 +1455,7 @@ paths:
              model: gemma3
      responses:
        "200":
-          description: Deletion status updates.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/StatusResponse"
-              example:
-                status: "success"
-            application/x-ndjson:
-              schema:
-                $ref: "#/components/schemas/StatusEvent"
+          description: Model successfully deleted
  /api/version:
    get:
      summary: Get version
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -196,8 +196,6 @@ var (
 	NoPrune = Bool("OLLAMA_NOPRUNE")
 	// SchedSpread allows scheduling models across all GPUs.
 	SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
-	// IntelGPU enables experimental Intel GPU detection.
-	IntelGPU = Bool("OLLAMA_INTEL_GPU")
 	// MultiUserCache optimizes prompt caching for multi-user scenarios
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 	// Enable the new Ollama engine
@@ -206,6 +204,8 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
+	// Enable Vulkan backend
+	EnableVulkan = Bool("OLLAMA_VULKAN")
 )

 func String(s string) func() string {
@@ -314,7 +314,7 @@ func AsMap() map[string]EnvVar {
 		ret["GGML_VK_VISIBLE_DEVICES"] = EnvVar{"GGML_VK_VISIBLE_DEVICES", VkVisibleDevices(), "Set which Vulkan devices are visible by numeric ID"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
+		ret["OLLAMA_VULKAN"] = EnvVar{"OLLAMA_VULKAN", EnableVulkan(), "Enable experimental Vulkan support"}
 	}

 	return ret
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -249,6 +249,9 @@ func (kv KV) OllamaEngineRequired() bool {
 		"qwen25vl",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
+		"deepseekocr",
+		"deepseek2",
+		"nomic-bert",
 	}, kv.Architecture())
 }

@@ -797,73 +800,6 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	return
 }

-func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
-	if llm.KV().Uint("vision.block_count") == 0 {
-		return
-	}
-
-	for name, layer := range llm.Tensors().GroupLayers() {
-		if name == "v" || strings.HasPrefix(name, "v.") {
-			for _, tensor := range layer {
-				weights += tensor.Size()
-			}
-		}
-	}
-
-	imageSize := uint64(llm.KV().Uint("vision.image_size"))
-	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
-	if patchSize == 0 {
-		slog.Warn("unknown patch size for vision model")
-		return
-	}
-
-	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
-
-	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
-	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
-		numPatches++
-	}
-
-	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
-	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
-
-	switch llm.KV().Architecture() {
-	case "mllama":
-		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
-
-		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))
-
-		graphSize = 4 * (8 +
-			imageSize*imageSize*numChannels*maxNumTiles +
-			embeddingLength*numPatches*maxNumTiles +
-			9*embeddingLength*numPaddedPatches*maxNumTiles +
-			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
-	case "gemma3", "mistral3":
-		graphSize = 4 * (imageSize*imageSize*numChannels +
-			embeddingLength*patchSize +
-			numPatches*numPatches*headCount)
-	case "qwen25vl":
-		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
-
-		numPatches := maxPixels / (patchSize * patchSize)
-
-		graphSize = 4 * (maxPixels*numChannels + // Original image storage
-			// Normalized pixels
-			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * patchSize^2)
-			numPatches*numChannels*patchSize*patchSize +
-			// Self-attention calculations
-			numPatches*numPatches*headCount +
-			// Additional buffer for processing
-			embeddingLength*numPatches)
-	case "llama4":
-		// vision graph is computed independently in the same schedule
-		// and is negligible compared to the worst case text graph
-	}
-
-	return weights, graphSize
-}
-
 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
 	if cacheType == "" || cacheType == "f16" {
@@ -895,6 +831,7 @@ func (f GGML) FlashAttention() bool {
 	return slices.Contains([]string{
 		"gemma3",
 		"gptoss", "gpt-oss",
+		"mistral3",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
 	}, f.KV().String("general.architecture"))
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -305,7 +305,7 @@ func readGGUFV1StringsData(llm *gguf, r io.Reader, a *array[string]) (any, error

 			a.values[i] = e
 		} else {
-			discardGGUFString(llm, r)
+			_ = discardGGUFString(llm, r)
 		}
 	}

@@ -568,7 +568,6 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 	g.SetLimit(runtime.GOMAXPROCS(0))
 	// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
 	for _, t := range ts {
-		t := t
 		w := io.NewOffsetWriter(f, offset+int64(t.Offset))
 		g.Go(func() error {
 			_, err := t.WriteTo(w)
@@ -598,6 +597,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {

 	var err error
 	switch v := v.(type) {
+	case int32:
+		err = writeGGUF(ws, ggufTypeInt32, v)
+	case int64:
+		err = writeGGUF(ws, ggufTypeInt64, v)
 	case uint32, FileType:
 		err = writeGGUF(ws, ggufTypeUint32, v)
 	case uint64:
@@ -612,6 +615,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
 		err = writeGGUFArray(ws, ggufTypeInt32, v)
 	case *array[int32]:
 		err = writeGGUFArray(ws, ggufTypeInt32, v.values)
+	case []int64:
+		err = writeGGUFArray(ws, ggufTypeInt64, v)
+	case *array[int64]:
+		err = writeGGUFArray(ws, ggufTypeInt64, v.values)
 	case []uint32:
 		err = writeGGUFArray(ws, ggufTypeUint32, v)
 	case *array[uint32]:
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -42,6 +42,10 @@ func TestWriteGGUF(t *testing.T) {
 				"general.architecture": "test",
 				"general.alignment":    uint32(16),
 				"test.key":             "value",
+				"test.int32_key":       int32(-42),
+				"test.int64_key":       int64(-9223372036854775808),
+				"test.int32_array":     []int32{-1, 0, 1, 2147483647, -2147483648},
+				"test.int64_array":     []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808},
 				"attention.key":        "value2",
 				"tokenizer.key":        "value3",
 				"adapter.key":          "value4",
@@ -55,7 +59,7 @@ func TestWriteGGUF(t *testing.T) {
 			}
 			defer r.Close()

-			ff, err := Decode(r, 0)
+			ff, err := Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
@@ -65,15 +69,19 @@ func TestWriteGGUF(t *testing.T) {
 				"general.alignment":       uint32(16),
 				"general.parameter_count": uint64(54),
 				"test.key":                "value",
+				"test.int32_key":          int32(-42),
+				"test.int64_key":          int64(-9223372036854775808),
+				"test.int32_array":        &array[int32]{size: 5, values: []int32{-1, 0, 1, 2147483647, -2147483648}},
+				"test.int64_array":        &array[int64]{size: 5, values: []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808}},
 				"test.attention.key":      "value2",
 				"tokenizer.key":           "value3",
 				"adapter.key":             "value4",
-			}, ff.KV()); diff != "" {
+			}, ff.KV(), cmp.AllowUnexported(array[int32]{}, array[int64]{})); diff != "" {
 				t.Errorf("Mismatch (-want +got):\n%s", diff)
 			}

 			if diff := cmp.Diff(Tensors{
-				Offset: 800,
+				Offset: 992,
 				items: []*Tensor{
 					{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
 					{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
--- a/go.mod
+++ b/go.mod
@@ -17,7 +17,6 @@ require (
 	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.12.0
 	golang.org/x/sys v0.36.0
-
 )

 require (
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -388,9 +388,9 @@ func NewFunctionNameMap() *FunctionNameMap {
 	}
 }

-// Init initializes the handler with tools and optional last message
+// Init initializes the handler with tools, optional last message, and think value
 // Implements the Parser interface
-func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
+func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
 	// Initialize the harmony parser
 	if h.HarmonyParser == nil {
 		h.HarmonyParser = &HarmonyParser{
--- a/integration/api_test.go
+++ b/integration/api_test.go
@@ -14,6 +14,23 @@ import (
 	"github.com/ollama/ollama/api"
 )

+func assertBytesMatchToken(t *testing.T, label, token string, ints []int) {
+	t.Helper()
+
+	raw := []byte(token)
+	if len(ints) != len(raw) {
+		t.Errorf("%s expected %d bytes for token %q, got %d (%v)", label, len(raw), token, len(ints), ints)
+		return
+	}
+
+	for i, b := range raw {
+		if ints[i] != int(b) {
+			t.Errorf("%s byte[%d] mismatch for token %q: got %d want %d", label, i, token, ints[i], int(b))
+			return
+		}
+	}
+}
+
 func TestAPIGenerate(t *testing.T) {
 	initialTimeout := 60 * time.Second
 	streamTimeout := 30 * time.Second
@@ -381,3 +398,182 @@ func TestAPIShowModel(t *testing.T) {
 		t.Errorf("%s missing modified_at: %#v", modelName, resp)
 	}
 }
+
+func TestAPIGenerateLogprobs(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	if err := PullIfMissing(ctx, client, smol); err != nil {
+		t.Fatalf("pull failed %s", err)
+	}
+
+	enableLogprobs := true
+	noStream := false
+
+	tests := []struct {
+		name        string
+		logprobs    *bool
+		topLogprobs int
+		expectCount int
+	}{
+		{
+			name:        "no_logprobs",
+			logprobs:    nil,
+			topLogprobs: 0,
+			expectCount: 0,
+		},
+		{
+			name:        "logprobs_only",
+			logprobs:    &enableLogprobs,
+			topLogprobs: 0,
+			expectCount: 1,
+		},
+		{
+			name:        "logprobs_with_top_5",
+			logprobs:    &enableLogprobs,
+			topLogprobs: 5,
+			expectCount: 1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			req := api.GenerateRequest{
+				Model:       smol,
+				Prompt:      "Why is the sky blue?",
+				Stream:      &noStream,
+				Logprobs:    test.logprobs != nil && *test.logprobs,
+				TopLogprobs: test.topLogprobs,
+				Options: map[string]interface{}{
+					"temperature": 0,
+					"seed":        123,
+					"num_predict": 10,
+				},
+			}
+
+			var response api.GenerateResponse
+			err := client.Generate(ctx, &req, func(resp api.GenerateResponse) error {
+				if resp.Done {
+					response = resp
+				}
+				return nil
+			})
+			if err != nil {
+				t.Fatalf("generate failed: %s", err)
+			}
+
+			// Check logprobs based on expectation
+			if test.expectCount == 0 {
+				if len(response.Logprobs) > 0 {
+					t.Errorf("expected no logprobs but got %d", len(response.Logprobs))
+				}
+			} else {
+				if len(response.Logprobs) == 0 {
+					t.Errorf("expected logprobs but got none")
+				}
+
+				// Validate each logprob entry
+				for i, lp := range response.Logprobs {
+					if lp.Token == "" {
+						t.Errorf("logprob[%d] has empty token", i)
+					}
+					if lp.Logprob > 0 {
+						t.Errorf("logprob[%d] has positive logprob %f (should be <= 0)", i, lp.Logprob)
+					}
+					assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d]", i), lp.Token, lp.Bytes)
+
+					// Check top_logprobs if requested
+					if test.topLogprobs > 0 {
+						if len(lp.TopLogprobs) == 0 {
+							t.Errorf("logprob[%d] expected top_logprobs but got none", i)
+						}
+						if len(lp.TopLogprobs) > test.topLogprobs {
+							t.Errorf("logprob[%d] has %d top_logprobs, expected max %d", i, len(lp.TopLogprobs), test.topLogprobs)
+						}
+
+						// Verify top_logprobs are sorted by probability (descending)
+						for j := 1; j < len(lp.TopLogprobs); j++ {
+							if lp.TopLogprobs[j-1].Logprob < lp.TopLogprobs[j].Logprob {
+								t.Errorf("logprob[%d].top_logprobs not sorted: %f < %f", i, lp.TopLogprobs[j-1].Logprob, lp.TopLogprobs[j].Logprob)
+							}
+						}
+						for j, top := range lp.TopLogprobs {
+							assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
+						}
+					} else if len(lp.TopLogprobs) > 0 {
+						t.Errorf("logprob[%d] has top_logprobs but none were requested", i)
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestAPIChatLogprobs(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	if err := PullIfMissing(ctx, client, smol); err != nil {
+		t.Fatalf("pull failed %s", err)
+	}
+
+	enableLogprobs := true
+	noStream := false
+
+	req := api.ChatRequest{
+		Model: smol,
+		Messages: []api.Message{
+			{Role: "user", Content: "Say hello in one word"},
+		},
+		Stream:      &noStream,
+		Logprobs:    enableLogprobs,
+		TopLogprobs: 3,
+		Options: map[string]interface{}{
+			"temperature": 0,
+			"seed":        123,
+			"num_predict": 5,
+		},
+	}
+
+	var response api.ChatResponse
+	err := client.Chat(ctx, &req, func(resp api.ChatResponse) error {
+		if resp.Done {
+			response = resp
+		}
+		return nil
+	})
+	if err != nil {
+		t.Fatalf("chat failed: %s", err)
+	}
+
+	if len(response.Logprobs) == 0 {
+		t.Fatal("expected logprobs in response but got none")
+	}
+
+	t.Logf("received %d logprobs for chat response", len(response.Logprobs))
+
+	for i, lp := range response.Logprobs {
+		if lp.Token == "" {
+			t.Errorf("logprob[%d] has empty token", i)
+		}
+		if lp.Logprob > 0 {
+			t.Errorf("logprob[%d] has positive logprob %f", i, lp.Logprob)
+		}
+		assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d]", i), lp.Token, lp.Bytes)
+		if len(lp.TopLogprobs) == 0 {
+			t.Errorf("logprob[%d] expected top_logprobs but got none", i)
+		}
+		if len(lp.TopLogprobs) > 3 {
+			t.Errorf("logprob[%d] has %d top_logprobs, expected max 3", i, len(lp.TopLogprobs))
+		}
+		for j, top := range lp.TopLogprobs {
+			assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
+		}
+	}
+}
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -4,7 +4,9 @@ package integration

 import (
 	"context"
+	"errors"
 	"math"
+	"strings"
 	"testing"
 	"time"

@@ -204,8 +206,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected %v, got %v (similarity: %f)", expected[0:5], res.Embeddings[0][0:5], sim)
 	}

-	if res.PromptEvalCount != 6 {
-		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 8 {
+		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

@@ -251,8 +253,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected %v, got %v (similarity: %f)", expected[1][0:5], res.Embeddings[1][0:5], sim)
 	}

-	if res.PromptEvalCount != 12 {
-		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 16 {
+		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

@@ -275,7 +277,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	cases := []struct {
 		name    string
 		request api.EmbedRequest
-		check   func(*api.EmbedResponse, error)
+		check   func(*testing.T, *api.EmbedResponse, error)
 	}{
 		{
 			name: "target truncation",
@@ -283,7 +285,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Model: "all-minilm",
 				Input: "why",
 			},
-			check: func(got *api.EmbedResponse, err error) {
+			check: func(t *testing.T, got *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
@@ -300,10 +302,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Input:   "why is the sky blue?",
 				Options: map[string]any{"num_ctx": 3},
 			},
-			check: func(got *api.EmbedResponse, err error) {
+			check: func(t *testing.T, got *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
+				t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
 				if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
 					t.Errorf("embedding mismatch (-want +got):\n%s", diff)
 				}
@@ -317,10 +320,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Truncate: &truncTrue,
 				Options:  map[string]any{"num_ctx": 3},
 			},
-			check: func(got *api.EmbedResponse, err error) {
+			check: func(t *testing.T, got *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
+				t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
 				if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
 					t.Errorf("embedding mismatch (-want +got):\n%s", diff)
 				}
@@ -334,21 +338,21 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Truncate: &truncFalse,
 				Options:  map[string]any{"num_ctx": 3},
 			},
-			check: func(res *api.EmbedResponse, err error) {
-				if err.Error() != "input exceeds maximum context length" {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
+				if err.Error() != "the input length exceeds the context length" {
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
 			},
 		},
 		{
-			name: "input after truncate error",
+			name: "input after truncate error with context length of 1",
 			request: api.EmbedRequest{
 				Model:    "all-minilm",
 				Input:    "why is the sky blue?",
 				Truncate: &truncTrue,
 				Options:  map[string]any{"num_ctx": 1},
 			},
-			check: func(res *api.EmbedResponse, err error) {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
 				if err.Error() != "input after truncation exceeds maximum context length" {
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
@@ -362,7 +366,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Truncate: &truncTrue,
 				Options:  map[string]any{"num_ctx": 0},
 			},
-			check: func(res *api.EmbedResponse, err error) {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
 				if err.Error() != "input after truncation exceeds maximum context length" {
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
@@ -375,7 +379,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Input:   "why is the sky blue? Why is the sky blue? hi there my",
 				Options: map[string]any{"num_ctx": 16},
 			},
-			check: func(res *api.EmbedResponse, err error) {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
@@ -385,7 +389,8 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {

 	for _, req := range cases {
 		t.Run(req.name, func(t *testing.T) {
-			req.check(embedTestHelper(ctx, client, t, req.request))
+			resp, err := embedTestHelper(ctx, client, t, req.request)
+			req.check(t, resp, err)
 		})
 	}
 }
@@ -409,3 +414,173 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req

 	return client.Embed(ctx, &req)
 }
+
+func TestEmbedTruncation(t *testing.T) {
+	// Use test deadline if set, otherwise default to 2 minutes
+	timeout := 2 * time.Minute
+	if deadline, ok := t.Deadline(); ok {
+		timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, model := range libraryEmbedModels {
+		model := model
+		t.Run(model, func(t *testing.T) {
+			// Check if we're running out of time (reserve 20s for current model)
+			if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
+				t.Skip("skipping remaining tests to avoid timeout")
+			}
+
+			// Give each model its own budget to account for first-time pulls/loads
+			mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
+			defer mcancel()
+
+			t.Run("truncation batch", func(t *testing.T) {
+				truncTrue := true
+				req := api.EmbedRequest{
+					Model:    model,
+					Input:    []string{"short", strings.Repeat("long ", 100), "medium text"},
+					Truncate: &truncTrue,
+					Options:  map[string]any{"num_ctx": 30},
+				}
+
+				res, err := embedTestHelper(mctx, client, t, req)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if len(res.Embeddings) != 3 {
+					t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
+				}
+
+				if res.PromptEvalCount > 90 {
+					t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
+				}
+			})
+
+			t.Run("runner token count accuracy", func(t *testing.T) {
+				baseline := api.EmbedRequest{Model: model, Input: "test"}
+				baseRes, err := embedTestHelper(mctx, client, t, baseline)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				batch := api.EmbedRequest{
+					Model: model,
+					Input: []string{"test", "test", "test"},
+				}
+				batchRes, err := embedTestHelper(mctx, client, t, batch)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				expectedCount := baseRes.PromptEvalCount * 3
+				if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
+					t.Fatalf("expected ~%d tokens (3 × %d), got %d",
+						expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
+				}
+			})
+		})
+	}
+}
+
+// TestEmbedStatusCode tests that errors from the embedding endpoint
+// properly preserve their HTTP status codes when returned to the client.
+// This test specifically checks the error handling path in EmbedHandler
+// where api.StatusError errors should maintain their original status code.
+func TestEmbedStatusCode(t *testing.T) {
+	// Use test deadline if set, otherwise default to 2 minutes
+	timeout := 2 * time.Minute
+	if deadline, ok := t.Deadline(); ok {
+		timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, model := range libraryEmbedModels {
+		model := model
+		t.Run(model, func(t *testing.T) {
+			// Check if we're running out of time (reserve 20s for current model)
+			if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
+				t.Skip("skipping remaining tests to avoid timeout")
+			}
+
+			mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
+			defer mcancel()
+
+			// Pull the model if needed
+			if err := PullIfMissing(mctx, client, model); err != nil {
+				t.Fatal(err)
+			}
+
+			t.Run("truncation error status code", func(t *testing.T) {
+				truncFalse := false
+				longInput := strings.Repeat("word ", 100)
+
+				req := api.EmbedRequest{
+					Model:    model,
+					Input:    longInput,
+					Truncate: &truncFalse,
+					Options:  map[string]any{"num_ctx": 10},
+				}
+
+				_, err := embedTestHelper(mctx, client, t, req)
+				if err == nil {
+					t.Fatal("expected error when truncate=false with long input")
+				}
+
+				// Check that it's a StatusError with the correct status code
+				var statusErr api.StatusError
+				if !errors.As(err, &statusErr) {
+					t.Fatalf("expected api.StatusError, got %T: %v", err, err)
+				}
+
+				// The error should be a 4xx client error (likely 400 Bad Request)
+				// not a 500 Internal Server Error
+				if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
+					t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
+				}
+
+				// Verify the error message is meaningful
+				if !strings.Contains(err.Error(), "context length") {
+					t.Errorf("expected error message to mention context length, got: %v", err)
+				}
+			})
+
+			t.Run("batch truncation error status code", func(t *testing.T) {
+				truncFalse := false
+				req := api.EmbedRequest{
+					Model: model,
+					Input: []string{
+						"short input",
+						strings.Repeat("very long input ", 100),
+						"another short input",
+					},
+					Truncate: &truncFalse,
+					Options:  map[string]any{"num_ctx": 10},
+				}
+
+				_, err := embedTestHelper(mctx, client, t, req)
+				if err == nil {
+					t.Fatal("expected error when one input exceeds context with truncate=false")
+				}
+
+				// Check that it's a StatusError with the correct status code
+				var statusErr api.StatusError
+				if !errors.As(err, &statusErr) {
+					t.Fatalf("expected api.StatusError, got %T: %v", err, err)
+				}
+
+				// The error should be a 4xx client error, not a 500 Internal Server Error
+				if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
+					t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
+				}
+			})
+		})
+	}
+}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -33,6 +33,9 @@ func TestVisionModels(t *testing.T) {
 			// Qwen 3 VL mixture of experts
 			model: "qwen3-vl:30b",
 		},
+		{
+			model: "ministral-3",
+		},
 	}

 	for _, v := range testCases {
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -30,6 +30,7 @@ func TestAPIToolCalling(t *testing.T) {
 		"mistral":       6,
 		"qwen2.5":       6,
 		"qwen2":         6,
+		"ministral-3":   20,
 		"mistral-nemo":  9,
 		"mistral-small": 16,
 		"mixtral:8x22b": 80,
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -38,6 +38,7 @@ var (

 	// Note: add newer models at the top of the list to test them first
 	ollamaEngineChatModels = []string{
+		"ministral-3",
 		"qwen3-coder:30b",
 		"gpt-oss:20b",
 		"gemma3n:e2b",
@@ -167,6 +168,7 @@ var (
 		"medllama2",
 		"megadolphin",
 		"minicpm-v",
+		"ministral-3",
 		"mistral-large",
 		"mistral-nemo",
 		"mistral-openorca",
@@ -270,6 +272,7 @@ var (
 		"mistral",
 		"qwen2.5",
 		"qwen2",
+		"ministral-3",
 		"mistral-nemo",
 		"mistral-small",
 		"mixtral:8x22b",
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -3,7 +3,6 @@ package kvcache
 import (
 	"errors"
 	"fmt"
-	"log/slog"
 	"math"
 	"slices"

@@ -40,18 +39,18 @@ type Causal struct {

 	// ** current forward pass **

-	// the active layer for Get and Put
-	curLayer int
-
-	// starting location for data storage for this batch
-	curLoc int
-
 	// size of the current batch
 	curBatchSize int

+	// locations for data storage for this batch
+	curLoc ml.Tensor
+
 	// mask of the cache as used by this batch
 	curMask ml.Tensor

+	// the active layer for Get and Put
+	curLayer int
+
 	// locations in the cache that are needed for this batch
 	curCellRange cellRange

@@ -206,45 +205,47 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

+	var locs []int32
 	if !reserve {
 		c.updateSlidingWindow()

 		var err error
-		c.curLoc, err = c.findStartLoc()
-		if errors.Is(err, ErrKvCacheFull) {
-			c.defrag()
-			c.curLoc, err = c.findStartLoc()
-		}
+		locs, err = c.findLocs()
 		if err != nil {
 			return err
 		}

 		for i, pos := range batch.Positions {
 			seq := batch.Sequences[i]
+			loc := int(locs[i])

-			c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
+			c.cells[loc] = cacheCell{pos: pos, sequences: []int{seq}}

 			seqRange, ok := c.cellRanges[seq]
 			if !ok {
 				seqRange = newRange()
 			}

-			seqRange.min = min(seqRange.min, c.curLoc+i)
-			c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
+			seqRange.min = min(seqRange.min, loc)
+			c.curCellRange.min = min(c.curCellRange.min, loc)

-			seqRange.max = max(seqRange.max, c.curLoc+i)
-			c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)
+			seqRange.max = max(seqRange.max, loc)
+			c.curCellRange.max = max(c.curCellRange.max, loc)

 			c.cellRanges[seq] = seqRange
 		}
 	} else {
 		// If we are reserving memory, don't update any of the cache metadata but set the size
 		// to the worst case.
-		c.curLoc = 0
+		locs = make([]int32, c.curBatchSize)
+		for i := range locs {
+			locs[i] = int32(i)
+		}
 		c.curCellRange.min = 0
 		c.curCellRange.max = len(c.cells) - 1
 	}

+	c.curLoc = ctx.Input().FromInts(locs, len(locs))
 	c.curMask = c.buildMask(ctx)

 	return nil
@@ -257,22 +258,20 @@ func newRange() cellRange {
 	}
 }

-// Find the first contiguous block of at least curBatchSize
-func (c *Causal) findStartLoc() (int, error) {
-	var start, count int
+// Returns a slice of locations where each token in the batch should be stored
+func (c *Causal) findLocs() ([]int32, error) {
+	loc := make([]int32, 0, c.curBatchSize)
+
 	for i := range c.cells {
 		if len(c.cells[i].sequences) == 0 {
-			count++
-			if count >= c.curBatchSize {
-				return start, nil
+			loc = append(loc, int32(i))
+			if len(loc) >= c.curBatchSize {
+				return loc, nil
 			}
-		} else {
-			start = i + 1
-			count = 0
 		}
 	}

-	return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
+	return nil, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
 }

 func (c *Causal) updateSlidingWindow() {
@@ -402,145 +401,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	return maskTensor
 }

-func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
-	for i, key := range c.keys {
-		if key == nil {
-			continue
-		}
-
-		kHeadDim := key.Dim(0)
-		numKVHeads := key.Dim(1)
-		rowSize := key.Stride(2)
-
-		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
-		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
-
-		value := c.values[i]
-		var vSrcView, vDstView ml.Tensor
-		if c.config.PermutedV {
-			vHeadDim := value.Dim(1)
-			elemSize := value.Stride(0)
-
-			vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-			vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-		} else {
-			vHeadDim := value.Dim(0)
-			rowSize := value.Stride(2)
-
-			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
-			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
-		}
-
-		ctx.Forward(
-			kSrcView.Copy(ctx, kDstView),
-			vSrcView.Copy(ctx, vDstView),
-		)
-	}
-}
-
-func (c *Causal) defrag() {
-	slog.Debug("defragmenting kv cache")
-
-	// Defrag strategy:
-	// - Search for empty holes at the beginning of the cache,
-	//   filling them with active data starting at the end
-	// - If there are contiguous elements that need to be moved,
-	//   combine them into a single operation by holding new moves
-	//   until we see that the next one is non-contiguous
-	// - Fill up the context with the maximum number of operations it
-	//   can hold then compute that and continue with a new context
-	//
-	// We could try to optimize placement by grouping blocks from
-	// the same sequences together but most likely the next forward
-	// pass will disrupt this anyways, so the real world benefit
-	// seems limited as this time.
-
-	ctx := c.backend.NewContext()
-
-	// For every move, 6 tensors are required per layer (2 views and a
-	// copy for each of k and v). We also need to refer to the original
-	// k and v cache tensors - once per layer, not per move.
-	layers := 0
-	for _, key := range c.keys {
-		if key == nil {
-			continue
-		}
-		layers++
-	}
-
-	maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
-	moves := 0
-
-	var pendingSrc, pendingDst, pendingLen int
-	src := len(c.cells) - 1
-
-	for dst := 0; dst < src; dst++ {
-		if len(c.cells[dst].sequences) == 0 {
-			for ; src > dst; src-- {
-				if len(c.cells[src].sequences) != 0 {
-					c.cells[dst] = c.cells[src]
-					c.cells[src] = cacheCell{}
-
-					if pendingLen > 0 {
-						if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
-							pendingSrc = src
-							pendingLen++
-							break
-						} else {
-							c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
-							moves++
-						}
-					}
-
-					pendingSrc = src
-					pendingDst = dst
-					pendingLen = 1
-
-					break
-				}
-			}
-		}
-
-		if moves >= maxMoves {
-			ctx.Compute()
-			ctx.Close()
-			ctx = c.backend.NewContext()
-
-			moves = 0
-		}
-	}
-
-	if pendingLen > 0 {
-		c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
-		moves++
-	}
-
-	if moves > 0 {
-		ctx.Compute()
-	}
-	ctx.Close()
-
-	// Reset range metadata
-	for seq := range c.cellRanges {
-		seqRange := newRange()
-
-		for i, cell := range c.cells {
-			if slices.Contains(cell.sequences, seq) {
-				if i < seqRange.min {
-					seqRange.min = i
-				}
-				if i > seqRange.max {
-					seqRange.max = i
-				}
-			}
-		}
-
-		c.cellRanges[seq] = seqRange
-	}
-
-	c.updateSlidingWindow()
-}
-
 func (c *Causal) SetLayer(layer int) {
 	c.curLayer = layer
 }
@@ -625,18 +485,25 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 		}
 	}

-	rowSize := c.keys[c.curLayer].Stride(2)
-	ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, rowSize*c.curLoc, kHeadDim*numKVHeads*batchSize)))
+	key = key.Reshape(ctx, kHeadDim*numKVHeads, batchSize)
+	keyCache := c.keys[c.curLayer]
+	keyCache = keyCache.Reshape(ctx, kHeadDim*numKVHeads, len(c.cells))
+	ctx.Forward(keyCache.SetRows(ctx, key, c.curLoc))

 	if c.config.PermutedV {
-		elemSize := c.values[c.curLayer].Stride(0)
+		value = value.Reshape(ctx, vHeadDim*numKVHeads, 1, batchSize)
+		value = value.Permute(ctx, 2, 0, 1, 3)

-		value = value.Permute(ctx, 1, 2, 0, 3)
-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
+		valueCache := c.values[c.curLayer]
+		valueCache = valueCache.Reshape(ctx, 1, len(c.cells), vHeadDim*numKVHeads)
+
+		ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
 	} else {
-		rowSize := c.values[c.curLayer].Stride(2)
+		value = value.Reshape(ctx, vHeadDim*numKVHeads, batchSize)
+		valueCache := c.values[c.curLayer]
+		valueCache = valueCache.Reshape(ctx, vHeadDim*numKVHeads, len(c.cells))

-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, rowSize*c.curLoc, vHeadDim*numKVHeads*batchSize)))
+		ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
 	}
 }

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "3cfa9c3f125763305b4226bc032f1954f08990dc";
+char const *LLAMA_COMMIT = "17f7f4baad8b3a716ee139da7bb56ae984e8c0fa";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@@ -22,6 +22,9 @@ include /src/llama.*
 include /src/llama-*.*
 include /src/unicode-data.*
 include /src/unicode.*
+include /src/models/
+include /src/models/*.h
+include /src/models/*.cpp
 include /vendor/
 include /vendor/miniaudio/
 include /vendor/miniaudio/*.h
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -8,6 +8,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"

 #include <algorithm>
 #include <cinttypes>
@@ -26,7 +27,6 @@
 #include <sstream>
 #include <string>
 #include <thread>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>

@@ -60,6 +60,14 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+common_time_meas::~common_time_meas() {
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
+    }
+}
+
 //
 // CPU utils
 //
@@ -355,11 +363,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
-    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
-            common_log_add(common_log_main(), level, "%s", text);
-        }
-    }, NULL);
+    llama_log_set(common_log_default_callback, NULL);

 #ifdef NDEBUG
    const char * build_type = "";
@@ -690,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over

 // Validate if a filename is safe to use
 // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
-bool fs_validate_filename(const std::string & filename) {
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
    if (!filename.length()) {
        // Empty filename invalid
        return false;
@@ -750,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
            || c == 0xFFFD // Replacement Character (UTF-8)
            || c == 0xFEFF // Byte Order Mark (BOM)
-            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
+            || c == ':' || c == '*' // Illegal characters
            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
            return false;
        }
+        if (!allow_subdirs && (c == '/' || c == '\\')) {
+            // Subdirectories not allowed, reject path separators
+            return false;
+        }
    }

    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
@@ -778,11 +786,29 @@ bool fs_validate_filename(const std::string & filename) {
 #include <iostream>


+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string & str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+
+    if (size <= 0) {
+        return std::wstring();
+    }
+
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+
+    return wstr;
+}
+#endif
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
+    std::wstring wpath = utf8_to_wstring(path);

    // if the path already exists, check whether it's a directory
    const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -855,6 +881,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
 #endif // _WIN32
 }

+bool fs_is_directory(const std::string & path) {
+    std::filesystem::path dir(path);
+    return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
+}
+
 std::string fs_get_cache_directory() {
    std::string cache_directory = "";
    auto ensure_trailing_slash = [](std::string p) {
@@ -889,6 +920,8 @@ std::string fs_get_cache_directory() {
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
        cache_directory = std::getenv("LOCALAPPDATA");
+#elif defined(__EMSCRIPTEN__)
+        GGML_ABORT("not implemented on this platform");
 #else
 #  error Unknown architecture
 #endif
@@ -908,11 +941,130 @@ std::string fs_get_cache_file(const std::string & filename) {
    return cache_directory + filename;
 }

+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
+    std::vector<common_file_info> files;
+    if (path.empty()) return files;
+
+    std::filesystem::path dir(path);
+    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+        return files;
+    }
+
+    for (const auto & entry : std::filesystem::directory_iterator(dir)) {
+        try {
+            // Only include regular files (skip directories)
+            const auto & p = entry.path();
+            if (std::filesystem::is_regular_file(p)) {
+                common_file_info info;
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.is_dir = false;
+                try {
+                    info.size = static_cast<size_t>(std::filesystem::file_size(p));
+                } catch (const std::filesystem::filesystem_error &) {
+                    info.size = 0;
+                }
+                files.push_back(std::move(info));
+            } else if (include_directories && std::filesystem::is_directory(p)) {
+                common_file_info info;
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.size   = 0; // Directories have no size
+                info.is_dir = true;
+                files.push_back(std::move(info));
+            }
+        } catch (const std::filesystem::filesystem_error &) {
+            // skip entries we cannot inspect
+            continue;
+        }
+    }
+
+    return files;
+}
+
+//
+// TTY utils
+//
+
+bool tty_can_use_colors() {
+    // Check NO_COLOR environment variable (https://no-color.org/)
+    if (const char * no_color = std::getenv("NO_COLOR")) {
+        if (no_color[0] != '\0') {
+            return false;
+        }
+    }
+
+    // Check TERM environment variable
+    if (const char * term = std::getenv("TERM")) {
+        if (std::strcmp(term, "dumb") == 0) {
+            return false;
+        }
+    }
+
+    // Check if stdout and stderr are connected to a terminal
+    // We check both because log messages can go to either
+    bool stdout_is_tty = isatty(fileno(stdout));
+    bool stderr_is_tty = isatty(fileno(stderr));
+
+    return stdout_is_tty || stderr_is_tty;
+}

 //
 // Model utils
 //

+static inline void common_init_sampler_from_model(
+    const llama_model * model,
+    common_params_sampling & sparams) {
+
+    const uint64_t config = sparams.user_sampling_config;
+
+    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
+        if (config & user_config) return;
+
+        char buf[64] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            int32_t v = strtol(buf, &end, 10);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
+        if (config & user_config) return;
+
+        char buf[128] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            float v = strtof(buf, &end);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    // Sampling sequence
+    if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
+            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
+            if (!sampler_names.empty()) {
+                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            }
+        }
+    }
+
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K),           sparams.top_k,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P),           sparams.top_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P),           sparams.min_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD),   sparams.xtc_threshold,   common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP),            sparams.temp,            common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N),  sparams.penalty_last_n,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT),  sparams.penalty_repeat,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT),        sparams.mirostat,        common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU),    sparams.mirostat_tau,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
+}
+
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);
@@ -924,6 +1076,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }

+    common_init_sampler_from_model(model, params.sampling);
+
    const llama_vocab * vocab = llama_model_get_vocab(model);

    auto cparams = common_context_params_to_llama(params);
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -2,17 +2,19 @@

 #pragma once

+#include "ggml-opt.h"
+#include "llama-cpp.h"
+
 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
-#include <sstream>
-#include <cmath>

-#include "ggml-opt.h"
-#include "llama-cpp.h"
+#if defined(_WIN32) && !defined(_WIN32_WINNT)
+#define _WIN32_WINNT 0x0A00
+#endif

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -28,7 +30,14 @@
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

-#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+struct common_time_meas {
+    common_time_meas(int64_t & t_acc, bool disable = false);
+    ~common_time_meas();
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};

 struct common_adapter_lora_info {
    std::string path;
@@ -133,6 +142,22 @@ struct common_grammar_trigger {
    llama_token token = LLAMA_TOKEN_NULL;
 };

+enum common_params_sampling_config : uint64_t {
+    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
+    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
+};
+
+
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -165,6 +190,8 @@ struct common_params_sampling {
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;

+    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
+
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY


@@ -198,6 +225,7 @@ struct common_params_model {
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
    std::string hf_file     = ""; // HF file                                                // NOLINT
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

 struct common_params_speculative {
@@ -344,7 +372,7 @@ struct common_params {

    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale

-    int32_t verbosity                  = 0;
+    int32_t verbosity                  = 3;  // LOG_LEVEL_INFO
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
@@ -406,6 +434,8 @@ struct common_params {
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)
+    int image_min_tokens = -1;
+    int image_max_tokens = -1;

    // finetune
    struct lr_opt lr;
@@ -451,14 +481,21 @@ struct common_params {
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

+    // router server configs
+    std::string models_dir = ""; // directory containing models for the router server
+    int models_max = 4;          // maximum number of models to load simultaneously
+    bool models_autoload = true; // automatically load models when requested via the router server
+
    bool log_json = false;

    std::string slot_save_path;
+    std::string media_path; // path to directory for loading media files

    float slot_prompt_similarity = 0.1f;

    // batched-bench params
-    bool is_pp_shared = false;
+    bool is_pp_shared   = false;
+    bool is_tg_separate = false;

    std::vector<int32_t> n_pp;
    std::vector<int32_t> n_tg;
@@ -505,6 +542,10 @@ struct common_params {
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
+
+    bool has_speculative() const {
+        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
+    }
 };

 // call once at the start of a program if it uses libcommon
@@ -599,12 +640,28 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
 // Filesystem utils
 //

-bool fs_validate_filename(const std::string & filename);
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
 bool fs_create_directory_with_parents(const std::string & path);
+bool fs_is_directory(const std::string & path);

 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);

+struct common_file_info {
+    std::string path;
+    std::string name;
+    size_t      size = 0; // in bytes
+    bool        is_dir = false;
+};
+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
+
+//
+// TTY utils
+//
+
+// Auto-detect if colors can be enabled based on terminal and environment
+bool tty_can_use_colors();
+
 //
 // Model utils
 //
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
 }

 std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
-std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
+std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
 std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
 std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
-    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
+    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
 };

 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
@@ -303,6 +303,8 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }

+std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
+
 class SchemaConverter {
 private:
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
@@ -601,7 +603,10 @@ private:
    }

    std::string _resolve_ref(const std::string & ref) {
-        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
+        auto it = ref.find('#');
+        std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
+        static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
+        std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
            _refs_being_resolved.insert(ref);
            json resolved = _refs[ref];
@@ -774,11 +779,24 @@ public:
                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
-                            if (target.is_null() || !target.contains(sel)) {
+                            if (target.is_object() && target.contains(sel)) {
+                                target = target[sel];
+                            } else if (target.is_array()) {
+                                size_t sel_index;
+                                try {
+                                    sel_index = std::stoul(sel);
+                                } catch (const std::invalid_argument & e) {
+                                    sel_index = target.size();
+                                }
+                                if (sel_index >= target.size()) {
+                                    _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
+                                    return;
+                                }
+                                target = target[sel_index];
+                            } else {
                                _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
                                return;
                            }
-                            target = target[sel];
                        }
                        _refs[ref] = target;
                    }
@@ -956,7 +974,7 @@ public:

    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
+            throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
        }
        if (!_warnings.empty()) {
            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
--- a/llama/llama.cpp/common/json-schema-to-grammar.h
+++ b/llama/llama.cpp/common/json-schema-to-grammar.h
@@ -18,4 +18,6 @@ struct common_grammar_options {
    bool dotall = false;
 };

+std::string gbnf_format_literal(const std::string & literal);
+
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/llama/llama.cpp/common/log.cpp
+++ b/llama/llama.cpp/common/log.cpp
@@ -1,3 +1,4 @@
+#include "common.h"
 #include "log.h"

 #include <chrono>
@@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }

-// Auto-detect if colors should be enabled based on terminal and environment
-static bool common_log_should_use_colors_auto() {
-    // Check NO_COLOR environment variable (https://no-color.org/)
-    if (const char * no_color = std::getenv("NO_COLOR")) {
-        if (no_color[0] != '\0') {
-            return false;
-        }
-    }
-
-    // Check TERM environment variable
-    if (const char * term = std::getenv("TERM")) {
-        if (std::strcmp(term, "dumb") == 0) {
-            return false;
-        }
-    }
-
-    // Check if stdout and stderr are connected to a terminal
-    // We check both because log messages can go to either
-    bool stdout_is_tty = isatty(fileno(stdout));
-    bool stderr_is_tty = isatty(fileno(stderr));
-
-    return stdout_is_tty || stderr_is_tty;
-}
-
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@@ -391,7 +368,7 @@ struct common_log * common_log_main() {
    static std::once_flag    init_flag;
    std::call_once(init_flag, [&]() {
        // Set default to auto-detect colors
-        log.set_colors(common_log_should_use_colors_auto());
+        log.set_colors(tty_can_use_colors());
    });

    return &log;
@@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {

 void common_log_set_colors(struct common_log * log, log_colors colors) {
    if (colors == LOG_COLORS_AUTO) {
-        log->set_colors(common_log_should_use_colors_auto());
+        log->set_colors(tty_can_use_colors());
        return;
    }

@@ -442,3 +419,23 @@ void common_log_set_prefix(struct common_log * log, bool prefix) {
 void common_log_set_timestamps(struct common_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }
+
+static int common_get_verbosity(enum ggml_log_level level) {
+    switch (level) {
+        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
+        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
+        case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
+        case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
+        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
+        case GGML_LOG_LEVEL_NONE:
+        default:
+            return LOG_LEVEL_OUTPUT;
+    }
+}
+
+void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
+    auto verbosity = common_get_verbosity(level);
+    if (verbosity <= common_log_verbosity_thold) {
+        common_log_add(common_log_main(), level, "%s", text);
+    }
+}
--- a/llama/llama.cpp/common/log.h
+++ b/llama/llama.cpp/common/log.h
@@ -21,8 +21,14 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#define LOG_DEFAULT_DEBUG 1
-#define LOG_DEFAULT_LLAMA 0
+#define LOG_LEVEL_DEBUG  4
+#define LOG_LEVEL_INFO   3
+#define LOG_LEVEL_WARN   2
+#define LOG_LEVEL_ERROR  1
+#define LOG_LEVEL_OUTPUT 0 // output data from tools
+
+#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
+#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO

 enum log_colors {
    LOG_COLORS_AUTO     = -1,
@@ -36,6 +42,8 @@ extern int common_log_verbosity_thold;

 void common_log_set_verbosity_thold(int verbosity); // not thread-safe

+void common_log_default_callback(enum ggml_log_level level, const char * text, void * user_data);
+
 // the common_log uses an internal worker thread to print/write log messages
 // when the worker thread is paused, incoming log messages are discarded
 struct common_log;
@@ -65,10 +73,11 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
 //   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
 //   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
 //
-// I - info    (stdout, V = 0)
-// W - warning (stderr, V = 0)
-// E - error   (stderr, V = 0)
 // D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
+// I - info    (stdout, V = LOG_DEFAULT_INFO)
+// W - warning (stderr, V = LOG_DEFAULT_WARN)
+// E - error   (stderr, V = LOG_DEFAULT_ERROR)
+// O - output  (stdout, V = LOG_DEFAULT_OUTPUT)
 //

 void common_log_set_file      (struct common_log * log, const char * file); // not thread-safe
@@ -93,14 +102,14 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps);   // w
        } \
    } while (0)

-#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
-#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)

-#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
-#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
-#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
-#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
-#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO

 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -3,9 +3,10 @@
 #include "common.h"
 #include "log.h"

-#include <cmath>
-#include <unordered_map>
 #include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <unordered_map>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {

    llama_token_data_array cur_p;

+    void reset() {
+        prev.clear();
+
+        llama_sampler_reset(grmr);
+        llama_sampler_reset(chain);
+    }
+
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);

@@ -128,6 +136,12 @@ struct common_sampler {

        cur_p = { cur.data(), cur.size(), -1, false };
    }
+
+    common_time_meas tm() {
+        return common_time_meas(t_total_us, params.no_perf);
+    }
+
+    mutable int64_t t_total_us = 0;
 };

 std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 }

 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+    const auto tm = gsmpl->tm();
+
    if (accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
 }

 void common_sampler_reset(struct common_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
-
-    llama_sampler_reset(gsmpl->chain);
+    gsmpl->reset();
 }

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
    // TODO: measure grammar performance

+    const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
+
+    llama_perf_sampler_data data_smpl;
+    llama_perf_context_data data_ctx;
+
+    memset(&data_smpl, 0, sizeof(data_smpl));
+    memset(&data_ctx,  0, sizeof(data_ctx));
+
    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        auto & data = data_smpl;
+
+        data = llama_perf_sampler(gsmpl->chain);
+
+        // note: the sampling time includes the samplers time + extra time spent in common/sampling
+        LOG_INF("%s:    sampling time = %10.2f ms\n", __func__, t_sampling_ms);
+        LOG_INF("%s:    samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
    }
+
    if (ctx) {
-        llama_perf_context_print(ctx);
+        auto & data = data_ctx;
+
+        data = llama_perf_context(ctx);
+
+        const double t_end_ms = 1e-3 * ggml_time_us();
+
+        const double t_total_ms = t_end_ms - data.t_start_ms;
+        const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
+        const double t_unacc_pc = 100.0 * t_unacc_ms /  t_total_ms;
+
+        LOG_INF("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+        LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+        LOG_INF("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+        LOG_INF("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
+        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);
+
        llama_memory_breakdown_print(ctx);
    }
 }

 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    llama_synchronize(ctx);
+
+    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
+    const auto tm = gsmpl->tm();
+
    gsmpl->set_logits(ctx, idx);

    auto & grmr  = gsmpl->grmr;
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 // helpers

 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
+    const auto tm = gsmpl->tm();
+
    auto * res = &gsmpl->cur_p;

    if (do_sort && !res->sorted) {
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -83,6 +83,7 @@ extern "C" {
        LLAMA_ROPE_TYPE_NORM   = 0,
        LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
        LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
+        LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
        LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
    };

@@ -245,6 +246,21 @@ extern "C" {
        LLAMA_KV_OVERRIDE_TYPE_STR,
    };

+    enum llama_model_meta_key {
+        LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
+        LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
+        LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
+        LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
+        LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
+        LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
+        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
+        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
+    };
+
    struct llama_model_kv_override {
        enum llama_model_kv_override_type tag;

@@ -460,7 +476,11 @@ extern "C" {
    LLAMA_API bool llama_supports_gpu_offload(void);
    LLAMA_API bool llama_supports_rpc        (void);

+    // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
+    //       In some cases the requested values via llama_context_params may differ from the actual values used by the context
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
@@ -481,6 +501,7 @@ extern "C" {

    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@@ -512,6 +533,9 @@ extern "C" {
    // Get the number of metadata key/value pairs
    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);

+    // Get sampling metadata key name. Returns nullptr if the key is invalid
+    LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
+
    // Get metadata key name by index
    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);

@@ -584,7 +608,7 @@ extern "C" {
    LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);

    // Manually free a LoRA adapter
-    // Note: loaded adapters will be free when the associated model is deleted
+    // NOTE: loaded adapters will be free when the associated model is deleted
    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);

    // Get the invocation tokens if the current lora is an alora
@@ -1110,8 +1134,6 @@ extern "C" {
    //        // sample from the logits of the last token in the batch
    //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
    //
-    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
-    //        llama_sampler_accept(smpl, id);
    //        ...
    //    }
    //
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -32,6 +32,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
    { LLM_ARCH_QWEN3,            "qwen3"            },
    { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
+    { LLM_ARCH_QWEN3NEXT,        "qwen3next"        },
+    { LLM_ARCH_QWEN3VL,          "qwen3vl"          },
+    { LLM_ARCH_QWEN3VLMOE,       "qwen3vlmoe"       },
    { LLM_ARCH_PHI2,             "phi2"             },
    { LLM_ARCH_PHI3,             "phi3"             },
    { LLM_ARCH_PHIMOE,           "phimoe"           },
@@ -89,6 +92,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_BAILINGMOE2,      "bailingmoe2"      },
    { LLM_ARCH_DOTS1,            "dots1"            },
    { LLM_ARCH_ARCEE,            "arcee"            },
+    { LLM_ARCH_AFMOE,            "afmoe"            },
    { LLM_ARCH_ERNIE4_5,         "ernie4_5"         },
    { LLM_ARCH_ERNIE4_5_MOE,     "ernie4_5-moe"     },
    { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
@@ -104,23 +108,40 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_SEED_OSS,         "seed_oss"         },
    { LLM_ARCH_GROVEMOE,         "grovemoe"         },
    { LLM_ARCH_APERTUS,          "apertus"          },
+    { LLM_ARCH_MINIMAX_M2,       "minimax-m2"       },
+    { LLM_ARCH_COGVLM,           "cogvlm"           },
+    { LLM_ARCH_RND1,             "rnd1"             },
+    { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-    { LLM_KV_GENERAL_TYPE,                 "general.type"                          },
-    { LLM_KV_GENERAL_ARCHITECTURE,         "general.architecture"                  },
-    { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"          },
-    { LLM_KV_GENERAL_ALIGNMENT,            "general.alignment"                     },
-    { LLM_KV_GENERAL_FILE_TYPE,            "general.file_type"                     },
-    { LLM_KV_GENERAL_NAME,                 "general.name"                          },
-    { LLM_KV_GENERAL_AUTHOR,               "general.author"                        },
-    { LLM_KV_GENERAL_VERSION,              "general.version"                       },
-    { LLM_KV_GENERAL_URL,                  "general.url"                           },
-    { LLM_KV_GENERAL_DESCRIPTION,          "general.description"                   },
-    { LLM_KV_GENERAL_LICENSE,              "general.license"                       },
-    { LLM_KV_GENERAL_SOURCE_URL,           "general.source.url"                    },
-    { LLM_KV_GENERAL_SOURCE_HF_REPO,       "general.source.huggingface.repository" },
+    { LLM_KV_GENERAL_TYPE,                     "general.type"                          },
+    { LLM_KV_GENERAL_ARCHITECTURE,             "general.architecture"                  },
+    { LLM_KV_GENERAL_QUANTIZATION_VERSION,     "general.quantization_version"          },
+    { LLM_KV_GENERAL_ALIGNMENT,                "general.alignment"                     },
+    { LLM_KV_GENERAL_FILE_TYPE,                "general.file_type"                     },
+    { LLM_KV_GENERAL_SAMPLING_SEQUENCE,        "general.sampling.sequence"             },
+    { LLM_KV_GENERAL_SAMPLING_TOP_K,           "general.sampling.top_k"                },
+    { LLM_KV_GENERAL_SAMPLING_TOP_P,           "general.sampling.top_p"                },
+    { LLM_KV_GENERAL_SAMPLING_MIN_P,           "general.sampling.min_p"                },
+    { LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability"      },
+    { LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,   "general.sampling.xtc_threshold"        },
+    { LLM_KV_GENERAL_SAMPLING_TEMP,            "general.sampling.temp"                 },
+    { LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,  "general.sampling.penalty_last_n"       },
+    { LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,  "general.sampling.penalty_repeat"       },
+    { LLM_KV_GENERAL_SAMPLING_MIROSTAT,        "general.sampling.mirostat"             },
+    { LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,    "general.sampling.mirostat_tau"         },
+    { LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,    "general.sampling.mirostat_eta"         },
+    { LLM_KV_GENERAL_NAME,                     "general.name"                          },
+    { LLM_KV_GENERAL_AUTHOR,                   "general.author"                        },
+    { LLM_KV_GENERAL_VERSION,                  "general.version"                       },
+    { LLM_KV_GENERAL_URL,                      "general.url"                           },
+    { LLM_KV_GENERAL_DESCRIPTION,              "general.description"                   },
+    { LLM_KV_GENERAL_LICENSE,                  "general.license"                       },
+    { LLM_KV_GENERAL_SOURCE_URL,               "general.source.url"                    },
+    { LLM_KV_GENERAL_SOURCE_HF_REPO,           "general.source.huggingface.repository" },

    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
@@ -146,6 +167,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERTS_PER_GROUP,                 "%s.experts_per_group"                 },
    { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
    { LLM_KV_NEXTN_PREDICT_LAYERS,              "%s.nextn_predict_layers"              },
+    { LLM_KV_NUM_DEEPSTACK_LAYERS,              "%s.n_deepstack_layers"                },
    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
@@ -184,6 +206,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
+    { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
@@ -329,6 +352,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_AFMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_GATE,       "blk.%d.attn_gate" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
+        },
+    },
    {
        LLM_ARCH_LLAMA4,
        {
@@ -781,6 +834,77 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
        },
    },
+    {
+        LLM_ARCH_QWEN3NEXT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_POST_NORM,     "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_SSM_A_NOSCAN,       "blk.%d.ssm_a" },
+            { LLM_TENSOR_SSM_CONV1D,         "blk.%d.ssm_conv1d" },
+            { LLM_TENSOR_SSM_DT,             "blk.%d.ssm_dt" },
+            { LLM_TENSOR_SSM_BETA_ALPHA,     "blk.%d.ssm_ba" },
+            { LLM_TENSOR_SSM_IN,             "blk.%d.ssm_in" },
+            { LLM_TENSOR_SSM_NORM,           "blk.%d.ssm_norm" },
+            { LLM_TENSOR_SSM_OUT,            "blk.%d.ssm_out" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN3VL,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN3VLMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
    {
        LLM_ARCH_PHI2,
        {
@@ -2168,7 +2292,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_SHORTCONV_INPROJ,  "blk.%d.shortconv.in_proj" },
            { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,       "token_embd_norm" }, // note: wrong tensor name
            { LLM_TENSOR_OUTPUT,            "output" },
        }
    },
@@ -2190,7 +2314,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_SHORTCONV_INPROJ,  "blk.%d.shortconv.in_proj" },
            { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,       "token_embd_norm" }, // note: wrong tensor name
            { LLM_TENSOR_FFN_GATE_INP,      "blk.%d.ffn_gate_inp" },
            { LLM_TENSOR_FFN_GATE_EXPS,     "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,     "blk.%d.ffn_down_exps" },
@@ -2332,6 +2456,110 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_CHEXPS,      "blk.%d.ffn_up_chexps" },
        },
    },
+    {
+        LLM_ARCH_MINIMAX_M2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
+        },
+    },
+    {
+        LLM_ARCH_PANGU_EMBED,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_COGVLM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
+            { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
+            { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
+            { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
+            { LLM_TENSOR_VISEXP_FFN_UP,   "blk.%d.vis_up" },
+        },
+    },
+    {
+        LLM_ARCH_RND1,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_MISTRAL3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
@@ -2340,11 +2568,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
    },
 };

+// declare information about the model weight tensors:
+// - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
+// - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
+//
+// for example, input layers are usually assigned to CPU/host buffer types
+//
+// a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
+//   assignment of the buffer types and extra overhead during computation
+// example: https://github.com/ggml-org/llama.cpp/pull/17548
+//
 static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_TOKEN_EMBD,                 {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
    {LLM_TENSOR_POS_EMBD,                   {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
-    {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
    {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
    {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -2361,6 +2599,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_GATE,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2398,6 +2637,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_SSM_X,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_SSM_DT,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_SSM_OUT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_BETA_ALPHA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_W1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_W2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_A1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2419,6 +2659,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_FFN_ACT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
    {LLM_TENSOR_SSM_CONV1D,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
    {LLM_TENSOR_SSM_A,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
+    {LLM_TENSOR_SSM_A_NOSCAN,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
    {LLM_TENSOR_SSM_DT_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_SSM_B_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_SSM_C_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -2509,6 +2750,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_SHORTCONV_CONV,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
    {LLM_TENSOR_SHORTCONV_INPROJ,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_SHORTCONV_OUTPROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_VISEXP_ATTN_QKV,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_VISEXP_ATTN_OUT,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_VISEXP_FFN_GATE,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_VISEXP_FFN_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_VISEXP_FFN_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    // NextN/MTP tensors are currently ignored (reserved for future MTP support)
    // These tensors only exist in the last layer(s) and are treated as output tensors
    {LLM_TENSOR_NEXTN_EH_PROJ,              {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -2592,6 +2838,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
        case LLM_ARCH_LFM2:
        case LLM_ARCH_LFM2MOE:
        case LLM_ARCH_NEMOTRON_H:
+        case LLM_ARCH_QWEN3NEXT:
            return true;
        default:
            return false;
@@ -2603,6 +2850,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
        case LLM_ARCH_DREAM:
        case LLM_ARCH_LLADA:
        case LLM_ARCH_LLADA_MOE:
+        case LLM_ARCH_RND1:
            return true;
        default:
            return false;
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -36,6 +36,9 @@ enum llm_arch {
    LLM_ARCH_QWEN2VL,
    LLM_ARCH_QWEN3,
    LLM_ARCH_QWEN3MOE,
+    LLM_ARCH_QWEN3NEXT,
+    LLM_ARCH_QWEN3VL,
+    LLM_ARCH_QWEN3VLMOE,
    LLM_ARCH_PHI2,
    LLM_ARCH_PHI3,
    LLM_ARCH_PHIMOE,
@@ -93,6 +96,7 @@ enum llm_arch {
    LLM_ARCH_BAILINGMOE2,
    LLM_ARCH_DOTS1,
    LLM_ARCH_ARCEE,
+    LLM_ARCH_AFMOE,
    LLM_ARCH_ERNIE4_5,
    LLM_ARCH_ERNIE4_5_MOE,
    LLM_ARCH_HUNYUAN_MOE,
@@ -108,6 +112,11 @@ enum llm_arch {
    LLM_ARCH_SEED_OSS,
    LLM_ARCH_GROVEMOE,
    LLM_ARCH_APERTUS,
+    LLM_ARCH_MINIMAX_M2,
+    LLM_ARCH_COGVLM,
+    LLM_ARCH_RND1,
+    LLM_ARCH_PANGU_EMBED,
+    LLM_ARCH_MISTRAL3,
    LLM_ARCH_UNKNOWN,
 };

@@ -117,6 +126,18 @@ enum llm_kv {
    LLM_KV_GENERAL_QUANTIZATION_VERSION,
    LLM_KV_GENERAL_ALIGNMENT,
    LLM_KV_GENERAL_FILE_TYPE,
+    LLM_KV_GENERAL_SAMPLING_SEQUENCE,
+    LLM_KV_GENERAL_SAMPLING_TOP_K,
+    LLM_KV_GENERAL_SAMPLING_TOP_P,
+    LLM_KV_GENERAL_SAMPLING_MIN_P,
+    LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
+    LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
+    LLM_KV_GENERAL_SAMPLING_TEMP,
+    LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
+    LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
+    LLM_KV_GENERAL_SAMPLING_MIROSTAT,
+    LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
+    LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
    LLM_KV_GENERAL_NAME,
    LLM_KV_GENERAL_AUTHOR,
    LLM_KV_GENERAL_VERSION,
@@ -150,6 +171,7 @@ enum llm_kv {
    LLM_KV_EXPERTS_PER_GROUP,
    LLM_KV_MOE_EVERY_N_LAYERS,
    LLM_KV_NEXTN_PREDICT_LAYERS,
+    LLM_KV_NUM_DEEPSTACK_LAYERS,
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,
    LLM_KV_DECODER_START_TOKEN_ID,
@@ -188,6 +210,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_OUTPUT_SCALE,
    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+    LLM_KV_ATTENTION_TEMPERATURE_SCALE,
    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -308,6 +331,7 @@ enum llm_tensor {
    LLM_TENSOR_ATTN_POST_NORM,
    LLM_TENSOR_ATTN_ROT_EMBD,
    LLM_TENSOR_ATTN_SINKS,
+    LLM_TENSOR_ATTN_GATE,
    LLM_TENSOR_FFN_GATE_INP,
    LLM_TENSOR_FFN_GATE_INP_SHEXP,
    LLM_TENSOR_FFN_NORM,
@@ -357,11 +381,13 @@ enum llm_tensor {
    LLM_TENSOR_SSM_DT,
    LLM_TENSOR_SSM_DT_NORM,
    LLM_TENSOR_SSM_A,
+    LLM_TENSOR_SSM_A_NOSCAN,        // qwen3next special case with MUL instead of SSM_SCAN
    LLM_TENSOR_SSM_B_NORM,
    LLM_TENSOR_SSM_C_NORM,
    LLM_TENSOR_SSM_D,
    LLM_TENSOR_SSM_NORM,
    LLM_TENSOR_SSM_OUT,
+    LLM_TENSOR_SSM_BETA_ALPHA,      // qwen3next
    LLM_TENSOR_TIME_MIX_W0,
    LLM_TENSOR_TIME_MIX_W1,
    LLM_TENSOR_TIME_MIX_W2,
@@ -458,6 +484,11 @@ enum llm_tensor {
    LLM_TENSOR_SHORTCONV_CONV,
    LLM_TENSOR_SHORTCONV_INPROJ,
    LLM_TENSOR_SHORTCONV_OUTPROJ,
+    LLM_TENSOR_VISEXP_ATTN_QKV,
+    LLM_TENSOR_VISEXP_ATTN_OUT,
+    LLM_TENSOR_VISEXP_FFN_GATE,
+    LLM_TENSOR_VISEXP_FFN_DOWN,
+    LLM_TENSOR_VISEXP_FFN_UP,
    LLM_TENSOR_NEXTN_EH_PROJ,
    LLM_TENSOR_NEXTN_EMBED_TOKENS,
    LLM_TENSOR_NEXTN_ENORM,
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
            /*.n_seq_tokens =*/ (uint32_t) 1,
            /*.n_seqs       =*/ (uint32_t) batch.n_tokens,
            /*.n_seqs_unq   =*/ (uint32_t) this->seq_id_unq.size(),
+            /*.n_pos        =*/ n_pos_per_embd,
            /*.token        =*/ batch.token,
            /*.embd         =*/ batch.embd,
            /*.pos          =*/ batch.pos,
@@ -251,46 +252,72 @@ bool llama_batch_allocr::init(
    // consistency checks
    //

-    for (uint32_t s = 0; s < n_seq_max; ++s) {
-        if (seq_pos[s].empty()) {
-            continue;
-        }
+    if (n_pos_per_embd > 1) {
+        // M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
+            if (seq_pos[s].empty()) {
+                continue;
+            }

-        const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
-
-        if (p0 >= 0) {
-            bool ok = true;
+            const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;

            if (batch.token) {
+                if (p0 >= 0 && p0 >= seq_pos_min(s)) {
+                    LLAMA_LOG_ERROR(
+                            "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+                            " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+                            " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+                            " for M-RoPE, it is required that the position satisfies: X < Y\n",
+                            __func__, s, s, p0, s, seq_pos_min(s));
+
+                    return false;
+                }
+            } else {
+                // embedding inputs can have overlapping positions
+                if (p0 >= 0 && p0 > seq_pos_min(s)) {
+                    LLAMA_LOG_ERROR(
+                            "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+                            " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+                            " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+                            " for M-RoPE, it is required that the position satisfies: X <= Y\n",
+                            __func__, s, s, p0, s, seq_pos_min(s));
+
+                    return false;
+                }
+            }
+        }
+    } else {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
+            if (seq_pos[s].empty()) {
+                continue;
+            }
+
+            const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
+
+            if (p0 >= 0) {
+                bool ok = true;
+
                if (seq_pos_min(s) != p0 + 1) {
                    ok = false;
                }
-            } else {
-                assert(batch.embd);

-                // for embeddings (typically used as vision input), we allow them to have repeating positions
-                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
-                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
-                    ok = false;
+                if (!ok) {
+                    LLAMA_LOG_ERROR(
+                            "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+                            " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+                            " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+                            " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+                            __func__, s, s, p0, s, seq_pos_min(s));
+
+                    return false;
                }
            }

-            if (!ok) {
-                LLAMA_LOG_ERROR(
-                        "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
-                        " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
-                        " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
-                        " it is required that the sequence positions remain consecutive: Y = X + 1\n",
-                        __func__, s, s, p0, s, seq_pos_min(s));
-
+            if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+                LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
                return false;
            }
        }
-
-        if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
-            LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
-            return false;
-        }
    }

    if (memory) {
@@ -389,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
        /*.n_seq_tokens =*/ n_seq_tokens,
        /*.n_seqs       =*/ n_seqs,
        /*.n_seqs_unq   =*/ n_seqs,
+        /*.n_pos        =*/ n_pos_per_embd,

        /*.token        =*/ udata->token.data(),
        /*.embd         =*/ nullptr,
@@ -655,10 +683,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u

    auto udata = std::make_shared<llama_ubatch::data_t>();

-    const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
-
    const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
-    const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
+    const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_per_embd;

    udata->token     .resize(n_tokens);
    udata->embd      .resize(n_embd_all);
@@ -680,8 +706,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
            memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
        }

-        for (int j = 0; j < n_pos_cur; ++j) {
-            udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
+        for (size_t j = 0; j < (size_t)n_pos_per_embd; ++j) {
+            // if we are using M-RoPE
+            //     if the current batch is text, we need to broadcast the same position across all RoPE sections
+            //     otherwise, the input batch is image embeddings, we copy the positions as-is
+            // if we are not using M-RoPE, there is only one position per token (this loop runs only once)
+            size_t src_off = batch.token ? 0 : j*batch.n_tokens;
+            udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
        }

        udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
@@ -710,6 +741,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        /*.n_seq_tokens =*/ n_tokens/n_seqs,
        /*.n_seqs       =*/ n_seqs,
        /*.n_seqs_unq   =*/ (uint32_t) udata->seq_id_unq.size(),
+        /*.n_pos        =*/ n_pos_per_embd,

        /*.token        =*/ batch.token ? udata->token.data() : nullptr,
        /*.embd         =*/ batch.embd ? udata->embd.data() : nullptr,
--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@@ -17,6 +17,16 @@ struct llama_ubatch {
        return b_equal_seqs != 0;
    }

+    // typical for M-RoPE cases:
+    //   0 - sequantial position of the tokens/embeddings in the sequence
+    //   1 - y position in the image
+    //   2 - x position in the image
+    //   3 - other
+    bool is_pos_2d() const {
+        // TODO @ngxson : we may need to check for model arch when more models use >1 positions
+        return n_pos >= 3;
+    }
+
    uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
                           //       otherwise address sanitizer complains
    // TODO: whole_seqs for embeddings?
@@ -25,6 +35,7 @@ struct llama_ubatch {
    uint32_t n_seq_tokens; // tokens per sequence set
    uint32_t n_seqs;       // sequence sets in the ubatch
    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
+    uint32_t n_pos;        // number of position inputs for each token/embedding

    // seq_id_unq: unique sequence ids in the ubatch
    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
@@ -33,7 +44,7 @@ struct llama_ubatch {
    //                          // size               | idx | val
    llama_token  *  token;      // [n_tokens]         | i   | id, token
    float        *  embd;       // [n_embd, n_tokens] | i   | embd
-    llama_pos    *  pos;        // [n_tokens]         | i   | pos
+    llama_pos    *  pos;        // [n_tokens*n_pos]   | i   | pos
    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
+    { "pangu-embedded",    LLM_CHAT_TEMPLATE_PANGU_EMBED       },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -213,6 +214,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_SEED_OSS;
    } else if (tmpl_contains("'Assistant: '  + message['content'] + '<|separator|>")) {
        return LLM_CHAT_TEMPLATE_GROK_2;
+    } else if (tmpl_contains(LU8("[unused9]系统：[unused10]"))) {
+        return LLM_CHAT_TEMPLATE_PANGU_EMBED;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -813,6 +816,35 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "Assistant:";
        }
+    }else if (tmpl == LLM_CHAT_TEMPLATE_PANGU_EMBED) {
+        // [unused9]系统：xxx[unused10]
+        // [unused9]用户：xxx[unused10]
+        // [unused9]助手：xxx[unused10]
+        // ...
+        for (size_t i = 0; i < chat.size(); ++i) {
+            const auto & msg = chat[i];
+            const std::string & role = msg->role;
+            const std::string & content = msg->content;
+
+            if (i == 0 && role != "system") {
+                ss << "[unused9]系统：[unused10]";
+            }
+
+            if (role == "system") {
+                ss << "[unused9]系统：" << content << "[unused10]";
+            } else if (role == "user") {
+                ss << "[unused9]用户：" << content << "[unused10]";
+            } else if (role == "assistant") {
+                ss << "[unused9]助手：" << content << "[unused10]";
+            } else if (role == "tool") {
+                ss << "[unused9]工具：" << content << "[unused10]";
+            } else if (role == "function") {
+                ss << "[unused9]方法：" << content << "[unused10]";
+            }
+        }
+        if (add_ass) {
+            ss << "[unused9]助手：";
+        }
    } else {
        // template not supported
        return -1;
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -53,6 +53,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_KIMI_K2,
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_GROK_2,
+    LLM_CHAT_TEMPLATE_PANGU_EMBED,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -1,5 +1,6 @@
 #include "llama-context.h"

+#include "llama-arch.h"
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-io.h"
@@ -21,6 +22,8 @@ llama_context::llama_context(
              llama_context_params params) :
    model(model),
    balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
+    // TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
+    //     may need to be backend-dependent
    LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);

    t_start_us = model.t_start_us;
@@ -112,11 +115,28 @@ llama_context::llama_context(
        }
    }

-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+    // ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
+    cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
+
+    if (cparams.kv_unified) {
+        cparams.n_ctx_seq = cparams.n_ctx;
+    } else {
+        cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
+        cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
+
+        if (cparams.n_ctx_seq == 0) {
+            throw std::runtime_error("n_ctx_seq == 0");
+        }
+
+        if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
+            cparams.n_ctx =  cparams.n_ctx_seq * cparams.n_seq_max;
+            LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
+        }
+    }

    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_ctx_seq     = %u\n",   __func__, cparams.n_ctx_seq);
    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
    LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
@@ -125,14 +145,14 @@ llama_context::llama_context(
    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);

-    if (n_ctx_per_seq < hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    if (cparams.n_ctx_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
    }

-    if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    if (cparams.n_ctx_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
    }

    if (!hparams.vocab_only) {
@@ -228,7 +248,10 @@ llama_context::llama_context(

        LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());

-        const size_t max_nodes = this->graph_max_nodes();
+        const uint32_t n_seqs = cparams.n_seq_max;
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        const size_t max_nodes = this->graph_max_nodes(n_tokens);

        LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);

@@ -268,9 +291,7 @@ llama_context::llama_context(
        if (pipeline_parallel) {
            LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
        }
-    }

-    if (!hparams.vocab_only) {
        llama_memory_context_ptr mctx;
        if (memory) {
            LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -282,9 +303,6 @@ llama_context::llama_context(

        cross.v_embd.clear();

-        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
-        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
        // avoid reserving graphs with zero outputs - assume one output per sequence
        n_outputs = n_seqs;

@@ -343,7 +361,14 @@ llama_context::llama_context(
        {
            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
            if (!gf) {
-                throw std::runtime_error("failed to allocate compute pp buffers");
+                if (pipeline_parallel) {
+                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+                    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+                    gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+                }
+                if (!gf) {
+                    throw std::runtime_error("failed to allocate compute pp buffers");
+                }
            }

            n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
@@ -448,8 +473,8 @@ uint32_t llama_context::n_ctx() const {
    return cparams.n_ctx;
 }

-uint32_t llama_context::n_ctx_per_seq() const {
-    return cparams.n_ctx / cparams.n_seq_max;
+uint32_t llama_context::n_ctx_seq() const {
+    return cparams.n_ctx_seq;
 }

 uint32_t llama_context::n_batch() const {
@@ -518,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
            throw std::runtime_error("failed to initialize memory context");
        }

-        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
+        const uint32_t n_seqs = cparams.n_seq_max;
        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

        auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@@ -803,7 +828,7 @@ int llama_context::encode(const llama_batch & batch_inp) {

    const auto & hparams = model.hparams;

-    const int64_t n_embd  = hparams.n_embd;
+    const int64_t n_embd  = hparams.n_embd_inp();
    const int64_t n_vocab = model.vocab.n_tokens();

    // note: during encode, we always pass the full sequence starting from pos = 0
@@ -972,7 +997,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    const auto & hparams = model.hparams;

    const int64_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;
+    const int64_t n_embd  = hparams.n_embd_inp();

    const bool output_all = false;

@@ -1223,7 +1248,7 @@ int llama_context::decode(const llama_batch & batch_inp) {

        // make the outputs have the same order they had in the user-provided batch
        // note: this is mostly relevant for recurrent models atm
-        if (!sorted_output) {
+        if (!sorted_output && n_outputs > 1) {
            GGML_ASSERT((size_t) n_outputs == out_ids.size());

            // TODO: is there something more efficient which also minimizes swaps?
@@ -1360,7 +1385,10 @@ void llama_context::output_reorder() {
 // graph
 //

-uint32_t llama_context::graph_max_nodes() const {
+uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
+    if (model.arch == LLM_ARCH_QWEN3NEXT) {
+        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+    }
    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
 }

@@ -2129,7 +2157,7 @@ void llama_context::opt_epoch_iter(
            batch.logits  [pos_batch]    = true;
        }

-        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
+        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
            LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
            return;
        }
@@ -2377,6 +2405,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
    return ctx->n_ctx();
 }

+uint32_t llama_n_ctx_seq(const llama_context * ctx) {
+    return ctx->n_ctx_seq();
+}
+
 uint32_t llama_n_batch(const llama_context * ctx) {
    return ctx->n_batch();
 }
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -43,11 +43,11 @@ struct llama_context {

    ggml_backend_sched_t get_sched() const;

-    uint32_t n_ctx()         const;
-    uint32_t n_ctx_per_seq() const;
-    uint32_t n_batch()       const;
-    uint32_t n_ubatch()      const;
-    uint32_t n_seq_max()     const;
+    uint32_t n_ctx()     const;
+    uint32_t n_ctx_seq() const;
+    uint32_t n_batch()   const;
+    uint32_t n_ubatch()  const;
+    uint32_t n_seq_max() const;

    uint32_t n_threads()       const;
    uint32_t n_threads_batch() const;
@@ -197,7 +197,7 @@ private:
    //

 public:
-    uint32_t graph_max_nodes() const;
+    uint32_t graph_max_nodes(uint32_t n_tokens) const;

    // can reuse the llm_graph_result instance of the context (for example to update a memory module)
    llm_graph_result * get_gf_res_reserve() const;
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -8,6 +8,7 @@

 struct llama_cparams {
    uint32_t n_ctx;           // context size used during inference
+    uint32_t n_ctx_seq;       // context for a single sequence
    uint32_t n_batch;
    uint32_t n_ubatch;
    uint32_t n_seq_max;
--- a/llama/llama.cpp/src/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -6,8 +6,10 @@

 #include <cmath>
 #include <algorithm>
+#include <cstdint>
 #include <stdexcept>

+#define MAX_REPETITION_THRESHOLD 2000
 //
 // helpers
 //
@@ -179,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
    throw std::runtime_error("unexpected end of input");
 }

+static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
+    const char * pos = src;
+    if (*pos != '<') {
+        throw std::runtime_error(std::string("expecting '<' at ") + pos);
+    }
+    pos++;
+
+    // Parse <[id]>
+    if (*pos == '[') {
+        pos++;
+        const char * int_end = parse_int(pos);
+        uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
+        pos = int_end;
+        if (*pos != ']') {
+            throw std::runtime_error(std::string("expecting ']' at ") + pos);
+        }
+        pos++;
+        if (*pos != '>') {
+            throw std::runtime_error(std::string("expecting '>' at ") + pos);
+        }
+        pos++;
+        return std::make_pair(token_id, pos);
+    }
+
+    if (vocab == nullptr) {
+        throw std::runtime_error(std::string("no vocab to parse token at ") + src);
+    }
+
+    // Parse <token> and tokenize to obtain the token id
+    while (*pos != 0 && *pos != '>') {
+        pos++;
+    }
+    if (*pos != '>') {
+        throw std::runtime_error(std::string("expecting '>' at ") + pos);
+    }
+    pos++;
+
+    llama_token tokens[2];
+    int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
+    if (n_tokens != 1) {
+        // must tokenize to exactly 1 token
+        throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
+    }
+    return std::make_pair(tokens[0], pos);
+}
+
 static void print_grammar_char(FILE * file, uint32_t c) {
    if (0x20 <= c && c <= 0x7f) {
        fprintf(file, "%c", static_cast<char>(c));
@@ -210,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
            case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
            case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
            case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
+            case LLAMA_GRETYPE_TOKEN:          fprintf(file, "TOKEN");          break;
+            case LLAMA_GRETYPE_TOKEN_NOT:      fprintf(file, "TOKEN_NOT");      break;
        }
        switch (elem.type) {
            case LLAMA_GRETYPE_END:
@@ -226,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
                print_grammar_char(file, elem.value);
                fprintf(file, "\") ");
                break;
+            case LLAMA_GRETYPE_TOKEN:
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
+            case LLAMA_GRETYPE_TOKEN_NOT:
+                fprintf(file, "!");
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
        }
    }
    fprintf(file, "\n");
@@ -282,6 +343,17 @@ static void print_rule(
            case LLAMA_GRETYPE_CHAR_ANY:
                fprintf(file, ".");
                break;
+            case LLAMA_GRETYPE_TOKEN:
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
+            case LLAMA_GRETYPE_TOKEN_NOT:
+                fprintf(file, "!");
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
        }
        if (is_char_element(elem)) {
            switch (rule[i + 1].type) {
@@ -345,8 +417,10 @@ const char * llama_grammar_parser::parse_sequence(
    size_t last_sym_start = rule.size();
    const char * pos = src;

-    auto handle_repetitions = [&](int min_times, int max_times) {
-
+    // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
+    // (though it's technically the same as -1 now)
+    auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
+        bool no_max = max_times == UINT64_MAX;
        if (last_sym_start == rule.size()) {
            throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
        }
@@ -373,20 +447,20 @@ const char * llama_grammar_parser::parse_sequence(
            rule.resize(last_sym_start);
        } else {
            // Repeat the previous elements (min_times - 1) times
-            for (int i = 1; i < min_times; i++) {
+            for (uint64_t i = 1; i < min_times; i++) {
                rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
            }
        }

        uint32_t last_rec_rule_id = 0;
-        auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+        auto n_opt = no_max ? 1 : max_times - min_times;

        llama_grammar_rule rec_rule(prev_rule);
-        for (int i = 0; i < n_opt; i++) {
+        for (uint64_t i = 0; i < n_opt; i++) {
            rec_rule.resize(prev_rule.size());
            uint32_t rec_rule_id = generate_symbol_id( rule_name);
-            if (i > 0 || max_times < 0) {
-                rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+            if (i > 0 || no_max) {
+                rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
            }
            rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
            rec_rule.push_back({LLAMA_GRETYPE_END, 0});
@@ -440,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
                }
            }
            pos = parse_space(pos + 1, is_nested);
+        } else if (*pos == '<' || *pos == '!') { // token
+            auto type = LLAMA_GRETYPE_TOKEN;
+            if (*pos == '!') { // token inverse
+                type = LLAMA_GRETYPE_TOKEN_NOT;
+                pos++;
+            }
+            auto token_pair = parse_token(vocab, pos);
+            const char * token_end  = token_pair.second;
+            last_sym_start = rule.size();
+            rule.push_back({type, token_pair.first});
+            pos = parse_space(token_end, is_nested);
        } else if (is_word_char(*pos)) { // rule reference
            const char * name_end    = parse_name(pos);
            uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
@@ -478,10 +563,10 @@ const char * llama_grammar_parser::parse_sequence(
                throw std::runtime_error(std::string("expecting an int at ") + pos);
            }
            const char * int_end = parse_int(pos);
-            int min_times = std::stoul(std::string(pos, int_end - pos));
+            uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
            pos = parse_space(int_end, is_nested);

-            int max_times = -1;
+            uint64_t max_times = UINT64_MAX; // default: no max limit

            if (*pos == '}') {
                max_times = min_times;
@@ -502,6 +587,10 @@ const char * llama_grammar_parser::parse_sequence(
            } else {
                throw std::runtime_error(std::string("expecting ',' at ") + pos);
            }
+            bool has_max = max_times != UINT64_MAX;
+            if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
+                throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
+            }
            handle_repetitions(min_times, max_times);
        } else {
            break;
@@ -683,6 +772,21 @@ static bool llama_grammar_match_partial_char(
    return !is_positive_char;
 }

+// returns true iff token matches the rule at pos (regular or inverse)
+// asserts that pos is pointing to a token element
+static bool llama_grammar_match_token(
+    const llama_grammar_element * pos,
+    const llama_token             token) {
+    GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
+    if (pos->type == LLAMA_GRETYPE_TOKEN) {
+        return pos->value == static_cast<uint32_t>(token);
+    }
+    if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        return pos->value != static_cast<uint32_t>(token);
+    }
+    return false;
+}
+
 // transforms a grammar pushdown stack into N possible stacks, all ending
 // at a character range (terminal element)
 static void llama_grammar_advance_stack(
@@ -730,6 +834,8 @@ static void llama_grammar_advance_stack(
        case LLAMA_GRETYPE_CHAR:
        case LLAMA_GRETYPE_CHAR_NOT:
        case LLAMA_GRETYPE_CHAR_ANY:
+        case LLAMA_GRETYPE_TOKEN:
+        case LLAMA_GRETYPE_TOKEN_NOT:
            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
                // only add the stack if it's not a duplicate of one we already have
                new_stacks.emplace_back(stack);
@@ -823,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
    return grammar->stacks;
 }

+static void llama_grammar_accept_chr(
+        struct llama_grammar       & grammar,
+        const llama_grammar_stack  & stack,
+              uint32_t               chr,
+              llama_grammar_stacks & new_stacks) {
+    if (stack.empty()) {
+        return;
+    }
+
+    const llama_grammar_element * pos = stack.back();
+
+    // ignore if this turns into a token
+    if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        return;
+    }
+
+    auto match = llama_grammar_match_char(pos, chr);
+    if (match.first) {
+        llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+        if (!llama_grammar_is_end_of_sequence(match.second)) {
+            new_stack.push_back(match.second);
+        }
+        llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
+    }
+}
+
 void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
    llama_grammar_stacks stacks_new;
    stacks_new.reserve(grammar->stacks.size());

    for (const auto & stack : grammar->stacks) {
-        if (stack.empty()) {
-            continue;
-        }
-
-        auto match = llama_grammar_match_char(stack.back(), chr);
-        if (match.first) {
-            const llama_grammar_element * pos = match.second;
-
-            // update top of stack to next element, if any
-            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
-            if (!llama_grammar_is_end_of_sequence(pos)) {
-                new_stack.push_back(pos);
-            }
-            llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
-        }
+        llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
    }

    grammar->stacks = std::move(stacks_new);
@@ -867,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(

    const llama_grammar_element * stack_pos = stack.back();

+    // if the top of the stack is a token rule, then we only need to check the token id
+    if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        for (const auto & tok : candidates) {
+            if (*tok.code_points == 0) {
+                // reached the end of a token consumed by char rules, reject iff it ended
+                // in a partial response
+                if (tok.partial_utf8.n_remain != 0) {
+                    rejects.push_back(tok);
+                }
+            } else if (!llama_grammar_match_token(stack_pos, tok.id)) {
+                rejects.push_back(tok);
+            }
+        }
+        return rejects;
+    }
+
    llama_grammar_candidates next_candidates;
    next_candidates.reserve(candidates.size());

@@ -879,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
                rejects.push_back(tok);
            }
        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
-            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
+            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
        } else {
            rejects.push_back(tok);
        }
@@ -897,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(

    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
    for (const auto & tok : next_rejects) {
-        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
+        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
    }

    return rejects;
@@ -966,12 +1100,13 @@ struct llama_grammar * llama_grammar_init_impl(
        ollama_vocab,
        std::move(vec_rules),
        std::move(stacks),
-        /* .partial_utf8 = */     {},
-        /* .lazy =*/              false,
-        /* .awaiting_trigger = */ false,
-        /* .trigger_buffer = */   "",
-        /* .trigger_tokens   = */ {},
-        /* .trigger_patterns    = */ {},
+        /* .partial_utf8 = */             {},
+        /* .lazy = */                     false,
+        /* .awaiting_trigger = */         false,
+        /* .trigger_buffer = */           "",
+        /* .trigger_buffer_positions = */ {},
+        /* .trigger_tokens = */           {},
+        /* .trigger_patterns = */         {},
    };
 }

@@ -985,7 +1120,7 @@ struct llama_grammar * llama_grammar_init_impl(
                            size_t num_trigger_patterns,
               const llama_token * trigger_tokens,
                            size_t num_trigger_tokens) {
-    llama_grammar_parser parser;
+    llama_grammar_parser parser(vocab);

    // if there is a grammar, parse it
    // rules will be empty (default) if there are parse errors
@@ -1073,10 +1208,11 @@ struct llama_grammar * llama_grammar_init_impl(
        ollama_vocab,
        std::move(vec_rules),
        std::move(stacks),
-        /* .partial_utf8 = */     {},
-        /* .lazy = */             lazy,
-        /* .awaiting_trigger = */ lazy,
-        /* .trigger_buffer = */   "",
+        /* .partial_utf8 = */             {},
+        /* .lazy = */                     lazy,
+        /* .awaiting_trigger = */         lazy,
+        /* .trigger_buffer = */           "",
+        /* .trigger_buffer_positions = */ {},
        std::move(vec_trigger_tokens),
        std::move(vec_trigger_patterns),
    };
@@ -1100,6 +1236,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
        grammar.lazy,
        grammar.awaiting_trigger,
        grammar.trigger_buffer,
+        grammar.trigger_buffer_positions,
        grammar.trigger_tokens,
        grammar.trigger_patterns,
    };
@@ -1156,7 +1293,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
            cur_p->data[i].logit = -INFINITY;
        } else {
            candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
-            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
+            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
        }
    }

@@ -1176,10 +1313,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
        if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
            grammar.awaiting_trigger = false;
            grammar.trigger_buffer.clear();
-            llama_grammar_accept_str(grammar, piece);
+            llama_grammar_accept_token(grammar, token, piece);
            LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
            return;
        } else {
+            auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
+            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
            grammar.trigger_buffer += piece;

            std::smatch match;
@@ -1197,10 +1336,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
                    if (start == std::string::npos) {
                        start = match.position(0);
                    }
+
+                    // replay tokens that overlap with [start, end)
+                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
+                        auto [tok_start, tok_end] = tok_pos;
+                        if (tok_end <= start) {
+                            continue;
+                        }
+
+                        size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
+                        size_t piece_len = tok_end - piece_start;
+                        auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
+                        llama_grammar_accept_token(grammar, tok, tok_piece);
+                    }
+
                    auto constrained_str = grammar.trigger_buffer.substr(start);
-                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                    grammar.trigger_buffer.clear();
-                    llama_grammar_accept_str(grammar, constrained_str);
+                    grammar.trigger_buffer_positions.clear();
                    LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
                    return;
                }
@@ -1220,7 +1372,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
        GGML_ABORT("grammar error: end of grammar token received but grammar stack is not empty");
    }

-    llama_grammar_accept_str(grammar, piece);
+    llama_grammar_accept_token(grammar, token, piece);
 }

 void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
@@ -1238,6 +1390,61 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
    }
 }

+void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
+    // Note terminating 0 in decoded string
+    const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
+    const auto & code_points = decoded.first;
+
+    llama_grammar_stacks stacks_new;
+    stacks_new.reserve(grammar.stacks.size());
+
+    for (const auto & stack : grammar.stacks) {
+        if (stack.empty()) {
+            continue;
+        }
+
+        const llama_grammar_element * pos = stack.back();
+
+        if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+            if (llama_grammar_match_token(pos, token)) {
+                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+                    new_stack.push_back(pos + 1);
+                }
+                llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
+            }
+        } else {
+            llama_grammar_stacks current_stacks = {stack};
+
+            for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+                llama_grammar_stacks next_stacks;
+
+                for (const auto & cur_stack : current_stacks) {
+                    llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
+                }
+
+                current_stacks = std::move(next_stacks);
+                if (current_stacks.empty()) {
+                    break;
+                }
+            }
+
+            for (auto & surviving_stack : current_stacks) {
+                if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
+                    stacks_new.emplace_back(surviving_stack);
+                }
+            }
+        }
+    }
+
+    grammar.stacks = std::move(stacks_new);
+    grammar.partial_utf8 = decoded.second;
+
+    if (grammar.stacks.empty()) {
+        throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
+    }
+}
+

 const std::string & ollama_vocab::token_to_piece(const uint32_t token) const {
    try {
--- a/llama/llama.cpp/src/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@@ -47,11 +47,17 @@ enum llama_gretype {

    // any character (.)
    LLAMA_GRETYPE_CHAR_ANY       = 7,
+
+    // terminal element: token (<[token-id]>)
+    LLAMA_GRETYPE_TOKEN          = 8,
+
+    // inverse token (!<[token-id]>)
+    LLAMA_GRETYPE_TOKEN_NOT      = 9,
 };

 typedef struct llama_grammar_element {
    enum llama_gretype type;
-    uint32_t           value; // Unicode code point or rule ID
+    uint32_t           value; // Unicode code point, rule ID, or token ID
 } llama_grammar_element;

 struct llama_partial_utf8 {
@@ -63,6 +69,7 @@ struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
+    llama_token          id;
 };

 using llama_grammar_rule  = std::vector<      llama_grammar_element>;
@@ -88,10 +95,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
        const llama_grammar_candidates & candidates);

 struct llama_grammar_parser {
+    const llama_vocab * vocab;
    std::map<std::string, uint32_t> symbol_ids;

    llama_grammar_rules rules;

+    llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
+
    llama_grammar_stack c_rules() const;

    uint32_t get_symbol_id(const char * src, size_t len);
@@ -123,6 +133,9 @@ struct llama_grammar_trigger_pattern {
 };

 struct llama_grammar {
+    // maintain a list of llama_tokens and their positions in the trigger_buffer
+    using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
+
    // note: allow null vocab for testing (not great)
    const llama_vocab * vocab;
    const ollama_vocab * o_vocab;
@@ -139,6 +152,7 @@ struct llama_grammar {
    bool                     lazy             = false;
    bool                     awaiting_trigger = false; // Initialized to true for lazy grammars only
    std::string              trigger_buffer;           // Output buffered by lazy grammar. Will be cleared once trigger is found.
+    std::vector<token_pos>   trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
    std::vector<llama_token> trigger_tokens;           // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
    std::vector<llama_grammar_trigger_pattern>
                             trigger_patterns;         // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
@@ -185,3 +199,8 @@ void llama_grammar_accept_impl(
 void llama_grammar_accept_str(
              struct llama_grammar & grammar,
                 const std::string & piece);
+
+void llama_grammar_accept_token(
+              struct llama_grammar & grammar,
+                       llama_token   token,
+                 const std::string & piece);
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
    if (ubatch->pos && attn_scale) {
        const int64_t n_tokens = ubatch->n_tokens;

+        GGML_ASSERT(f_attn_temp_scale != 0.0f);
+        GGML_ASSERT(n_attn_temp_floor_scale != 0);
+
        std::vector<float> attn_scale_data(n_tokens, 0.0f);
        for (int i = 0; i < n_tokens; ++i) {
            const float pos = ubatch->pos[i];
@@ -958,25 +961,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        // organize experts into n_expert_groups
        ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]

-        ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
+        ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
        group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]

        // get top n_group_used expert groups
        group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
        group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]

-        ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
+        ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
        cb(expert_groups, "ffn_moe_group_topk", il);

        // mask out the other groups
        selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
-        selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
+        selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
        selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
        cb(selection_probs, "ffn_moe_probs_masked", il);
    }

    // select experts
-    ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+    ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
    cb(selected_experts->src[0], "ffn_moe_argsort", il);
    cb(selected_experts, "ffn_moe_topk", il);

@@ -1006,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
        cb(weights_sum, "ffn_moe_weights_sum", il);

-        if (arch == LLM_ARCH_BAILINGMOE2) {
-            weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
-            cb(weights_sum, "ffn_moe_weights_sum_biased", il);
-        }
+        // Avoid division by zero, clamp to smallest number representable by F16
+        weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
+        cb(weights_sum, "ffn_moe_weights_sum_clamped", il);

        weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
        cb(weights, "ffn_moe_weights_norm", il);
@@ -1137,7 +1139,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

 // input embeddings with optional lora
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
-    const int64_t n_embd = hparams.n_embd;
+    const int64_t n_embd = hparams.n_embd_inp();

    auto inp = std::make_unique<llm_graph_input_embd>();

@@ -1274,7 +1276,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
    //    return cur;
    //}

-    const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
+    const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;

    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
@@ -1587,9 +1589,10 @@ ggml_tensor * llm_graph_context::build_attn(
            int       il) const {
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
+    // expand k later to enable rope fusion which directly writes into k-v cache
    ggml_build_forward_expand(gf, q_cur);
-    ggml_build_forward_expand(gf, k_cur);
    ggml_build_forward_expand(gf, v_cur);
+    ggml_build_forward_expand(gf, k_cur);

    const auto * mctx_cur = inp->mctx;

@@ -2030,7 +2033,7 @@ int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buck

    if (bidirectional) {
        relative_bucket += (relative_position > 0) * n_buckets;
-        relative_position = abs(relative_position);
+        relative_position = std::abs(relative_position);
    } else {
        relative_position = -std::min<int32_t>(relative_position, 0);
    }
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
    return n_head/n_head_kv;
 }

+uint32_t llama_hparams::n_embd_inp() const {
+    uint32_t n_embd_inp = n_embd;
+
+    if (n_deepstack_layers > 0) {
+        n_embd_inp += n_embd * n_deepstack_layers;
+    }
+
+    return n_embd_inp;
+}
+
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);

@@ -148,7 +158,7 @@ bool llama_hparams::is_recurrent(uint32_t il) const {
 }

 uint32_t llama_hparams::n_pos_per_embd() const {
-    return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
+    return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
 }

 bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -6,7 +6,7 @@

 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
-#define LLAMA_MAX_EXPERTS 384  // Kimi-K2
+#define LLAMA_MAX_EXPERTS 512 // Qwen3 Next

 enum llama_expert_gating_func_type {
    LLAMA_EXPERT_GATING_FUNC_TYPE_NONE           = 0,
@@ -164,8 +164,8 @@ struct llama_hparams {
    // llama4 smallthinker
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
-    uint32_t n_attn_temp_floor_scale = 8192;
-    float    f_attn_temp_scale       = 0.1;
+    uint32_t n_attn_temp_floor_scale = 0;
+    float    f_attn_temp_scale       = 0.0f;

    // gemma3n altup
    uint32_t n_altup      = 4; // altup_num_inputs
@@ -185,6 +185,9 @@ struct llama_hparams {
    std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
    std::array<float, LLAMA_MAX_LAYERS> xielu_eps;

+    // qwen3vl deepstack
+    uint32_t n_deepstack_layers = 0;
+
    // needed by encoder-decoder models (e.g. T5, FLAN-T5)
    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
@@ -226,6 +229,9 @@ struct llama_hparams {

    uint32_t n_gqa(uint32_t il = 0) const;

+    // dimension of main + auxiliary input embeddings
+    uint32_t n_embd_inp() const;
+
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;

--- a/llama/llama.cpp/src/llama-impl.cpp
+++ b/llama/llama.cpp/src/llama-impl.cpp
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
 time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}

 time_meas::~time_meas() {
-        if (t_start_us >= 0) {
-            t_acc += ggml_time_us() - t_start_us;
-        }
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
    }
+}

 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
    ggml_log_set(log_callback, user_data);
--- a/llama/llama.cpp/src/llama-impl.h
+++ b/llama/llama.cpp/src/llama-impl.h
@@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 template <typename T>
 struct no_init {
    T value;
-    no_init() { /* do nothing */ }
+    no_init() = default;
 };

 struct time_meas {
--- a/llama/llama.cpp/src/llama-kv-cache-iswa.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache-iswa.cpp
@@ -45,7 +45,9 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(

    const uint32_t size_base = kv_size;

-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
+    // note: the SWA cache is always padded to 256 for performance
+    //       https://github.com/ggml-org/llama.cpp/issues/17037
+    uint32_t size_swa = GGML_PAD(std::min(size_base, hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch), 256);

    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
    if (swa_full) {
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -8,6 +8,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <cstring>
 #include <limits>
 #include <map>
 #include <stdexcept>
@@ -37,8 +38,15 @@ llama_kv_cache::llama_kv_cache(

    const uint32_t n_layer_kv = hparams.n_layer_kv();

+    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+    struct ggml_backend_buft_comparator {
+        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+        }
+    };
+    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
    // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
@@ -53,13 +61,12 @@ llama_kv_cache::llama_kv_cache(
                return nullptr;
            }

-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
+            ctx_map.emplace(buft, ctx);

            return ctx;
        }

-        return it->second;
+        return it->second.get();
    };

    GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
@@ -167,11 +174,8 @@ llama_kv_cache::llama_kv_cache(
    }

    // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+    for (auto & [buft, ctx] : ctx_map) {
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
        if (!buf) {
            throw std::runtime_error("failed to allocate buffer for kv cache");
        }
@@ -179,7 +183,7 @@ llama_kv_cache::llama_kv_cache(
        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);

        ggml_backend_buffer_clear(buf, 0);
-        bufs.emplace_back(buf);
+        ctxs_bufs.emplace_back(std::move(ctx), buf);
    }

    {
@@ -203,7 +207,7 @@ void llama_kv_cache::clear(bool data) {
    }

    if (data) {
-        for (auto & buf : bufs) {
+        for (auto & [_, buf] : ctxs_bufs) {
            ggml_backend_buffer_clear(buf.get(), 0);
        }
    }
@@ -334,6 +338,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
            llama_pos pos   = v_cells[s0].pos_get(i);
            llama_pos shift = v_cells[s0].get_shift(i);

+            llama_kv_cell_ext ext = v_cells[s0].ext_get(i);
+
            if (shift != 0) {
                pos -= shift;
                assert(pos >= 0);
@@ -345,6 +351,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
            if (shift != 0) {
                v_cells[s1].pos_add(i, shift);
            }
+
+            v_cells[s1].ext_set(i, ext);
        }
    }

@@ -379,6 +387,7 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) {

 void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
    GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+    GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1");

    auto & cells = v_cells[seq_to_stream[seq_id]];
    auto & head  = v_heads[seq_to_stream[seq_id]];
@@ -423,6 +432,7 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll

 void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
    GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+    GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1");

    auto & cells = v_cells[seq_to_stream[seq_id]];

@@ -472,8 +482,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {

 std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
    std::map<ggml_backend_buffer_type_t, size_t> ret;
-    for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
-        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    for (const auto & [_, buf] : ctxs_bufs) {
+        ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
    }
    return ret;
 }
@@ -896,6 +906,14 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &

            cells.pos_set(idx, ubatch.pos[i]);

+            if (ubatch.is_pos_2d()) {
+                llama_kv_cell_ext ext {
+                    /*.x =*/ ubatch.pos[i + ubatch.n_tokens*2],
+                    /*.y =*/ ubatch.pos[i + ubatch.n_tokens],
+                };
+                cells.ext_set(idx, ext);
+            }
+
            for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
                cells.seq_add(idx, ubatch.seq_id[i][s]);
            }
@@ -957,10 +975,14 @@ bool llama_kv_cache::get_has_shift() const {
 uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
    uint32_t result = 0;

+    // pad the n_kv value so that the graph remains constant across batches and can be reused
+    // note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
+    const uint32_t n_pad_cur = std::max(n_pad, 256u);
+
    for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
        const auto & cells = v_cells[sinfo.strm[s]];

-        result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
+        result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
    }

    return result;
@@ -1239,6 +1261,11 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u

                const llama_pos p1 = ubatch->pos[i];

+                // for M-RoPE
+                const bool is_2d = ubatch->is_pos_2d();
+                const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
+                const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
+
                const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);

                for (uint32_t j = 0; j < n_kv; ++j) {
@@ -1258,6 +1285,14 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                        continue;
                    }

+                    // M-RoPE causal mask
+                    if (causal_attn && is_2d && p0 == p1) {
+                        const auto & p0_ext = cells.ext_get(j);
+                        if (p0_ext.is_2d_gt(p1_x, p1_y)) {
+                            continue;
+                        }
+                    }
+
                    // apply SWA if any
                    if (is_masked_swa(p0, p1)) {
                        continue;
@@ -1298,7 +1333,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
 size_t llama_kv_cache::total_size() const {
    size_t size = 0;

-    for (const auto & buf : bufs) {
+    for (const auto & [_, buf] : ctxs_bufs) {
        size += ggml_backend_buffer_get_size(buf.get());
    }

@@ -1340,7 +1375,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;

    const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
+    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
                                // @ngxson : this is a workaround
                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
                                // a normal RoPE should work, we just need to use the correct ordering
@@ -1551,6 +1586,9 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t
            io.write(&pos,      sizeof(pos));
            io.write(&n_seq_id, sizeof(n_seq_id));

+            // TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
+            //       see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
+
            for (const auto & seq_id : seq_ids) {
                io.write(&seq_id, sizeof(seq_id));
            }
@@ -1696,6 +1734,8 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
            return false;
        }

+        // TODO: we cannot yet restore llama_kv_cell_ext as the apply_ubatch() does not support it yet
+        //       see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
        apply_ubatch(sinfo, ubatch);

        const auto head_cur = sinfo.head();
@@ -2010,8 +2050,3 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
 void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
    kv->set_input_pos_bucket(dst, ubatch);
 }
-
-uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
-}
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -19,8 +19,6 @@ struct llama_context;

 class llama_kv_cache : public llama_memory_i {
 public:
-    static uint32_t get_padding(const llama_cparams & cparams);
-
    struct stream_copy_info {
        bool empty() const {
            assert(ssrc.size() == sdst.size());
@@ -217,8 +215,8 @@ private:
    // this is the SWA type of the cache - not to be confused with the model SWA type
    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;

-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
+    // ggml contexts for the KV cache along with the allocated backend buffers:
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;

    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
--- a/llama/llama.cpp/src/llama-kv-cells.h
+++ b/llama/llama.cpp/src/llama-kv-cells.h
@@ -5,9 +5,27 @@

 #include <bitset>
 #include <cassert>
-#include <vector>
-#include <set>
+#include <cstring>
 #include <map>
+#include <set>
+#include <vector>
+
+struct llama_kv_cell_ext {
+    // 2D spatial positions, typically used for M-RoPE
+    llama_pos x = 0;
+    llama_pos y = 0;
+
+    // return true if the current 2D spatial position is greater than other
+    bool is_2d_gt(llama_pos ox, llama_pos oy) const {
+        return (y > oy) || (y == oy && x > ox);
+    }
+
+    void reset() {
+        static_assert(std::is_trivially_copyable_v<llama_kv_cell_ext>);
+
+        memset(this, 0, sizeof(*this));
+    }
+};

 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
@@ -16,6 +34,7 @@ public:
    void reset() {
        for (uint32_t i = 0; i < pos.size(); ++i) {
            pos[i]   = -1;
+            ext[i].reset();
            shift[i] =  0;
            seq[i].reset();
        }
@@ -43,6 +62,7 @@ public:

    void resize(uint32_t n) {
        pos.resize(n);
+        ext.resize(n);
        shift.resize(n);
        seq.resize(n);

@@ -108,6 +128,7 @@ public:
            const auto idx = i + j;

            res.pos[j] = pos[idx];
+            res.ext[j] = ext[idx];
            res.seq[j] = seq[idx];

            assert(shift[idx] == 0);
@@ -126,6 +147,7 @@ public:
            const auto idx = idxs[j];

            res.pos[j] = pos[idx];
+            res.ext[j] = ext[idx];
            res.seq[j] = seq[idx];

            assert(shift[idx] == 0);
@@ -154,6 +176,7 @@ public:
            }

            pos[idx] = other.pos[j];
+            ext[idx] = other.ext[j];
            seq[idx] = other.seq[j];

            if (pos[idx] != -1) {
@@ -184,6 +207,7 @@ public:
            }

            pos[idx] = other.pos[j];
+            ext[idx] = other.ext[j];
            seq[idx] = other.seq[j];

            if (pos[idx] != -1) {
@@ -203,6 +227,7 @@ public:
        seq[i].reset();

        pos[i] = -1;
+        ext[i].reset();
        shift[i] = 0;

        used.erase(i);
@@ -221,6 +246,7 @@ public:

        if (seq[i].none()) {
            pos[i] = -1;
+            ext[i].reset();
            shift[i] = 0;

            used.erase(i);
@@ -250,6 +276,7 @@ public:
            seq[i].reset();

            pos[i] = -1;
+            ext[i].reset();
            shift[i] = 0;

            used.erase(i);
@@ -340,6 +367,13 @@ public:
        return pos[i];
    }

+    const llama_kv_cell_ext & ext_get(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return ext[i];
+    }
+
    // note: call only if the cell is not empty
    llama_pos get_shift(uint32_t i) const {
        assert(i < pos.size());
@@ -368,6 +402,11 @@ public:
        used.insert(i);
    }

+    void ext_set(uint32_t i, llama_kv_cell_ext p) {
+        assert(i < ext.size());
+        ext[i] = p;
+    }
+
    // pos[i] = pos[i] + d
    // sets "has_shift" to true
    // note: call only if the cell is not empty
@@ -424,6 +463,9 @@ private:

    std::vector<llama_pos> pos;

+    // stores extra info per cell
+    std::vector<llama_kv_cell_ext> ext;
+
    // this array accumulates any applied shifts to the pos array since the last reset_shift() call
    // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
    //
--- a/Show More
+++ b/Show More