Compare commits
100 Commits
v0.12.11-r
...
hoyyeva/re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
42d6a3f075 | ||
|
|
ed553f51f7 | ||
|
|
7d6f0c621f | ||
|
|
de9ecfd01c | ||
|
|
95fdd8d619 | ||
|
|
9f7822851c | ||
|
|
9b2035d194 | ||
|
|
93d45d7a04 | ||
|
|
709f842457 | ||
|
|
2dfb74410d | ||
|
|
1eb5e75972 | ||
|
|
3475d915cb | ||
|
|
48e78e9be1 | ||
|
|
a838421ea3 | ||
|
|
1c4e85b4df | ||
|
|
dac4f17fea | ||
|
|
56b8fb024c | ||
|
|
b95693056c | ||
|
|
c34fc64688 | ||
|
|
7cf6f18c1f | ||
|
|
bbbb6b2a01 | ||
|
|
76f88caf43 | ||
|
|
2bccf8c624 | ||
|
|
0c5e5f6630 | ||
|
|
d475d1f081 | ||
|
|
d2f334c1f7 | ||
|
|
603ceefaa6 | ||
|
|
e082d60a24 | ||
|
|
5dae738067 | ||
|
|
0c78723174 | ||
|
|
5a41d69b2a | ||
|
|
c146a138e3 | ||
|
|
31b8c6a214 | ||
|
|
9191dfaf05 | ||
|
|
1108d8b34e | ||
|
|
7837a5bc7e | ||
|
|
0a844f8e96 | ||
|
|
a03223b86f | ||
|
|
0cf7794b16 | ||
|
|
854d40edc5 | ||
|
|
84a2cedf18 | ||
|
|
3f30836734 | ||
|
|
cc9555aff0 | ||
|
|
20aee96706 | ||
|
|
18b5958d46 | ||
|
|
5317202c38 | ||
|
|
d771043e88 | ||
|
|
f8f1071818 | ||
|
|
d3e0a0dee4 | ||
|
|
554172759c | ||
|
|
5b6a8e6001 | ||
|
|
467bbc0dd5 | ||
|
|
6d9f9323c5 | ||
|
|
0c2489605d | ||
|
|
8b1b89a984 | ||
|
|
47e272c35a | ||
|
|
417a81fda3 | ||
|
|
dba62ff3a5 | ||
|
|
d70e935526 | ||
|
|
5c1063df7f | ||
|
|
cb485b2019 | ||
|
|
b2af50960f | ||
|
|
eac5b8bfbd | ||
|
|
604e43b28d | ||
|
|
53985b3c4d | ||
|
|
b6e02cbbd2 | ||
|
|
91935631ac | ||
|
|
8de30b568a | ||
|
|
485da9fd35 | ||
|
|
0796d79d19 | ||
|
|
92981ae3f2 | ||
|
|
8ed1adf3db | ||
|
|
440a3823a6 | ||
|
|
718961de68 | ||
|
|
330f62a7fa | ||
|
|
584e2d646f | ||
|
|
1fd4cb87b2 | ||
|
|
4aba2e8b72 | ||
|
|
2f36d769aa | ||
|
|
399eacf486 | ||
|
|
231cc878cb | ||
|
|
aa676b313f | ||
|
|
dd0ed0ef17 | ||
|
|
d5649821ae | ||
|
|
4cea757e70 | ||
|
|
a751bc159c | ||
|
|
5d31242fbf | ||
|
|
d7fd72193f | ||
|
|
72ff5b9d8c | ||
|
|
ce29f695b4 | ||
|
|
12b174b10e | ||
|
|
333203d871 | ||
|
|
c114987523 | ||
|
|
b48083f33f | ||
|
|
482bec824f | ||
|
|
684a9a8c5a | ||
|
|
54a76d3773 | ||
|
|
8a75d8b015 | ||
|
|
f206357412 | ||
|
|
8224cd9063 |
4
.gitattributes
vendored
4
.gitattributes
vendored
@@ -15,8 +15,12 @@ ml/backend/**/*.cu linguist-vendored
|
||||
ml/backend/**/*.cuh linguist-vendored
|
||||
ml/backend/**/*.m linguist-vendored
|
||||
ml/backend/**/*.metal linguist-vendored
|
||||
ml/backend/**/*.comp linguist-vendored
|
||||
ml/backend/**/*.glsl linguist-vendored
|
||||
ml/backend/**/CMakeLists.txt linguist-vendored
|
||||
|
||||
app/webview linguist-vendored
|
||||
|
||||
llama/build-info.cpp linguist-generated
|
||||
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated
|
||||
|
||||
|
||||
20
.github/workflows/release.yaml
vendored
20
.github/workflows/release.yaml
vendored
@@ -16,13 +16,15 @@ jobs:
|
||||
outputs:
|
||||
GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
|
||||
VERSION: ${{ steps.goflags.outputs.VERSION }}
|
||||
vendorsha: ${{ steps.changes.outputs.vendorsha }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set environment
|
||||
id: goflags
|
||||
run: |
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
|
||||
echo VERSION="${GITHUB_REF_NAME#v}" >>$GITHUB_OUTPUT
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" | tee -a $GITHUB_OUTPUT
|
||||
echo VERSION="${GITHUB_REF_NAME#v}" | tee -a $GITHUB_OUTPUT
|
||||
echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT
|
||||
|
||||
darwin-build:
|
||||
runs-on: macos-14-xlarge
|
||||
@@ -53,6 +55,9 @@ jobs:
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- run: |
|
||||
./scripts/build_darwin.sh
|
||||
- name: Log build results
|
||||
@@ -185,14 +190,14 @@ jobs:
|
||||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ${{ github.workspace }}\.ccache
|
||||
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
|
||||
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}-${{ needs.setup-environment.outputs.vendorsha }}
|
||||
- name: Build target "${{ matrix.preset }}"
|
||||
run: |
|
||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} --install-prefix "$((pwd).Path)\dist\${{ matrix.os }}-${{ matrix.arch }}"
|
||||
cmake --build --parallel ([Environment]::ProcessorCount) --preset "${{ matrix.preset }}"
|
||||
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip
|
||||
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || startsWith(matrix.preset, 'Vulkan') && 'Vulkan' || 'CPU' }}" --strip
|
||||
Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
|
||||
env:
|
||||
CMAKE_GENERATOR: Ninja
|
||||
@@ -249,6 +254,9 @@ jobs:
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- name: Verify gcc is actually clang
|
||||
run: |
|
||||
$ErrorActionPreference='Continue'
|
||||
@@ -302,6 +310,9 @@ jobs:
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: depends-windows*
|
||||
@@ -366,6 +377,7 @@ jobs:
|
||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/vulkan*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
||||
|
||||
18
.github/workflows/test.yaml
vendored
18
.github/workflows/test.yaml
vendored
@@ -22,6 +22,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
changed: ${{ steps.changes.outputs.changed }}
|
||||
vendorsha: ${{ steps.changes.outputs.vendorsha }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -37,6 +38,7 @@ jobs:
|
||||
}
|
||||
|
||||
echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
|
||||
echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT
|
||||
|
||||
linux:
|
||||
needs: [changes]
|
||||
@@ -83,7 +85,7 @@ jobs:
|
||||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: /github/home/.cache/ccache
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
|
||||
- run: |
|
||||
cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
|
||||
cmake --build --preset ${{ matrix.preset }} --parallel
|
||||
@@ -178,7 +180,7 @@ jobs:
|
||||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ${{ github.workspace }}\.ccache
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
|
||||
- run: |
|
||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||
@@ -206,6 +208,9 @@ jobs:
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
@@ -226,12 +231,9 @@ jobs:
|
||||
if: always()
|
||||
run: go test -count=1 -benchtime=1x ./...
|
||||
|
||||
# TODO(bmizerany): replace this heavy tool with just the
|
||||
# tools/checks/binaries we want and then make them all run in parallel
|
||||
# across jobs, not on a single tiny vm on Github Actions.
|
||||
- uses: golangci/golangci-lint-action@v6
|
||||
- uses: golangci/golangci-lint-action@v9
|
||||
with:
|
||||
args: --timeout 10m0s -v
|
||||
only-new-issues: true
|
||||
|
||||
patches:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -240,4 +242,4 @@ jobs:
|
||||
- name: Verify patches apply cleanly and do not change files
|
||||
run: |
|
||||
make -f Makefile.sync clean checkout apply-patches sync
|
||||
git diff --compact-summary --exit-code
|
||||
git diff --compact-summary --exit-code
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
run:
|
||||
timeout: 5m
|
||||
version: "2"
|
||||
linters:
|
||||
enable:
|
||||
- asasalint
|
||||
@@ -7,35 +6,46 @@ linters:
|
||||
- bodyclose
|
||||
- containedctx
|
||||
- gocheckcompilerdirectives
|
||||
- gofmt
|
||||
- gofumpt
|
||||
- gosimple
|
||||
- govet
|
||||
- ineffassign
|
||||
- intrange
|
||||
- makezero
|
||||
- misspell
|
||||
- nilerr
|
||||
- nolintlint
|
||||
- nosprintfhostport
|
||||
- staticcheck
|
||||
- unconvert
|
||||
- usetesting
|
||||
- wastedassign
|
||||
- whitespace
|
||||
disable:
|
||||
- usestdlibvars
|
||||
- errcheck
|
||||
linters-settings:
|
||||
staticcheck:
|
||||
checks:
|
||||
- all
|
||||
- -SA1019 # omit Deprecated check
|
||||
- usestdlibvars
|
||||
settings:
|
||||
govet:
|
||||
disable:
|
||||
- unusedresult
|
||||
staticcheck:
|
||||
checks:
|
||||
- all
|
||||
- -QF* # disable quick fix suggestions
|
||||
- -SA1019
|
||||
- -ST1000 # package comment format
|
||||
- -ST1003 # underscores in package names
|
||||
- -ST1005 # error strings should not be capitalized
|
||||
- -ST1012 # error var naming (ErrFoo)
|
||||
- -ST1016 # receiver name consistency
|
||||
- -ST1020 # comment on exported function format
|
||||
- -ST1021 # comment on exported type format
|
||||
- -ST1022 # comment on exported var format
|
||||
- -ST1023 # omit type from declaration
|
||||
severity:
|
||||
default-severity: error
|
||||
default: error
|
||||
rules:
|
||||
- linters:
|
||||
- gofmt
|
||||
- goimports
|
||||
- intrange
|
||||
severity: info
|
||||
formatters:
|
||||
enable:
|
||||
- gofmt
|
||||
- gofumpt
|
||||
|
||||
@@ -16,7 +16,7 @@ See the [development documentation](./docs/development.md) for instructions on h
|
||||
|
||||
* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
|
||||
* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
|
||||
* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.
|
||||
* Documentation: small updates to fill in or correct missing documentation are helpful, however large documentation additions can be hard to maintain over time.
|
||||
|
||||
### Issues that may not be accepted
|
||||
|
||||
@@ -43,7 +43,7 @@ Tips for proposals:
|
||||
* Explain how the change will be tested.
|
||||
|
||||
Additionally, for bonus points: Provide draft documentation you would expect to
|
||||
see if the change were accepted.
|
||||
see if the changes were accepted.
|
||||
|
||||
## Pull requests
|
||||
|
||||
@@ -66,7 +66,6 @@ Examples:
|
||||
|
||||
llm/backend/mlx: support the llama architecture
|
||||
CONTRIBUTING: provide clarity on good commit messages, and bad
|
||||
docs: simplify manual installation with shorter curl commands
|
||||
|
||||
Bad Examples:
|
||||
|
||||
|
||||
14
Dockerfile
14
Dockerfile
@@ -39,14 +39,14 @@ ENV CC=clang CXX=clang++
|
||||
FROM base-${TARGETARCH} AS base
|
||||
ARG CMAKEVERSION
|
||||
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
ENV LDFLAGS=-s
|
||||
|
||||
FROM base AS cpu
|
||||
RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
|
||||
ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CPU' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CPU' \
|
||||
@@ -57,6 +57,8 @@ ARG CUDA11VERSION=11.8
|
||||
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 11' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
|
||||
@@ -67,6 +69,8 @@ ARG CUDA12VERSION=12.8
|
||||
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-12/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 12' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
|
||||
@@ -78,6 +82,8 @@ ARG CUDA13VERSION=13.0
|
||||
RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-13/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 13' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
|
||||
@@ -87,6 +93,8 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
FROM base AS rocm-6
|
||||
ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
|
||||
ARG PARALLEL
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'ROCm 6' \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
|
||||
@@ -118,6 +126,8 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
&& cmake --install build --component CUDA --strip --parallel ${PARALLEL}
|
||||
|
||||
FROM base AS vulkan
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'Vulkan' \
|
||||
&& cmake --build --parallel --preset 'Vulkan' \
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=3cfa9c3f125763305b4226bc032f1954f08990dc
|
||||
FETCH_HEAD=17f7f4baad8b3a716ee139da7bb56ae984e8c0fa
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
@@ -57,7 +57,7 @@ checkout: $(WORKDIR)
|
||||
$(WORKDIR):
|
||||
git clone $(UPSTREAM) $(WORKDIR)
|
||||
|
||||
.PHONE: format-patches
|
||||
.PHONY: format-patches
|
||||
format-patches: llama/patches
|
||||
git -C $(WORKDIR) format-patch \
|
||||
--no-signature \
|
||||
@@ -66,7 +66,11 @@ format-patches: llama/patches
|
||||
-o $(realpath $<) \
|
||||
$(FETCH_HEAD)
|
||||
|
||||
.PHONE: clean
|
||||
.PHONY: clean
|
||||
clean: checkout
|
||||
@git -C $(WORKDIR) am --abort || true
|
||||
$(RM) llama/patches/.*.patched
|
||||
|
||||
.PHONY: print-base
|
||||
print-base:
|
||||
@echo $(FETCH_HEAD)
|
||||
17
README.md
17
README.md
@@ -299,6 +299,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [LibreChat](https://github.com/danny-avila/LibreChat)
|
||||
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
|
||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [AI-UI](https://github.com/bajahaw/ai-ui)
|
||||
- [Saddle](https://github.com/jikkuatwork/saddle)
|
||||
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
|
||||
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
||||
@@ -365,7 +366,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
|
||||
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
|
||||
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
|
||||
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
|
||||
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VS Code extension for multi-file/whole-repo coding
|
||||
- [Void](https://github.com/voideditor/void) (Open source AI code editor and Cursor alternative)
|
||||
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
|
||||
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
||||
@@ -397,7 +399,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
|
||||
- [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
|
||||
- [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
|
||||
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
|
||||
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VS Code extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
|
||||
- [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
|
||||
- [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
|
||||
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
|
||||
@@ -426,6 +428,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
|
||||
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
|
||||
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
|
||||
- [KDeps](https://github.com/kdeps/kdeps) (Kdeps is an offline-first AI framework for building Dockerized full-stack AI applications declaratively using Apple PKL and integrates APIs with Ollama on the backend.)
|
||||
- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
|
||||
- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
|
||||
- [Hillnote](https://hillnote.com) (A Markdown-first workspace designed to supercharge your AI workflow. Create documents ready to integrate with Claude, ChatGPT, Gemini, Cursor, and more - all while keeping your work on your device.)
|
||||
@@ -552,7 +555,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
|
||||
- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
|
||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
||||
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
|
||||
- [Swollama for Swift](https://github.com/guitaripod/Swollama) with [DocC](https://guitaripod.github.io/Swollama/documentation/swollama)
|
||||
- [GoLamify](https://github.com/prasad89/golamify)
|
||||
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
|
||||
@@ -615,7 +618,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
||||
- [AI Summmary Helper plugin](https://github.com/philffm/ai-summary-helper)
|
||||
- [AI Summary Helper plugin](https://github.com/philffm/ai-summary-helper)
|
||||
- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
|
||||
- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
|
||||
- [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
|
||||
@@ -623,7 +626,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
||||
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
||||
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
|
||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Editor tool to analyze scripts via Ollama)
|
||||
- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
|
||||
- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)
|
||||
- [NOMYO Router](https://github.com/nomyo-ai/nomyo-router) (A transparent Ollama proxy with model deployment aware routing which auto-manages multiple Ollama instances in a given network)
|
||||
@@ -633,12 +636,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.
|
||||
|
||||
### Observability
|
||||
- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
|
||||
- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native integration to Ollama.
|
||||
- [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
|
||||
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
|
||||
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
|
||||
- [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
|
||||
- [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.
|
||||
|
||||
## Security
|
||||
### Security
|
||||
- [Ollama Fortress](https://github.com/ParisNeo/ollama_proxy_server)
|
||||
|
||||
@@ -14,7 +14,7 @@ Please include the following details in your report:
|
||||
|
||||
## Security best practices
|
||||
|
||||
While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
|
||||
While the maintainer team does its best to secure Ollama, users are encouraged to implement their own security best practices, such as:
|
||||
|
||||
- Regularly updating to the latest version of Ollama
|
||||
- Securing access to hosted instances of Ollama
|
||||
|
||||
@@ -226,7 +226,14 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||
|
||||
bts := scanner.Bytes()
|
||||
if err := json.Unmarshal(bts, &errorResponse); err != nil {
|
||||
return fmt.Errorf("unmarshal: %w", err)
|
||||
if response.StatusCode >= http.StatusBadRequest {
|
||||
return StatusError{
|
||||
StatusCode: response.StatusCode,
|
||||
Status: response.Status,
|
||||
ErrorMessage: string(bts),
|
||||
}
|
||||
}
|
||||
return errors.New(string(bts))
|
||||
}
|
||||
|
||||
if response.StatusCode == http.StatusUnauthorized {
|
||||
@@ -340,7 +347,7 @@ type CreateProgressFunc func(ProgressResponse) error
|
||||
// Create creates a model from a [Modelfile]. fn is a progress function that
|
||||
// behaves similarly to other methods (see [Client.Pull]).
|
||||
//
|
||||
// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.md
|
||||
// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.mdx
|
||||
func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
|
||||
return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
|
||||
var resp ProgressResponse
|
||||
|
||||
@@ -55,6 +55,7 @@ func TestClientFromEnvironment(t *testing.T) {
|
||||
type testError struct {
|
||||
message string
|
||||
statusCode int
|
||||
raw bool // if true, write message as-is instead of JSON encoding
|
||||
}
|
||||
|
||||
func (e testError) Error() string {
|
||||
@@ -111,6 +112,20 @@ func TestClientStream(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "plain text error response",
|
||||
responses: []any{
|
||||
"internal server error",
|
||||
},
|
||||
wantErr: "internal server error",
|
||||
},
|
||||
{
|
||||
name: "HTML error page",
|
||||
responses: []any{
|
||||
"<html><body>404 Not Found</body></html>",
|
||||
},
|
||||
wantErr: "404 Not Found",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
@@ -135,6 +150,12 @@ func TestClientStream(t *testing.T) {
|
||||
return
|
||||
}
|
||||
|
||||
if str, ok := resp.(string); ok {
|
||||
fmt.Fprintln(w, str)
|
||||
flusher.Flush()
|
||||
continue
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(w).Encode(resp); err != nil {
|
||||
t.Fatalf("failed to encode response: %v", err)
|
||||
}
|
||||
@@ -173,9 +194,10 @@ func TestClientStream(t *testing.T) {
|
||||
|
||||
func TestClientDo(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
response any
|
||||
wantErr string
|
||||
name string
|
||||
response any
|
||||
wantErr string
|
||||
wantStatusCode int
|
||||
}{
|
||||
{
|
||||
name: "immediate error response",
|
||||
@@ -183,7 +205,8 @@ func TestClientDo(t *testing.T) {
|
||||
message: "test error message",
|
||||
statusCode: http.StatusBadRequest,
|
||||
},
|
||||
wantErr: "test error message",
|
||||
wantErr: "test error message",
|
||||
wantStatusCode: http.StatusBadRequest,
|
||||
},
|
||||
{
|
||||
name: "server error response",
|
||||
@@ -191,7 +214,8 @@ func TestClientDo(t *testing.T) {
|
||||
message: "internal error",
|
||||
statusCode: http.StatusInternalServerError,
|
||||
},
|
||||
wantErr: "internal error",
|
||||
wantErr: "internal error",
|
||||
wantStatusCode: http.StatusInternalServerError,
|
||||
},
|
||||
{
|
||||
name: "successful response",
|
||||
@@ -203,6 +227,26 @@ func TestClientDo(t *testing.T) {
|
||||
Success: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "plain text error response",
|
||||
response: testError{
|
||||
message: "internal server error",
|
||||
statusCode: http.StatusInternalServerError,
|
||||
raw: true,
|
||||
},
|
||||
wantErr: "internal server error",
|
||||
wantStatusCode: http.StatusInternalServerError,
|
||||
},
|
||||
{
|
||||
name: "HTML error page",
|
||||
response: testError{
|
||||
message: "<html><body>404 Not Found</body></html>",
|
||||
statusCode: http.StatusNotFound,
|
||||
raw: true,
|
||||
},
|
||||
wantErr: "<html><body>404 Not Found</body></html>",
|
||||
wantStatusCode: http.StatusNotFound,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
@@ -210,11 +254,16 @@ func TestClientDo(t *testing.T) {
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if errResp, ok := tc.response.(testError); ok {
|
||||
w.WriteHeader(errResp.statusCode)
|
||||
err := json.NewEncoder(w).Encode(map[string]string{
|
||||
"error": errResp.message,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal("failed to encode error response:", err)
|
||||
if !errResp.raw {
|
||||
err := json.NewEncoder(w).Encode(map[string]string{
|
||||
"error": errResp.message,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal("failed to encode error response:", err)
|
||||
}
|
||||
} else {
|
||||
// Write raw message (simulates non-JSON error responses)
|
||||
fmt.Fprint(w, errResp.message)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -241,6 +290,15 @@ func TestClientDo(t *testing.T) {
|
||||
if err.Error() != tc.wantErr {
|
||||
t.Errorf("error message mismatch: got %q, want %q", err.Error(), tc.wantErr)
|
||||
}
|
||||
if tc.wantStatusCode != 0 {
|
||||
if statusErr, ok := err.(StatusError); ok {
|
||||
if statusErr.StatusCode != tc.wantStatusCode {
|
||||
t.Errorf("status code mismatch: got %d, want %d", statusErr.StatusCode, tc.wantStatusCode)
|
||||
}
|
||||
} else {
|
||||
t.Errorf("expected StatusError, got %T", err)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -15,19 +15,19 @@ func main() {
|
||||
}
|
||||
|
||||
messages := []api.Message{
|
||||
api.Message{
|
||||
{
|
||||
Role: "system",
|
||||
Content: "Provide very brief, concise responses",
|
||||
},
|
||||
api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "Name some unusual animals",
|
||||
},
|
||||
api.Message{
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: "Monotreme, platypus, echidna",
|
||||
},
|
||||
api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "which of these is the most dangerous?",
|
||||
},
|
||||
|
||||
@@ -366,6 +366,9 @@ type TokenLogprob struct {
|
||||
|
||||
// Logprob is the log probability of this token.
|
||||
Logprob float64 `json:"logprob"`
|
||||
|
||||
// Bytes contains the raw byte representation of the token
|
||||
Bytes []int `json:"bytes,omitempty"`
|
||||
}
|
||||
|
||||
// Logprob contains log probability information for a generated token.
|
||||
|
||||
@@ -273,10 +273,6 @@ func main() {
|
||||
Handler: uiServer.Handler(),
|
||||
}
|
||||
|
||||
if _, err := uiServer.UserData(ctx); err != nil {
|
||||
slog.Warn("failed to load user data", "error", err)
|
||||
}
|
||||
|
||||
// Start the UI server
|
||||
slog.Info("starting ui server", "port", port)
|
||||
go func() {
|
||||
@@ -309,6 +305,9 @@ func main() {
|
||||
go func() {
|
||||
<-signals
|
||||
slog.Info("received SIGINT or SIGTERM signal, shutting down")
|
||||
if err := st.ClearAllDrafts(); err != nil {
|
||||
slog.Warn("failed to clear drafts on shutdown", "error", err)
|
||||
}
|
||||
quit()
|
||||
}()
|
||||
|
||||
@@ -320,6 +319,17 @@ func main() {
|
||||
slog.Debug("no URL scheme request to handle")
|
||||
}
|
||||
|
||||
go func() {
|
||||
slog.Debug("waiting for ollama server to be ready")
|
||||
if err := ui.WaitForServer(ctx, 10*time.Second); err != nil {
|
||||
slog.Warn("ollama server not ready, continuing anyway", "error", err)
|
||||
}
|
||||
|
||||
if _, err := uiServer.UserData(ctx); err != nil {
|
||||
slog.Warn("failed to load user data", "error", err)
|
||||
}
|
||||
}()
|
||||
|
||||
osRun(cancel, hasCompletedFirstRun, startHidden)
|
||||
|
||||
slog.Info("shutting down desktop server")
|
||||
@@ -361,7 +371,7 @@ func checkUserLoggedIn(uiServerPort int) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
resp, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/api/v1/me", uiServerPort))
|
||||
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/me", uiServerPort), "application/json", nil)
|
||||
if err != nil {
|
||||
slog.Debug("failed to call local auth endpoint", "error", err)
|
||||
return false
|
||||
@@ -397,8 +407,8 @@ func checkUserLoggedIn(uiServerPort int) bool {
|
||||
// handleConnectURLScheme fetches the connect URL and opens it in the browser
|
||||
func handleConnectURLScheme() {
|
||||
if checkUserLoggedIn(uiServerPort) {
|
||||
slog.Info("user is already logged in, opening settings instead")
|
||||
sendUIRequestMessage("/")
|
||||
slog.Info("user is already logged in, opening app instead")
|
||||
showWindow(wv.webview.Window())
|
||||
return
|
||||
}
|
||||
|
||||
@@ -434,37 +444,30 @@ func openInBrowser(url string) {
|
||||
}
|
||||
}
|
||||
|
||||
// parseURLScheme parses an ollama:// URL and returns whether it's a connect URL and the UI path
|
||||
func parseURLScheme(urlSchemeRequest string) (isConnect bool, uiPath string, err error) {
|
||||
// parseURLScheme parses an ollama:// URL and validates it
|
||||
// Supports: ollama:// (open app) and ollama://connect (OAuth)
|
||||
func parseURLScheme(urlSchemeRequest string) (isConnect bool, err error) {
|
||||
parsedURL, err := url.Parse(urlSchemeRequest)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
return false, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Check if this is a connect URL
|
||||
if parsedURL.Host == "connect" || strings.TrimPrefix(parsedURL.Path, "/") == "connect" {
|
||||
return true, "", nil
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Extract the UI path
|
||||
path := "/"
|
||||
if parsedURL.Path != "" && parsedURL.Path != "/" {
|
||||
// For URLs like ollama:///settings, use the path directly
|
||||
path = parsedURL.Path
|
||||
} else if parsedURL.Host != "" {
|
||||
// For URLs like ollama://settings (without triple slash),
|
||||
// the "settings" part is parsed as the host, not the path.
|
||||
// We need to convert it to a path by prepending "/"
|
||||
// This also handles ollama://settings/ where Windows adds a trailing slash
|
||||
path = "/" + parsedURL.Host
|
||||
// Allow bare ollama:// or ollama:/// to open the app
|
||||
if (parsedURL.Host == "" && parsedURL.Path == "") || parsedURL.Path == "/" {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return false, path, nil
|
||||
return false, fmt.Errorf("unsupported ollama:// URL path: %s", urlSchemeRequest)
|
||||
}
|
||||
|
||||
// handleURLSchemeInCurrentInstance processes URL scheme requests in the current instance
|
||||
func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
|
||||
isConnect, uiPath, err := parseURLScheme(urlSchemeRequest)
|
||||
isConnect, err := parseURLScheme(urlSchemeRequest)
|
||||
if err != nil {
|
||||
slog.Error("failed to parse URL scheme request", "url", urlSchemeRequest, "error", err)
|
||||
return
|
||||
@@ -473,6 +476,8 @@ func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
|
||||
if isConnect {
|
||||
handleConnectURLScheme()
|
||||
} else {
|
||||
sendUIRequestMessage(uiPath)
|
||||
if wv.webview != nil {
|
||||
showWindow(wv.webview.Window())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,6 +182,11 @@ func osRun(_ func(), hasCompletedFirstRun, startHidden bool) {
|
||||
}
|
||||
|
||||
func quit() {
|
||||
if wv.Store != nil {
|
||||
if err := wv.Store.ClearAllDrafts(); err != nil {
|
||||
slog.Warn("failed to clear drafts on quit", "error", err)
|
||||
}
|
||||
}
|
||||
C.quit()
|
||||
}
|
||||
|
||||
@@ -191,13 +196,6 @@ func LaunchNewApp() {
|
||||
C.launchApp(appName)
|
||||
}
|
||||
|
||||
// Send a request to the main app thread to load a UI page
|
||||
func sendUIRequestMessage(path string) {
|
||||
p := C.CString(path)
|
||||
defer C.free(unsafe.Pointer(p))
|
||||
C.uiRequest(p)
|
||||
}
|
||||
|
||||
func registerLaunchAgent(hasCompletedFirstRun bool) {
|
||||
// Remove any stale Login Item registrations
|
||||
C.unregisterSelfFromLoginItem()
|
||||
|
||||
@@ -24,27 +24,14 @@ bool firstTimeRun,startHidden; // Set in run before initialization
|
||||
for (NSURL *url in urls) {
|
||||
if ([url.scheme isEqualToString:@"ollama"]) {
|
||||
NSString *path = url.path;
|
||||
if (!path || [path isEqualToString:@""]) {
|
||||
// For URLs like ollama://settings (without triple slash),
|
||||
// the "settings" part is parsed as the host, not the path.
|
||||
// We need to convert it to a path by prepending "/"
|
||||
if (url.host && ![url.host isEqualToString:@""]) {
|
||||
path = [@"/" stringByAppendingString:url.host];
|
||||
} else {
|
||||
path = @"/";
|
||||
}
|
||||
}
|
||||
|
||||
if ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"]) {
|
||||
|
||||
if (path && ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"])) {
|
||||
// Special case: handle connect by opening browser instead of app
|
||||
handleConnectURL();
|
||||
} else {
|
||||
// Set app to be active and visible
|
||||
[NSApp setActivationPolicy:NSApplicationActivationPolicyRegular];
|
||||
[NSApp activateIgnoringOtherApps:YES];
|
||||
|
||||
// Open the path with the UI
|
||||
[self uiRequest:path];
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -260,7 +247,7 @@ bool firstTimeRun,startHidden; // Set in run before initialization
|
||||
}
|
||||
|
||||
- (void)openHelp:(id)sender {
|
||||
NSURL *url = [NSURL URLWithString:@"https://github.com/ollama/ollama/tree/main/docs"];
|
||||
NSURL *url = [NSURL URLWithString:@"https://docs.ollama.com/"];
|
||||
[[NSWorkspace sharedWorkspace] openURL:url];
|
||||
}
|
||||
|
||||
|
||||
@@ -111,6 +111,11 @@ func (*appCallbacks) UIRunning() bool {
|
||||
}
|
||||
|
||||
func (app *appCallbacks) Quit() {
|
||||
if wv.Store != nil {
|
||||
if err := wv.Store.ClearAllDrafts(); err != nil {
|
||||
slog.Warn("failed to clear drafts on quit", "error", err)
|
||||
}
|
||||
}
|
||||
app.t.Quit()
|
||||
wv.Terminate()
|
||||
}
|
||||
@@ -138,7 +143,7 @@ func (app *appCallbacks) HandleURLScheme(urlScheme string) {
|
||||
|
||||
// handleURLSchemeRequest processes URL scheme requests from other instances
|
||||
func handleURLSchemeRequest(urlScheme string) {
|
||||
isConnect, uiPath, err := parseURLScheme(urlScheme)
|
||||
isConnect, err := parseURLScheme(urlScheme)
|
||||
if err != nil {
|
||||
slog.Error("failed to parse URL scheme request", "url", urlScheme, "error", err)
|
||||
return
|
||||
@@ -147,7 +152,9 @@ func handleURLSchemeRequest(urlScheme string) {
|
||||
if isConnect {
|
||||
handleConnectURLScheme()
|
||||
} else {
|
||||
sendUIRequestMessage(uiPath)
|
||||
if wv.webview != nil {
|
||||
showWindow(wv.webview.Window())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,11 +268,6 @@ func createLoginShortcut() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Send a request to the main app thread to load a UI page
|
||||
func sendUIRequestMessage(path string) {
|
||||
wintray.SendUIRequestMessage(path)
|
||||
}
|
||||
|
||||
func LaunchNewApp() {
|
||||
}
|
||||
|
||||
|
||||
@@ -169,37 +169,47 @@ DlgResult fileDlg(FileDlgParams* params) {
|
||||
}
|
||||
|
||||
NSArray* urls = [panel URLs];
|
||||
if(self->params->allowMultiple && [urls count] >= 1) {
|
||||
if([urls count] == 0) {
|
||||
return DLG_CANCEL;
|
||||
}
|
||||
|
||||
if(self->params->allowMultiple) {
|
||||
// For multiple files, we need to return all paths separated by null bytes
|
||||
char* bufPtr = self->params->buf;
|
||||
int remainingBuf = self->params->nbuf;
|
||||
|
||||
// Calculate total required buffer size first
|
||||
int totalSize = 0;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
if(![url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX]) {
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
totalSize += strlen(tempBuf) + 1; // +1 for null terminator
|
||||
}
|
||||
totalSize += 1; // Final null terminator
|
||||
// Calculate total required buffer size first
|
||||
int totalSize = 0;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
if(![url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX]) {
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
totalSize += strlen(tempBuf) + 1; // +1 for null terminator
|
||||
}
|
||||
totalSize += 1; // Final null terminator
|
||||
|
||||
if(totalSize > self->params->nbuf) {
|
||||
// Not enough buffer space
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
if(totalSize > self->params->nbuf) {
|
||||
// Not enough buffer space
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
|
||||
// Now actually copy the paths (we know we have space)
|
||||
bufPtr = self->params->buf;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
[url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX];
|
||||
int pathLen = strlen(tempBuf);
|
||||
strcpy(bufPtr, tempBuf);
|
||||
bufPtr += pathLen + 1;
|
||||
}
|
||||
*bufPtr = '\0'; // Final null terminator
|
||||
// Now actually copy the paths (we know we have space)
|
||||
bufPtr = self->params->buf;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
[url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX];
|
||||
int pathLen = strlen(tempBuf);
|
||||
strcpy(bufPtr, tempBuf);
|
||||
bufPtr += pathLen + 1;
|
||||
}
|
||||
*bufPtr = '\0'; // Final null terminator
|
||||
} else {
|
||||
// Single file/directory selection - write path to buffer
|
||||
NSURL* url = [urls firstObject];
|
||||
if(![url getFileSystemRepresentation:self->params->buf maxLength:self->params->nbuf]) {
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
}
|
||||
|
||||
return DLG_OK;
|
||||
|
||||
@@ -15,7 +15,7 @@ const multiFileBufferSize = w32.MAX_PATH * 10
|
||||
type WinDlgError int
|
||||
|
||||
func (e WinDlgError) Error() string {
|
||||
return fmt.Sprintf("CommDlgExtendedError: %#x", e)
|
||||
return fmt.Sprintf("CommDlgExtendedError: %#x", int(e))
|
||||
}
|
||||
|
||||
func err() error {
|
||||
|
||||
6
app/package-lock.json
generated
Normal file
6
app/package-lock.json
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"name": "app",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {}
|
||||
}
|
||||
@@ -224,9 +224,7 @@ func (s *Server) cmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
if _, err := os.Stat(settings.Models); err == nil {
|
||||
env["OLLAMA_MODELS"] = settings.Models
|
||||
} else {
|
||||
slog.Warn("models path not accessible, clearing models setting", "path", settings.Models, "err", err)
|
||||
settings.Models = ""
|
||||
s.store.SetSettings(settings)
|
||||
slog.Warn("models path not accessible, using default", "path", settings.Models, "err", err)
|
||||
}
|
||||
}
|
||||
if settings.ContextLength > 0 {
|
||||
|
||||
@@ -14,7 +14,7 @@ import (
|
||||
|
||||
// currentSchemaVersion defines the current database schema version.
|
||||
// Increment this when making schema changes that require migrations.
|
||||
const currentSchemaVersion = 12
|
||||
const currentSchemaVersion = 13
|
||||
|
||||
// database wraps the SQLite connection.
|
||||
// SQLite handles its own locking for concurrent access:
|
||||
@@ -95,7 +95,8 @@ func (db *database) init() error {
|
||||
id TEXT PRIMARY KEY,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
browser_state TEXT
|
||||
browser_state TEXT,
|
||||
draft TEXT NOT NULL DEFAULT ''
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS messages (
|
||||
@@ -244,6 +245,12 @@ func (db *database) migrate() error {
|
||||
return fmt.Errorf("migrate v11 to v12: %w", err)
|
||||
}
|
||||
version = 12
|
||||
case 12:
|
||||
// add draft column to chats table
|
||||
if err := db.migrateV12ToV13(); err != nil {
|
||||
return fmt.Errorf("migrate v12 to v13: %w", err)
|
||||
}
|
||||
version = 13
|
||||
default:
|
||||
// If we have a version we don't recognize, just set it to current
|
||||
// This might happen during development
|
||||
@@ -452,6 +459,21 @@ func (db *database) migrateV11ToV12() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// migrateV12ToV13 adds the draft column to the chats table
|
||||
func (db *database) migrateV12ToV13() error {
|
||||
_, err := db.conn.Exec(`ALTER TABLE chats ADD COLUMN draft TEXT NOT NULL DEFAULT ''`)
|
||||
if err != nil && !duplicateColumnError(err) {
|
||||
return fmt.Errorf("add draft column: %w", err)
|
||||
}
|
||||
|
||||
_, err = db.conn.Exec(`UPDATE settings SET schema_version = 13`)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update schema version: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// cleanupOrphanedData removes orphaned records that may exist due to the foreign key bug
|
||||
func (db *database) cleanupOrphanedData() error {
|
||||
_, err := db.conn.Exec(`
|
||||
@@ -570,7 +592,7 @@ func (db *database) getAllChats() ([]Chat, error) {
|
||||
|
||||
func (db *database) getChatWithOptions(id string, loadAttachmentData bool) (*Chat, error) {
|
||||
query := `
|
||||
SELECT id, title, created_at, browser_state
|
||||
SELECT id, title, created_at, browser_state, draft
|
||||
FROM chats
|
||||
WHERE id = ?
|
||||
`
|
||||
@@ -578,12 +600,14 @@ func (db *database) getChatWithOptions(id string, loadAttachmentData bool) (*Cha
|
||||
var chat Chat
|
||||
var createdAt time.Time
|
||||
var browserState sql.NullString
|
||||
var draft sql.NullString
|
||||
|
||||
err := db.conn.QueryRow(query, id).Scan(
|
||||
&chat.ID,
|
||||
&chat.Title,
|
||||
&createdAt,
|
||||
&browserState,
|
||||
&draft,
|
||||
)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
@@ -599,6 +623,9 @@ func (db *database) getChatWithOptions(id string, loadAttachmentData bool) (*Cha
|
||||
chat.BrowserState = raw
|
||||
}
|
||||
}
|
||||
if draft.Valid {
|
||||
chat.Draft = draft.String
|
||||
}
|
||||
|
||||
messages, err := db.getMessages(id, loadAttachmentData)
|
||||
if err != nil {
|
||||
@@ -622,11 +649,12 @@ func (db *database) saveChat(chat Chat) error {
|
||||
// UPSERT would overwrite browser_state with NULL, breaking revisit rendering that relies
|
||||
// on the last persisted full tool state.
|
||||
query := `
|
||||
INSERT INTO chats (id, title, created_at, browser_state)
|
||||
VALUES (?, ?, ?, ?)
|
||||
INSERT INTO chats (id, title, created_at, browser_state, draft)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
browser_state = COALESCE(excluded.browser_state, chats.browser_state)
|
||||
browser_state = COALESCE(excluded.browser_state, chats.browser_state),
|
||||
draft = excluded.draft
|
||||
`
|
||||
|
||||
var browserState sql.NullString
|
||||
@@ -639,6 +667,7 @@ func (db *database) saveChat(chat Chat) error {
|
||||
chat.Title,
|
||||
chat.CreatedAt,
|
||||
browserState,
|
||||
chat.Draft,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("save chat: %w", err)
|
||||
@@ -669,6 +698,23 @@ func (db *database) saveChat(chat Chat) error {
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// updateChatDraft updates only the draft for a chat
|
||||
func (db *database) updateChatDraft(chatID string, draft string) error {
|
||||
_, err := db.conn.Exec(`UPDATE chats SET draft = ? WHERE id = ?`, draft, chatID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update chat draft: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (db *database) clearAllDrafts() error {
|
||||
_, err := db.conn.Exec(`UPDATE chats SET draft = ''`)
|
||||
if err != nil {
|
||||
return fmt.Errorf("clear all drafts: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateChatBrowserState updates only the browser_state for a chat
|
||||
func (db *database) updateChatBrowserState(chatID string, state json.RawMessage) error {
|
||||
_, err := db.conn.Exec(`UPDATE chats SET browser_state = ? WHERE id = ?`, string(state), chatID)
|
||||
|
||||
@@ -109,6 +109,7 @@ type Chat struct {
|
||||
Title string `json:"title"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
BrowserState json.RawMessage `json:"browser_state,omitempty" ts_type:"BrowserStateData"`
|
||||
Draft string `json:"draft,omitempty"`
|
||||
}
|
||||
|
||||
// NewChat creates a new Chat with the ID, with CreatedAt timestamp initialized
|
||||
@@ -451,6 +452,22 @@ func (s *Store) AppendMessage(chatID string, message Message) error {
|
||||
return s.db.appendMessage(chatID, message)
|
||||
}
|
||||
|
||||
func (s *Store) UpdateChatDraft(chatID string, draft string) error {
|
||||
if err := s.ensureDB(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return s.db.updateChatDraft(chatID, draft)
|
||||
}
|
||||
|
||||
func (s *Store) ClearAllDrafts() error {
|
||||
if err := s.ensureDB(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return s.db.clearAllDrafts()
|
||||
}
|
||||
|
||||
func (s *Store) UpdateChatBrowserState(chatID string, state json.RawMessage) error {
|
||||
if err := s.ensureDB(); err != nil {
|
||||
return err
|
||||
|
||||
@@ -159,6 +159,7 @@ export class Chat {
|
||||
title: string;
|
||||
created_at: Time;
|
||||
browser_state?: BrowserStateData;
|
||||
draft?: string;
|
||||
|
||||
constructor(source: any = {}) {
|
||||
if ('string' === typeof source) source = JSON.parse(source);
|
||||
@@ -167,6 +168,7 @@ export class Chat {
|
||||
this.title = source["title"];
|
||||
this.created_at = this.convertValues(source["created_at"], Time);
|
||||
this.browser_state = source["browser_state"];
|
||||
this.draft = source["draft"];
|
||||
}
|
||||
|
||||
convertValues(a: any, classs: any, asMap: boolean = false): any {
|
||||
@@ -469,26 +471,24 @@ export class HealthResponse {
|
||||
}
|
||||
export class User {
|
||||
id: string;
|
||||
name: string;
|
||||
email: string;
|
||||
avatarURL: string;
|
||||
plan: string;
|
||||
bio: string;
|
||||
firstName: string;
|
||||
lastName: string;
|
||||
overThreshold: boolean;
|
||||
name: string;
|
||||
bio?: string;
|
||||
avatarurl?: string;
|
||||
firstname?: string;
|
||||
lastname?: string;
|
||||
plan?: string;
|
||||
|
||||
constructor(source: any = {}) {
|
||||
if ('string' === typeof source) source = JSON.parse(source);
|
||||
this.id = source["id"];
|
||||
this.name = source["name"];
|
||||
this.email = source["email"];
|
||||
this.avatarURL = source["avatarURL"];
|
||||
this.plan = source["plan"];
|
||||
this.name = source["name"];
|
||||
this.bio = source["bio"];
|
||||
this.firstName = source["firstName"];
|
||||
this.lastName = source["lastName"];
|
||||
this.overThreshold = source["overThreshold"];
|
||||
this.avatarurl = source["avatarurl"];
|
||||
this.firstname = source["firstname"];
|
||||
this.lastname = source["lastname"];
|
||||
this.plan = source["plan"];
|
||||
}
|
||||
}
|
||||
export class Attachment {
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
import { parseJsonlFromResponse } from "./util/jsonl-parsing";
|
||||
import { ollamaClient as ollama } from "./lib/ollama-client";
|
||||
import type { ModelResponse } from "ollama/browser";
|
||||
import { API_BASE, OLLAMA_DOT_COM } from "./lib/config";
|
||||
|
||||
// Extend Model class with utility methods
|
||||
declare module "@/gotypes" {
|
||||
@@ -26,9 +27,6 @@ declare module "@/gotypes" {
|
||||
Model.prototype.isCloud = function (): boolean {
|
||||
return this.model.endsWith("cloud");
|
||||
};
|
||||
|
||||
const API_BASE = import.meta.env.DEV ? "http://127.0.0.1:3001" : "";
|
||||
|
||||
// Helper function to convert Uint8Array to base64
|
||||
function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
||||
const chunkSize = 0x8000; // 32KB chunks to avoid stack overflow
|
||||
@@ -43,44 +41,50 @@ function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
||||
}
|
||||
|
||||
export async function fetchUser(): Promise<User | null> {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/api/v1/me`, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const userData: User = await response.json();
|
||||
return userData;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error("Error fetching user:", error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function fetchConnectUrl(): Promise<string> {
|
||||
const response = await fetch(`${API_BASE}/api/v1/connect`, {
|
||||
method: "GET",
|
||||
const response = await fetch(`${API_BASE}/api/me`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error("Failed to fetch connect URL");
|
||||
if (response.ok) {
|
||||
const userData: User = await response.json();
|
||||
|
||||
if (userData.avatarurl && !userData.avatarurl.startsWith("http")) {
|
||||
userData.avatarurl = `${OLLAMA_DOT_COM}${userData.avatarurl}`;
|
||||
}
|
||||
|
||||
return userData;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
return data.connect_url;
|
||||
if (response.status === 401 || response.status === 403) {
|
||||
return null;
|
||||
}
|
||||
|
||||
throw new Error(`Failed to fetch user: ${response.status}`);
|
||||
}
|
||||
|
||||
export async function fetchConnectUrl(): Promise<string> {
|
||||
const response = await fetch(`${API_BASE}/api/me`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
|
||||
if (response.status === 401) {
|
||||
const data = await response.json();
|
||||
if (data.signin_url) {
|
||||
return data.signin_url;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error("Failed to fetch connect URL");
|
||||
}
|
||||
|
||||
export async function disconnectUser(): Promise<void> {
|
||||
const response = await fetch(`${API_BASE}/api/v1/disconnect`, {
|
||||
const response = await fetch(`${API_BASE}/api/signout`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
@@ -205,12 +209,10 @@ export async function* sendMessage(
|
||||
data: uint8ArrayToBase64(att.data),
|
||||
}));
|
||||
|
||||
// Only send think parameter when actually requesting thinking
|
||||
// Don't send false as it causes issues with some providers
|
||||
// Send think parameter when it's explicitly set (true, false, or a non-empty string).
|
||||
const shouldSendThink =
|
||||
think !== undefined &&
|
||||
((typeof think === "boolean" && think) ||
|
||||
(typeof think === "string" && think !== ""));
|
||||
(typeof think === "boolean" || (typeof think === "string" && think !== ""));
|
||||
|
||||
const response = await fetch(`${API_BASE}/api/v1/chat/${chatId}`, {
|
||||
method: "POST",
|
||||
@@ -297,6 +299,20 @@ export async function renameChat(chatId: string, title: string): Promise<void> {
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateChatDraft(chatId: string, draft: string): Promise<void> {
|
||||
const response = await fetch(`${API_BASE}/api/v1/chat/${chatId}/draft`, {
|
||||
method: "PUT",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ draft }),
|
||||
});
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(error || "Failed to update draft");
|
||||
}
|
||||
}
|
||||
|
||||
export async function deleteChat(chatId: string): Promise<void> {
|
||||
const response = await fetch(`${API_BASE}/api/v1/chat/${chatId}`, {
|
||||
method: "DELETE",
|
||||
@@ -392,7 +408,8 @@ export async function getInferenceCompute(): Promise<InferenceCompute[]> {
|
||||
|
||||
export async function fetchHealth(): Promise<boolean> {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/api/v1/health`, {
|
||||
// Use the /api/version endpoint as a health check
|
||||
const response = await fetch(`${API_BASE}/api/version`, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
@@ -401,7 +418,8 @@ export async function fetchHealth(): Promise<boolean> {
|
||||
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
return data.healthy || false;
|
||||
// If we get a version back, the server is healthy
|
||||
return !!data.version;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
@@ -282,6 +282,7 @@ export default function Chat({ chatId }: { chatId: string }) {
|
||||
onSubmit={handleChatFormSubmit}
|
||||
chatId={chatId}
|
||||
autoFocus={true}
|
||||
initialDraft={chatQuery?.data?.chat?.draft ?? ""}
|
||||
editingMessage={editingMessage}
|
||||
onCancelEdit={handleCancelEdit}
|
||||
isDisabled={isDisabled}
|
||||
|
||||
@@ -27,6 +27,7 @@ import { ErrorMessage } from "./ErrorMessage";
|
||||
import { processFiles } from "@/utils/fileValidation";
|
||||
import type { ImageData } from "@/types/webview";
|
||||
import { PlusIcon } from "@heroicons/react/24/outline";
|
||||
import { useDraftMessage } from "@/hooks/useDraftMessage";
|
||||
|
||||
export type ThinkingLevel = "low" | "medium" | "high";
|
||||
|
||||
@@ -62,6 +63,7 @@ interface ChatFormProps {
|
||||
chatId?: string;
|
||||
isDownloadingModel?: boolean;
|
||||
isDisabled?: boolean;
|
||||
initialDraft?: string;
|
||||
// Editing props - when provided, ChatForm enters edit mode
|
||||
editingMessage?: {
|
||||
content: string;
|
||||
@@ -84,6 +86,7 @@ function ChatForm({
|
||||
chatId = "new",
|
||||
isDownloadingModel = false,
|
||||
isDisabled = false,
|
||||
initialDraft,
|
||||
editingMessage,
|
||||
onCancelEdit,
|
||||
onFilesReceived,
|
||||
@@ -118,6 +121,8 @@ function ChatForm({
|
||||
null,
|
||||
);
|
||||
|
||||
const { saveDraft, clearDraft } = useDraftMessage(chatId);
|
||||
|
||||
const handleThinkingLevelDropdownToggle = (isOpen: boolean) => {
|
||||
if (
|
||||
isOpen &&
|
||||
@@ -308,10 +313,39 @@ function ChatForm({
|
||||
}
|
||||
}, [editingMessage]);
|
||||
|
||||
// Clear composition and reset textarea height when chatId changes
|
||||
useEffect(() => {
|
||||
resetChatForm();
|
||||
}, [chatId]);
|
||||
if (editingMessage) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (initialDraft && initialDraft.trim()) {
|
||||
setMessage({
|
||||
content: initialDraft,
|
||||
attachments: [],
|
||||
fileErrors: [],
|
||||
});
|
||||
|
||||
// Adjust textarea height after loading draft
|
||||
setTimeout(() => {
|
||||
if (textareaRef.current && initialDraft) {
|
||||
textareaRef.current.style.height = "auto";
|
||||
textareaRef.current.style.height =
|
||||
Math.min(textareaRef.current.scrollHeight, 24 * 8) + "px";
|
||||
}
|
||||
}, 0);
|
||||
} else {
|
||||
resetChatForm();
|
||||
}
|
||||
}, [chatId, initialDraft, editingMessage]);
|
||||
|
||||
// Save draft only when navigating away or on blur
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (!editingMessage && message.content.trim()) {
|
||||
saveDraft(message.content);
|
||||
}
|
||||
};
|
||||
}, [message.content, editingMessage, saveDraft]);
|
||||
|
||||
// Auto-focus textarea when autoFocus is true or when streaming completes (but not when editing)
|
||||
useEffect(() => {
|
||||
@@ -511,12 +545,13 @@ function ChatForm({
|
||||
});
|
||||
}
|
||||
|
||||
// Clear composition after successful submission
|
||||
// Clear composition and draft after successful submission
|
||||
setMessage({
|
||||
content: "",
|
||||
attachments: [],
|
||||
fileErrors: [],
|
||||
});
|
||||
clearDraft();
|
||||
|
||||
// Reset textarea height and refocus after submit
|
||||
setTimeout(() => {
|
||||
@@ -621,6 +656,13 @@ function ChatForm({
|
||||
e.target.style.height = Math.min(e.target.scrollHeight, 24 * 8) + "px";
|
||||
};
|
||||
|
||||
// Save draft when textarea loses focus
|
||||
const handleTextareaBlur = () => {
|
||||
if (!editingMessage && message.content.trim()) {
|
||||
saveDraft(message.content);
|
||||
}
|
||||
};
|
||||
|
||||
const handleFilesUpload = async () => {
|
||||
try {
|
||||
setFileUploadError(null);
|
||||
@@ -832,6 +874,7 @@ function ChatForm({
|
||||
ref={textareaRef}
|
||||
value={message.content}
|
||||
onChange={handleTextareaChange}
|
||||
onBlur={handleTextareaBlur}
|
||||
placeholder="Send a message"
|
||||
disabled={isDisabled}
|
||||
className={`allow-context-menu w-full overflow-y-auto text-neutral-700 outline-none resize-none border-none bg-transparent dark:text-white placeholder:text-neutral-400 dark:placeholder:text-neutral-500 min-h-[24px] leading-6 transition-opacity duration-300 ${
|
||||
|
||||
@@ -16,7 +16,6 @@ import {
|
||||
ArrowLeftIcon,
|
||||
} from "@heroicons/react/20/solid";
|
||||
import { Settings as SettingsType } from "@/gotypes";
|
||||
import { useNavigate } from "@tanstack/react-router";
|
||||
import { useUser } from "@/hooks/useUser";
|
||||
import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
|
||||
import { getSettings, updateSettings } from "@/api";
|
||||
@@ -52,7 +51,6 @@ export default function Settings() {
|
||||
const [isAwaitingConnection, setIsAwaitingConnection] = useState(false);
|
||||
const [connectionError, setConnectionError] = useState<string | null>(null);
|
||||
const [pollingInterval, setPollingInterval] = useState<number | null>(null);
|
||||
const navigate = useNavigate();
|
||||
|
||||
const {
|
||||
data: settingsData,
|
||||
@@ -216,7 +214,7 @@ export default function Settings() {
|
||||
>
|
||||
{isWindows && (
|
||||
<button
|
||||
onClick={() => navigate({ to: "/" })}
|
||||
onClick={() => window.history.back()}
|
||||
className="hover:bg-neutral-100 mr-3 dark:hover:bg-neutral-800 rounded-full p-1.5"
|
||||
>
|
||||
<ArrowLeftIcon className="w-5 h-5 dark:text-white" />
|
||||
@@ -226,7 +224,7 @@ export default function Settings() {
|
||||
</h1>
|
||||
{!isWindows && (
|
||||
<button
|
||||
onClick={() => navigate({ to: "/" })}
|
||||
onClick={() => window.history.back()}
|
||||
className="p-1 hover:bg-neutral-100 mr-3 dark:hover:bg-neutral-800 rounded-full"
|
||||
>
|
||||
<XMarkIcon className="w-6 h-6 dark:text-white" />
|
||||
@@ -299,9 +297,9 @@ export default function Settings() {
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
{user?.avatarURL && (
|
||||
{user?.avatarurl && (
|
||||
<img
|
||||
src={user.avatarURL}
|
||||
src={user.avatarurl}
|
||||
alt={user?.name}
|
||||
className="h-10 w-10 rounded-full bg-neutral-200 dark:bg-neutral-700 flex-shrink-0"
|
||||
onError={(e) => {
|
||||
|
||||
@@ -50,21 +50,33 @@ export default function Thinking({
|
||||
// Position content to show bottom when collapsed
|
||||
useEffect(() => {
|
||||
if (isCollapsed && contentRef.current && wrapperRef.current) {
|
||||
const contentHeight = contentRef.current.scrollHeight;
|
||||
const wrapperHeight = wrapperRef.current.clientHeight;
|
||||
if (contentHeight > wrapperHeight) {
|
||||
const translateY = -(contentHeight - wrapperHeight);
|
||||
contentRef.current.style.transform = `translateY(${translateY}px)`;
|
||||
setHasOverflow(true);
|
||||
} else {
|
||||
setHasOverflow(false);
|
||||
}
|
||||
requestAnimationFrame(() => {
|
||||
if (!contentRef.current || !wrapperRef.current) return;
|
||||
|
||||
const contentHeight = contentRef.current.scrollHeight;
|
||||
const wrapperHeight = wrapperRef.current.clientHeight;
|
||||
if (contentHeight > wrapperHeight) {
|
||||
const translateY = -(contentHeight - wrapperHeight);
|
||||
contentRef.current.style.transform = `translateY(${translateY}px)`;
|
||||
setHasOverflow(true);
|
||||
} else {
|
||||
contentRef.current.style.transform = "translateY(0)";
|
||||
setHasOverflow(false);
|
||||
}
|
||||
});
|
||||
} else if (contentRef.current) {
|
||||
contentRef.current.style.transform = "translateY(0)";
|
||||
setHasOverflow(false);
|
||||
}
|
||||
}, [thinking, isCollapsed]);
|
||||
|
||||
useEffect(() => {
|
||||
if (activelyThinking && wrapperRef.current && !isCollapsed) {
|
||||
// When expanded and actively thinking, scroll to bottom
|
||||
wrapperRef.current.scrollTop = wrapperRef.current.scrollHeight;
|
||||
}
|
||||
}, [thinking, activelyThinking, isCollapsed]);
|
||||
|
||||
const handleToggle = () => {
|
||||
setIsCollapsed(!isCollapsed);
|
||||
setHasUserInteracted(true);
|
||||
|
||||
@@ -7,6 +7,7 @@ import { createQueryBatcher } from "./useQueryBatcher";
|
||||
import { useRefetchModels } from "./useModels";
|
||||
import { useStreamingContext } from "@/contexts/StreamingContext";
|
||||
import { useSettings } from "./useSettings";
|
||||
import { getModelCapabilities } from "@/api";
|
||||
|
||||
export const useChats = () => {
|
||||
return useQuery({
|
||||
@@ -606,6 +607,24 @@ export const useSendMessage = (chatId: string) => {
|
||||
queryClient.setQueryData(["staleModels"], newStaleMap);
|
||||
|
||||
queryClient.invalidateQueries({ queryKey: ["models"] });
|
||||
|
||||
// Fetch fresh capabilities for the downloaded model
|
||||
getModelCapabilities(selectedModel.model)
|
||||
.then((capabilities) => {
|
||||
queryClient.setQueryData(
|
||||
["modelCapabilities", selectedModel.model],
|
||||
capabilities,
|
||||
);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(
|
||||
"Failed to fetch capabilities after download:",
|
||||
error,
|
||||
);
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: ["modelCapabilities", selectedModel.model],
|
||||
});
|
||||
});
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1,114 +0,0 @@
|
||||
import { useMutation, useQueryClient } from "@tanstack/react-query";
|
||||
import { useState } from "react";
|
||||
import { pullModel } from "@/api";
|
||||
import { useSelectedModel } from "./useSelectedModel";
|
||||
import { useSettings } from "./useSettings";
|
||||
|
||||
interface DownloadProgress {
|
||||
status: string;
|
||||
digest?: string;
|
||||
total?: number;
|
||||
completed?: number;
|
||||
done?: boolean;
|
||||
}
|
||||
|
||||
export function useDownloadModel(chatId?: string) {
|
||||
const queryClient = useQueryClient();
|
||||
const { selectedModel } = useSelectedModel(chatId);
|
||||
const { setSettings } = useSettings();
|
||||
const [downloadProgress, setDownloadProgress] =
|
||||
useState<DownloadProgress | null>(null);
|
||||
const [abortController, setAbortController] =
|
||||
useState<AbortController | null>(null);
|
||||
const [downloadingChatIds, setDownloadingChatIds] = useState<Set<string>>(
|
||||
new Set(),
|
||||
);
|
||||
|
||||
const mutation = useMutation({
|
||||
mutationFn: async (modelName: string) => {
|
||||
const controller = new AbortController();
|
||||
setAbortController(controller);
|
||||
setDownloadProgress({ status: "Starting download..." });
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => new Set(prev).add(chatId));
|
||||
}
|
||||
|
||||
try {
|
||||
for await (const progress of pullModel(modelName, controller.signal)) {
|
||||
setDownloadProgress(progress);
|
||||
|
||||
if (progress.status === "success") {
|
||||
// Update selected model to indicate it's now available locally
|
||||
if (selectedModel && selectedModel.model === modelName) {
|
||||
setSettings({ SelectedModel: modelName });
|
||||
}
|
||||
// Invalidate models query to refresh the list
|
||||
await queryClient.invalidateQueries({ queryKey: ["models"] });
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
setAbortController(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
onSuccess: () => {
|
||||
setDownloadProgress(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
},
|
||||
onError: (error: Error) => {
|
||||
const status =
|
||||
error.name === "AbortError" ? "Download cancelled" : "Download failed";
|
||||
setDownloadProgress({ status, done: true });
|
||||
|
||||
// Clear error message after delay
|
||||
const delay = error.name === "AbortError" ? 1500 : 3000;
|
||||
setTimeout(() => {
|
||||
setDownloadProgress(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
}, delay);
|
||||
},
|
||||
});
|
||||
|
||||
const cancelDownload = () => {
|
||||
if (abortController) {
|
||||
abortController.abort();
|
||||
setAbortController(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return {
|
||||
downloadModel: mutation.mutate,
|
||||
isDownloading:
|
||||
mutation.isPending && chatId ? downloadingChatIds.has(chatId) : false,
|
||||
downloadProgress:
|
||||
chatId && downloadingChatIds.has(chatId) ? downloadProgress : null,
|
||||
error: mutation.error,
|
||||
cancelDownload,
|
||||
};
|
||||
}
|
||||
34
app/ui/app/src/hooks/useDraftMessage.ts
Normal file
34
app/ui/app/src/hooks/useDraftMessage.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import { useCallback } from "react";
|
||||
import { updateChatDraft } from "@/api";
|
||||
|
||||
export function useDraftMessage(chatId: string) {
|
||||
const saveDraft = useCallback(async (content: string) => {
|
||||
try {
|
||||
if (chatId === "new") {
|
||||
return;
|
||||
}
|
||||
|
||||
await updateChatDraft(chatId, content);
|
||||
} catch (error) {
|
||||
console.error("Error saving draft message:", error);
|
||||
}
|
||||
}, [chatId]);
|
||||
|
||||
const clearDraft = useCallback(async () => {
|
||||
try {
|
||||
if (chatId === "new") {
|
||||
return;
|
||||
}
|
||||
|
||||
await updateChatDraft(chatId, "");
|
||||
} catch (error) {
|
||||
console.error("Error clearing draft message:", error);
|
||||
}
|
||||
}, [chatId]);
|
||||
|
||||
return {
|
||||
saveDraft,
|
||||
clearDraft,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,29 +1,20 @@
|
||||
import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
|
||||
import { useEffect, useState } from "react";
|
||||
import { fetchUser, fetchConnectUrl, disconnectUser } from "@/api";
|
||||
|
||||
export function useUser() {
|
||||
const queryClient = useQueryClient();
|
||||
const [initialDataLoaded, setInitialDataLoaded] = useState(false);
|
||||
|
||||
// Wait for initial data to be loaded
|
||||
useEffect(() => {
|
||||
const initialPromise = window.__initialUserDataPromise;
|
||||
if (initialPromise) {
|
||||
initialPromise.finally(() => {
|
||||
setInitialDataLoaded(true);
|
||||
});
|
||||
} else {
|
||||
setInitialDataLoaded(true);
|
||||
}
|
||||
}, []);
|
||||
|
||||
const userQuery = useQuery({
|
||||
queryKey: ["user"],
|
||||
queryFn: () => fetchUser(),
|
||||
queryFn: async () => {
|
||||
const result = await fetchUser();
|
||||
return result;
|
||||
},
|
||||
staleTime: 5 * 60 * 1000, // Consider data stale after 5 minutes
|
||||
gcTime: 10 * 60 * 1000, // Keep in cache for 10 minutes
|
||||
initialData: null, // Start with null to prevent flashing
|
||||
retry: 10,
|
||||
retryDelay: (attemptIndex) => Math.min(500 * attemptIndex, 2000),
|
||||
refetchOnMount: true, // Always fetch when component mounts
|
||||
});
|
||||
|
||||
// Mutation to refresh user data
|
||||
@@ -49,14 +40,15 @@ export function useUser() {
|
||||
},
|
||||
});
|
||||
|
||||
const isLoading = userQuery.isLoading || userQuery.isFetching;
|
||||
const isAuthenticated = Boolean(userQuery.data?.name);
|
||||
|
||||
return {
|
||||
user: userQuery.data,
|
||||
isLoading:
|
||||
!initialDataLoaded ||
|
||||
(userQuery.isLoading && userQuery.data === undefined), // Show loading until initial data is loaded
|
||||
isLoading,
|
||||
isError: userQuery.isError,
|
||||
error: userQuery.error,
|
||||
isAuthenticated: Boolean(userQuery.data?.name),
|
||||
isAuthenticated,
|
||||
refreshUser: refreshUser.mutate,
|
||||
isRefreshing: refreshUser.isPending,
|
||||
refetchUser: userQuery.refetch,
|
||||
|
||||
13
app/ui/app/src/lib/config.ts
Normal file
13
app/ui/app/src/lib/config.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
// API configuration
|
||||
const DEV_API_URL = "http://127.0.0.1:3001";
|
||||
|
||||
// Base URL for fetch API calls (can be relative in production)
|
||||
export const API_BASE = import.meta.env.DEV ? DEV_API_URL : "";
|
||||
|
||||
// Full host URL for Ollama client (needs full origin in production)
|
||||
export const OLLAMA_HOST = import.meta.env.DEV
|
||||
? DEV_API_URL
|
||||
: window.location.origin;
|
||||
|
||||
export const OLLAMA_DOT_COM =
|
||||
import.meta.env.VITE_OLLAMA_DOT_COM_URL || "https://ollama.com";
|
||||
@@ -1,4 +1,5 @@
|
||||
import { Ollama } from "ollama/browser";
|
||||
import { OLLAMA_HOST } from "./config";
|
||||
|
||||
let _ollamaClient: Ollama | null = null;
|
||||
|
||||
@@ -6,7 +7,7 @@ export const ollamaClient = new Proxy({} as Ollama, {
|
||||
get(_target, prop) {
|
||||
if (!_ollamaClient) {
|
||||
_ollamaClient = new Ollama({
|
||||
host: window.location.origin,
|
||||
host: OLLAMA_HOST,
|
||||
});
|
||||
}
|
||||
const value = _ollamaClient[prop as keyof Ollama];
|
||||
|
||||
@@ -5,13 +5,6 @@ import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
|
||||
import { routeTree } from "./routeTree.gen";
|
||||
import { fetchUser } from "./api";
|
||||
import { StreamingProvider } from "./contexts/StreamingContext";
|
||||
import { User } from "@/gotypes";
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
__initialUserDataPromise?: Promise<User | null>;
|
||||
}
|
||||
}
|
||||
|
||||
const queryClient = new QueryClient({
|
||||
defaultOptions: {
|
||||
@@ -24,27 +17,11 @@ const queryClient = new QueryClient({
|
||||
},
|
||||
});
|
||||
|
||||
// Track initial user data fetch
|
||||
let initialUserDataPromise: Promise<User | null> | null = null;
|
||||
|
||||
// Initialize user data on app startup
|
||||
const initializeUserData = async () => {
|
||||
try {
|
||||
const userData = await fetchUser();
|
||||
fetchUser().then((userData) => {
|
||||
if (userData) {
|
||||
queryClient.setQueryData(["user"], userData);
|
||||
return userData;
|
||||
} catch (error) {
|
||||
console.error("Error initializing user data:", error);
|
||||
queryClient.setQueryData(["user"], null);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
// Start initialization immediately and track the promise
|
||||
initialUserDataPromise = initializeUserData();
|
||||
|
||||
// Export the promise so hooks can await it
|
||||
window.__initialUserDataPromise = initialUserDataPromise;
|
||||
});
|
||||
|
||||
const router = createRouter({
|
||||
routeTree,
|
||||
|
||||
@@ -101,15 +101,14 @@ type HealthResponse struct {
|
||||
}
|
||||
|
||||
type User struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Email string `json:"email"`
|
||||
AvatarURL string `json:"avatarURL"`
|
||||
Plan string `json:"plan"`
|
||||
Bio string `json:"bio"`
|
||||
FirstName string `json:"firstName"`
|
||||
LastName string `json:"lastName"`
|
||||
OverThreshold bool `json:"overThreshold"`
|
||||
ID string `json:"id"`
|
||||
Email string `json:"email"`
|
||||
Name string `json:"name"`
|
||||
Bio string `json:"bio,omitempty"`
|
||||
AvatarURL string `json:"avatarurl,omitempty"`
|
||||
FirstName string `json:"firstname,omitempty"`
|
||||
LastName string `json:"lastname,omitempty"`
|
||||
Plan string `json:"plan,omitempty"`
|
||||
}
|
||||
|
||||
type Attachment struct {
|
||||
|
||||
182
app/ui/ui.go
182
app/ui/ui.go
@@ -23,7 +23,6 @@ import (
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/auth"
|
||||
"github.com/ollama/ollama/app/server"
|
||||
"github.com/ollama/ollama/app/store"
|
||||
"github.com/ollama/ollama/app/tools"
|
||||
@@ -254,6 +253,7 @@ func (s *Server) Handler() http.Handler {
|
||||
mux.Handle("DELETE /api/v1/chat/{id}", handle(s.deleteChat))
|
||||
mux.Handle("POST /api/v1/create-chat", handle(s.createChat))
|
||||
mux.Handle("PUT /api/v1/chat/{id}/rename", handle(s.renameChat))
|
||||
mux.Handle("PUT /api/v1/chat/{id}/draft", handle(s.updateDraft))
|
||||
|
||||
mux.Handle("GET /api/v1/inference-compute", handle(s.getInferenceCompute))
|
||||
mux.Handle("POST /api/v1/model/upstream", handle(s.modelUpstream))
|
||||
@@ -264,11 +264,10 @@ func (s *Server) Handler() http.Handler {
|
||||
ollamaProxy := s.ollamaProxy()
|
||||
mux.Handle("GET /api/tags", ollamaProxy)
|
||||
mux.Handle("POST /api/show", ollamaProxy)
|
||||
|
||||
mux.Handle("GET /api/v1/me", handle(s.me))
|
||||
mux.Handle("POST /api/v1/disconnect", handle(s.disconnect))
|
||||
mux.Handle("GET /api/v1/connect", handle(s.connectURL))
|
||||
mux.Handle("GET /api/v1/health", handle(s.health))
|
||||
mux.Handle("GET /api/version", ollamaProxy)
|
||||
mux.Handle("HEAD /api/version", ollamaProxy)
|
||||
mux.Handle("POST /api/me", ollamaProxy)
|
||||
mux.Handle("POST /api/signout", ollamaProxy)
|
||||
|
||||
// React app - catch all non-API routes and serve the React app
|
||||
mux.Handle("GET /", s.appHandler())
|
||||
@@ -338,7 +337,7 @@ func (s *Server) doSelfSigned(ctx context.Context, method, path string) (*http.R
|
||||
}
|
||||
|
||||
// UserData fetches user data from ollama.com API for the current ollama key
|
||||
func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
||||
func (s *Server) UserData(ctx context.Context) (*api.UserResponse, error) {
|
||||
resp, err := s.doSelfSigned(ctx, http.MethodPost, "/api/me")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to call ollama.com/api/me: %w", err)
|
||||
@@ -349,7 +348,7 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var user responses.User
|
||||
var user api.UserResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse user response: %w", err)
|
||||
}
|
||||
@@ -368,29 +367,27 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
||||
return &user, nil
|
||||
}
|
||||
|
||||
func waitForServer(ctx context.Context) error {
|
||||
timeout := time.Now().Add(10 * time.Second)
|
||||
// TODO: this avoids an error on first load of the app
|
||||
// however we should either show a loading state or
|
||||
// wait for the Ollama server to be ready before redirecting
|
||||
for {
|
||||
// WaitForServer waits for the Ollama server to be ready
|
||||
func WaitForServer(ctx context.Context, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
for time.Now().Before(deadline) {
|
||||
c, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := c.Version(ctx); err == nil {
|
||||
break
|
||||
}
|
||||
if time.Now().After(timeout) {
|
||||
return fmt.Errorf("timeout waiting for Ollama server to be ready")
|
||||
slog.Debug("ollama server is ready")
|
||||
return nil
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
return nil
|
||||
return errors.New("timeout waiting for Ollama server to be ready")
|
||||
}
|
||||
|
||||
func (s *Server) createChat(w http.ResponseWriter, r *http.Request) error {
|
||||
waitForServer(r.Context())
|
||||
if err := WaitForServer(r.Context(), 10*time.Second); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := uuid.NewV7()
|
||||
if err != nil {
|
||||
@@ -1280,6 +1277,28 @@ func (s *Server) renameChat(w http.ResponseWriter, r *http.Request) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) updateDraft(w http.ResponseWriter, r *http.Request) error {
|
||||
cid := r.PathValue("id")
|
||||
if cid == "" {
|
||||
return fmt.Errorf("chat ID is required")
|
||||
}
|
||||
|
||||
var req struct {
|
||||
Draft string `json:"draft"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
return fmt.Errorf("invalid request body: %w", err)
|
||||
}
|
||||
|
||||
if err := s.Store.UpdateChatDraft(cid, req.Draft); err != nil {
|
||||
return fmt.Errorf("failed to update draft: %w", err)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "ok"})
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) deleteChat(w http.ResponseWriter, r *http.Request) error {
|
||||
cid := r.PathValue("id")
|
||||
if cid == "" {
|
||||
@@ -1438,129 +1457,6 @@ func (s *Server) settings(w http.ResponseWriter, r *http.Request) error {
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Server) me(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
user, err := s.UserData(r.Context())
|
||||
if err != nil {
|
||||
// If fetching from API fails, try to return cached user data if available
|
||||
if cachedUser, cacheErr := s.Store.User(); cacheErr == nil && cachedUser != nil {
|
||||
s.log().Info("API request failed, returning cached user data", "error", err)
|
||||
responseUser := &responses.User{
|
||||
Name: cachedUser.Name,
|
||||
Email: cachedUser.Email,
|
||||
Plan: cachedUser.Plan,
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(responseUser)
|
||||
}
|
||||
|
||||
s.log().Error("failed to get user data", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to get user data",
|
||||
})
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(user)
|
||||
}
|
||||
|
||||
func (s *Server) disconnect(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := s.Store.ClearUser(); err != nil {
|
||||
s.log().Warn("failed to clear cached user data", "error", err)
|
||||
}
|
||||
|
||||
// Get the SSH public key to encode for the delete request
|
||||
pubKey, err := ollamaAuth.GetPublicKey()
|
||||
if err != nil {
|
||||
s.log().Error("failed to get public key", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to get public key",
|
||||
})
|
||||
}
|
||||
|
||||
// Encode the key using base64 URL encoding
|
||||
encodedKey := base64.RawURLEncoding.EncodeToString([]byte(pubKey))
|
||||
|
||||
// Call the /api/user/keys/{encodedKey} endpoint with DELETE
|
||||
resp, err := s.doSelfSigned(r.Context(), http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey))
|
||||
if err != nil {
|
||||
s.log().Error("failed to call ollama.com/api/user/keys", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to disconnect from ollama.com",
|
||||
})
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
s.log().Error("disconnect request failed", "status", resp.StatusCode)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to disconnect from ollama.com",
|
||||
})
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(map[string]string{"status": "disconnected"})
|
||||
}
|
||||
|
||||
func (s *Server) connectURL(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
connectURL, err := auth.BuildConnectURL(OllamaDotCom)
|
||||
if err != nil {
|
||||
s.log().Error("failed to build connect URL", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to build connect URL",
|
||||
})
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(map[string]string{
|
||||
"connect_url": connectURL,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Server) health(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
healthy := false
|
||||
c, err := api.ClientFromEnvironment()
|
||||
if err == nil {
|
||||
if _, err := c.Version(r.Context()); err == nil {
|
||||
healthy = true
|
||||
}
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(responses.HealthResponse{
|
||||
Healthy: healthy,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) error {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 500*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
@@ -158,16 +158,16 @@ func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam ui
|
||||
case uint32(UI_REQUEST_MSG_ID):
|
||||
// Requests for the UI must always come from the main event thread
|
||||
l := int(wParam)
|
||||
path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l)
|
||||
path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l) //nolint:govet,gosec
|
||||
t.app.UIRun(path)
|
||||
case WM_COPYDATA:
|
||||
// Handle URL scheme requests from other instances
|
||||
if lParam != 0 {
|
||||
cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam))
|
||||
if cds.DwData == 1 { // Our identifier for URL scheme messages
|
||||
cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam)) //nolint:govet,gosec
|
||||
if cds.DwData == 1 { // Our identifier for URL scheme messages
|
||||
// Convert the data back to string
|
||||
data := make([]byte, cds.CbData)
|
||||
copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData])
|
||||
copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData]) //nolint:govet,gosec
|
||||
urlScheme := string(data)
|
||||
handleURLSchemeRequest(urlScheme)
|
||||
lResult = 1 // Return non-zero to indicate success
|
||||
|
||||
115
cmd/bench/README.md
Normal file
115
cmd/bench/README.md
Normal file
@@ -0,0 +1,115 @@
|
||||
Ollama Benchmark Tool
|
||||
---------------------
|
||||
|
||||
A Go-based command-line tool for benchmarking Ollama models with configurable parameters and multiple output formats.
|
||||
|
||||
## Features
|
||||
|
||||
* Benchmark multiple models in a single run
|
||||
* Support for both text and image prompts
|
||||
* Configurable generation parameters (temperature, max tokens, seed, etc.)
|
||||
* Supports benchstat and CSV output formats
|
||||
* Detailed performance metrics (prefill, generate, load, total durations)
|
||||
|
||||
## Building from Source
|
||||
|
||||
```
|
||||
go build -o ollama-bench bench.go
|
||||
./ollama-bench -model gpt-oss:20b -epochs 6 -format csv
|
||||
```
|
||||
|
||||
Using Go Run (without building)
|
||||
|
||||
```
|
||||
go run bench.go -model gpt-oss:20b -epochs 3
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Example
|
||||
|
||||
```
|
||||
./ollama-bench -model gemma3 -epochs 6
|
||||
```
|
||||
|
||||
### Benchmark Multiple Models
|
||||
|
||||
```
|
||||
./ollama-bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
|
||||
benchstat -col /name gemma.bench
|
||||
```
|
||||
|
||||
### With Image Prompt
|
||||
|
||||
```
|
||||
./ollama-bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
|
||||
```
|
||||
|
||||
### Advanced Example
|
||||
|
||||
```
|
||||
./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
|
||||
```
|
||||
|
||||
## Command Line Options
|
||||
|
||||
| Option | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| -model | Comma-separated list of models to benchmark | (required) |
|
||||
| -epochs | Number of iterations per model | 1 |
|
||||
| -max-tokens | Maximum tokens for model response | 0 (unlimited) |
|
||||
| -temperature | Temperature parameter | 0.0 |
|
||||
| -seed | Random seed | 0 (random) |
|
||||
| -timeout | Timeout in seconds | 300 |
|
||||
| -p | Prompt text | "Write a long story." |
|
||||
| -image | Image file to include in prompt | |
|
||||
| -k | Keep-alive duration in seconds | 0 |
|
||||
| -format | Output format (benchstat, csv) | benchstat |
|
||||
| -output | Output file for results | "" (stdout) |
|
||||
| -v | Verbose mode | false |
|
||||
| -debug | Show debug information | false |
|
||||
|
||||
## Output Formats
|
||||
|
||||
### Markdown Format
|
||||
|
||||
The default markdown format is suitable for copying and pasting into a GitHub issue and will look like:
|
||||
```
|
||||
Model | Step | Count | Duration | nsPerToken | tokensPerSec |
|
||||
|-------|------|-------|----------|------------|--------------|
|
||||
| gpt-oss:20b | prefill | 124 | 30.006458ms | 241987.56 | 4132.44 |
|
||||
| gpt-oss:20b | generate | 200 | 2.646843954s | 13234219.77 | 75.56 |
|
||||
| gpt-oss:20b | load | 1 | 121.674208ms | - | - |
|
||||
| gpt-oss:20b | total | 1 | 2.861047625s | - | - |
|
||||
```
|
||||
|
||||
### Benchstat Format
|
||||
|
||||
Compatible with Go's benchstat tool for statistical analysis:
|
||||
|
||||
```
|
||||
BenchmarkModel/name=gpt-oss:20b/step=prefill 128 78125.00 ns/token 12800.00 token/sec
|
||||
BenchmarkModel/name=gpt-oss:20b/step=generate 512 19531.25 ns/token 51200.00 token/sec
|
||||
BenchmarkModel/name=gpt-oss:20b/step=load 1 1500000000 ns/request
|
||||
```
|
||||
|
||||
### CSV Format
|
||||
|
||||
Machine-readable comma-separated values:
|
||||
|
||||
```
|
||||
NAME,STEP,COUNT,NS_PER_COUNT,TOKEN_PER_SEC
|
||||
gpt-oss:20b,prefill,128,78125.00,12800.00
|
||||
gpt-oss:20b,generate,512,19531.25,51200.00
|
||||
gpt-oss:20b,load,1,1500000000,0
|
||||
```
|
||||
|
||||
## Metrics Explained
|
||||
|
||||
The tool reports four types of metrics for each model:
|
||||
|
||||
* prefill: Time spent processing the prompt
|
||||
* generate: Time spent generating the response
|
||||
* load: Model loading time (one-time cost)
|
||||
* total: Total request duration
|
||||
|
||||
321
cmd/bench/bench.go
Normal file
321
cmd/bench/bench.go
Normal file
@@ -0,0 +1,321 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
type flagOptions struct {
|
||||
models *string
|
||||
epochs *int
|
||||
maxTokens *int
|
||||
temperature *float64
|
||||
seed *int
|
||||
timeout *int
|
||||
prompt *string
|
||||
imageFile *string
|
||||
keepAlive *float64
|
||||
format *string
|
||||
outputFile *string
|
||||
debug *bool
|
||||
verbose *bool
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
Model string
|
||||
Step string
|
||||
Count int
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
var once sync.Once
|
||||
|
||||
const DefaultPrompt = `Please write a descriptive story about a llama named Alonso who grows up to be President of the Land of Llamas. Include details about Alonso's childhood, adolescent years, and how he grew up to be a political mover and shaker. Write the story with a sense of whimsy.`
|
||||
|
||||
func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool) {
|
||||
switch format {
|
||||
case "benchstat":
|
||||
if verbose {
|
||||
printHeader := func() {
|
||||
fmt.Fprintf(w, "sysname: %s\n", runtime.GOOS)
|
||||
fmt.Fprintf(w, "machine: %s\n", runtime.GOARCH)
|
||||
}
|
||||
once.Do(printHeader)
|
||||
}
|
||||
for _, m := range metrics {
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
if m.Count > 0 {
|
||||
nsPerToken := float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec := float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d %.2f ns/token %.2f token/sec\n",
|
||||
m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
|
||||
} else {
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d 0 ns/token 0 token/sec\n",
|
||||
m.Model, m.Step, m.Count)
|
||||
}
|
||||
} else {
|
||||
var suffix string
|
||||
if m.Step == "load" {
|
||||
suffix = "/step=load"
|
||||
}
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s%s 1 %d ns/request\n",
|
||||
m.Model, suffix, m.Duration.Nanoseconds())
|
||||
}
|
||||
}
|
||||
case "csv":
|
||||
printHeader := func() {
|
||||
headings := []string{"NAME", "STEP", "COUNT", "NS_PER_COUNT", "TOKEN_PER_SEC"}
|
||||
fmt.Fprintln(w, strings.Join(headings, ","))
|
||||
}
|
||||
once.Do(printHeader)
|
||||
|
||||
for _, m := range metrics {
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
var nsPerToken float64
|
||||
var tokensPerSec float64
|
||||
if m.Count > 0 {
|
||||
nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
}
|
||||
fmt.Fprintf(w, "%s,%s,%d,%.2f,%.2f\n", m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
|
||||
} else {
|
||||
fmt.Fprintf(w, "%s,%s,1,%d,0\n", m.Model, m.Step, m.Duration.Nanoseconds())
|
||||
}
|
||||
}
|
||||
case "markdown":
|
||||
printHeader := func() {
|
||||
fmt.Fprintln(w, "| Model | Step | Count | Duration | nsPerToken | tokensPerSec |")
|
||||
fmt.Fprintln(w, "|-------|------|-------|----------|------------|--------------|")
|
||||
}
|
||||
once.Do(printHeader)
|
||||
|
||||
for _, m := range metrics {
|
||||
var nsPerToken, tokensPerSec float64
|
||||
var nsPerTokenStr, tokensPerSecStr string
|
||||
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
nsPerTokenStr = fmt.Sprintf("%.2f", nsPerToken)
|
||||
tokensPerSecStr = fmt.Sprintf("%.2f", tokensPerSec)
|
||||
} else {
|
||||
nsPerTokenStr = "-"
|
||||
tokensPerSecStr = "-"
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "| %s | %s | %d | %v | %s | %s |\n",
|
||||
m.Model, m.Step, m.Count, m.Duration, nsPerTokenStr, tokensPerSecStr)
|
||||
}
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "Unknown output format '%s'\n", format)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkChat(fOpt flagOptions) error {
|
||||
models := strings.Split(*fOpt.models, ",")
|
||||
|
||||
// todo - add multi-image support
|
||||
var imgData api.ImageData
|
||||
var err error
|
||||
if *fOpt.imageFile != "" {
|
||||
imgData, err = readImage(*fOpt.imageFile)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't read image '%s': %v\n", *fOpt.imageFile, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if *fOpt.debug && imgData != nil {
|
||||
fmt.Fprintf(os.Stderr, "Read file '%s'\n", *fOpt.imageFile)
|
||||
}
|
||||
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't create ollama client: %v\n", err)
|
||||
return err
|
||||
}
|
||||
|
||||
var out io.Writer = os.Stdout
|
||||
if fOpt.outputFile != nil && *fOpt.outputFile != "" {
|
||||
f, err := os.OpenFile(*fOpt.outputFile, os.O_CREATE|os.O_WRONLY, 0o644)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: cannot open output file %s: %v\n", *fOpt.outputFile, err)
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
out = f
|
||||
}
|
||||
|
||||
for _, model := range models {
|
||||
for range *fOpt.epochs {
|
||||
options := make(map[string]interface{})
|
||||
if *fOpt.maxTokens > 0 {
|
||||
options["num_predict"] = *fOpt.maxTokens
|
||||
}
|
||||
options["temperature"] = *fOpt.temperature
|
||||
if fOpt.seed != nil && *fOpt.seed > 0 {
|
||||
options["seed"] = *fOpt.seed
|
||||
}
|
||||
|
||||
var keepAliveDuration *api.Duration
|
||||
if *fOpt.keepAlive > 0 {
|
||||
duration := api.Duration{Duration: time.Duration(*fOpt.keepAlive * float64(time.Second))}
|
||||
keepAliveDuration = &duration
|
||||
}
|
||||
|
||||
req := &api.ChatRequest{
|
||||
Model: model,
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: *fOpt.prompt,
|
||||
},
|
||||
},
|
||||
Options: options,
|
||||
KeepAlive: keepAliveDuration,
|
||||
}
|
||||
|
||||
if imgData != nil {
|
||||
req.Messages[0].Images = []api.ImageData{imgData}
|
||||
}
|
||||
|
||||
var responseMetrics *api.Metrics
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*fOpt.timeout)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
err = client.Chat(ctx, req, func(resp api.ChatResponse) error {
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintf(os.Stderr, "%s", cmp.Or(resp.Message.Thinking, resp.Message.Content))
|
||||
}
|
||||
|
||||
if resp.Done {
|
||||
responseMetrics = &resp.Metrics
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintln(os.Stderr)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Chat request timed out with model '%s' after %vs\n", model, 1)
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't chat with model '%s': %v\n", model, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if responseMetrics == nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: No metrics received for model '%s'\n", model)
|
||||
continue
|
||||
}
|
||||
|
||||
metrics := []Metrics{
|
||||
{
|
||||
Model: model,
|
||||
Step: "prefill",
|
||||
Count: responseMetrics.PromptEvalCount,
|
||||
Duration: responseMetrics.PromptEvalDuration,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "generate",
|
||||
Count: responseMetrics.EvalCount,
|
||||
Duration: responseMetrics.EvalDuration,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "load",
|
||||
Count: 1,
|
||||
Duration: responseMetrics.LoadDuration,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "total",
|
||||
Count: 1,
|
||||
Duration: responseMetrics.TotalDuration,
|
||||
},
|
||||
}
|
||||
|
||||
OutputMetrics(out, *fOpt.format, metrics, *fOpt.verbose)
|
||||
|
||||
if *fOpt.keepAlive > 0 {
|
||||
time.Sleep(time.Duration(*fOpt.keepAlive*float64(time.Second)) + 200*time.Millisecond)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func readImage(filePath string) (api.ImageData, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
data, err := io.ReadAll(file)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return api.ImageData(data), nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
fOpt := flagOptions{
|
||||
models: flag.String("model", "", "Model to benchmark"),
|
||||
epochs: flag.Int("epochs", 6, "Number of epochs (iterations) per model"),
|
||||
maxTokens: flag.Int("max-tokens", 200, "Maximum tokens for model response"),
|
||||
temperature: flag.Float64("temperature", 0, "Temperature parameter"),
|
||||
seed: flag.Int("seed", 0, "Random seed"),
|
||||
timeout: flag.Int("timeout", 60*5, "Timeout in seconds (default 300s)"),
|
||||
prompt: flag.String("p", DefaultPrompt, "Prompt to use"),
|
||||
imageFile: flag.String("image", "", "Filename for an image to include"),
|
||||
keepAlive: flag.Float64("k", 0, "Keep alive duration in seconds"),
|
||||
format: flag.String("format", "markdown", "Output format [benchstat|csv] (default benchstat)"),
|
||||
outputFile: flag.String("output", "", "Output file for results (stdout if empty)"),
|
||||
verbose: flag.Bool("v", false, "Show system information"),
|
||||
debug: flag.Bool("debug", false, "Show debug information"),
|
||||
}
|
||||
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS]\n\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, "Description:\n")
|
||||
fmt.Fprintf(os.Stderr, " Model benchmarking tool with configurable parameters\n\n")
|
||||
fmt.Fprintf(os.Stderr, "Options:\n")
|
||||
flag.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nExamples:\n")
|
||||
fmt.Fprintf(os.Stderr, " bench -model gpt-oss:20b -epochs 3 -temperature 0.7\n")
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
if !slices.Contains([]string{"markdown", "benchstat", "csv"}, *fOpt.format) {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Unknown format '%s'\n", *fOpt.format)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if len(*fOpt.models) == 0 {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: No model(s) specified to benchmark.\n")
|
||||
flag.Usage()
|
||||
return
|
||||
}
|
||||
|
||||
BenchmarkChat(fOpt)
|
||||
}
|
||||
463
cmd/bench/bench_test.go
Normal file
463
cmd/bench/bench_test.go
Normal file
@@ -0,0 +1,463 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/rand"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
func createTestFlagOptions() flagOptions {
|
||||
models := "test-model"
|
||||
format := "benchstat"
|
||||
epochs := 1
|
||||
maxTokens := 100
|
||||
temperature := 0.7
|
||||
seed := 42
|
||||
timeout := 30
|
||||
prompt := "test prompt"
|
||||
imageFile := ""
|
||||
keepAlive := 5.0
|
||||
verbose := false
|
||||
debug := false
|
||||
|
||||
return flagOptions{
|
||||
models: &models,
|
||||
format: &format,
|
||||
epochs: &epochs,
|
||||
maxTokens: &maxTokens,
|
||||
temperature: &temperature,
|
||||
seed: &seed,
|
||||
timeout: &timeout,
|
||||
prompt: &prompt,
|
||||
imageFile: &imageFile,
|
||||
keepAlive: &keepAlive,
|
||||
verbose: &verbose,
|
||||
debug: &debug,
|
||||
}
|
||||
}
|
||||
|
||||
func captureOutput(f func()) string {
|
||||
oldStdout := os.Stdout
|
||||
oldStderr := os.Stderr
|
||||
defer func() {
|
||||
os.Stdout = oldStdout
|
||||
os.Stderr = oldStderr
|
||||
}()
|
||||
|
||||
r, w, _ := os.Pipe()
|
||||
os.Stdout = w
|
||||
os.Stderr = w
|
||||
|
||||
f()
|
||||
|
||||
w.Close()
|
||||
var buf bytes.Buffer
|
||||
io.Copy(&buf, r)
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func createMockOllamaServer(t *testing.T, responses []api.ChatResponse) *httptest.Server {
|
||||
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/api/chat" {
|
||||
t.Errorf("Expected path /api/chat, got %s", r.URL.Path)
|
||||
http.Error(w, "Not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
if r.Method != "POST" {
|
||||
t.Errorf("Expected POST method, got %s", r.Method)
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
for _, resp := range responses {
|
||||
jsonData, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
t.Errorf("Failed to marshal response: %v", err)
|
||||
return
|
||||
}
|
||||
w.Write(jsonData)
|
||||
w.Write([]byte("\n"))
|
||||
if f, ok := w.(http.Flusher); ok {
|
||||
f.Flush()
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond) // Simulate some delay
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_Success(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
mockResponses := []api.ChatResponse{
|
||||
{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response part 1",
|
||||
},
|
||||
Done: false,
|
||||
},
|
||||
{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response part 2",
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
server := createMockOllamaServer(t, mockResponses)
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "BenchmarkModel/name=test-model/step=prefill") {
|
||||
t.Errorf("Expected output to contain prefill metrics, got: %s", output)
|
||||
}
|
||||
if !strings.Contains(output, "BenchmarkModel/name=test-model/step=generate") {
|
||||
t.Errorf("Expected output to contain generate metrics, got: %s", output)
|
||||
}
|
||||
if !strings.Contains(output, "ns/token") {
|
||||
t.Errorf("Expected output to contain ns/token metric, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_ServerError(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "Internal server error", http.StatusInternalServerError)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected error to be handled internally, got returned error: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: Couldn't chat with model") {
|
||||
t.Errorf("Expected error message about chat failure, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_Timeout(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
shortTimeout := 1 // Very short timeout
|
||||
fOpt.timeout = &shortTimeout
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Simulate a long delay that will cause timeout
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
response := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response",
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
jsonData, _ := json.Marshal(response)
|
||||
w.Write(jsonData)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected timeout to be handled internally, got returned error: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: Chat request timed out") {
|
||||
t.Errorf("Expected timeout error message, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_NoMetrics(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
mockResponses := []api.ChatResponse{
|
||||
{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response",
|
||||
},
|
||||
Done: false, // Never sends Done=true
|
||||
},
|
||||
}
|
||||
|
||||
server := createMockOllamaServer(t, mockResponses)
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: No metrics received") {
|
||||
t.Errorf("Expected no metrics error message, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_MultipleModels(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
models := "model1,model2"
|
||||
epochs := 2
|
||||
fOpt.models = &models
|
||||
fOpt.epochs = &epochs
|
||||
|
||||
callCount := 0
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
callCount++
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
var req api.ChatRequest
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
json.Unmarshal(body, &req)
|
||||
|
||||
response := api.ChatResponse{
|
||||
Model: req.Model,
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response for " + req.Model,
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
jsonData, _ := json.Marshal(response)
|
||||
w.Write(jsonData)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
// Should be called 4 times (2 models × 2 epochs)
|
||||
if callCount != 4 {
|
||||
t.Errorf("Expected 4 API calls, got %d", callCount)
|
||||
}
|
||||
|
||||
if !strings.Contains(output, "BenchmarkModel/name=model1") || !strings.Contains(output, "BenchmarkModel/name=model2") {
|
||||
t.Errorf("Expected output for both models, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_WithImage(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
tmpfile, err := os.CreateTemp(t.TempDir(), "testimage")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp file: %v", err)
|
||||
}
|
||||
defer os.Remove(tmpfile.Name())
|
||||
|
||||
content := []byte("fake image data")
|
||||
if _, err := tmpfile.Write(content); err != nil {
|
||||
t.Fatalf("Failed to write to temp file: %v", err)
|
||||
}
|
||||
tmpfile.Close()
|
||||
|
||||
tmpfileName := tmpfile.Name()
|
||||
fOpt.imageFile = &tmpfileName
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify the request contains image data
|
||||
var req api.ChatRequest
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
json.Unmarshal(body, &req)
|
||||
|
||||
if len(req.Messages) == 0 || len(req.Messages[0].Images) == 0 {
|
||||
t.Error("Expected request to contain images")
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
response := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "test response with image",
|
||||
},
|
||||
Done: true,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
PromptEvalDuration: 100 * time.Millisecond,
|
||||
EvalCount: 50,
|
||||
EvalDuration: 500 * time.Millisecond,
|
||||
TotalDuration: 600 * time.Millisecond,
|
||||
LoadDuration: 50 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
jsonData, _ := json.Marshal(response)
|
||||
w.Write(jsonData)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", server.URL)
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "BenchmarkModel/name=test-model") {
|
||||
t.Errorf("Expected benchmark output, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkChat_ImageError(t *testing.T) {
|
||||
randFileName := func() string {
|
||||
const charset = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
const length = 8
|
||||
|
||||
result := make([]byte, length)
|
||||
rand.Read(result) // Fill with random bytes
|
||||
|
||||
for i := range result {
|
||||
result[i] = charset[result[i]%byte(len(charset))]
|
||||
}
|
||||
|
||||
return string(result) + ".txt"
|
||||
}
|
||||
|
||||
fOpt := createTestFlagOptions()
|
||||
imageFile := randFileName()
|
||||
fOpt.imageFile = &imageFile
|
||||
|
||||
output := captureOutput(func() {
|
||||
err := BenchmarkChat(fOpt)
|
||||
if err == nil {
|
||||
t.Error("Expected error from image reading, got nil")
|
||||
}
|
||||
})
|
||||
|
||||
if !strings.Contains(output, "ERROR: Couldn't read image") {
|
||||
t.Errorf("Expected image read error message, got: %s", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadImage_Success(t *testing.T) {
|
||||
tmpfile, err := os.CreateTemp(t.TempDir(), "testimage")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp file: %v", err)
|
||||
}
|
||||
defer os.Remove(tmpfile.Name())
|
||||
|
||||
content := []byte("fake image data")
|
||||
if _, err := tmpfile.Write(content); err != nil {
|
||||
t.Fatalf("Failed to write to temp file: %v", err)
|
||||
}
|
||||
tmpfile.Close()
|
||||
|
||||
imgData, err := readImage(tmpfile.Name())
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if imgData == nil {
|
||||
t.Error("Expected image data, got nil")
|
||||
}
|
||||
|
||||
expected := api.ImageData(content)
|
||||
if string(imgData) != string(expected) {
|
||||
t.Errorf("Expected image data %v, got %v", expected, imgData)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadImage_FileNotFound(t *testing.T) {
|
||||
imgData, err := readImage("nonexistentfile.jpg")
|
||||
if err == nil {
|
||||
t.Error("Expected error for non-existent file, got nil")
|
||||
}
|
||||
if imgData != nil {
|
||||
t.Error("Expected nil image data for non-existent file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestOptionsMapCreation(t *testing.T) {
|
||||
fOpt := createTestFlagOptions()
|
||||
|
||||
options := make(map[string]interface{})
|
||||
if *fOpt.maxTokens > 0 {
|
||||
options["num_predict"] = *fOpt.maxTokens
|
||||
}
|
||||
options["temperature"] = *fOpt.temperature
|
||||
if fOpt.seed != nil && *fOpt.seed > 0 {
|
||||
options["seed"] = *fOpt.seed
|
||||
}
|
||||
|
||||
if options["num_predict"] != *fOpt.maxTokens {
|
||||
t.Errorf("Expected num_predict %d, got %v", *fOpt.maxTokens, options["num_predict"])
|
||||
}
|
||||
if options["temperature"] != *fOpt.temperature {
|
||||
t.Errorf("Expected temperature %f, got %v", *fOpt.temperature, options["temperature"])
|
||||
}
|
||||
if options["seed"] != *fOpt.seed {
|
||||
t.Errorf("Expected seed %d, got %v", *fOpt.seed, options["seed"])
|
||||
}
|
||||
}
|
||||
@@ -1430,7 +1430,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||
latest.Summary()
|
||||
}
|
||||
|
||||
return &api.Message{Role: role, Content: fullResponse.String()}, nil
|
||||
return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
|
||||
}
|
||||
|
||||
func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
|
||||
@@ -182,6 +182,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
conv = &llama4Model{}
|
||||
case "Mistral3ForConditionalGeneration":
|
||||
conv = &mistral3Model{}
|
||||
case "Ministral3ForCausalLM":
|
||||
conv = &mistral3CausalModel{}
|
||||
case "MixtralForCausalLM":
|
||||
conv = &mixtralModel{}
|
||||
case "GemmaForCausalLM":
|
||||
@@ -202,10 +204,16 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
conv = &qwen3VLModel{}
|
||||
case "BertModel":
|
||||
conv = &bertModel{}
|
||||
case "NomicBertModel", "NomicBertMoEModel":
|
||||
conv = &nomicbertModel{}
|
||||
case "CohereForCausalLM":
|
||||
conv = &commandrModel{}
|
||||
case "GptOssForCausalLM":
|
||||
conv = &gptossModel{}
|
||||
case "DeepseekOCRForCausalLM":
|
||||
conv = &deepseekocr{}
|
||||
case "DeepseekV3ForCausalLM":
|
||||
conv = &deepseek2Model{}
|
||||
default:
|
||||
return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
|
||||
}
|
||||
|
||||
173
convert/convert_deepseek2.go
Normal file
173
convert/convert_deepseek2.go
Normal file
@@ -0,0 +1,173 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type deepseek2Model struct {
|
||||
ModelParameters // architectures, vocab_size
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
QKNopeHeadDim uint32 `json:"qk_nope_head_dim"`
|
||||
QKRopeHeadDim uint32 `json:"qk_rope_head_dim"`
|
||||
KVLoraRank uint32 `json:"kv_lora_rank"`
|
||||
QLoraRank uint32 `json:"q_lora_rank"`
|
||||
VHeadDim uint32 `json:"v_head_dim"`
|
||||
|
||||
ExpertCount uint32 `json:"n_routed_experts"`
|
||||
ExpertSharedCount uint32 `json:"n_shared_experts"`
|
||||
ExpertIntermediateSize uint32 `json:"moe_intermediate_size"`
|
||||
ExpertUsedCount uint32 `json:"num_experts_per_tok"`
|
||||
ExpertWeightsNorm bool `json:"norm_topk_prob"`
|
||||
ExpertWeightsScale float32 `json:"routed_scaling_factor"`
|
||||
|
||||
ScoringFunc string `json:"scoring_func"`
|
||||
LeadingDenseBlockCount uint32 `json:"first_k_dense_replace"`
|
||||
|
||||
RopeScaling struct {
|
||||
Factor float32 `json:"factor"`
|
||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
Type string `json:"type"`
|
||||
MScaleAllDim float32 `json:"mscale_all_dim"`
|
||||
} `json:"rope_scaling"`
|
||||
|
||||
Architecture string
|
||||
}
|
||||
|
||||
func (p *deepseek2Model) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "deepseek2"
|
||||
kv["general.type"] = "model"
|
||||
kv["deepseek2.block_count"] = p.HiddenLayers
|
||||
|
||||
numHeads := p.NumAttentionHeads
|
||||
numKVHeads := p.NumKeyValueHeads
|
||||
|
||||
kv["deepseek2.attention.head_count"] = numHeads
|
||||
kv["deepseek2.attention.head_count_kv"] = numKVHeads
|
||||
kv["deepseek2.attention.key_length"] = p.QKNopeHeadDim + p.QKRopeHeadDim
|
||||
kv["deepseek2.attention.kv_lora_rank"] = p.KVLoraRank
|
||||
kv["deepseek2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||
kv["deepseek2.attention.q_lora_rank"] = p.QLoraRank
|
||||
kv["deepseek2.attention.value_length"] = p.VHeadDim
|
||||
kv["deepseek2.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["deepseek2.embedding_length"] = p.HiddenSize
|
||||
kv["deepseek2.expert_count"] = p.ExpertCount
|
||||
kv["deepseek2.expert_feed_forward_length"] = p.ExpertIntermediateSize
|
||||
kv["deepseek2.expert_shared_count"] = p.ExpertSharedCount
|
||||
|
||||
var scoringFunc uint32
|
||||
switch p.ScoringFunc {
|
||||
case "softmax":
|
||||
// not currently supported in the model, but needed for Deepseek-OCR
|
||||
scoringFunc = 1
|
||||
case "sigmoid":
|
||||
scoringFunc = 2
|
||||
}
|
||||
kv["deepseek2.expert_gating_func"] = scoringFunc
|
||||
kv["deepseek2.expert_used_count"] = p.ExpertUsedCount
|
||||
kv["deepseek2.expert_weights_norm"] = p.ExpertWeightsNorm
|
||||
kv["deepseek2.expert_weights_scale"] = p.ExpertWeightsScale
|
||||
kv["deepseek2.feed_forward_length"] = p.IntermediateSize
|
||||
kv["deepseek2.leading_dense_block_count"] = p.LeadingDenseBlockCount
|
||||
|
||||
kv["deepseek2.rope.dimension_count"] = p.QKRopeHeadDim
|
||||
kv["deepseek2.rope.freq_base"] = cmp.Or(p.RopeTheta, 10000.0)
|
||||
kv["deepseek2.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||
kv["deepseek2.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
|
||||
kv["deepseek2.rope.scaling.type"] = p.RopeScaling.Type
|
||||
kv["deepseek2.rope.scaling.yarn_log_multiplier"] = 0.1 * p.RopeScaling.MScaleAllDim
|
||||
|
||||
kv["tokenizer.ggml.pre"] = "deepseek-v3"
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *deepseek2Model) Replacements() []string {
|
||||
return []string{
|
||||
"lm_head", "output",
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.norm", "output_norm",
|
||||
"language_model.", "",
|
||||
"model.layers", "blk",
|
||||
"input_layernorm", "attn_norm",
|
||||
"self_attn.kv_a_proj_with_mqa", "attn_kv_a_mqa",
|
||||
"self_attn.kv_a_layernorm", "attn_kv_a_norm",
|
||||
"self_attn.kv_b_proj", "attn_kv_b",
|
||||
"self_attn.q_a_proj", "attn_q_a",
|
||||
"self_attn.q_a_layernorm", "attn_q_a_norm",
|
||||
"self_attn.q_b_proj", "attn_q_b",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
"mlp.shared_experts.down_proj", "ffn_down_shexp",
|
||||
"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
|
||||
"mlp.shared_experts.up_proj", "ffn_up_shexp",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"mlp.gate.e_score_correction_bias", "exp_probs_b.bias",
|
||||
"mlp.gate", "ffn_gate_inp",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *deepseek2Model) Tensors(s []Tensor) (out []*ggml.Tensor) {
|
||||
merges := make([]merge, p.HiddenLayers*3)
|
||||
for i := range p.HiddenLayers {
|
||||
merges[i*3+0] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
|
||||
}
|
||||
merges[i*3+1] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
|
||||
}
|
||||
merges[i*3+2] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
|
||||
}
|
||||
}
|
||||
|
||||
skipLayer := func(n string, minValue uint32) bool {
|
||||
re := regexp.MustCompile(`^blk\.(\d+)`)
|
||||
matches := re.FindStringSubmatch(n)
|
||||
if matches == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
blkNum, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return uint32(blkNum) >= minValue
|
||||
}
|
||||
|
||||
out, s = mergeTensors(s, merges...)
|
||||
for _, t := range s {
|
||||
// skip any additional layers (such as the Multi-Token Prediction layer)
|
||||
if skipLayer(t.Name(), p.HiddenLayers) {
|
||||
slog.Debug("skipping layer", "name", t.Name())
|
||||
continue
|
||||
}
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
136
convert/convert_deepseekocr.go
Normal file
136
convert/convert_deepseekocr.go
Normal file
@@ -0,0 +1,136 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type deepseekocr struct {
|
||||
ModelParameters
|
||||
LanguageConfig struct {
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
NumRoutedExperts uint32 `json:"n_routed_experts"`
|
||||
NumSharedExperts uint32 `json:"n_shared_experts"`
|
||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||
FirstKDenseReplace uint32 `json:"first_k_dense_replace"`
|
||||
} `json:"language_config"`
|
||||
|
||||
VisionConfig struct {
|
||||
ImageSize uint32 `json:"image_size"`
|
||||
Width struct {
|
||||
Vision struct {
|
||||
Heads uint32 `json:"heads"`
|
||||
ImageSize uint32 `json:"image_size"`
|
||||
Layers uint32 `json:"layers"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
Width uint32 `json:"width"`
|
||||
} `json:"clip-l-14-224"`
|
||||
Sam struct {
|
||||
GlobalAttentionIndexes []int32 `json:"global_attn_indexes"`
|
||||
Heads uint32 `json:"heads"`
|
||||
Layers uint32 `json:"layers"`
|
||||
Width uint32 `json:"width"`
|
||||
} `json:"sam_vit_b"`
|
||||
}
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
func (m *deepseekocr) KV(t *Tokenizer) ggml.KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "deepseekocr"
|
||||
kv["block_count"] = m.LanguageConfig.HiddenLayers
|
||||
kv["context_length"] = m.LanguageConfig.MaxPositionEmbeddings
|
||||
kv["embedding_length"] = m.LanguageConfig.HiddenSize
|
||||
kv["feed_forward_length"] = m.LanguageConfig.IntermediateSize
|
||||
kv["attention.head_count"] = m.LanguageConfig.NumAttentionHeads
|
||||
kv["attention.head_count_kv"] = m.LanguageConfig.NumKeyValueHeads
|
||||
kv["expert_count"] = m.LanguageConfig.NumRoutedExperts
|
||||
kv["expert_used_count"] = m.LanguageConfig.NumExpertsPerToken
|
||||
kv["leading_dense_block_count"] = m.LanguageConfig.FirstKDenseReplace
|
||||
|
||||
kv["vision.block_count"] = m.VisionConfig.Width.Vision.Layers
|
||||
kv["vision.embedding_length"] = m.VisionConfig.Width.Vision.Width
|
||||
kv["vision.head_count"] = m.VisionConfig.Width.Vision.Heads
|
||||
kv["vision.image_size"] = m.VisionConfig.Width.Vision.ImageSize
|
||||
kv["vision.patch_size"] = m.VisionConfig.Width.Vision.PatchSize
|
||||
|
||||
kv["sam.block_count"] = m.VisionConfig.Width.Sam.Layers
|
||||
kv["sam.embedding_length"] = m.VisionConfig.Width.Sam.Width
|
||||
kv["sam.head_count"] = m.VisionConfig.Width.Sam.Heads
|
||||
kv["sam.global_attention_indexes"] = m.VisionConfig.Width.Sam.GlobalAttentionIndexes
|
||||
return kv
|
||||
}
|
||||
|
||||
func (m *deepseekocr) Tensors(s []Tensor) (out []*ggml.Tensor) {
|
||||
merges := make([]merge, m.LanguageConfig.HiddenLayers*3)
|
||||
for i := range m.LanguageConfig.HiddenLayers {
|
||||
merges[i*3+0] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
|
||||
}
|
||||
merges[i*3+1] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
|
||||
}
|
||||
merges[i*3+2] = merge{
|
||||
fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
|
||||
}
|
||||
}
|
||||
|
||||
out, s = mergeTensors(s, merges...)
|
||||
for _, t := range s {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (m *deepseekocr) Replacements() []string {
|
||||
return []string{
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.layers", "blk",
|
||||
"input_layernorm", "attn_norm",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.gate", "ffn_gate_inp",
|
||||
"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
|
||||
"mlp.shared_experts.up_proj", "ffn_up_shexp",
|
||||
"mlp.shared_experts.down_proj", "ffn_down_shexp",
|
||||
"model.norm", "output_norm",
|
||||
"lm_head", "output",
|
||||
|
||||
"model.vision_model", "v",
|
||||
"embeddings.patch_embedding", "patch_embd",
|
||||
"embeddings.class_embedding", "class_embd",
|
||||
"embeddings.position_embedding", "position_embd",
|
||||
"transformer.layers", "blk",
|
||||
|
||||
"model.projector", "mm",
|
||||
"model.image_newline", "mm.image_newline",
|
||||
//nolint:misspell // this misspelling is upstream. fixing it breaks the model
|
||||
"model.view_seperator", "mm.view_seperator",
|
||||
|
||||
"model.sam_model.patch_embed.proj", "s.patch_embd",
|
||||
"model.sam_model.pos_embed", "s.position_embd",
|
||||
"model.sam_model.blocks", "s.blk",
|
||||
"model.sam_model.neck", "s.neck",
|
||||
"model.sam_model.net_", "s.net_",
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
@@ -26,16 +27,26 @@ type gemma3Model struct {
|
||||
NumChannels uint32 `json:"num_channels"` // num_channels 3
|
||||
PatchSize uint32 `json:"patch_size"` // patch_size 14
|
||||
} `json:"vision_config"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||
RopeLocalTheta float32 `json:"rope_local_base_freq"`
|
||||
RopeGlobalTheta float32 `json:"rope_global_base_freq"`
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
MultiModalTokensPerImage uint32 `json:"mm_tokens_per_image"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||
RopeLocalTheta float32 `json:"rope_local_base_freq"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
SlidingWindowPattern *uint32 `json:"sliding_window_pattern"`
|
||||
LayerTypes []string `json:"layer_types"`
|
||||
MultiModalTokensPerImage uint32 `json:"mm_tokens_per_image"`
|
||||
RopeScaling *struct {
|
||||
Type string `json:"rope_type"`
|
||||
Factor float32 `json:"factor"`
|
||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
ExtrapolationFactor float32 `json:"extrapolation_factor"`
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
} `json:"rope_scaling"`
|
||||
}
|
||||
|
||||
const (
|
||||
@@ -81,9 +92,38 @@ func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
|
||||
kv["gemma3.attention.key_length"] = p.HeadDim
|
||||
kv["gemma3.attention.value_length"] = p.HeadDim
|
||||
kv["gemma3.attention.sliding_window"] = p.SlidingWindow
|
||||
kv["gemma3.final_logit_softcapping"] = cmp.Or(p.FinalLogitSoftcap, 30)
|
||||
|
||||
// The sliding window pattern is either provided as the sliding_window_pattern
|
||||
// key (an int) or as the layer_types key (a list of strings).
|
||||
if p.SlidingWindowPattern != nil || len(p.LayerTypes) > 0 {
|
||||
kv["gemma3.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
|
||||
for i := range numBlocks {
|
||||
var isLocal bool
|
||||
if len(p.LayerTypes) > 0 && int(i) < len(p.LayerTypes) {
|
||||
isLocal = p.LayerTypes[i] == "sliding_attention"
|
||||
} else if p.SlidingWindowPattern != nil && *p.SlidingWindowPattern > 0 {
|
||||
isLocal = (i+1)%*p.SlidingWindowPattern != 0
|
||||
}
|
||||
if !yield(isLocal) {
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
if p.FinalLogitSoftcap > 0 {
|
||||
kv["gemma3.final_logit_softcapping"] = p.FinalLogitSoftcap
|
||||
}
|
||||
kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
|
||||
kv["gemma3.rope.global.freq_base"] = cmp.Or(p.RopeGlobalTheta, 1000000.0)
|
||||
kv["gemma3.rope.freq_base"] = cmp.Or(p.RopeTheta, 1000000.0)
|
||||
if p.RopeScaling != nil && p.RopeScaling.Type == "yarn" && p.RopeScaling.Factor > 0 {
|
||||
kv["gemma3.rope.scaling.type"] = "yarn"
|
||||
kv["gemma3.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||
kv["gemma3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
|
||||
kv["gemma3.rope.scaling.extrapolation_factor"] = cmp.Or(p.RopeScaling.ExtrapolationFactor, float32(1.0))
|
||||
kv["gemma3.rope.scaling.beta_fast"] = cmp.Or(p.RopeScaling.BetaFast, float32(64.0))
|
||||
kv["gemma3.rope.scaling.beta_slow"] = cmp.Or(p.RopeScaling.BetaSlow, float32(1.0))
|
||||
}
|
||||
|
||||
kv["gemma3.embedding_length"] = p.HiddenSize
|
||||
kv["gemma3.feed_forward_length"] = p.IntermediateSize
|
||||
default:
|
||||
|
||||
@@ -110,9 +110,12 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
|
||||
for name, mxfp4 := range mxfp4s {
|
||||
dims := mxfp4.blocks.Shape()
|
||||
if !strings.HasSuffix(name, ".weight") {
|
||||
name = name + ".weight"
|
||||
}
|
||||
if strings.Contains(name, "ffn_down_exps") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: name + ".weight",
|
||||
Name: name,
|
||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||
Shape: []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
|
||||
WriterTo: mxfp4,
|
||||
@@ -121,12 +124,12 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
// gate_up_exps is interleaved, need to split into gate_exps and up_exps
|
||||
// e.g. gate_exps, up_exps = gate_up_exps[:, 0::2, ...], gate_up_exps[:, 1::2, ...]
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "gate_up", "gate", 1) + ".weight",
|
||||
Name: strings.Replace(name, "gate_up", "gate", 1),
|
||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||
Shape: []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
|
||||
WriterTo: mxfp4.slice(1, 0, int(dims[1]), 2),
|
||||
}, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "gate_up", "up", 1) + ".weight",
|
||||
Name: strings.Replace(name, "gate_up", "up", 1),
|
||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||
Shape: []uint64{dims[0], dims[1] / 2, dims[2] * dims[3] * 2},
|
||||
WriterTo: mxfp4.slice(1, 1, int(dims[1]), 2),
|
||||
|
||||
@@ -29,6 +29,17 @@ type mistral3Model struct {
|
||||
SlidingWindow *uint32 `json:"sliding_window"`
|
||||
HiddenAct string `json:"hidden_act"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
RopeParameters struct {
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
Factor float32 `json:"factor"`
|
||||
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
RopeType string `json:"rope_type"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
Mscale *float32 `json:"mscale"`
|
||||
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||
} `json:"rope_parameters"`
|
||||
} `json:"text_config"`
|
||||
VisionModel struct {
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
@@ -41,6 +52,9 @@ type mistral3Model struct {
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
HiddenAct string `json:"hidden_act"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RopeParameters struct {
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
} `json:"rope_parameters"`
|
||||
} `json:"vision_config"`
|
||||
MultiModalProjectorBias bool `json:"multimodal_projector_bias"`
|
||||
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
||||
@@ -61,8 +75,25 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
||||
kv["mistral3.attention.layer_norm_rms_epsilon"] = p.TextModel.RMSNormEPS
|
||||
kv["mistral3.attention.key_length"] = p.TextModel.HeadDim
|
||||
kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
|
||||
kv["mistral3.rope.dimension_count"] = p.TextModel.HiddenSize / p.TextModel.NumHiddenLayers
|
||||
kv["mistral3.rope.freq_base"] = p.TextModel.RopeTheta
|
||||
kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
|
||||
kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
|
||||
kv["mistral3.rope.scaling.factor"] = p.TextModel.RopeParameters.Factor
|
||||
kv["mistral3.rope.scaling.type"] = p.TextModel.RopeParameters.RopeType
|
||||
kv["mistral3.rope.scaling.beta_fast"] = p.TextModel.RopeParameters.BetaFast
|
||||
kv["mistral3.rope.scaling.beta_slow"] = p.TextModel.RopeParameters.BetaSlow
|
||||
|
||||
if p.TextModel.RopeParameters.Mscale != nil {
|
||||
kv["mistral3.rope.scaling.mscale"] = *p.TextModel.RopeParameters.Mscale
|
||||
}
|
||||
if p.TextModel.RopeParameters.MscaleAllDim != nil {
|
||||
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.TextModel.RopeParameters.MscaleAllDim
|
||||
}
|
||||
if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||
kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
|
||||
}
|
||||
if p.TextModel.RopeParameters.Llama4ScalingBeta != nil {
|
||||
kv["mistral3.rope.scaling_beta"] = *p.TextModel.RopeParameters.Llama4ScalingBeta
|
||||
}
|
||||
|
||||
// Vision configuration
|
||||
kv["mistral3.vision.block_count"] = p.VisionModel.NumHiddenLayers
|
||||
@@ -74,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
||||
kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
|
||||
kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
|
||||
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
|
||||
kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta
|
||||
kv["mistral3.vision.rope.freq_base"] = cmp.Or(p.VisionModel.RopeTheta, p.VisionModel.RopeParameters.RopeTheta)
|
||||
|
||||
// Multimodal configuration
|
||||
kv["mistral3.image_token_index"] = p.ImageTokenIndex
|
||||
|
||||
181
convert/convert_mistral_causal.go
Normal file
181
convert/convert_mistral_causal.go
Normal file
@@ -0,0 +1,181 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type mistral3CausalModel struct {
|
||||
ModelParameters
|
||||
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
SlidingWindow *uint32 `json:"sliding_window"`
|
||||
HiddenAct string `json:"hidden_act"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
RopeParameters struct {
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
Factor float32 `json:"factor"`
|
||||
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
RopeType string `json:"rope_type"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
Mscale *float32 `json:"mscale"`
|
||||
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||
} `json:"rope_parameters"`
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "mistral3"
|
||||
kv["mistral3.vocab_size"] = p.VocabSize
|
||||
|
||||
// Text configuration
|
||||
kv["mistral3.block_count"] = p.NumHiddenLayers
|
||||
kv["mistral3.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["mistral3.embedding_length"] = p.HiddenSize
|
||||
kv["mistral3.feed_forward_length"] = p.IntermediateSize
|
||||
kv["mistral3.attention.head_count"] = p.NumAttentionHeads
|
||||
kv["mistral3.attention.head_count_kv"] = p.NumKeyValueHeads
|
||||
kv["mistral3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||
kv["mistral3.attention.key_length"] = p.HeadDim
|
||||
kv["mistral3.attention.value_length"] = p.HeadDim
|
||||
kv["mistral3.rope.dimension_count"] = cmp.Or(p.HeadDim, p.HiddenSize/p.NumAttentionHeads)
|
||||
kv["mistral3.rope.freq_base"] = cmp.Or(p.RopeTheta, p.RopeParameters.RopeTheta)
|
||||
kv["mistral3.rope.scaling.factor"] = p.RopeParameters.Factor
|
||||
kv["mistral3.rope.scaling.type"] = p.RopeParameters.RopeType
|
||||
kv["mistral3.rope.scaling.beta_fast"] = p.RopeParameters.BetaFast
|
||||
kv["mistral3.rope.scaling.beta_slow"] = p.RopeParameters.BetaSlow
|
||||
|
||||
if p.RopeParameters.Mscale != nil {
|
||||
kv["mistral3.rope.scaling.mscale"] = *p.RopeParameters.Mscale
|
||||
}
|
||||
|
||||
if p.RopeParameters.MscaleAllDim != nil {
|
||||
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.RopeParameters.MscaleAllDim
|
||||
}
|
||||
|
||||
if p.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||
kv["mistral3.rope.scaling.original_context_length"] = p.RopeParameters.OrigMaxPositionEmbeddings
|
||||
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||
}
|
||||
|
||||
if p.RopeParameters.Llama4ScalingBeta != nil {
|
||||
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||
}
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
for _, t := range ts {
|
||||
if !strings.HasPrefix(t.Name(), "v.") {
|
||||
if strings.HasSuffix(t.Name(), ".attn_q.weight") ||
|
||||
strings.HasSuffix(t.Name(), ".attn_k.weight") {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
}
|
||||
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) Replacements() []string {
|
||||
return []string{
|
||||
"model.norm", "output_norm",
|
||||
"model.", "",
|
||||
"layers", "blk",
|
||||
"transformer.layers", "blk",
|
||||
"vision_tower", "v",
|
||||
"ln_pre", "encoder_norm",
|
||||
"input_layernorm", "attn_norm",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
"embed_tokens", "token_embd",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"attention.q_proj", "attn_q",
|
||||
"attention.k_proj", "attn_k",
|
||||
"attention.v_proj", "attn_v",
|
||||
"attention.o_proj", "attn_output",
|
||||
"attention_norm", "attn_norm",
|
||||
"feed_forward.gate_proj", "ffn_gate",
|
||||
"feed_forward.down_proj", "ffn_down",
|
||||
"feed_forward.up_proj", "ffn_up",
|
||||
"multi_modal_projector", "mm",
|
||||
"ffn_norm", "ffn_norm",
|
||||
"lm_head", "output",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||
var dims []int
|
||||
for _, dim := range shape {
|
||||
dims = append(dims, int(dim))
|
||||
}
|
||||
|
||||
var heads uint32
|
||||
if strings.HasSuffix(name, ".attn_q.weight") {
|
||||
heads = p.NumAttentionHeads
|
||||
} else if strings.HasSuffix(name, ".attn_k.weight") {
|
||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||
} else {
|
||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||
}
|
||||
|
||||
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.T(0, 2, 1, 3); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Reshape(dims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ts, err := native.SelectF32(n, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var f32s []float32
|
||||
for _, t := range ts {
|
||||
f32s = append(f32s, t...)
|
||||
}
|
||||
|
||||
return f32s, nil
|
||||
}
|
||||
213
convert/convert_nomicbert.go
Normal file
213
convert/convert_nomicbert.go
Normal file
@@ -0,0 +1,213 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"io/fs"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type nomicbertModel struct {
|
||||
ModelParameters
|
||||
NLayers uint32 `json:"n_layers"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||
RopeFreqBase float32 `json:"rope_theta"`
|
||||
normalizeEmbeddings bool
|
||||
PoolingType uint32
|
||||
|
||||
// MoE parameters (only present in v2 models)
|
||||
NumExperts uint32 `json:"num_local_experts"`
|
||||
NumExpertsUsed uint32 `json:"num_experts_per_tok"`
|
||||
MoEEveryNLayers uint32 `json:"moe_every_n_layers"`
|
||||
}
|
||||
|
||||
var (
|
||||
_ ModelConverter = (*nomicbertModel)(nil)
|
||||
_ moreParser = (*nomicbertModel)(nil)
|
||||
)
|
||||
|
||||
func (p *nomicbertModel) parseMore(fsys fs.FS) error {
|
||||
bts, err := fs.ReadFile(fsys, "modules.json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var modules []struct {
|
||||
Type string `json:"type"`
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, &modules); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pooling string
|
||||
for _, m := range modules {
|
||||
switch m.Type {
|
||||
case "sentence_transformers.models.Pooling":
|
||||
pooling = m.Path
|
||||
case "sentence_transformers.models.Normalize":
|
||||
p.normalizeEmbeddings = true
|
||||
}
|
||||
}
|
||||
|
||||
if pooling != "" {
|
||||
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pc struct {
|
||||
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
|
||||
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, &pc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if pc.PoolingModeMeanTokens {
|
||||
p.PoolingType = 1
|
||||
} else if pc.PoolingModeCLSToken {
|
||||
p.PoolingType = 2
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *nomicbertModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
|
||||
// Determine architecture based on MoE parameters (following qwen3 pattern)
|
||||
arch := "nomic-bert"
|
||||
if p.MoEEveryNLayers > 0 {
|
||||
arch += "-moe"
|
||||
}
|
||||
|
||||
kv["general.architecture"] = arch
|
||||
kv["attention.causal"] = false
|
||||
kv["pooling_type"] = p.PoolingType
|
||||
kv["normalize_embeddings"] = p.normalizeEmbeddings
|
||||
|
||||
kv["block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers)
|
||||
|
||||
if contextLength := p.MaxPositionEmbeddings; contextLength > 0 {
|
||||
kv["context_length"] = contextLength
|
||||
}
|
||||
|
||||
if embeddingLength := p.HiddenSize; embeddingLength > 0 {
|
||||
kv["embedding_length"] = p.HiddenSize
|
||||
}
|
||||
|
||||
if feedForwardLength := p.IntermediateSize; feedForwardLength > 0 {
|
||||
kv["feed_forward_length"] = p.IntermediateSize
|
||||
}
|
||||
|
||||
if headCount := p.NumAttentionHeads; headCount > 0 {
|
||||
kv["attention.head_count"] = p.NumAttentionHeads
|
||||
}
|
||||
|
||||
if kvHeadCount := p.NumKeyValueHeads; kvHeadCount > 0 {
|
||||
kv["attention.head_count_kv"] = p.NumKeyValueHeads
|
||||
}
|
||||
|
||||
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon); layerNormEpsilon > 0 {
|
||||
kv["attention.layer_norm_epsilon"] = layerNormEpsilon
|
||||
}
|
||||
|
||||
if p.RopeFreqBase > 0 {
|
||||
kv["rope.freq_base"] = p.RopeFreqBase
|
||||
}
|
||||
|
||||
// MoE specific parameters (only if MoE is enabled)
|
||||
if p.NumExperts > 0 {
|
||||
kv["expert_count"] = p.NumExperts
|
||||
}
|
||||
|
||||
if p.NumExpertsUsed > 0 {
|
||||
kv["expert_used_count"] = p.NumExpertsUsed
|
||||
}
|
||||
|
||||
if p.MoEEveryNLayers > 0 {
|
||||
kv["moe_every_n_layers"] = p.MoEEveryNLayers
|
||||
}
|
||||
|
||||
kv["tokenizer.ggml.model"] = "bert"
|
||||
kv["tokenizer.ggml.token_type_count"] = uint32(2)
|
||||
|
||||
// convert to phantom space tokens
|
||||
for i, e := range t.Tokens {
|
||||
switch {
|
||||
case strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]"):
|
||||
// noop - keep special tokens as-is
|
||||
case strings.HasPrefix(e, "##"):
|
||||
t.Tokens[i] = e[2:]
|
||||
default:
|
||||
t.Tokens[i] = "\u2581" + e
|
||||
}
|
||||
}
|
||||
|
||||
kv["tokenizer.ggml.tokens"] = t.Tokens
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *nomicbertModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
out := make([]*ggml.Tensor, 0, len(ts))
|
||||
for _, t := range ts {
|
||||
if slices.Contains([]string{
|
||||
"embeddings.position_ids",
|
||||
"pooler.dense.weight",
|
||||
"pooler.dense.bias",
|
||||
}, t.Name()) {
|
||||
continue
|
||||
}
|
||||
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (nomicbertModel) Replacements() []string {
|
||||
return []string{
|
||||
"encoder.layer", "blk",
|
||||
"encoder.layers", "blk",
|
||||
"embeddings.word_embeddings", "token_embd",
|
||||
"embeddings.token_type_embeddings", "token_types",
|
||||
"embeddings.LayerNorm", "token_embd_norm",
|
||||
|
||||
"attention.self.qkv", "attn_qkv",
|
||||
|
||||
"attention.output.dense", "attn_output",
|
||||
"attention.output.LayerNorm", "attn_output_norm",
|
||||
|
||||
"mlp.up", "ffn_up",
|
||||
"mlp.down", "ffn_down",
|
||||
|
||||
"mlp.router", "ffn_gate_inp",
|
||||
"mlp.experts.up", "ffn_up_exps",
|
||||
"mlp.experts.down", "ffn_down_exps",
|
||||
|
||||
"intermediate.dense", "ffn_up",
|
||||
"output.dense", "ffn_down",
|
||||
"output.LayerNorm", "layer_output_norm",
|
||||
}
|
||||
}
|
||||
@@ -44,7 +44,10 @@ func (t tensorBase) Kind() uint32 {
|
||||
t.name == "v.positional_embedding_vlm" ||
|
||||
t.name == "v.tile_position_embd.weight" ||
|
||||
t.name == "v.pre_tile_position_embd.weight" ||
|
||||
t.name == "v.post_tile_position_embd.weight" {
|
||||
t.name == "v.post_tile_position_embd.weight" ||
|
||||
t.name == "s.position_embd" ||
|
||||
strings.HasSuffix(t.name, "rel_pos_h") ||
|
||||
strings.HasSuffix(t.name, "rel_pos_w") {
|
||||
// these tensors are always F32
|
||||
return tensorKindFP32
|
||||
}
|
||||
|
||||
@@ -96,7 +96,10 @@ type safetensor struct {
|
||||
|
||||
func (st safetensor) Kind() uint32 {
|
||||
kind := st.tensorBase.Kind()
|
||||
if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
|
||||
if st.dtype == "BF16" &&
|
||||
!strings.HasPrefix(st.name, "v.") &&
|
||||
!strings.HasPrefix(st.name, "s.") &&
|
||||
kind != tensorKindFP32 {
|
||||
kind = tensorKindBF16
|
||||
}
|
||||
|
||||
|
||||
@@ -2,10 +2,12 @@ package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"errors"
|
||||
"io"
|
||||
"iter"
|
||||
"path"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
@@ -94,6 +96,26 @@ func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []
|
||||
return matched
|
||||
})
|
||||
|
||||
slices.SortStableFunc(matched, func(a, b Tensor) int {
|
||||
x := strings.Split(a.Name(), ".")
|
||||
y := strings.Split(b.Name(), ".")
|
||||
if len(x) != len(y) {
|
||||
return cmp.Compare(len(x), len(y))
|
||||
}
|
||||
|
||||
vals := make([]int, len(x))
|
||||
for i := range x {
|
||||
vals[i] = strings.Compare(x[i], y[i])
|
||||
m, err := strconv.ParseInt(x[i], 0, 0)
|
||||
n, err2 := strconv.ParseInt(y[i], 0, 0)
|
||||
if errors.Join(err, err2) == nil {
|
||||
vals[i] = cmp.Compare(m, n)
|
||||
}
|
||||
}
|
||||
|
||||
return cmp.Or(vals...)
|
||||
})
|
||||
|
||||
if len(matched) > 0 {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: merges[i].name,
|
||||
|
||||
@@ -3,8 +3,10 @@ package convert
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"iter"
|
||||
"math/rand/v2"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -951,3 +953,45 @@ func TestMerge(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestMergeOrder(t *testing.T) {
|
||||
for range 8 {
|
||||
t.Run("", func(t *testing.T) {
|
||||
tensors := make([]Tensor, 16)
|
||||
for i := range tensors {
|
||||
tensors[i] = &fakeTensor{
|
||||
name: fmt.Sprintf("layer.%d.weight", i),
|
||||
shape: []uint64{1},
|
||||
data: []float32{float32(i)},
|
||||
}
|
||||
}
|
||||
|
||||
rand.Shuffle(len(tensors), func(i, j int) {
|
||||
tensors[i], tensors[j] = tensors[j], tensors[i]
|
||||
})
|
||||
|
||||
matched, unmatched := mergeTensors(tensors, merge{"layer.*.weight", "layer.weight"})
|
||||
if len(unmatched) != 0 {
|
||||
t.Error("expected no remaining tensors, got", len(unmatched))
|
||||
}
|
||||
|
||||
if len(matched) != 1 {
|
||||
t.Error("expected 1 merged tensor, got", len(matched))
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := matched[0].WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var f32s [16]float32
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.IsSorted(f32s[:]) {
|
||||
t.Errorf("merged tensor data is not in order: %+v", f32s)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package discover
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
@@ -10,12 +11,21 @@ import (
|
||||
"reflect"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
func GetCPUMem() (memInfo, error) {
|
||||
mem, err := getCPUMem()
|
||||
if err != nil {
|
||||
return memInfo{}, err
|
||||
}
|
||||
return getCPUMemByCgroups(mem), nil
|
||||
}
|
||||
|
||||
func getCPUMem() (memInfo, error) {
|
||||
var mem memInfo
|
||||
var total, available, free, buffers, cached, freeSwap uint64
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
@@ -56,6 +66,32 @@ func GetCPUMem() (memInfo, error) {
|
||||
return mem, nil
|
||||
}
|
||||
|
||||
func getCPUMemByCgroups(mem memInfo) memInfo {
|
||||
total, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.max")
|
||||
if err == nil {
|
||||
mem.TotalMemory = total
|
||||
}
|
||||
used, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.current")
|
||||
if err == nil {
|
||||
mem.FreeMemory = mem.TotalMemory - used
|
||||
}
|
||||
return mem
|
||||
}
|
||||
|
||||
func getUint64ValueFromFile(path string) (uint64, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
s := bufio.NewScanner(f)
|
||||
for s.Scan() {
|
||||
line := s.Text()
|
||||
return strconv.ParseUint(line, 10, 64)
|
||||
}
|
||||
return 0, errors.New("empty file content")
|
||||
}
|
||||
|
||||
const CpuInfoFilename = "/proc/cpuinfo"
|
||||
|
||||
type linuxCpuInfo struct {
|
||||
@@ -74,7 +110,41 @@ func GetCPUDetails() []CPU {
|
||||
return nil
|
||||
}
|
||||
defer file.Close()
|
||||
return linuxCPUDetails(file)
|
||||
cpus := linuxCPUDetails(file)
|
||||
return overwriteThreadCountByLinuxCgroups(cpus)
|
||||
}
|
||||
|
||||
func overwriteThreadCountByLinuxCgroups(cpus []CPU) []CPU {
|
||||
file, err := os.Open("/sys/fs/cgroup/cpu.max")
|
||||
if err != nil {
|
||||
return cpus
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if sl := strings.Split(line, " "); len(sl) == 2 {
|
||||
allowdUs, err := strconv.ParseInt(sl[0], 10, 64)
|
||||
if err != nil {
|
||||
slog.Warn("failed to parse CPU allowed micro secs", "error", err)
|
||||
return cpus
|
||||
}
|
||||
unitUs, err := strconv.ParseInt(sl[1], 10, 64)
|
||||
if err != nil {
|
||||
slog.Warn("failed to parse CPU unit micro secs", "error", err)
|
||||
return cpus
|
||||
}
|
||||
|
||||
threads := int(max(allowdUs/unitUs, 1))
|
||||
|
||||
cpu := cpus[0]
|
||||
cpu.CoreCount = threads
|
||||
cpu.ThreadCount = threads
|
||||
return []CPU{cpu}
|
||||
}
|
||||
}
|
||||
return cpus
|
||||
}
|
||||
|
||||
func linuxCPUDetails(file io.Reader) []CPU {
|
||||
|
||||
@@ -65,6 +65,11 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
}
|
||||
|
||||
slog.Info("discovering available GPUs...")
|
||||
detectIncompatibleLibraries()
|
||||
|
||||
// Warn if any user-overrides are set which could lead to incorrect GPU discovery
|
||||
overrideWarnings()
|
||||
|
||||
requested := envconfig.LLMLibrary()
|
||||
jetpack := cudaJetpack()
|
||||
|
||||
@@ -90,10 +95,13 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
var dirs []string
|
||||
if dir != "" {
|
||||
if requested != "" && filepath.Base(dir) != requested {
|
||||
slog.Debug("skipping available library at users request", "requested", requested, "libDir", dir)
|
||||
slog.Debug("skipping available library at user's request", "requested", requested, "libDir", dir)
|
||||
continue
|
||||
} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
|
||||
continue
|
||||
} else if jetpack == "" && strings.Contains(filepath.Base(dir), "cuda_jetpack") {
|
||||
slog.Debug("jetpack not detected (set JETSON_JETPACK or OLLAMA_LLM_LIBRARY to override), skipping", "libDir", dir)
|
||||
continue
|
||||
} else if !envconfig.EnableVulkan() && strings.Contains(filepath.Base(dir), "vulkan") {
|
||||
slog.Info("experimental Vulkan support disabled. To enable, set OLLAMA_VULKAN=1")
|
||||
continue
|
||||
@@ -113,7 +121,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
// In the second pass, we more deeply initialize the GPUs to weed out devices that
|
||||
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
|
||||
// Only devices that need verification are included in this pass
|
||||
slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
|
||||
slog.Debug("evaluating which, if any, devices to filter out", "initial_count", len(devices))
|
||||
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
var wg sync.WaitGroup
|
||||
@@ -121,15 +129,25 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
supportedMu := sync.Mutex{}
|
||||
supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
|
||||
for i := range devices {
|
||||
libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
|
||||
if !devices[i].NeedsInitValidation() {
|
||||
// No need to validate, add to the supported map
|
||||
supportedMu.Lock()
|
||||
if _, ok := supported[devices[i].Library]; !ok {
|
||||
supported[devices[i].Library] = make(map[string]map[string]int)
|
||||
}
|
||||
if _, ok := supported[devices[i].Library][libDir]; !ok {
|
||||
supported[devices[i].Library][libDir] = make(map[string]int)
|
||||
}
|
||||
supported[devices[i].Library][libDir][devices[i].ID] = i
|
||||
supportedMu.Unlock()
|
||||
continue
|
||||
}
|
||||
libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
|
||||
slog.Debug("verifying device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
|
||||
slog.Debug("verifying if device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1])
|
||||
extraEnvs := ml.GetVisibleDevicesEnv(devices[i:i+1], true)
|
||||
devices[i].AddInitValidation(extraEnvs)
|
||||
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
|
||||
slog.Debug("filtering device which didn't fully initialize",
|
||||
@@ -315,7 +333,8 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
||||
defer cancel()
|
||||
|
||||
// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
|
||||
devFilter := ml.GetVisibleDevicesEnv(devices)
|
||||
// We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
|
||||
devFilter := ml.GetVisibleDevicesEnv(devices, false)
|
||||
|
||||
for dir := range libDirs {
|
||||
updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter)
|
||||
@@ -449,3 +468,37 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map
|
||||
|
||||
return devices
|
||||
}
|
||||
|
||||
func overrideWarnings() {
|
||||
anyFound := false
|
||||
m := envconfig.AsMap()
|
||||
for _, k := range []string{
|
||||
"CUDA_VISIBLE_DEVICES",
|
||||
"HIP_VISIBLE_DEVICES",
|
||||
"ROCR_VISIBLE_DEVICES",
|
||||
"GGML_VK_VISIBLE_DEVICES",
|
||||
"GPU_DEVICE_ORDINAL",
|
||||
"HSA_OVERRIDE_GFX_VERSION",
|
||||
} {
|
||||
if e, found := m[k]; found && e.Value != "" {
|
||||
anyFound = true
|
||||
slog.Warn("user overrode visible devices", k, e.Value)
|
||||
}
|
||||
}
|
||||
if anyFound {
|
||||
slog.Warn("if GPUs are not correctly discovered, unset and try again")
|
||||
}
|
||||
}
|
||||
|
||||
func detectIncompatibleLibraries() {
|
||||
if runtime.GOOS != "windows" {
|
||||
return
|
||||
}
|
||||
basePath, err := exec.LookPath("ggml-base.dll")
|
||||
if err != nil || basePath == "" {
|
||||
return
|
||||
}
|
||||
if !strings.HasPrefix(basePath, ml.LibOllamaPath) {
|
||||
slog.Warn("potentially incompatible library detected in PATH", "location", basePath)
|
||||
}
|
||||
}
|
||||
|
||||
10
docs/api.md
10
docs/api.md
@@ -50,7 +50,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||
Advanced parameters (optional):
|
||||
|
||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `system`: system message to (overrides what is defined in the `Modelfile`)
|
||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
@@ -507,7 +507,7 @@ The `message` object has the following fields:
|
||||
Advanced parameters (optional):
|
||||
|
||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema.
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
@@ -1189,7 +1189,7 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
||||
- `template`: (optional) the prompt template for the model
|
||||
- `license`: (optional) a string or list of strings containing the license or licenses for the model
|
||||
- `system`: (optional) a string containing the system prompt for the model
|
||||
- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.md#valid-parameters-and-values) for a list of parameters)
|
||||
- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.mdx#valid-parameters-and-values) for a list of parameters)
|
||||
- `messages`: (optional) a list of message objects used to create a conversation
|
||||
- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `quantize` (optional): quantize a non-quantized (e.g. float16) model
|
||||
@@ -1698,7 +1698,7 @@ Generate embeddings from a model
|
||||
Advanced parameters:
|
||||
|
||||
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
- `dimensions`: number of dimensions for the embedding
|
||||
|
||||
@@ -1817,7 +1817,7 @@ Generate embeddings from a model
|
||||
|
||||
Advanced parameters:
|
||||
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
### Examples
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -13,9 +13,23 @@ Embeddings turn text into numeric vectors you can store in a vector database, se
|
||||
|
||||
## Generate embeddings
|
||||
|
||||
Use `/api/embed` with a single string.
|
||||
|
||||
<Tabs>
|
||||
<Tab title="CLI">
|
||||
Generate embeddings directly from the command line:
|
||||
|
||||
```shell
|
||||
ollama run embeddinggemma "Hello world"
|
||||
```
|
||||
|
||||
You can also pipe text to generate embeddings:
|
||||
|
||||
```shell
|
||||
echo "Hello world" | ollama run embeddinggemma
|
||||
```
|
||||
|
||||
Output is a JSON array.
|
||||
|
||||
</Tab>
|
||||
<Tab title="cURL">
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/embed \
|
||||
|
||||
@@ -15,7 +15,7 @@ Also known as "single-shot" tool calling.
|
||||
```shell
|
||||
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||
"model": "qwen3",
|
||||
"messages": [{"role": "user", "content": "What's the temperature in New York?"}],
|
||||
"messages": [{"role": "user", "content": "What is the temperature in New York?"}],
|
||||
"stream": false,
|
||||
"tools": [
|
||||
{
|
||||
@@ -41,7 +41,7 @@ Also known as "single-shot" tool calling.
|
||||
curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
||||
"model": "qwen3",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the temperature in New York?"},
|
||||
{"role": "user", "content": "What is the temperature in New York?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [
|
||||
@@ -90,7 +90,7 @@ Also known as "single-shot" tool calling.
|
||||
}
|
||||
return temperatures.get(city, "Unknown")
|
||||
|
||||
messages = [{"role": "user", "content": "What's the temperature in New York?"}]
|
||||
messages = [{"role": "user", "content": "What is the temperature in New York?"}]
|
||||
|
||||
# pass functions directly as tools in the tools list or as a JSON schema
|
||||
response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
|
||||
@@ -146,7 +146,7 @@ Also known as "single-shot" tool calling.
|
||||
},
|
||||
]
|
||||
|
||||
const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
|
||||
const messages = [{ role: 'user', content: "What is the temperature in New York?" }]
|
||||
|
||||
const response = await ollama.chat({
|
||||
model: 'qwen3',
|
||||
@@ -609,7 +609,7 @@ def get_temperature(city: str) -> str:
|
||||
return temperatures.get(city, 'Unknown')
|
||||
|
||||
|
||||
messages = [{'role': 'user', 'content': "What's the temperature in New York?"}]
|
||||
messages = [{'role': 'user', 'content': "What is the temperature in New York?"}]
|
||||
|
||||
while True:
|
||||
stream = chat(
|
||||
@@ -684,7 +684,7 @@ const getTemperatureTool = {
|
||||
}
|
||||
|
||||
async function agentLoop() {
|
||||
const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
|
||||
const messages = [{ role: 'user', content: "What is the temperature in New York?" }]
|
||||
|
||||
while (true) {
|
||||
const stream = await ollama.chat({
|
||||
|
||||
@@ -9,15 +9,9 @@ sidebarTitle: Cloud
|
||||
|
||||
Ollama's cloud models are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud service while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn't fit on a personal computer.
|
||||
|
||||
Ollama currently supports the following cloud models, with more coming soon:
|
||||
### Supported models
|
||||
|
||||
- `deepseek-v3.1:671b-cloud`
|
||||
- `gpt-oss:20b-cloud`
|
||||
- `gpt-oss:120b-cloud`
|
||||
- `kimi-k2:1t-cloud`
|
||||
- `qwen3-coder:480b-cloud`
|
||||
- `glm-4.6:cloud`
|
||||
- `minimax-m2:cloud`
|
||||
For a list of supported models, see Ollama's [model library](https://ollama.com/search?c=cloud).
|
||||
|
||||
### Running Cloud models
|
||||
|
||||
|
||||
@@ -49,6 +49,8 @@ Install prerequisites:
|
||||
- [Ninja](https://github.com/ninja-build/ninja/releases)
|
||||
- (Optional) NVIDIA GPU support
|
||||
- [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
|
||||
- (Optional) VULKAN GPU support
|
||||
- [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
|
||||
|
||||
Then, configure and build the project:
|
||||
|
||||
@@ -57,6 +59,17 @@ cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
> Building for Vulkan requires VULKAN_SDK environment variable:
|
||||
>
|
||||
> PowerShell
|
||||
> ```powershell
|
||||
> $env:VULKAN_SDK="C:\VulkanSDK\<version>"
|
||||
> ```
|
||||
> CMD
|
||||
> ```cmd
|
||||
> set VULKAN_SDK=C:\VulkanSDK\<version>
|
||||
> ```
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Building for ROCm requires additional flags:
|
||||
> ```
|
||||
@@ -65,6 +78,7 @@ cmake --build build --config Release
|
||||
> ```
|
||||
|
||||
|
||||
|
||||
Lastly, run Ollama:
|
||||
|
||||
```shell
|
||||
@@ -84,7 +98,9 @@ Install prerequisites:
|
||||
- [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
|
||||
- (Optional) NVIDIA GPU support
|
||||
- [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
|
||||
|
||||
- (Optional) VULKAN GPU support
|
||||
- [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
|
||||
- Or install via package manager: `sudo apt install vulkan-sdk` (Ubuntu/Debian) or `sudo dnf install vulkan-sdk` (Fedora/CentOS)
|
||||
> [!IMPORTANT]
|
||||
> Ensure prerequisites are in `PATH` before running CMake.
|
||||
|
||||
|
||||
13
docs/faq.mdx
13
docs/faq.mdx
@@ -57,8 +57,13 @@ ollama ps
|
||||
```
|
||||
|
||||
<Info>
|
||||
**Output**: ``` NAME ID SIZE PROCESSOR UNTIL llama3:70b bcfb190ca3a7 42 GB
|
||||
100% GPU 4 minutes from now ```
|
||||
|
||||
**Output**:
|
||||
|
||||
```
|
||||
NAME ID SIZE PROCESSOR UNTIL
|
||||
llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now
|
||||
```
|
||||
</Info>
|
||||
|
||||
The `Processor` column will show which memory the model was loaded in to:
|
||||
@@ -223,7 +228,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
|
||||
|
||||
## How can I use Ollama in Visual Studio Code?
|
||||
|
||||
There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
|
||||
There is already a large collection of plugins available for VS Code as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
|
||||
|
||||
## How do I use Ollama with GPU acceleration in Docker?
|
||||
|
||||
@@ -385,4 +390,4 @@ Ollama for Windows and macOS register as a login item during installation. You
|
||||
- In `Task Manager` go to the `Startup apps` tab, search for `ollama` then click `Disable`
|
||||
|
||||
**MacOS**
|
||||
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
|
||||
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
|
||||
|
||||
@@ -1,34 +1,34 @@
|
||||
---
|
||||
title: VS Code
|
||||
title: VS Code
|
||||
---
|
||||
|
||||
## Install
|
||||
|
||||
Install [VSCode](https://code.visualstudio.com/download).
|
||||
Install [VS Code](https://code.visualstudio.com/download).
|
||||
|
||||
## Usage with Ollama
|
||||
## Usage with Ollama
|
||||
|
||||
1. Open Copilot side bar found in top right window
|
||||
<div style={{ display: 'flex', justifyContent: 'center' }}>
|
||||
<img
|
||||
src="/images/vscode-sidebar.png"
|
||||
alt="VSCode chat Sidebar"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
2. Select the model drowpdown > **Manage models**
|
||||
<div style={{ display: 'flex', justifyContent: 'center' }}>
|
||||
<img
|
||||
src="/images/vscode-models.png"
|
||||
alt="VSCode model picker"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
<div style={{ display: "flex", justifyContent: "center" }}>
|
||||
<img
|
||||
src="/images/vscode-sidebar.png"
|
||||
alt="VS Code chat Sidebar"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
2. Select the model dropdown > **Manage models**
|
||||
<div style={{ display: "flex", justifyContent: "center" }}>
|
||||
<img
|
||||
src="/images/vscode-models.png"
|
||||
alt="VS Code model picker"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
3. Enter **Ollama** under **Provider Dropdown** and select desired models (e.g `qwen3, qwen3-coder:480b-cloud`)
|
||||
<div style={{ display: 'flex', justifyContent: 'center' }}>
|
||||
<img
|
||||
src="/images/vscode-model-options.png"
|
||||
alt="VSCode model options dropdown"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
<div style={{ display: "flex", justifyContent: "center" }}>
|
||||
<img
|
||||
src="/images/vscode-model-options.png"
|
||||
alt="VS Code model options dropdown"
|
||||
width="75%"
|
||||
/>
|
||||
</div>
|
||||
|
||||
@@ -149,9 +149,6 @@ PARAMETER <parameter> <parametervalue>
|
||||
|
||||
| Parameter | Description | Value Type | Example Usage |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
|
||||
| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 |
|
||||
| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 |
|
||||
| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
|
||||
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
|
||||
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
||||
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
||||
|
||||
@@ -111,6 +111,12 @@ components:
|
||||
description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
|
||||
options:
|
||||
$ref: "#/components/schemas/ModelOptions"
|
||||
logprobs:
|
||||
type: boolean
|
||||
description: Whether to return log probabilities of the output tokens
|
||||
top_logprobs:
|
||||
type: integer
|
||||
description: Number of most likely tokens to return at each token position when logprobs are enabled
|
||||
GenerateResponse:
|
||||
type: object
|
||||
properties:
|
||||
@@ -150,6 +156,11 @@ components:
|
||||
eval_duration:
|
||||
type: integer
|
||||
description: Time spent generating tokens in nanoseconds
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Logprob"
|
||||
description: Log probability information for the generated tokens when logprobs are enabled
|
||||
GenerateStreamEvent:
|
||||
type: object
|
||||
properties:
|
||||
@@ -287,6 +298,12 @@ components:
|
||||
- type: string
|
||||
- type: number
|
||||
description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
|
||||
logprobs:
|
||||
type: boolean
|
||||
description: Whether to return log probabilities of the output tokens
|
||||
top_logprobs:
|
||||
type: integer
|
||||
description: Number of most likely tokens to return at each token position when logprobs are enabled
|
||||
ChatResponse:
|
||||
type: object
|
||||
properties:
|
||||
@@ -344,6 +361,11 @@ components:
|
||||
eval_duration:
|
||||
type: integer
|
||||
description: Time spent generating tokens in nanoseconds
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Logprob"
|
||||
description: Log probability information for the generated tokens when logprobs are enabled
|
||||
ChatStreamEvent:
|
||||
type: object
|
||||
properties:
|
||||
@@ -706,6 +728,41 @@ components:
|
||||
version:
|
||||
type: string
|
||||
description: Version of Ollama
|
||||
TokenLogprob:
|
||||
type: object
|
||||
description: Log probability information for a single token alternative
|
||||
properties:
|
||||
token:
|
||||
type: string
|
||||
description: The text representation of the token
|
||||
logprob:
|
||||
type: number
|
||||
description: The log probability of this token
|
||||
bytes:
|
||||
type: array
|
||||
items:
|
||||
type: integer
|
||||
description: The raw byte representation of the token
|
||||
Logprob:
|
||||
type: object
|
||||
description: Log probability information for a generated token
|
||||
properties:
|
||||
token:
|
||||
type: string
|
||||
description: The text representation of the token
|
||||
logprob:
|
||||
type: number
|
||||
description: The log probability of this token
|
||||
bytes:
|
||||
type: array
|
||||
items:
|
||||
type: integer
|
||||
description: The raw byte representation of the token
|
||||
top_logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TokenLogprob"
|
||||
description: Most likely tokens and their log probabilities at this position
|
||||
ErrorResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
||||
46
docs/tools/extract-examples/README.md
Normal file
46
docs/tools/extract-examples/README.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# extract-examples
|
||||
|
||||
Extracts code examples from MDX files to a temp directory so you can run them.
|
||||
|
||||
## Usage
|
||||
|
||||
```shell
|
||||
go run docs/tools/extract-examples/main.go <mdx-file>
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
```shell
|
||||
go run docs/tools/extract-examples/main.go docs/api/openai-compatibility.mdx
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
```
|
||||
Extracting code examples to: /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||
|
||||
- 01_basic.py
|
||||
- 01_basic.js
|
||||
- 01_basic.sh
|
||||
- 02_responses.py
|
||||
- 02_responses.js
|
||||
- 02_responses.sh
|
||||
- 03_vision.py
|
||||
- 03_vision.js
|
||||
- 03_vision.sh
|
||||
|
||||
Extracted 9 file(s) to /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||
|
||||
To run examples:
|
||||
|
||||
cd /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||
npm install # for JS examples
|
||||
|
||||
then run individual files with `node file.js`, `python file.py`, `bash file.sh`
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
- Parses MDX files looking for fenced code blocks with filenames (e.g., ` ```python basic.py `)
|
||||
- Groups examples by their `<CodeGroup>` and prefixes filenames with `01_`, `02_`, etc.
|
||||
- Writes all extracted files to a temp directory
|
||||
137
docs/tools/extract-examples/main.go
Normal file
137
docs/tools/extract-examples/main.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
fmt.Fprintln(os.Stderr, "Usage: go run extract-examples.go <mdx-file>")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
mdxFile := os.Args[1]
|
||||
|
||||
f, err := os.Open(mdxFile)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Create temp directory
|
||||
tempDir, err := os.MkdirTemp("", "mdx-examples-*")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error creating temp dir: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Printf("Extracting code examples to: %s\n\n", tempDir)
|
||||
|
||||
// Patterns
|
||||
codeBlockStart := regexp.MustCompile("^```([a-zA-Z0-9_-]+)\\s+([^\\s]+)$")
|
||||
codeGroupStart := regexp.MustCompile("^<CodeGroup")
|
||||
codeGroupEnd := regexp.MustCompile("^</CodeGroup>")
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
inCodeBlock := false
|
||||
inCodeGroup := false
|
||||
var currentFile string
|
||||
var content strings.Builder
|
||||
count := 0
|
||||
codeGroupNum := 0
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Track CodeGroup boundaries
|
||||
if codeGroupStart.MatchString(line) {
|
||||
inCodeGroup = true
|
||||
codeGroupNum++
|
||||
continue
|
||||
}
|
||||
if codeGroupEnd.MatchString(line) {
|
||||
inCodeGroup = false
|
||||
continue
|
||||
}
|
||||
|
||||
if inCodeBlock {
|
||||
if line == "```" {
|
||||
// End of code block - write file
|
||||
if currentFile != "" {
|
||||
outPath := filepath.Join(tempDir, currentFile)
|
||||
if err := os.WriteFile(outPath, []byte(content.String()), 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing %s: %v\n", currentFile, err)
|
||||
} else {
|
||||
fmt.Printf(" - %s\n", currentFile)
|
||||
count++
|
||||
}
|
||||
}
|
||||
inCodeBlock = false
|
||||
currentFile = ""
|
||||
content.Reset()
|
||||
} else {
|
||||
content.WriteString(line)
|
||||
content.WriteString("\n")
|
||||
}
|
||||
} else {
|
||||
if matches := codeBlockStart.FindStringSubmatch(line); matches != nil {
|
||||
inCodeBlock = true
|
||||
filename := matches[2]
|
||||
// Prefix with CodeGroup number if inside a CodeGroup
|
||||
if inCodeGroup {
|
||||
currentFile = fmt.Sprintf("%02d_%s", codeGroupNum, filename)
|
||||
} else {
|
||||
currentFile = filename
|
||||
}
|
||||
content.Reset()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Write package.json for JavaScript dependencies
|
||||
packageJSON := `{
|
||||
"name": "mdx-examples",
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"openai": "^4",
|
||||
"ollama": "^0.5"
|
||||
}
|
||||
}
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(tempDir, "package.json"), []byte(packageJSON), 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing package.json: %v\n", err)
|
||||
}
|
||||
|
||||
// Write pyproject.toml for Python dependencies
|
||||
pyprojectTOML := `[project]
|
||||
name = "mdx-examples"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"openai",
|
||||
"ollama",
|
||||
]
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(tempDir, "pyproject.toml"), []byte(pyprojectTOML), 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing pyproject.toml: %v\n", err)
|
||||
}
|
||||
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf("Extracted %d file(s) to %s\n", count, tempDir)
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf("To run examples:\n")
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf(" cd %s\n npm install # for JS examples\n", tempDir)
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf("then run individual files with `node file.js`, `python file.py`, `bash file.sh`\n")
|
||||
}
|
||||
@@ -249,6 +249,9 @@ func (kv KV) OllamaEngineRequired() bool {
|
||||
"qwen25vl",
|
||||
"qwen3", "qwen3moe",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
"deepseekocr",
|
||||
"deepseek2",
|
||||
"nomic-bert",
|
||||
}, kv.Architecture())
|
||||
}
|
||||
|
||||
@@ -828,6 +831,7 @@ func (f GGML) FlashAttention() bool {
|
||||
return slices.Contains([]string{
|
||||
"gemma3",
|
||||
"gptoss", "gpt-oss",
|
||||
"mistral3",
|
||||
"qwen3", "qwen3moe",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
}, f.KV().String("general.architecture"))
|
||||
|
||||
@@ -305,7 +305,7 @@ func readGGUFV1StringsData(llm *gguf, r io.Reader, a *array[string]) (any, error
|
||||
|
||||
a.values[i] = e
|
||||
} else {
|
||||
discardGGUFString(llm, r)
|
||||
_ = discardGGUFString(llm, r)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -568,7 +568,6 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||
// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
|
||||
for _, t := range ts {
|
||||
t := t
|
||||
w := io.NewOffsetWriter(f, offset+int64(t.Offset))
|
||||
g.Go(func() error {
|
||||
_, err := t.WriteTo(w)
|
||||
@@ -598,6 +597,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
||||
|
||||
var err error
|
||||
switch v := v.(type) {
|
||||
case int32:
|
||||
err = writeGGUF(ws, ggufTypeInt32, v)
|
||||
case int64:
|
||||
err = writeGGUF(ws, ggufTypeInt64, v)
|
||||
case uint32, FileType:
|
||||
err = writeGGUF(ws, ggufTypeUint32, v)
|
||||
case uint64:
|
||||
@@ -612,6 +615,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
||||
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
||||
case *array[int32]:
|
||||
err = writeGGUFArray(ws, ggufTypeInt32, v.values)
|
||||
case []int64:
|
||||
err = writeGGUFArray(ws, ggufTypeInt64, v)
|
||||
case *array[int64]:
|
||||
err = writeGGUFArray(ws, ggufTypeInt64, v.values)
|
||||
case []uint32:
|
||||
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
||||
case *array[uint32]:
|
||||
|
||||
@@ -42,6 +42,10 @@ func TestWriteGGUF(t *testing.T) {
|
||||
"general.architecture": "test",
|
||||
"general.alignment": uint32(16),
|
||||
"test.key": "value",
|
||||
"test.int32_key": int32(-42),
|
||||
"test.int64_key": int64(-9223372036854775808),
|
||||
"test.int32_array": []int32{-1, 0, 1, 2147483647, -2147483648},
|
||||
"test.int64_array": []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808},
|
||||
"attention.key": "value2",
|
||||
"tokenizer.key": "value3",
|
||||
"adapter.key": "value4",
|
||||
@@ -55,7 +59,7 @@ func TestWriteGGUF(t *testing.T) {
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
ff, err := Decode(r, 0)
|
||||
ff, err := Decode(r, -1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -65,15 +69,19 @@ func TestWriteGGUF(t *testing.T) {
|
||||
"general.alignment": uint32(16),
|
||||
"general.parameter_count": uint64(54),
|
||||
"test.key": "value",
|
||||
"test.int32_key": int32(-42),
|
||||
"test.int64_key": int64(-9223372036854775808),
|
||||
"test.int32_array": &array[int32]{size: 5, values: []int32{-1, 0, 1, 2147483647, -2147483648}},
|
||||
"test.int64_array": &array[int64]{size: 5, values: []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808}},
|
||||
"test.attention.key": "value2",
|
||||
"tokenizer.key": "value3",
|
||||
"adapter.key": "value4",
|
||||
}, ff.KV()); diff != "" {
|
||||
}, ff.KV(), cmp.AllowUnexported(array[int32]{}, array[int64]{})); diff != "" {
|
||||
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(Tensors{
|
||||
Offset: 800,
|
||||
Offset: 992,
|
||||
items: []*Tensor{
|
||||
{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
|
||||
{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
|
||||
|
||||
1
go.mod
1
go.mod
@@ -17,7 +17,6 @@ require (
|
||||
github.com/x448/float16 v0.8.4
|
||||
golang.org/x/sync v0.12.0
|
||||
golang.org/x/sys v0.36.0
|
||||
|
||||
)
|
||||
|
||||
require (
|
||||
|
||||
@@ -388,9 +388,9 @@ func NewFunctionNameMap() *FunctionNameMap {
|
||||
}
|
||||
}
|
||||
|
||||
// Init initializes the handler with tools and optional last message
|
||||
// Init initializes the handler with tools, optional last message, and think value
|
||||
// Implements the Parser interface
|
||||
func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
|
||||
func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
|
||||
// Initialize the harmony parser
|
||||
if h.HarmonyParser == nil {
|
||||
h.HarmonyParser = &HarmonyParser{
|
||||
|
||||
@@ -14,6 +14,23 @@ import (
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
func assertBytesMatchToken(t *testing.T, label, token string, ints []int) {
|
||||
t.Helper()
|
||||
|
||||
raw := []byte(token)
|
||||
if len(ints) != len(raw) {
|
||||
t.Errorf("%s expected %d bytes for token %q, got %d (%v)", label, len(raw), token, len(ints), ints)
|
||||
return
|
||||
}
|
||||
|
||||
for i, b := range raw {
|
||||
if ints[i] != int(b) {
|
||||
t.Errorf("%s byte[%d] mismatch for token %q: got %d want %d", label, i, token, ints[i], int(b))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAPIGenerate(t *testing.T) {
|
||||
initialTimeout := 60 * time.Second
|
||||
streamTimeout := 30 * time.Second
|
||||
@@ -466,6 +483,7 @@ func TestAPIGenerateLogprobs(t *testing.T) {
|
||||
if lp.Logprob > 0 {
|
||||
t.Errorf("logprob[%d] has positive logprob %f (should be <= 0)", i, lp.Logprob)
|
||||
}
|
||||
assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d]", i), lp.Token, lp.Bytes)
|
||||
|
||||
// Check top_logprobs if requested
|
||||
if test.topLogprobs > 0 {
|
||||
@@ -482,6 +500,9 @@ func TestAPIGenerateLogprobs(t *testing.T) {
|
||||
t.Errorf("logprob[%d].top_logprobs not sorted: %f < %f", i, lp.TopLogprobs[j-1].Logprob, lp.TopLogprobs[j].Logprob)
|
||||
}
|
||||
}
|
||||
for j, top := range lp.TopLogprobs {
|
||||
assertBytesMatchToken(t, fmt.Sprintf("generate logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
|
||||
}
|
||||
} else if len(lp.TopLogprobs) > 0 {
|
||||
t.Errorf("logprob[%d] has top_logprobs but none were requested", i)
|
||||
}
|
||||
@@ -544,11 +565,15 @@ func TestAPIChatLogprobs(t *testing.T) {
|
||||
if lp.Logprob > 0 {
|
||||
t.Errorf("logprob[%d] has positive logprob %f", i, lp.Logprob)
|
||||
}
|
||||
assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d]", i), lp.Token, lp.Bytes)
|
||||
if len(lp.TopLogprobs) == 0 {
|
||||
t.Errorf("logprob[%d] expected top_logprobs but got none", i)
|
||||
}
|
||||
if len(lp.TopLogprobs) > 3 {
|
||||
t.Errorf("logprob[%d] has %d top_logprobs, expected max 3", i, len(lp.TopLogprobs))
|
||||
}
|
||||
for j, top := range lp.TopLogprobs {
|
||||
assertBytesMatchToken(t, fmt.Sprintf("chat logprob[%d].top[%d]", i, j), top.Token, top.Bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,9 @@ package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"math"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -204,8 +206,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
||||
t.Fatalf("expected %v, got %v (similarity: %f)", expected[0:5], res.Embeddings[0][0:5], sim)
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 6 {
|
||||
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
|
||||
if res.PromptEvalCount != 8 {
|
||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,8 +253,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||
t.Fatalf("expected %v, got %v (similarity: %f)", expected[1][0:5], res.Embeddings[1][0:5], sim)
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 12 {
|
||||
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
|
||||
if res.PromptEvalCount != 16 {
|
||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -275,7 +277,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
request api.EmbedRequest
|
||||
check func(*api.EmbedResponse, error)
|
||||
check func(*testing.T, *api.EmbedResponse, error)
|
||||
}{
|
||||
{
|
||||
name: "target truncation",
|
||||
@@ -283,7 +285,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
Model: "all-minilm",
|
||||
Input: "why",
|
||||
},
|
||||
check: func(got *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -300,10 +302,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
Input: "why is the sky blue?",
|
||||
Options: map[string]any{"num_ctx": 3},
|
||||
},
|
||||
check: func(got *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
|
||||
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
||||
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
@@ -317,10 +320,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 3},
|
||||
},
|
||||
check: func(got *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
|
||||
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
||||
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
@@ -334,21 +338,21 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 3},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "input exceeds maximum context length" {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "the input length exceeds the context length" {
|
||||
t.Fatalf("expected truncation error, got: %v", err)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "input after truncate error",
|
||||
name: "input after truncate error with context length of 1",
|
||||
request: api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: "why is the sky blue?",
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 1},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "input after truncation exceeds maximum context length" {
|
||||
t.Fatalf("expected truncation error, got: %v", err)
|
||||
}
|
||||
@@ -362,7 +366,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 0},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "input after truncation exceeds maximum context length" {
|
||||
t.Fatalf("expected truncation error, got: %v", err)
|
||||
}
|
||||
@@ -375,7 +379,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
Input: "why is the sky blue? Why is the sky blue? hi there my",
|
||||
Options: map[string]any{"num_ctx": 16},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -385,7 +389,8 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
|
||||
for _, req := range cases {
|
||||
t.Run(req.name, func(t *testing.T) {
|
||||
req.check(embedTestHelper(ctx, client, t, req.request))
|
||||
resp, err := embedTestHelper(ctx, client, t, req.request)
|
||||
req.check(t, resp, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -409,3 +414,230 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req
|
||||
|
||||
return client.Embed(ctx, &req)
|
||||
}
|
||||
|
||||
func TestEmbedTruncation(t *testing.T) {
|
||||
// Use test deadline if set, otherwise default to 2 minutes
|
||||
timeout := 2 * time.Minute
|
||||
if deadline, ok := t.Deadline(); ok {
|
||||
timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for _, model := range libraryEmbedModels {
|
||||
model := model
|
||||
t.Run(model, func(t *testing.T) {
|
||||
// Check if we're running out of time (reserve 20s for current model)
|
||||
if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
|
||||
t.Skip("skipping remaining tests to avoid timeout")
|
||||
}
|
||||
|
||||
// Give each model its own budget to account for first-time pulls/loads
|
||||
mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
|
||||
defer mcancel()
|
||||
|
||||
t.Run("truncation batch", func(t *testing.T) {
|
||||
truncTrue := true
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: []string{"short", strings.Repeat("long ", 100), "medium text"},
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 30},
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(mctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(res.Embeddings) != 3 {
|
||||
t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
|
||||
}
|
||||
|
||||
if res.PromptEvalCount > 90 {
|
||||
t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("runner token count accuracy", func(t *testing.T) {
|
||||
baseline := api.EmbedRequest{Model: model, Input: "test"}
|
||||
baseRes, err := embedTestHelper(mctx, client, t, baseline)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
batch := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: []string{"test", "test", "test"},
|
||||
}
|
||||
batchRes, err := embedTestHelper(mctx, client, t, batch)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expectedCount := baseRes.PromptEvalCount * 3
|
||||
if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
|
||||
t.Fatalf("expected ~%d tokens (3 × %d), got %d",
|
||||
expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
|
||||
func TestEmbedLargeInput(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for _, model := range libraryEmbedModels {
|
||||
model := model
|
||||
t.Run(model, func(t *testing.T) {
|
||||
mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
|
||||
defer mcancel()
|
||||
|
||||
// Test with progressively larger inputs
|
||||
testCases := []struct {
|
||||
name string
|
||||
inputWords int
|
||||
}{
|
||||
{"medium_input_256_words", 256},
|
||||
{"large_input_512_words", 512},
|
||||
{"very_large_input_800_words", 800},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
words := make([]string, tc.inputWords)
|
||||
for i := range words {
|
||||
words[i] = "word"
|
||||
}
|
||||
input := strings.Join(words, " ")
|
||||
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: input,
|
||||
KeepAlive: &api.Duration{Duration: 30 * time.Second},
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(mctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatalf("embedding failed for %d words: %v", tc.inputWords, err)
|
||||
}
|
||||
|
||||
if len(res.Embeddings) != 1 {
|
||||
t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
|
||||
}
|
||||
|
||||
if len(res.Embeddings[0]) == 0 {
|
||||
t.Fatal("expected non-empty embedding")
|
||||
}
|
||||
|
||||
t.Logf("Successfully embedded %d words (%d tokens)", tc.inputWords, res.PromptEvalCount)
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEmbedStatusCode tests that errors from the embedding endpoint
|
||||
// properly preserve their HTTP status codes when returned to the client.
|
||||
// This test specifically checks the error handling path in EmbedHandler
|
||||
// where api.StatusError errors should maintain their original status code.
|
||||
func TestEmbedStatusCode(t *testing.T) {
|
||||
// Use test deadline if set, otherwise default to 2 minutes
|
||||
timeout := 2 * time.Minute
|
||||
if deadline, ok := t.Deadline(); ok {
|
||||
timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for _, model := range libraryEmbedModels {
|
||||
model := model
|
||||
t.Run(model, func(t *testing.T) {
|
||||
// Check if we're running out of time (reserve 20s for current model)
|
||||
if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
|
||||
t.Skip("skipping remaining tests to avoid timeout")
|
||||
}
|
||||
|
||||
mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
|
||||
defer mcancel()
|
||||
|
||||
// Pull the model if needed
|
||||
if err := PullIfMissing(mctx, client, model); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
t.Run("truncation error status code", func(t *testing.T) {
|
||||
truncFalse := false
|
||||
longInput := strings.Repeat("word ", 100)
|
||||
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: longInput,
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 10},
|
||||
}
|
||||
|
||||
_, err := embedTestHelper(mctx, client, t, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when truncate=false with long input")
|
||||
}
|
||||
|
||||
// Check that it's a StatusError with the correct status code
|
||||
var statusErr api.StatusError
|
||||
if !errors.As(err, &statusErr) {
|
||||
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||
}
|
||||
|
||||
// The error should be a 4xx client error (likely 400 Bad Request)
|
||||
// not a 500 Internal Server Error
|
||||
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||
}
|
||||
|
||||
// Verify the error message is meaningful
|
||||
if !strings.Contains(err.Error(), "context length") {
|
||||
t.Errorf("expected error message to mention context length, got: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("batch truncation error status code", func(t *testing.T) {
|
||||
truncFalse := false
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: []string{
|
||||
"short input",
|
||||
strings.Repeat("very long input ", 100),
|
||||
"another short input",
|
||||
},
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 10},
|
||||
}
|
||||
|
||||
_, err := embedTestHelper(mctx, client, t, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when one input exceeds context with truncate=false")
|
||||
}
|
||||
|
||||
// Check that it's a StatusError with the correct status code
|
||||
var statusErr api.StatusError
|
||||
if !errors.As(err, &statusErr) {
|
||||
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||
}
|
||||
|
||||
// The error should be a 4xx client error, not a 500 Internal Server Error
|
||||
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,9 @@ func TestVisionModels(t *testing.T) {
|
||||
// Qwen 3 VL mixture of experts
|
||||
model: "qwen3-vl:30b",
|
||||
},
|
||||
{
|
||||
model: "ministral-3",
|
||||
},
|
||||
}
|
||||
|
||||
for _, v := range testCases {
|
||||
|
||||
@@ -30,6 +30,7 @@ func TestAPIToolCalling(t *testing.T) {
|
||||
"mistral": 6,
|
||||
"qwen2.5": 6,
|
||||
"qwen2": 6,
|
||||
"ministral-3": 20,
|
||||
"mistral-nemo": 9,
|
||||
"mistral-small": 16,
|
||||
"mixtral:8x22b": 80,
|
||||
|
||||
@@ -38,6 +38,7 @@ var (
|
||||
|
||||
// Note: add newer models at the top of the list to test them first
|
||||
ollamaEngineChatModels = []string{
|
||||
"ministral-3",
|
||||
"qwen3-coder:30b",
|
||||
"gpt-oss:20b",
|
||||
"gemma3n:e2b",
|
||||
@@ -167,6 +168,7 @@ var (
|
||||
"medllama2",
|
||||
"megadolphin",
|
||||
"minicpm-v",
|
||||
"ministral-3",
|
||||
"mistral-large",
|
||||
"mistral-nemo",
|
||||
"mistral-openorca",
|
||||
@@ -270,6 +272,7 @@ var (
|
||||
"mistral",
|
||||
"qwen2.5",
|
||||
"qwen2",
|
||||
"ministral-3",
|
||||
"mistral-nemo",
|
||||
"mistral-small",
|
||||
"mixtral:8x22b",
|
||||
|
||||
@@ -3,7 +3,6 @@ package kvcache
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
@@ -40,18 +39,18 @@ type Causal struct {
|
||||
|
||||
// ** current forward pass **
|
||||
|
||||
// the active layer for Get and Put
|
||||
curLayer int
|
||||
|
||||
// starting location for data storage for this batch
|
||||
curLoc int
|
||||
|
||||
// size of the current batch
|
||||
curBatchSize int
|
||||
|
||||
// locations for data storage for this batch
|
||||
curLoc ml.Tensor
|
||||
|
||||
// mask of the cache as used by this batch
|
||||
curMask ml.Tensor
|
||||
|
||||
// the active layer for Get and Put
|
||||
curLayer int
|
||||
|
||||
// locations in the cache that are needed for this batch
|
||||
curCellRange cellRange
|
||||
|
||||
@@ -206,45 +205,47 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
|
||||
c.curPositions = batch.Positions
|
||||
c.opts.Except = nil
|
||||
|
||||
var locs []int32
|
||||
if !reserve {
|
||||
c.updateSlidingWindow()
|
||||
|
||||
var err error
|
||||
c.curLoc, err = c.findStartLoc()
|
||||
if errors.Is(err, ErrKvCacheFull) {
|
||||
c.defrag()
|
||||
c.curLoc, err = c.findStartLoc()
|
||||
}
|
||||
locs, err = c.findLocs()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for i, pos := range batch.Positions {
|
||||
seq := batch.Sequences[i]
|
||||
loc := int(locs[i])
|
||||
|
||||
c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
|
||||
c.cells[loc] = cacheCell{pos: pos, sequences: []int{seq}}
|
||||
|
||||
seqRange, ok := c.cellRanges[seq]
|
||||
if !ok {
|
||||
seqRange = newRange()
|
||||
}
|
||||
|
||||
seqRange.min = min(seqRange.min, c.curLoc+i)
|
||||
c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
|
||||
seqRange.min = min(seqRange.min, loc)
|
||||
c.curCellRange.min = min(c.curCellRange.min, loc)
|
||||
|
||||
seqRange.max = max(seqRange.max, c.curLoc+i)
|
||||
c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)
|
||||
seqRange.max = max(seqRange.max, loc)
|
||||
c.curCellRange.max = max(c.curCellRange.max, loc)
|
||||
|
||||
c.cellRanges[seq] = seqRange
|
||||
}
|
||||
} else {
|
||||
// If we are reserving memory, don't update any of the cache metadata but set the size
|
||||
// to the worst case.
|
||||
c.curLoc = 0
|
||||
locs = make([]int32, c.curBatchSize)
|
||||
for i := range locs {
|
||||
locs[i] = int32(i)
|
||||
}
|
||||
c.curCellRange.min = 0
|
||||
c.curCellRange.max = len(c.cells) - 1
|
||||
}
|
||||
|
||||
c.curLoc = ctx.Input().FromInts(locs, len(locs))
|
||||
c.curMask = c.buildMask(ctx)
|
||||
|
||||
return nil
|
||||
@@ -257,22 +258,20 @@ func newRange() cellRange {
|
||||
}
|
||||
}
|
||||
|
||||
// Find the first contiguous block of at least curBatchSize
|
||||
func (c *Causal) findStartLoc() (int, error) {
|
||||
var start, count int
|
||||
// Returns a slice of locations where each token in the batch should be stored
|
||||
func (c *Causal) findLocs() ([]int32, error) {
|
||||
loc := make([]int32, 0, c.curBatchSize)
|
||||
|
||||
for i := range c.cells {
|
||||
if len(c.cells[i].sequences) == 0 {
|
||||
count++
|
||||
if count >= c.curBatchSize {
|
||||
return start, nil
|
||||
loc = append(loc, int32(i))
|
||||
if len(loc) >= c.curBatchSize {
|
||||
return loc, nil
|
||||
}
|
||||
} else {
|
||||
start = i + 1
|
||||
count = 0
|
||||
}
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
|
||||
return nil, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
|
||||
}
|
||||
|
||||
func (c *Causal) updateSlidingWindow() {
|
||||
@@ -402,145 +401,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
||||
return maskTensor
|
||||
}
|
||||
|
||||
func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
|
||||
for i, key := range c.keys {
|
||||
if key == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
kHeadDim := key.Dim(0)
|
||||
numKVHeads := key.Dim(1)
|
||||
rowSize := key.Stride(2)
|
||||
|
||||
kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
|
||||
kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
|
||||
|
||||
value := c.values[i]
|
||||
var vSrcView, vDstView ml.Tensor
|
||||
if c.config.PermutedV {
|
||||
vHeadDim := value.Dim(1)
|
||||
elemSize := value.Stride(0)
|
||||
|
||||
vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
|
||||
vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
|
||||
} else {
|
||||
vHeadDim := value.Dim(0)
|
||||
rowSize := value.Stride(2)
|
||||
|
||||
vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
|
||||
vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
|
||||
}
|
||||
|
||||
ctx.Forward(
|
||||
kSrcView.Copy(ctx, kDstView),
|
||||
vSrcView.Copy(ctx, vDstView),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Causal) defrag() {
|
||||
slog.Debug("defragmenting kv cache")
|
||||
|
||||
// Defrag strategy:
|
||||
// - Search for empty holes at the beginning of the cache,
|
||||
// filling them with active data starting at the end
|
||||
// - If there are contiguous elements that need to be moved,
|
||||
// combine them into a single operation by holding new moves
|
||||
// until we see that the next one is non-contiguous
|
||||
// - Fill up the context with the maximum number of operations it
|
||||
// can hold then compute that and continue with a new context
|
||||
//
|
||||
// We could try to optimize placement by grouping blocks from
|
||||
// the same sequences together but most likely the next forward
|
||||
// pass will disrupt this anyways, so the real world benefit
|
||||
// seems limited as this time.
|
||||
|
||||
ctx := c.backend.NewContext()
|
||||
|
||||
// For every move, 6 tensors are required per layer (2 views and a
|
||||
// copy for each of k and v). We also need to refer to the original
|
||||
// k and v cache tensors - once per layer, not per move.
|
||||
layers := 0
|
||||
for _, key := range c.keys {
|
||||
if key == nil {
|
||||
continue
|
||||
}
|
||||
layers++
|
||||
}
|
||||
|
||||
maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
|
||||
moves := 0
|
||||
|
||||
var pendingSrc, pendingDst, pendingLen int
|
||||
src := len(c.cells) - 1
|
||||
|
||||
for dst := 0; dst < src; dst++ {
|
||||
if len(c.cells[dst].sequences) == 0 {
|
||||
for ; src > dst; src-- {
|
||||
if len(c.cells[src].sequences) != 0 {
|
||||
c.cells[dst] = c.cells[src]
|
||||
c.cells[src] = cacheCell{}
|
||||
|
||||
if pendingLen > 0 {
|
||||
if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
|
||||
pendingSrc = src
|
||||
pendingLen++
|
||||
break
|
||||
} else {
|
||||
c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
|
||||
moves++
|
||||
}
|
||||
}
|
||||
|
||||
pendingSrc = src
|
||||
pendingDst = dst
|
||||
pendingLen = 1
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if moves >= maxMoves {
|
||||
ctx.Compute()
|
||||
ctx.Close()
|
||||
ctx = c.backend.NewContext()
|
||||
|
||||
moves = 0
|
||||
}
|
||||
}
|
||||
|
||||
if pendingLen > 0 {
|
||||
c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
|
||||
moves++
|
||||
}
|
||||
|
||||
if moves > 0 {
|
||||
ctx.Compute()
|
||||
}
|
||||
ctx.Close()
|
||||
|
||||
// Reset range metadata
|
||||
for seq := range c.cellRanges {
|
||||
seqRange := newRange()
|
||||
|
||||
for i, cell := range c.cells {
|
||||
if slices.Contains(cell.sequences, seq) {
|
||||
if i < seqRange.min {
|
||||
seqRange.min = i
|
||||
}
|
||||
if i > seqRange.max {
|
||||
seqRange.max = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.cellRanges[seq] = seqRange
|
||||
}
|
||||
|
||||
c.updateSlidingWindow()
|
||||
}
|
||||
|
||||
func (c *Causal) SetLayer(layer int) {
|
||||
c.curLayer = layer
|
||||
}
|
||||
@@ -625,18 +485,25 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
|
||||
}
|
||||
}
|
||||
|
||||
rowSize := c.keys[c.curLayer].Stride(2)
|
||||
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, rowSize*c.curLoc, kHeadDim*numKVHeads*batchSize)))
|
||||
key = key.Reshape(ctx, kHeadDim*numKVHeads, batchSize)
|
||||
keyCache := c.keys[c.curLayer]
|
||||
keyCache = keyCache.Reshape(ctx, kHeadDim*numKVHeads, len(c.cells))
|
||||
ctx.Forward(keyCache.SetRows(ctx, key, c.curLoc))
|
||||
|
||||
if c.config.PermutedV {
|
||||
elemSize := c.values[c.curLayer].Stride(0)
|
||||
value = value.Reshape(ctx, vHeadDim*numKVHeads, 1, batchSize)
|
||||
value = value.Permute(ctx, 2, 0, 1, 3)
|
||||
|
||||
value = value.Permute(ctx, 1, 2, 0, 3)
|
||||
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
|
||||
valueCache := c.values[c.curLayer]
|
||||
valueCache = valueCache.Reshape(ctx, 1, len(c.cells), vHeadDim*numKVHeads)
|
||||
|
||||
ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
|
||||
} else {
|
||||
rowSize := c.values[c.curLayer].Stride(2)
|
||||
value = value.Reshape(ctx, vHeadDim*numKVHeads, batchSize)
|
||||
valueCache := c.values[c.curLayer]
|
||||
valueCache = valueCache.Reshape(ctx, vHeadDim*numKVHeads, len(c.cells))
|
||||
|
||||
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, rowSize*c.curLoc, vHeadDim*numKVHeads*batchSize)))
|
||||
ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
@@ -1,4 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "3cfa9c3f125763305b4226bc032f1954f08990dc";
|
||||
char const *LLAMA_COMMIT = "17f7f4baad8b3a716ee139da7bb56ae984e8c0fa";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
||||
@@ -22,6 +22,9 @@ include /src/llama.*
|
||||
include /src/llama-*.*
|
||||
include /src/unicode-data.*
|
||||
include /src/unicode.*
|
||||
include /src/models/
|
||||
include /src/models/*.h
|
||||
include /src/models/*.cpp
|
||||
include /vendor/
|
||||
include /vendor/miniaudio/
|
||||
include /vendor/miniaudio/*.h
|
||||
|
||||
174
llama/llama.cpp/common/common.cpp
vendored
174
llama/llama.cpp/common/common.cpp
vendored
@@ -8,6 +8,7 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
@@ -26,7 +27,6 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
@@ -60,6 +60,14 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
||||
|
||||
common_time_meas::~common_time_meas() {
|
||||
if (t_start_us >= 0) {
|
||||
t_acc += ggml_time_us() - t_start_us;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// CPU utils
|
||||
//
|
||||
@@ -355,11 +363,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
||||
}
|
||||
|
||||
void common_init() {
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
||||
common_log_add(common_log_main(), level, "%s", text);
|
||||
}
|
||||
}, NULL);
|
||||
llama_log_set(common_log_default_callback, NULL);
|
||||
|
||||
#ifdef NDEBUG
|
||||
const char * build_type = "";
|
||||
@@ -690,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
||||
|
||||
// Validate if a filename is safe to use
|
||||
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||
bool fs_validate_filename(const std::string & filename) {
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
||||
if (!filename.length()) {
|
||||
// Empty filename invalid
|
||||
return false;
|
||||
@@ -750,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
||||
|| c == ':' || c == '*' // Illegal characters
|
||||
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||
return false;
|
||||
}
|
||||
if (!allow_subdirs && (c == '/' || c == '\\')) {
|
||||
// Subdirectories not allowed, reject path separators
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||
@@ -778,11 +786,29 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
#include <iostream>
|
||||
|
||||
|
||||
#ifdef _WIN32
|
||||
static std::wstring utf8_to_wstring(const std::string & str) {
|
||||
if (str.empty()) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
|
||||
|
||||
if (size <= 0) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
std::wstring wstr(size, 0);
|
||||
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
|
||||
|
||||
return wstr;
|
||||
}
|
||||
#endif
|
||||
|
||||
// returns true if successful, false otherwise
|
||||
bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
std::wstring wpath = converter.from_bytes(path);
|
||||
std::wstring wpath = utf8_to_wstring(path);
|
||||
|
||||
// if the path already exists, check whether it's a directory
|
||||
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
||||
@@ -855,6 +881,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
bool fs_is_directory(const std::string & path) {
|
||||
std::filesystem::path dir(path);
|
||||
return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
|
||||
}
|
||||
|
||||
std::string fs_get_cache_directory() {
|
||||
std::string cache_directory = "";
|
||||
auto ensure_trailing_slash = [](std::string p) {
|
||||
@@ -889,6 +920,8 @@ std::string fs_get_cache_directory() {
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#elif defined(__EMSCRIPTEN__)
|
||||
GGML_ABORT("not implemented on this platform");
|
||||
#else
|
||||
# error Unknown architecture
|
||||
#endif
|
||||
@@ -908,11 +941,130 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||
return cache_directory + filename;
|
||||
}
|
||||
|
||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
|
||||
std::vector<common_file_info> files;
|
||||
if (path.empty()) return files;
|
||||
|
||||
std::filesystem::path dir(path);
|
||||
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
|
||||
return files;
|
||||
}
|
||||
|
||||
for (const auto & entry : std::filesystem::directory_iterator(dir)) {
|
||||
try {
|
||||
// Only include regular files (skip directories)
|
||||
const auto & p = entry.path();
|
||||
if (std::filesystem::is_regular_file(p)) {
|
||||
common_file_info info;
|
||||
info.path = p.string();
|
||||
info.name = p.filename().string();
|
||||
info.is_dir = false;
|
||||
try {
|
||||
info.size = static_cast<size_t>(std::filesystem::file_size(p));
|
||||
} catch (const std::filesystem::filesystem_error &) {
|
||||
info.size = 0;
|
||||
}
|
||||
files.push_back(std::move(info));
|
||||
} else if (include_directories && std::filesystem::is_directory(p)) {
|
||||
common_file_info info;
|
||||
info.path = p.string();
|
||||
info.name = p.filename().string();
|
||||
info.size = 0; // Directories have no size
|
||||
info.is_dir = true;
|
||||
files.push_back(std::move(info));
|
||||
}
|
||||
} catch (const std::filesystem::filesystem_error &) {
|
||||
// skip entries we cannot inspect
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
bool tty_can_use_colors() {
|
||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||
if (no_color[0] != '\0') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check TERM environment variable
|
||||
if (const char * term = std::getenv("TERM")) {
|
||||
if (std::strcmp(term, "dumb") == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if stdout and stderr are connected to a terminal
|
||||
// We check both because log messages can go to either
|
||||
bool stdout_is_tty = isatty(fileno(stdout));
|
||||
bool stderr_is_tty = isatty(fileno(stderr));
|
||||
|
||||
return stdout_is_tty || stderr_is_tty;
|
||||
}
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
||||
static inline void common_init_sampler_from_model(
|
||||
const llama_model * model,
|
||||
common_params_sampling & sparams) {
|
||||
|
||||
const uint64_t config = sparams.user_sampling_config;
|
||||
|
||||
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
|
||||
char buf[64] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
int32_t v = strtol(buf, &end, 10);
|
||||
if (end && end != buf) dst = v;
|
||||
}
|
||||
};
|
||||
|
||||
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
|
||||
char buf[128] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
float v = strtof(buf, &end);
|
||||
if (end && end != buf) dst = v;
|
||||
}
|
||||
};
|
||||
|
||||
// Sampling sequence
|
||||
if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
|
||||
char buf[512] = {0};
|
||||
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
|
||||
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
|
||||
if (!sampler_names.empty()) {
|
||||
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
|
||||
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
|
||||
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
||||
}
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params) {
|
||||
common_init_result iparams;
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
@@ -924,6 +1076,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
return iparams;
|
||||
}
|
||||
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
73
llama/llama.cpp/common/common.h
vendored
73
llama/llama.cpp/common/common.h
vendored
@@ -2,17 +2,19 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <cmath>
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
#if defined(_WIN32) && !defined(_WIN32_WINNT)
|
||||
#define _WIN32_WINNT 0x0A00
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
@@ -28,7 +30,14 @@
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||
struct common_time_meas {
|
||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||
~common_time_meas();
|
||||
|
||||
const int64_t t_start_us;
|
||||
|
||||
int64_t & t_acc;
|
||||
};
|
||||
|
||||
struct common_adapter_lora_info {
|
||||
std::string path;
|
||||
@@ -133,6 +142,22 @@ struct common_grammar_trigger {
|
||||
llama_token token = LLAMA_TOKEN_NULL;
|
||||
};
|
||||
|
||||
enum common_params_sampling_config : uint64_t {
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
|
||||
};
|
||||
|
||||
|
||||
// sampling parameters
|
||||
struct common_params_sampling {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
@@ -165,6 +190,8 @@ struct common_params_sampling {
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool timing_per_token = false;
|
||||
|
||||
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
|
||||
|
||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||
|
||||
|
||||
@@ -198,6 +225,7 @@ struct common_params_model {
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
@@ -344,7 +372,7 @@ struct common_params {
|
||||
|
||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
||||
int32_t verbosity = 0;
|
||||
int32_t verbosity = 3; // LOG_LEVEL_INFO
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
bool offline = false;
|
||||
@@ -406,6 +434,8 @@ struct common_params {
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
int image_min_tokens = -1;
|
||||
int image_max_tokens = -1;
|
||||
|
||||
// finetune
|
||||
struct lr_opt lr;
|
||||
@@ -451,14 +481,21 @@ struct common_params {
|
||||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
std::string slot_save_path;
|
||||
std::string media_path; // path to directory for loading media files
|
||||
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
|
||||
// batched-bench params
|
||||
bool is_pp_shared = false;
|
||||
bool is_pp_shared = false;
|
||||
bool is_tg_separate = false;
|
||||
|
||||
std::vector<int32_t> n_pp;
|
||||
std::vector<int32_t> n_tg;
|
||||
@@ -505,6 +542,10 @@ struct common_params {
|
||||
// return false from callback to abort model loading or true to continue
|
||||
llama_progress_callback load_progress_callback = NULL;
|
||||
void * load_progress_callback_user_data = NULL;
|
||||
|
||||
bool has_speculative() const {
|
||||
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
|
||||
}
|
||||
};
|
||||
|
||||
// call once at the start of a program if it uses libcommon
|
||||
@@ -599,12 +640,28 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||
// Filesystem utils
|
||||
//
|
||||
|
||||
bool fs_validate_filename(const std::string & filename);
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
|
||||
bool fs_create_directory_with_parents(const std::string & path);
|
||||
bool fs_is_directory(const std::string & path);
|
||||
|
||||
std::string fs_get_cache_directory();
|
||||
std::string fs_get_cache_file(const std::string & filename);
|
||||
|
||||
struct common_file_info {
|
||||
std::string path;
|
||||
std::string name;
|
||||
size_t size = 0; // in bytes
|
||||
bool is_dir = false;
|
||||
};
|
||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
// Auto-detect if colors can be enabled based on terminal and environment
|
||||
bool tty_can_use_colors();
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
||||
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
|
||||
}
|
||||
|
||||
std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
|
||||
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
|
||||
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
|
||||
std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
|
||||
std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
|
||||
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
|
||||
};
|
||||
|
||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||
@@ -303,6 +303,8 @@ static std::string format_literal(const std::string & literal) {
|
||||
return "\"" + escaped + "\"";
|
||||
}
|
||||
|
||||
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
||||
|
||||
class SchemaConverter {
|
||||
private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
@@ -601,7 +603,10 @@ private:
|
||||
}
|
||||
|
||||
std::string _resolve_ref(const std::string & ref) {
|
||||
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
|
||||
auto it = ref.find('#');
|
||||
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
|
||||
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
|
||||
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
|
||||
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
||||
_refs_being_resolved.insert(ref);
|
||||
json resolved = _refs[ref];
|
||||
@@ -774,11 +779,24 @@ public:
|
||||
std::vector<std::string> tokens = string_split(pointer, "/");
|
||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||
std::string sel = tokens[i];
|
||||
if (target.is_null() || !target.contains(sel)) {
|
||||
if (target.is_object() && target.contains(sel)) {
|
||||
target = target[sel];
|
||||
} else if (target.is_array()) {
|
||||
size_t sel_index;
|
||||
try {
|
||||
sel_index = std::stoul(sel);
|
||||
} catch (const std::invalid_argument & e) {
|
||||
sel_index = target.size();
|
||||
}
|
||||
if (sel_index >= target.size()) {
|
||||
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
||||
return;
|
||||
}
|
||||
target = target[sel_index];
|
||||
} else {
|
||||
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
||||
return;
|
||||
}
|
||||
target = target[sel];
|
||||
}
|
||||
_refs[ref] = target;
|
||||
}
|
||||
@@ -956,7 +974,7 @@ public:
|
||||
|
||||
void check_errors() {
|
||||
if (!_errors.empty()) {
|
||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
}
|
||||
if (!_warnings.empty()) {
|
||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
||||
|
||||
@@ -18,4 +18,6 @@ struct common_grammar_options {
|
||||
bool dotall = false;
|
||||
};
|
||||
|
||||
std::string gbnf_format_literal(const std::string & literal);
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
|
||||
|
||||
49
llama/llama.cpp/common/log.cpp
vendored
49
llama/llama.cpp/common/log.cpp
vendored
@@ -1,3 +1,4 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <chrono>
|
||||
@@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
|
||||
common_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
// Auto-detect if colors should be enabled based on terminal and environment
|
||||
static bool common_log_should_use_colors_auto() {
|
||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||
if (no_color[0] != '\0') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check TERM environment variable
|
||||
if (const char * term = std::getenv("TERM")) {
|
||||
if (std::strcmp(term, "dumb") == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if stdout and stderr are connected to a terminal
|
||||
// We check both because log messages can go to either
|
||||
bool stdout_is_tty = isatty(fileno(stdout));
|
||||
bool stderr_is_tty = isatty(fileno(stderr));
|
||||
|
||||
return stdout_is_tty || stderr_is_tty;
|
||||
}
|
||||
|
||||
static int64_t t_us() {
|
||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
@@ -391,7 +368,7 @@ struct common_log * common_log_main() {
|
||||
static std::once_flag init_flag;
|
||||
std::call_once(init_flag, [&]() {
|
||||
// Set default to auto-detect colors
|
||||
log.set_colors(common_log_should_use_colors_auto());
|
||||
log.set_colors(tty_can_use_colors());
|
||||
});
|
||||
|
||||
return &log;
|
||||
@@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
||||
|
||||
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
||||
if (colors == LOG_COLORS_AUTO) {
|
||||
log->set_colors(common_log_should_use_colors_auto());
|
||||
log->set_colors(tty_can_use_colors());
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -442,3 +419,23 @@ void common_log_set_prefix(struct common_log * log, bool prefix) {
|
||||
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
||||
log->set_timestamps(timestamps);
|
||||
}
|
||||
|
||||
static int common_get_verbosity(enum ggml_log_level level) {
|
||||
switch (level) {
|
||||
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
||||
case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_INFO;
|
||||
case GGML_LOG_LEVEL_WARN: return LOG_LEVEL_WARN;
|
||||
case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
|
||||
case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_INFO; // same as INFO
|
||||
case GGML_LOG_LEVEL_NONE:
|
||||
default:
|
||||
return LOG_LEVEL_OUTPUT;
|
||||
}
|
||||
}
|
||||
|
||||
void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||
auto verbosity = common_get_verbosity(level);
|
||||
if (verbosity <= common_log_verbosity_thold) {
|
||||
common_log_add(common_log_main(), level, "%s", text);
|
||||
}
|
||||
}
|
||||
|
||||
33
llama/llama.cpp/common/log.h
vendored
33
llama/llama.cpp/common/log.h
vendored
@@ -21,8 +21,14 @@
|
||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
|
||||
#define LOG_DEFAULT_DEBUG 1
|
||||
#define LOG_DEFAULT_LLAMA 0
|
||||
#define LOG_LEVEL_DEBUG 4
|
||||
#define LOG_LEVEL_INFO 3
|
||||
#define LOG_LEVEL_WARN 2
|
||||
#define LOG_LEVEL_ERROR 1
|
||||
#define LOG_LEVEL_OUTPUT 0 // output data from tools
|
||||
|
||||
#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
|
||||
#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO
|
||||
|
||||
enum log_colors {
|
||||
LOG_COLORS_AUTO = -1,
|
||||
@@ -36,6 +42,8 @@ extern int common_log_verbosity_thold;
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||
|
||||
void common_log_default_callback(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
// the common_log uses an internal worker thread to print/write log messages
|
||||
// when the worker thread is paused, incoming log messages are discarded
|
||||
struct common_log;
|
||||
@@ -65,10 +73,11 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
||||
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
||||
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
||||
//
|
||||
// I - info (stdout, V = 0)
|
||||
// W - warning (stderr, V = 0)
|
||||
// E - error (stderr, V = 0)
|
||||
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||
// I - info (stdout, V = LOG_DEFAULT_INFO)
|
||||
// W - warning (stderr, V = LOG_DEFAULT_WARN)
|
||||
// E - error (stderr, V = LOG_DEFAULT_ERROR)
|
||||
// O - output (stdout, V = LOG_DEFAULT_OUTPUT)
|
||||
//
|
||||
|
||||
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
||||
@@ -93,14 +102,14 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps); // w
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
|
||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||
|
||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, LOG_LEVEL_INFO, __VA_ARGS__) // same as INFO
|
||||
|
||||
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||
|
||||
68
llama/llama.cpp/common/sampling.cpp
vendored
68
llama/llama.cpp/common/sampling.cpp
vendored
@@ -3,9 +3,10 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <unordered_map>
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
@@ -112,6 +113,13 @@ struct common_sampler {
|
||||
|
||||
llama_token_data_array cur_p;
|
||||
|
||||
void reset() {
|
||||
prev.clear();
|
||||
|
||||
llama_sampler_reset(grmr);
|
||||
llama_sampler_reset(chain);
|
||||
}
|
||||
|
||||
void set_logits(struct llama_context * ctx, int idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
@@ -128,6 +136,12 @@ struct common_sampler {
|
||||
|
||||
cur_p = { cur.data(), cur.size(), -1, false };
|
||||
}
|
||||
|
||||
common_time_meas tm() {
|
||||
return common_time_meas(t_total_us, params.no_perf);
|
||||
}
|
||||
|
||||
mutable int64_t t_total_us = 0;
|
||||
};
|
||||
|
||||
std::string common_params_sampling::print() const {
|
||||
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
}
|
||||
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
|
||||
}
|
||||
|
||||
void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||
llama_sampler_reset(gsmpl->grmr);
|
||||
|
||||
llama_sampler_reset(gsmpl->chain);
|
||||
gsmpl->reset();
|
||||
}
|
||||
|
||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
||||
// TODO: measure grammar performance
|
||||
|
||||
const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
|
||||
|
||||
llama_perf_sampler_data data_smpl;
|
||||
llama_perf_context_data data_ctx;
|
||||
|
||||
memset(&data_smpl, 0, sizeof(data_smpl));
|
||||
memset(&data_ctx, 0, sizeof(data_ctx));
|
||||
|
||||
if (gsmpl) {
|
||||
llama_perf_sampler_print(gsmpl->chain);
|
||||
auto & data = data_smpl;
|
||||
|
||||
data = llama_perf_sampler(gsmpl->chain);
|
||||
|
||||
// note: the sampling time includes the samplers time + extra time spent in common/sampling
|
||||
LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
|
||||
LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
llama_perf_context_print(ctx);
|
||||
auto & data = data_ctx;
|
||||
|
||||
data = llama_perf_context(ctx);
|
||||
|
||||
const double t_end_ms = 1e-3 * ggml_time_us();
|
||||
|
||||
const double t_total_ms = t_end_ms - data.t_start_ms;
|
||||
const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
|
||||
const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
|
||||
|
||||
LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
||||
LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
||||
LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||
LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
|
||||
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
|
||||
llama_memory_breakdown_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
// helpers
|
||||
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
auto * res = &gsmpl->cur_p;
|
||||
|
||||
if (do_sort && !res->sorted) {
|
||||
|
||||
28
llama/llama.cpp/include/llama.h
vendored
28
llama/llama.cpp/include/llama.h
vendored
@@ -83,6 +83,7 @@ extern "C" {
|
||||
LLAMA_ROPE_TYPE_NORM = 0,
|
||||
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
||||
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
|
||||
LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
|
||||
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
|
||||
};
|
||||
|
||||
@@ -245,6 +246,21 @@ extern "C" {
|
||||
LLAMA_KV_OVERRIDE_TYPE_STR,
|
||||
};
|
||||
|
||||
enum llama_model_meta_key {
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
|
||||
};
|
||||
|
||||
struct llama_model_kv_override {
|
||||
enum llama_model_kv_override_type tag;
|
||||
|
||||
@@ -460,7 +476,11 @@ extern "C" {
|
||||
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||
LLAMA_API bool llama_supports_rpc (void);
|
||||
|
||||
// NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
|
||||
// In some cases the requested values via llama_context_params may differ from the actual values used by the context
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
||||
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
||||
@@ -481,6 +501,7 @@ extern "C" {
|
||||
|
||||
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
||||
@@ -512,6 +533,9 @@ extern "C" {
|
||||
// Get the number of metadata key/value pairs
|
||||
LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
|
||||
|
||||
// Get sampling metadata key name. Returns nullptr if the key is invalid
|
||||
LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
|
||||
|
||||
// Get metadata key name by index
|
||||
LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
||||
|
||||
@@ -584,7 +608,7 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
// NOTE: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
@@ -1110,8 +1134,6 @@ extern "C" {
|
||||
// // sample from the logits of the last token in the batch
|
||||
// const llama_token id = llama_sampler_sample(smpl, ctx, -1);
|
||||
//
|
||||
// // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
|
||||
// llama_sampler_accept(smpl, id);
|
||||
// ...
|
||||
// }
|
||||
//
|
||||
|
||||
280
llama/llama.cpp/src/llama-arch.cpp
vendored
280
llama/llama.cpp/src/llama-arch.cpp
vendored
@@ -32,6 +32,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
||||
{ LLM_ARCH_QWEN3, "qwen3" },
|
||||
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
|
||||
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
|
||||
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
|
||||
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
@@ -89,6 +92,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
|
||||
{ LLM_ARCH_DOTS1, "dots1" },
|
||||
{ LLM_ARCH_ARCEE, "arcee" },
|
||||
{ LLM_ARCH_AFMOE, "afmoe" },
|
||||
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
||||
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
||||
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
||||
@@ -104,23 +108,40 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
||||
{ LLM_ARCH_GROVEMOE, "grovemoe" },
|
||||
{ LLM_ARCH_APERTUS, "apertus" },
|
||||
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
|
||||
{ LLM_ARCH_COGVLM, "cogvlm" },
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
|
||||
{ LLM_KV_GENERAL_NAME, "general.name" },
|
||||
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
||||
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
||||
{ LLM_KV_GENERAL_URL, "general.url" },
|
||||
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
||||
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
||||
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
||||
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
||||
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
|
||||
{ LLM_KV_GENERAL_NAME, "general.name" },
|
||||
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
||||
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
||||
{ LLM_KV_GENERAL_URL, "general.url" },
|
||||
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
||||
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
||||
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
||||
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
||||
|
||||
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||
@@ -146,6 +167,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
@@ -184,6 +206,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
@@ -329,6 +352,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_AFMOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_LLAMA4,
|
||||
{
|
||||
@@ -781,6 +834,77 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_QWEN3NEXT,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
|
||||
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_QWEN3VL,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_QWEN3VLMOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PHI2,
|
||||
{
|
||||
@@ -2168,7 +2292,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
||||
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
}
|
||||
},
|
||||
@@ -2190,7 +2314,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
||||
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "token_embd_norm" }, // note: wrong tensor name
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
@@ -2332,6 +2456,110 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_MINIMAX_M2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_COGVLM,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
|
||||
{ LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
|
||||
{ LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
|
||||
{ LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
|
||||
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_RND1,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_MISTRAL3,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2340,11 +2568,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
},
|
||||
};
|
||||
|
||||
// declare information about the model weight tensors:
|
||||
// - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
|
||||
// - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
|
||||
//
|
||||
// for example, input layers are usually assigned to CPU/host buffer types
|
||||
//
|
||||
// a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
|
||||
// assignment of the buffer types and extra overhead during computation
|
||||
// example: https://github.com/ggml-org/llama.cpp/pull/17548
|
||||
//
|
||||
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
@@ -2361,6 +2599,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
@@ -2398,6 +2637,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
@@ -2419,6 +2659,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
|
||||
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
||||
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
|
||||
{LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
|
||||
{LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
@@ -2509,6 +2750,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
||||
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
||||
// These tensors only exist in the last layer(s) and are treated as output tensors
|
||||
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
@@ -2592,6 +2838,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -2603,6 +2850,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
||||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
case LLM_ARCH_LLADA_MOE:
|
||||
case LLM_ARCH_RND1:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
31
llama/llama.cpp/src/llama-arch.h
vendored
31
llama/llama.cpp/src/llama-arch.h
vendored
@@ -36,6 +36,9 @@ enum llm_arch {
|
||||
LLM_ARCH_QWEN2VL,
|
||||
LLM_ARCH_QWEN3,
|
||||
LLM_ARCH_QWEN3MOE,
|
||||
LLM_ARCH_QWEN3NEXT,
|
||||
LLM_ARCH_QWEN3VL,
|
||||
LLM_ARCH_QWEN3VLMOE,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PHIMOE,
|
||||
@@ -93,6 +96,7 @@ enum llm_arch {
|
||||
LLM_ARCH_BAILINGMOE2,
|
||||
LLM_ARCH_DOTS1,
|
||||
LLM_ARCH_ARCEE,
|
||||
LLM_ARCH_AFMOE,
|
||||
LLM_ARCH_ERNIE4_5,
|
||||
LLM_ARCH_ERNIE4_5_MOE,
|
||||
LLM_ARCH_HUNYUAN_MOE,
|
||||
@@ -108,6 +112,11 @@ enum llm_arch {
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_GROVEMOE,
|
||||
LLM_ARCH_APERTUS,
|
||||
LLM_ARCH_MINIMAX_M2,
|
||||
LLM_ARCH_COGVLM,
|
||||
LLM_ARCH_RND1,
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
LLM_ARCH_MISTRAL3,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -117,6 +126,18 @@ enum llm_kv {
|
||||
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
||||
LLM_KV_GENERAL_ALIGNMENT,
|
||||
LLM_KV_GENERAL_FILE_TYPE,
|
||||
LLM_KV_GENERAL_SAMPLING_SEQUENCE,
|
||||
LLM_KV_GENERAL_SAMPLING_TOP_K,
|
||||
LLM_KV_GENERAL_SAMPLING_TOP_P,
|
||||
LLM_KV_GENERAL_SAMPLING_MIN_P,
|
||||
LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
|
||||
LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
|
||||
LLM_KV_GENERAL_SAMPLING_TEMP,
|
||||
LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
|
||||
LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
|
||||
LLM_KV_GENERAL_SAMPLING_MIROSTAT,
|
||||
LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
|
||||
LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
|
||||
LLM_KV_GENERAL_NAME,
|
||||
LLM_KV_GENERAL_AUTHOR,
|
||||
LLM_KV_GENERAL_VERSION,
|
||||
@@ -150,6 +171,7 @@ enum llm_kv {
|
||||
LLM_KV_EXPERTS_PER_GROUP,
|
||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
@@ -188,6 +210,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
@@ -308,6 +331,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||
LLM_TENSOR_ATTN_SINKS,
|
||||
LLM_TENSOR_ATTN_GATE,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
@@ -357,11 +381,13 @@ enum llm_tensor {
|
||||
LLM_TENSOR_SSM_DT,
|
||||
LLM_TENSOR_SSM_DT_NORM,
|
||||
LLM_TENSOR_SSM_A,
|
||||
LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
|
||||
LLM_TENSOR_SSM_B_NORM,
|
||||
LLM_TENSOR_SSM_C_NORM,
|
||||
LLM_TENSOR_SSM_D,
|
||||
LLM_TENSOR_SSM_NORM,
|
||||
LLM_TENSOR_SSM_OUT,
|
||||
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
|
||||
LLM_TENSOR_TIME_MIX_W0,
|
||||
LLM_TENSOR_TIME_MIX_W1,
|
||||
LLM_TENSOR_TIME_MIX_W2,
|
||||
@@ -458,6 +484,11 @@ enum llm_tensor {
|
||||
LLM_TENSOR_SHORTCONV_CONV,
|
||||
LLM_TENSOR_SHORTCONV_INPROJ,
|
||||
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
||||
LLM_TENSOR_VISEXP_ATTN_QKV,
|
||||
LLM_TENSOR_VISEXP_ATTN_OUT,
|
||||
LLM_TENSOR_VISEXP_FFN_GATE,
|
||||
LLM_TENSOR_VISEXP_FFN_DOWN,
|
||||
LLM_TENSOR_VISEXP_FFN_UP,
|
||||
LLM_TENSOR_NEXTN_EH_PROJ,
|
||||
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
||||
LLM_TENSOR_NEXTN_ENORM,
|
||||
|
||||
96
llama/llama.cpp/src/llama-batch.cpp
vendored
96
llama/llama.cpp/src/llama-batch.cpp
vendored
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
|
||||
/*.n_seq_tokens =*/ (uint32_t) 1,
|
||||
/*.n_seqs =*/ (uint32_t) batch.n_tokens,
|
||||
/*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(),
|
||||
/*.n_pos =*/ n_pos_per_embd,
|
||||
/*.token =*/ batch.token,
|
||||
/*.embd =*/ batch.embd,
|
||||
/*.pos =*/ batch.pos,
|
||||
@@ -251,46 +252,72 @@ bool llama_batch_allocr::init(
|
||||
// consistency checks
|
||||
//
|
||||
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_pos[s].empty()) {
|
||||
continue;
|
||||
}
|
||||
if (n_pos_per_embd > 1) {
|
||||
// M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_pos[s].empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
||||
|
||||
if (p0 >= 0) {
|
||||
bool ok = true;
|
||||
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
||||
|
||||
if (batch.token) {
|
||||
if (p0 >= 0 && p0 >= seq_pos_min(s)) {
|
||||
LLAMA_LOG_ERROR(
|
||||
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
||||
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
||||
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
||||
" for M-RoPE, it is required that the position satisfies: X < Y\n",
|
||||
__func__, s, s, p0, s, seq_pos_min(s));
|
||||
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// embedding inputs can have overlapping positions
|
||||
if (p0 >= 0 && p0 > seq_pos_min(s)) {
|
||||
LLAMA_LOG_ERROR(
|
||||
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
||||
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
||||
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
||||
" for M-RoPE, it is required that the position satisfies: X <= Y\n",
|
||||
__func__, s, s, p0, s, seq_pos_min(s));
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_pos[s].empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
||||
|
||||
if (p0 >= 0) {
|
||||
bool ok = true;
|
||||
|
||||
if (seq_pos_min(s) != p0 + 1) {
|
||||
ok = false;
|
||||
}
|
||||
} else {
|
||||
assert(batch.embd);
|
||||
|
||||
// for embeddings (typically used as vision input), we allow them to have repeating positions
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
|
||||
if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
|
||||
ok = false;
|
||||
if (!ok) {
|
||||
LLAMA_LOG_ERROR(
|
||||
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
||||
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
||||
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
||||
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
||||
__func__, s, s, p0, s, seq_pos_min(s));
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
LLAMA_LOG_ERROR(
|
||||
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
||||
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
||||
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
||||
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
||||
__func__, s, s, p0, s, seq_pos_min(s));
|
||||
|
||||
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (memory) {
|
||||
@@ -389,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
|
||||
/*.n_seq_tokens =*/ n_seq_tokens,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.n_seqs_unq =*/ n_seqs,
|
||||
/*.n_pos =*/ n_pos_per_embd,
|
||||
|
||||
/*.token =*/ udata->token.data(),
|
||||
/*.embd =*/ nullptr,
|
||||
@@ -655,10 +683,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
|
||||
auto udata = std::make_shared<llama_ubatch::data_t>();
|
||||
|
||||
const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
|
||||
|
||||
const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
|
||||
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur;
|
||||
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
|
||||
|
||||
udata->token .resize(n_tokens);
|
||||
udata->embd .resize(n_embd_all);
|
||||
@@ -680,8 +706,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
|
||||
}
|
||||
|
||||
for (int j = 0; j < n_pos_cur; ++j) {
|
||||
udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
|
||||
for (size_t j = 0; j < (size_t)n_pos_per_embd; ++j) {
|
||||
// if we are using M-RoPE
|
||||
// if the current batch is text, we need to broadcast the same position across all RoPE sections
|
||||
// otherwise, the input batch is image embeddings, we copy the positions as-is
|
||||
// if we are not using M-RoPE, there is only one position per token (this loop runs only once)
|
||||
size_t src_off = batch.token ? 0 : j*batch.n_tokens;
|
||||
udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
|
||||
}
|
||||
|
||||
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||
@@ -710,6 +741,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
/*.n_seq_tokens =*/ n_tokens/n_seqs,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
|
||||
/*.n_pos =*/ n_pos_per_embd,
|
||||
|
||||
/*.token =*/ batch.token ? udata->token.data() : nullptr,
|
||||
/*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
|
||||
|
||||
13
llama/llama.cpp/src/llama-batch.h
vendored
13
llama/llama.cpp/src/llama-batch.h
vendored
@@ -17,6 +17,16 @@ struct llama_ubatch {
|
||||
return b_equal_seqs != 0;
|
||||
}
|
||||
|
||||
// typical for M-RoPE cases:
|
||||
// 0 - sequantial position of the tokens/embeddings in the sequence
|
||||
// 1 - y position in the image
|
||||
// 2 - x position in the image
|
||||
// 3 - other
|
||||
bool is_pos_2d() const {
|
||||
// TODO @ngxson : we may need to check for model arch when more models use >1 positions
|
||||
return n_pos >= 3;
|
||||
}
|
||||
|
||||
uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
|
||||
// otherwise address sanitizer complains
|
||||
// TODO: whole_seqs for embeddings?
|
||||
@@ -25,6 +35,7 @@ struct llama_ubatch {
|
||||
uint32_t n_seq_tokens; // tokens per sequence set
|
||||
uint32_t n_seqs; // sequence sets in the ubatch
|
||||
uint32_t n_seqs_unq; // unique sequence ids in the ubatch
|
||||
uint32_t n_pos; // number of position inputs for each token/embedding
|
||||
|
||||
// seq_id_unq: unique sequence ids in the ubatch
|
||||
// seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
|
||||
@@ -33,7 +44,7 @@ struct llama_ubatch {
|
||||
// // size | idx | val
|
||||
llama_token * token; // [n_tokens] | i | id, token
|
||||
float * embd; // [n_embd, n_tokens] | i | embd
|
||||
llama_pos * pos; // [n_tokens] | i | pos
|
||||
llama_pos * pos; // [n_tokens*n_pos] | i | pos
|
||||
int32_t * n_seq_id; // [n_tokens] | i | -
|
||||
llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
|
||||
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
||||
|
||||
32
llama/llama.cpp/src/llama-chat.cpp
vendored
32
llama/llama.cpp/src/llama-chat.cpp
vendored
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
|
||||
};
|
||||
|
||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||
@@ -213,6 +214,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_SEED_OSS;
|
||||
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GROK_2;
|
||||
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
|
||||
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@@ -813,6 +816,35 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "Assistant:";
|
||||
}
|
||||
}else if (tmpl == LLM_CHAT_TEMPLATE_PANGU_EMBED) {
|
||||
// [unused9]系统:xxx[unused10]
|
||||
// [unused9]用户:xxx[unused10]
|
||||
// [unused9]助手:xxx[unused10]
|
||||
// ...
|
||||
for (size_t i = 0; i < chat.size(); ++i) {
|
||||
const auto & msg = chat[i];
|
||||
const std::string & role = msg->role;
|
||||
const std::string & content = msg->content;
|
||||
|
||||
if (i == 0 && role != "system") {
|
||||
ss << "[unused9]系统:[unused10]";
|
||||
}
|
||||
|
||||
if (role == "system") {
|
||||
ss << "[unused9]系统:" << content << "[unused10]";
|
||||
} else if (role == "user") {
|
||||
ss << "[unused9]用户:" << content << "[unused10]";
|
||||
} else if (role == "assistant") {
|
||||
ss << "[unused9]助手:" << content << "[unused10]";
|
||||
} else if (role == "tool") {
|
||||
ss << "[unused9]工具:" << content << "[unused10]";
|
||||
} else if (role == "function") {
|
||||
ss << "[unused9]方法:" << content << "[unused10]";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "[unused9]助手:";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
||||
1
llama/llama.cpp/src/llama-chat.h
vendored
1
llama/llama.cpp/src/llama-chat.h
vendored
@@ -53,6 +53,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||
LLM_CHAT_TEMPLATE_GROK_2,
|
||||
LLM_CHAT_TEMPLATE_PANGU_EMBED,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
78
llama/llama.cpp/src/llama-context.cpp
vendored
78
llama/llama.cpp/src/llama-context.cpp
vendored
@@ -1,5 +1,6 @@
|
||||
#include "llama-context.h"
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-io.h"
|
||||
@@ -21,6 +22,8 @@ llama_context::llama_context(
|
||||
llama_context_params params) :
|
||||
model(model),
|
||||
balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
|
||||
// TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
|
||||
// may need to be backend-dependent
|
||||
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
||||
|
||||
t_start_us = model.t_start_us;
|
||||
@@ -112,11 +115,28 @@ llama_context::llama_context(
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
||||
|
||||
if (cparams.kv_unified) {
|
||||
cparams.n_ctx_seq = cparams.n_ctx;
|
||||
} else {
|
||||
cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
|
||||
|
||||
if (cparams.n_ctx_seq == 0) {
|
||||
throw std::runtime_error("n_ctx_seq == 0");
|
||||
}
|
||||
|
||||
if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
|
||||
cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
|
||||
LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||
LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
|
||||
LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
|
||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
||||
@@ -125,14 +145,14 @@ llama_context::llama_context(
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
|
||||
if (n_ctx_per_seq < hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
if (cparams.n_ctx_seq < hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
|
||||
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
if (n_ctx_per_seq > hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
if (cparams.n_ctx_seq > hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
||||
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
@@ -228,7 +248,10 @@ llama_context::llama_context(
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes();
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
@@ -268,9 +291,7 @@ llama_context::llama_context(
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
|
||||
}
|
||||
}
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
||||
@@ -282,9 +303,6 @@ llama_context::llama_context(
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||
n_outputs = n_seqs;
|
||||
|
||||
@@ -343,7 +361,14 @@ llama_context::llama_context(
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
|
||||
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
}
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
}
|
||||
|
||||
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
||||
@@ -448,8 +473,8 @@ uint32_t llama_context::n_ctx() const {
|
||||
return cparams.n_ctx;
|
||||
}
|
||||
|
||||
uint32_t llama_context::n_ctx_per_seq() const {
|
||||
return cparams.n_ctx / cparams.n_seq_max;
|
||||
uint32_t llama_context::n_ctx_seq() const {
|
||||
return cparams.n_ctx_seq;
|
||||
}
|
||||
|
||||
uint32_t llama_context::n_batch() const {
|
||||
@@ -518,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
|
||||
throw std::runtime_error("failed to initialize memory context");
|
||||
}
|
||||
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
@@ -803,7 +828,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
const int64_t n_vocab = model.vocab.n_tokens();
|
||||
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
@@ -972,7 +997,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
const bool output_all = false;
|
||||
|
||||
@@ -1223,7 +1248,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
if (!sorted_output && n_outputs > 1) {
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
@@ -1360,7 +1385,10 @@ void llama_context::output_reorder() {
|
||||
// graph
|
||||
//
|
||||
|
||||
uint32_t llama_context::graph_max_nodes() const {
|
||||
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
||||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
}
|
||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
}
|
||||
|
||||
@@ -2129,7 +2157,7 @@ void llama_context::opt_epoch_iter(
|
||||
batch.logits [pos_batch] = true;
|
||||
}
|
||||
|
||||
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
||||
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return;
|
||||
}
|
||||
@@ -2377,6 +2405,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
|
||||
return ctx->n_ctx();
|
||||
}
|
||||
|
||||
uint32_t llama_n_ctx_seq(const llama_context * ctx) {
|
||||
return ctx->n_ctx_seq();
|
||||
}
|
||||
|
||||
uint32_t llama_n_batch(const llama_context * ctx) {
|
||||
return ctx->n_batch();
|
||||
}
|
||||
|
||||
12
llama/llama.cpp/src/llama-context.h
vendored
12
llama/llama.cpp/src/llama-context.h
vendored
@@ -43,11 +43,11 @@ struct llama_context {
|
||||
|
||||
ggml_backend_sched_t get_sched() const;
|
||||
|
||||
uint32_t n_ctx() const;
|
||||
uint32_t n_ctx_per_seq() const;
|
||||
uint32_t n_batch() const;
|
||||
uint32_t n_ubatch() const;
|
||||
uint32_t n_seq_max() const;
|
||||
uint32_t n_ctx() const;
|
||||
uint32_t n_ctx_seq() const;
|
||||
uint32_t n_batch() const;
|
||||
uint32_t n_ubatch() const;
|
||||
uint32_t n_seq_max() const;
|
||||
|
||||
uint32_t n_threads() const;
|
||||
uint32_t n_threads_batch() const;
|
||||
@@ -197,7 +197,7 @@ private:
|
||||
//
|
||||
|
||||
public:
|
||||
uint32_t graph_max_nodes() const;
|
||||
uint32_t graph_max_nodes(uint32_t n_tokens) const;
|
||||
|
||||
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
||||
llm_graph_result * get_gf_res_reserve() const;
|
||||
|
||||
1
llama/llama.cpp/src/llama-cparams.h
vendored
1
llama/llama.cpp/src/llama-cparams.h
vendored
@@ -8,6 +8,7 @@
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
uint32_t n_ctx_seq; // context for a single sequence
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user