Compare commits
33 Commits
v0.5.8-rc3
...
v0.5.8-rc1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1f766c36fb | ||
|
|
484a99e428 | ||
|
|
ec6121c331 | ||
|
|
b86c0a1500 | ||
|
|
7e402ebb8c | ||
|
|
b901a712c6 | ||
|
|
abb8dd57f8 | ||
|
|
a400df48c0 | ||
|
|
6ab4ba4c26 | ||
|
|
e8d4eb3e68 | ||
|
|
ae7e368f75 | ||
|
|
31acd1ebf9 | ||
|
|
9a4757ae66 | ||
|
|
7814019708 | ||
|
|
b698f9a0d8 | ||
|
|
32285a6d19 | ||
|
|
1c198977ec | ||
|
|
330b6c50b0 | ||
|
|
928911bc68 | ||
|
|
5b446cc815 | ||
|
|
451c1596af | ||
|
|
932bded12f | ||
|
|
070ad913ac | ||
|
|
8d8b9f83ae | ||
|
|
f00d359a67 | ||
|
|
291def6adb | ||
|
|
cd3fbf1c49 | ||
|
|
c852b8e021 | ||
|
|
d8932c55e7 | ||
|
|
63f0269f7f | ||
|
|
4759ecae19 | ||
|
|
65b7ecac7b | ||
|
|
f9d2d89135 |
4
.gitattributes
vendored
4
.gitattributes
vendored
@@ -15,6 +15,10 @@ ml/backend/**/*.cu linguist-vendored
|
||||
ml/backend/**/*.cuh linguist-vendored
|
||||
ml/backend/**/*.m linguist-vendored
|
||||
ml/backend/**/*.metal linguist-vendored
|
||||
ml/backend/**/CMakeLists.txt linguist-vendored
|
||||
|
||||
llama/build-info.cpp linguist-generated
|
||||
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated
|
||||
|
||||
* text=auto
|
||||
*.go text eol=lf
|
||||
|
||||
8
.github/ISSUE_TEMPLATE/10_bug_report.yml
vendored
8
.github/ISSUE_TEMPLATE/10_bug_report.yml
vendored
@@ -9,6 +9,14 @@ body:
|
||||
description: What happened? What did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md#how-to-troubleshoot-issues) for details.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
- type: dropdown
|
||||
id: os
|
||||
attributes:
|
||||
|
||||
145
.github/workflows/release.yaml
vendored
145
.github/workflows/release.yaml
vendored
@@ -242,7 +242,7 @@ jobs:
|
||||
dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
|
||||
|
||||
windows-sign:
|
||||
runs-on: windows
|
||||
runs-on: windows-2022
|
||||
environment: release
|
||||
needs: [windows-depends, windows-build]
|
||||
steps:
|
||||
@@ -303,59 +303,104 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
- uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.os }}/${{ matrix.arch }}
|
||||
target: ${{ matrix.target }}
|
||||
build-args: |
|
||||
GOFLAGS=${{ env.GOFLAGS }}
|
||||
CGO_CFLAGS=${{ env.CGO_CFLAGS }}
|
||||
CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
|
||||
outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
|
||||
cache-from: type=registry,ref=ollama/ollama:latest
|
||||
cache-to: type=inline
|
||||
- run: |
|
||||
sudo apt-get update && sudo apt-get install pigz
|
||||
docker buildx build --platform $PLATFORM --target ${{ matrix.target }} --build-arg GOFLAGS --build-arg CGO_CFLAGS --build-arg CGO_CXXFLAGS --output type=local,dest=dist/$PLATFORM .
|
||||
|
||||
for COMPONENTS in dist/$PLATFORM/* dist/$PLATFORM/lib/ollama/*; do
|
||||
if [ -d "$COMPONENTS" ]; then
|
||||
case "$COMPONENTS" in
|
||||
*/bin) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in ;;
|
||||
*/lib/ollama) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in;;
|
||||
*/lib/ollama/cuda_v11) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in;;
|
||||
*/lib/ollama/cuda_v12) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in;;
|
||||
*/lib/ollama/cuda_jetpack5) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}-jetpack5.tar.in ;;
|
||||
*/lib/ollama/cuda_jetpack6) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}-jetpack6.tar.in ;;
|
||||
*/lib/ollama/rocm) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}-rocm.tar.in ;;
|
||||
esac
|
||||
fi
|
||||
for COMPONENT in bin/* lib/ollama/*; do
|
||||
case "$COMPONENT" in
|
||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v11) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
||||
esac
|
||||
done
|
||||
|
||||
for ARCHIVE in dist/*.tar.in; do tar c -T $ARCHIVE --strip-components 3 | pigz -9cv >${ARCHIVE//.*/}.tgz; done
|
||||
env:
|
||||
PLATFORM: ${{ matrix.os }}/${{ matrix.arch }}
|
||||
working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
|
||||
- run: |
|
||||
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-${{ matrix.os }}-${{ matrix.arch }}
|
||||
name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
|
||||
path: |
|
||||
dist/*.tgz
|
||||
*.tgz
|
||||
|
||||
docker-build:
|
||||
# Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower.
|
||||
docker-build-push:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- flavor: 'latest=false'
|
||||
platforms: linux/amd64,linux/arm64
|
||||
- os: linux
|
||||
arch: arm64
|
||||
build-args: |
|
||||
CGO_CFLAGS
|
||||
CGO_CXXFLAGS
|
||||
GOFLAGS
|
||||
- flavor: 'latest=false,suffix=-rocm'
|
||||
platforms: linux/amd64
|
||||
- os: linux
|
||||
arch: amd64
|
||||
build-args: |
|
||||
CGO_CFLAGS
|
||||
CGO_CXXFLAGS
|
||||
GOFLAGS
|
||||
- os: linux
|
||||
arch: amd64
|
||||
suffix: '-rocm'
|
||||
build-args: |
|
||||
CGO_CFLAGS
|
||||
CGO_CXXFLAGS
|
||||
GOFLAGS
|
||||
FLAVOR=rocm
|
||||
env:
|
||||
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
|
||||
runs-on: linux
|
||||
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
|
||||
environment: release
|
||||
needs: setup-environment
|
||||
env:
|
||||
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: docker/setup-qemu-action@v2
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
- id: build-push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.os }}/${{ matrix.arch }}
|
||||
build-args: ${{ matrix.build-args }}
|
||||
outputs: type=image,name=ollama/ollama,push-by-digest=true,name-canonical=true,push=true
|
||||
cache-from: type=registry,ref=ollama/ollama:latest
|
||||
cache-to: type=inline
|
||||
- run: |
|
||||
mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
|
||||
echo "${{ steps.build-push.outputs.digest }}" >${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}.txt
|
||||
working-directory: ${{ runner.temp }}
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: digest-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}
|
||||
path: |
|
||||
${{ runner.temp }}/${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}.txt
|
||||
|
||||
# Merge Docker images for the same flavor into a single multi-arch manifest
|
||||
docker-merge-push:
|
||||
strategy:
|
||||
matrix:
|
||||
suffix: ['', '-rocm']
|
||||
runs-on: linux
|
||||
environment: release
|
||||
needs: [docker-build-push]
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
@@ -363,22 +408,23 @@ jobs:
|
||||
- id: metadata
|
||||
uses: docker/metadata-action@v4
|
||||
with:
|
||||
flavor: ${{ matrix.flavor }}
|
||||
flavor: |
|
||||
latest=false
|
||||
suffix=${{ matrix.suffix }}
|
||||
images: |
|
||||
ollama/ollama
|
||||
tags: |
|
||||
type=ref,enable=true,priority=600,prefix=pr-,event=pr
|
||||
type=semver,pattern={{version}}
|
||||
- uses: docker/build-push-action@v6
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.platforms }}
|
||||
build-args: ${{ matrix.build-args }}
|
||||
tags: ${{ steps.metadata.outputs.tags }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
cache-from: type=registry,ref=ollama/ollama:latest
|
||||
cache-to: type=inline
|
||||
provenance: false
|
||||
pattern: digest-*
|
||||
path: ${{ runner.temp }}
|
||||
merge-multiple: true
|
||||
- run: |
|
||||
docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf 'ollama/ollama@%s ')
|
||||
docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
|
||||
working-directory: ${{ runner.temp }}
|
||||
|
||||
# Aggregate all the assets and ship a release
|
||||
release:
|
||||
@@ -391,9 +437,6 @@ jobs:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set Version
|
||||
shell: bash
|
||||
run: |
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: dist-darwin
|
||||
@@ -406,14 +449,12 @@ jobs:
|
||||
with:
|
||||
pattern: dist-linux-*
|
||||
path: dist
|
||||
- run: |
|
||||
ls -lh dist/
|
||||
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
|
||||
mv sha256sum.txt dist/
|
||||
cat dist/sha256sum.txt
|
||||
merge-multiple: true
|
||||
- run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
|
||||
working-directory: dist
|
||||
- name: Create or update Release
|
||||
run: |
|
||||
RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
|
||||
RELEASE_VERSION="$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
|
||||
|
||||
echo "Looking for existing release for ${RELEASE_VERSION}"
|
||||
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
|
||||
|
||||
2
.github/workflows/test.yaml
vendored
2
.github/workflows/test.yaml
vendored
@@ -163,5 +163,5 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Verify patches apply cleanly and do not change files
|
||||
run: |
|
||||
make -f Makefile.sync clean checkout sync
|
||||
make -f Makefile.sync clean sync
|
||||
git diff --compact-summary --exit-code
|
||||
|
||||
@@ -29,6 +29,11 @@ if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
||||
set(GGML_CPU_ALL_VARIANTS ON)
|
||||
endif()
|
||||
|
||||
if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
|
||||
set(CMAKE_BUILD_RPATH "@loader_path")
|
||||
set(CMAKE_INSTALL_RPATH "@loader_path")
|
||||
endif()
|
||||
|
||||
set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
|
||||
set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
|
||||
|
||||
@@ -80,6 +85,11 @@ if(CMAKE_CUDA_COMPILER)
|
||||
)
|
||||
endif()
|
||||
|
||||
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a):xnack[+-]$"
|
||||
CACHE STRING
|
||||
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a):xnack[+-]$\"."
|
||||
)
|
||||
|
||||
check_language(HIP)
|
||||
if(CMAKE_HIP_COMPILER)
|
||||
set(HIP_PLATFORM "amd")
|
||||
@@ -87,15 +97,18 @@ if(CMAKE_HIP_COMPILER)
|
||||
find_package(hip REQUIRED)
|
||||
if(NOT AMDGPU_TARGETS)
|
||||
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
|
||||
elseif(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
|
||||
list(FILTER AMDGPU_TARGETS EXCLUDE REGEX ${WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX})
|
||||
endif()
|
||||
|
||||
if(AMDGPU_TARGETS)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
|
||||
|
||||
set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
|
||||
install(TARGETS ggml-hip
|
||||
RUNTIME_DEPENDENCIES
|
||||
DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
|
||||
PRE_INCLUDE_REGEXES amdhip64 hipblas rocblas amd_comgr hsa_runtime64 rocprofiler-register drm_amdgpu drm numa
|
||||
PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
|
||||
PRE_EXCLUDE_REGEXES ".*"
|
||||
POST_EXCLUDE_REGEXES "system32"
|
||||
RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
"name": "ROCm 6",
|
||||
"inherits": [ "ROCm" ],
|
||||
"cacheVariables": {
|
||||
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
@@ -15,7 +15,11 @@ help:
|
||||
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
||||
|
||||
.PHONY: sync
|
||||
sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches
|
||||
sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml apply-patches
|
||||
|
||||
.PHONY: llama/build-info.cpp
|
||||
llama/build-info.cpp: llama/build-info.cpp.in
|
||||
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
|
||||
|
||||
.PHONY: llama/llama.cpp
|
||||
llama/llama.cpp: llama/vendor/ apply-patches
|
||||
|
||||
53
README.md
53
README.md
@@ -18,7 +18,7 @@ Get up and running with large language models.
|
||||
|
||||
### Linux
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
```
|
||||
|
||||
@@ -42,7 +42,7 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
|
||||
|
||||
To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama run llama3.2
|
||||
```
|
||||
|
||||
@@ -54,6 +54,8 @@ Here are some example models that can be downloaded:
|
||||
|
||||
| Model | Parameters | Size | Download |
|
||||
| ------------------ | ---------- | ----- | -------------------------------- |
|
||||
| DeepSeek-R1 | 7B | 4.7GB | `ollama run deepseek-r1` |
|
||||
| DeepSeek-R1 | 671B | 404GB | `ollama run deepseek-r1:671b` |
|
||||
| Llama 3.3 | 70B | 43GB | `ollama run llama3.3` |
|
||||
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
||||
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
||||
@@ -92,13 +94,13 @@ Ollama supports importing GGUF models in the Modelfile:
|
||||
|
||||
2. Create the model in Ollama
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama create example -f Modelfile
|
||||
```
|
||||
|
||||
3. Run the model
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama run example
|
||||
```
|
||||
|
||||
@@ -110,7 +112,7 @@ See the [guide](docs/import.md) on importing models for more information.
|
||||
|
||||
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.2` model:
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama pull llama3.2
|
||||
```
|
||||
|
||||
@@ -145,13 +147,13 @@ For more information on working with a Modelfile, see the [Modelfile](docs/model
|
||||
|
||||
`ollama create` is used to create a model from a Modelfile.
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama create mymodel -f ./Modelfile
|
||||
```
|
||||
|
||||
### Pull a model
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama pull llama3.2
|
||||
```
|
||||
|
||||
@@ -159,13 +161,13 @@ ollama pull llama3.2
|
||||
|
||||
### Remove a model
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama rm llama3.2
|
||||
```
|
||||
|
||||
### Copy a model
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama cp llama3.2 my-model
|
||||
```
|
||||
|
||||
@@ -184,37 +186,39 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
||||
|
||||
```
|
||||
ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
|
||||
The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||
```
|
||||
|
||||
> **Output**: The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||
|
||||
### Pass the prompt as an argument
|
||||
|
||||
```shell
|
||||
ollama run llama3.2 "Summarize this file: $(cat README.md)"
|
||||
```
|
||||
$ ollama run llama3.2 "Summarize this file: $(cat README.md)"
|
||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||
```
|
||||
|
||||
> **Output**: Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||
|
||||
### Show model information
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama show llama3.2
|
||||
```
|
||||
|
||||
### List models on your computer
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama list
|
||||
```
|
||||
|
||||
### List which models are currently loaded
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama ps
|
||||
```
|
||||
|
||||
### Stop a model which is currently running
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama stop llama3.2
|
||||
```
|
||||
|
||||
@@ -230,13 +234,13 @@ See the [developer guide](https://github.com/ollama/ollama/blob/main/docs/develo
|
||||
|
||||
Next, start the server:
|
||||
|
||||
```
|
||||
```shell
|
||||
./ollama serve
|
||||
```
|
||||
|
||||
Finally, in a separate shell, run a model:
|
||||
|
||||
```
|
||||
```shell
|
||||
./ollama run llama3.2
|
||||
```
|
||||
|
||||
@@ -246,7 +250,7 @@ Ollama has a REST API for running and managing models.
|
||||
|
||||
### Generate a response
|
||||
|
||||
```
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3.2",
|
||||
"prompt":"Why is the sky blue?"
|
||||
@@ -255,7 +259,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
|
||||
### Chat with a model
|
||||
|
||||
```
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3.2",
|
||||
"messages": [
|
||||
@@ -353,6 +357,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
|
||||
- [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
|
||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
||||
- [chat-ollama](https://github.com/annilq/chat-ollama) (a React Native client for Ollama)
|
||||
- [SpaceLlama](https://github.com/tcsenpai/spacellama) (Firefox and Chrome extension to quickly summarize web pages with ollama in a sidebar)
|
||||
- [YouLama](https://github.com/tcsenpai/youlama) (Webapp to quickly summarize any YouTube video, supporting Invidious as well)
|
||||
- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
|
||||
@@ -369,9 +374,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
|
||||
- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
|
||||
- [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
|
||||
- [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
|
||||
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
|
||||
- [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
|
||||
- [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
|
||||
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
|
||||
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
|
||||
|
||||
### Cloud
|
||||
|
||||
@@ -485,6 +493,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
|
||||
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
|
||||
- [Ollama for Zig](https://github.com/dravenk/ollama-zig)
|
||||
|
||||
### Mobile
|
||||
|
||||
@@ -535,6 +544,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
|
||||
- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
|
||||
- [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
|
||||
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
||||
|
||||
### Supported backends
|
||||
|
||||
@@ -545,3 +555,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
|
||||
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
|
||||
- [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
|
||||
- [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
|
||||
Run the examples in this directory with:
|
||||
|
||||
```
|
||||
```shell
|
||||
go run example_name/main.go
|
||||
```
|
||||
|
||||
## Chat - Chat with a model
|
||||
- [chat/main.go](chat/main.go)
|
||||
|
||||
|
||||
@@ -17,6 +17,6 @@ If you want to build the installer, youll need to install
|
||||
In the top directory of this repo, run the following powershell script
|
||||
to build the ollama CLI, ollama app, and ollama installer.
|
||||
|
||||
```
|
||||
```powershell
|
||||
powershell -ExecutionPolicy Bypass -File .\scripts\build_windows.ps1
|
||||
```
|
||||
|
||||
33
docs/api.md
33
docs/api.md
@@ -31,7 +31,7 @@ Certain endpoints stream responses as JSON objects. Streaming can be disabled by
|
||||
|
||||
## Generate a completion
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/generate
|
||||
```
|
||||
|
||||
@@ -485,7 +485,7 @@ A single JSON object is returned:
|
||||
|
||||
## Generate a chat completion
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/chat
|
||||
```
|
||||
|
||||
@@ -878,6 +878,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||
```
|
||||
|
||||
##### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3.2",
|
||||
@@ -924,7 +925,7 @@ A single JSON object is returned:
|
||||
|
||||
## Create a Model
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/create
|
||||
```
|
||||
|
||||
@@ -1020,7 +1021,7 @@ curl http://localhost:11434/api/create -d '{
|
||||
|
||||
A stream of JSON objects is returned:
|
||||
|
||||
```
|
||||
```json
|
||||
{"status":"quantizing F16 model to Q4_K_M"}
|
||||
{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
|
||||
{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
|
||||
@@ -1051,7 +1052,7 @@ curl http://localhost:11434/api/create -d '{
|
||||
|
||||
A stream of JSON objects is returned:
|
||||
|
||||
```
|
||||
```json
|
||||
{"status":"parsing GGUF"}
|
||||
{"status":"using existing layer sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3"}
|
||||
{"status":"writing manifest"}
|
||||
@@ -1118,7 +1119,7 @@ Return 200 OK if the blob exists, 404 Not Found if it does not.
|
||||
|
||||
## Push a Blob
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/blobs/:digest
|
||||
```
|
||||
|
||||
@@ -1142,7 +1143,7 @@ Return 201 Created if the blob was successfully created, 400 Bad Request if the
|
||||
|
||||
## List Local Models
|
||||
|
||||
```shell
|
||||
```
|
||||
GET /api/tags
|
||||
```
|
||||
|
||||
@@ -1195,7 +1196,7 @@ A single JSON object will be returned.
|
||||
|
||||
## Show Model Information
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/show
|
||||
```
|
||||
|
||||
@@ -1261,7 +1262,7 @@ curl http://localhost:11434/api/show -d '{
|
||||
|
||||
## Copy a Model
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/copy
|
||||
```
|
||||
|
||||
@@ -1284,7 +1285,7 @@ Returns a 200 OK if successful, or a 404 Not Found if the source model doesn't e
|
||||
|
||||
## Delete a Model
|
||||
|
||||
```shell
|
||||
```
|
||||
DELETE /api/delete
|
||||
```
|
||||
|
||||
@@ -1310,7 +1311,7 @@ Returns a 200 OK if successful, 404 Not Found if the model to be deleted doesn't
|
||||
|
||||
## Pull a Model
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/pull
|
||||
```
|
||||
|
||||
@@ -1382,7 +1383,7 @@ if `stream` is set to false, then the response is a single JSON object:
|
||||
|
||||
## Push a Model
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/push
|
||||
```
|
||||
|
||||
@@ -1447,7 +1448,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
|
||||
|
||||
## Generate Embeddings
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/embed
|
||||
```
|
||||
|
||||
@@ -1515,7 +1516,7 @@ curl http://localhost:11434/api/embed -d '{
|
||||
```
|
||||
|
||||
## List Running Models
|
||||
```shell
|
||||
```
|
||||
GET /api/ps
|
||||
```
|
||||
|
||||
@@ -1562,7 +1563,7 @@ A single JSON object will be returned.
|
||||
|
||||
> Note: this endpoint has been superseded by `/api/embed`
|
||||
|
||||
```shell
|
||||
```
|
||||
POST /api/embeddings
|
||||
```
|
||||
|
||||
@@ -1602,7 +1603,7 @@ curl http://localhost:11434/api/embeddings -d '{
|
||||
|
||||
## Version
|
||||
|
||||
```shell
|
||||
```
|
||||
GET /api/version
|
||||
```
|
||||
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
Install prerequisites:
|
||||
|
||||
- [Go](https://go.dev/doc/install)
|
||||
- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://jmeubank.github.io/tdm-gcc/download/) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.
|
||||
- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://github.com/jmeubank/tdm-gcc/releases/latest) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.
|
||||
|
||||
Then build and run Ollama from the root directory of the repository:
|
||||
|
||||
```
|
||||
```shell
|
||||
go run . serve
|
||||
```
|
||||
|
||||
@@ -23,14 +23,14 @@ Install prerequisites:
|
||||
|
||||
Then, configure and build the project:
|
||||
|
||||
```
|
||||
```shell
|
||||
cmake -B build
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
Lastly, run Ollama:
|
||||
|
||||
```
|
||||
```shell
|
||||
go run . serve
|
||||
```
|
||||
|
||||
@@ -57,14 +57,14 @@ Install prerequisites:
|
||||
|
||||
Then, configure and build the project:
|
||||
|
||||
```
|
||||
```shell
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
Lastly, run Ollama:
|
||||
|
||||
```
|
||||
```shell
|
||||
go run . serve
|
||||
```
|
||||
|
||||
@@ -88,26 +88,26 @@ Install prerequisites:
|
||||
|
||||
Then, configure and build the project:
|
||||
|
||||
```
|
||||
```shell
|
||||
cmake -B build
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
Lastly, run Ollama:
|
||||
|
||||
```
|
||||
```shell
|
||||
go run . serve
|
||||
```
|
||||
|
||||
## Docker
|
||||
|
||||
```
|
||||
```shell
|
||||
docker build .
|
||||
```
|
||||
|
||||
### ROCm
|
||||
|
||||
```
|
||||
```shell
|
||||
docker build --build-arg FLAVOR=rocm .
|
||||
```
|
||||
|
||||
@@ -115,6 +115,17 @@ docker build --build-arg FLAVOR=rocm .
|
||||
|
||||
To run tests, use `go test`:
|
||||
|
||||
```
|
||||
```shell
|
||||
go test ./...
|
||||
```
|
||||
|
||||
## Library detection
|
||||
|
||||
Ollama looks for acceleration libraries in the following paths relative to the `ollama` executable:
|
||||
|
||||
* `./lib/ollama` (Windows)
|
||||
* `../lib/ollama` (Linux)
|
||||
* `.` (macOS)
|
||||
* `build/lib/ollama` (for development)
|
||||
|
||||
If the libraries are not found, Ollama will not run with any acceleration libraries.
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
### CPU only
|
||||
|
||||
```bash
|
||||
```shell
|
||||
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||
```
|
||||
|
||||
@@ -11,42 +11,46 @@ Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-
|
||||
|
||||
#### Install with Apt
|
||||
1. Configure the repository
|
||||
```bash
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
|
||||
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
|
||||
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
|
||||
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
sudo apt-get update
|
||||
```
|
||||
|
||||
```shell
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
|
||||
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
|
||||
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
|
||||
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
sudo apt-get update
|
||||
```
|
||||
|
||||
2. Install the NVIDIA Container Toolkit packages
|
||||
```bash
|
||||
sudo apt-get install -y nvidia-container-toolkit
|
||||
```
|
||||
|
||||
```shell
|
||||
sudo apt-get install -y nvidia-container-toolkit
|
||||
```
|
||||
|
||||
#### Install with Yum or Dnf
|
||||
1. Configure the repository
|
||||
|
||||
```bash
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
|
||||
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
|
||||
```
|
||||
```shell
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
|
||||
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
|
||||
```
|
||||
|
||||
2. Install the NVIDIA Container Toolkit packages
|
||||
|
||||
```bash
|
||||
sudo yum install -y nvidia-container-toolkit
|
||||
```
|
||||
```shell
|
||||
sudo yum install -y nvidia-container-toolkit
|
||||
```
|
||||
|
||||
#### Configure Docker to use Nvidia driver
|
||||
```
|
||||
|
||||
```shell
|
||||
sudo nvidia-ctk runtime configure --runtime=docker
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
#### Start the container
|
||||
|
||||
```bash
|
||||
```shell
|
||||
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||
```
|
||||
|
||||
@@ -57,7 +61,7 @@ docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ol
|
||||
|
||||
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
|
||||
|
||||
```
|
||||
```shell
|
||||
docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
|
||||
```
|
||||
|
||||
@@ -65,7 +69,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
|
||||
|
||||
Now you can run a model:
|
||||
|
||||
```
|
||||
```shell
|
||||
docker exec -it ollama ollama run llama3.2
|
||||
```
|
||||
|
||||
|
||||
22
docs/faq.md
22
docs/faq.md
@@ -24,7 +24,7 @@ By default, Ollama uses a context window size of 2048 tokens.
|
||||
|
||||
To change this when using `ollama run`, use `/set parameter`:
|
||||
|
||||
```
|
||||
```shell
|
||||
/set parameter num_ctx 4096
|
||||
```
|
||||
|
||||
@@ -46,10 +46,15 @@ Use the `ollama ps` command to see what models are currently loaded into memory.
|
||||
|
||||
```shell
|
||||
ollama ps
|
||||
NAME ID SIZE PROCESSOR UNTIL
|
||||
llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now
|
||||
```
|
||||
|
||||
> **Output**:
|
||||
>
|
||||
> ```
|
||||
> NAME ID SIZE PROCESSOR UNTIL
|
||||
> llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now
|
||||
> ```
|
||||
|
||||
The `Processor` column will show which memory the model was loaded in to:
|
||||
* `100% GPU` means the model was loaded entirely into the GPU
|
||||
* `100% CPU` means the model was loaded entirely in system memory
|
||||
@@ -66,7 +71,7 @@ If Ollama is run as a macOS application, environment variables should be set usi
|
||||
1. For each environment variable, call `launchctl setenv`.
|
||||
|
||||
```bash
|
||||
launchctl setenv OLLAMA_HOST "0.0.0.0"
|
||||
launchctl setenv OLLAMA_HOST "0.0.0.0:11434"
|
||||
```
|
||||
|
||||
2. Restart Ollama application.
|
||||
@@ -81,14 +86,14 @@ If Ollama is run as a systemd service, environment variables should be set using
|
||||
|
||||
```ini
|
||||
[Service]
|
||||
Environment="OLLAMA_HOST=0.0.0.0"
|
||||
Environment="OLLAMA_HOST=0.0.0.0:11434"
|
||||
```
|
||||
|
||||
3. Save and exit.
|
||||
|
||||
4. Reload `systemd` and restart Ollama:
|
||||
|
||||
```bash
|
||||
```shell
|
||||
systemctl daemon-reload
|
||||
systemctl restart ollama
|
||||
```
|
||||
@@ -221,16 +226,19 @@ properties.
|
||||
If you are using the API you can preload a model by sending the Ollama server an empty request. This works with both the `/api/generate` and `/api/chat` API endpoints.
|
||||
|
||||
To preload the mistral model using the generate endpoint, use:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "mistral"}'
|
||||
```
|
||||
|
||||
To use the chat completions endpoint, use:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
|
||||
```
|
||||
|
||||
To preload a model using the CLI, use the command:
|
||||
|
||||
```shell
|
||||
ollama run llama3.2 ""
|
||||
```
|
||||
@@ -250,11 +258,13 @@ If you're using the API, use the `keep_alive` parameter with the `/api/generate`
|
||||
* '0' which will unload the model immediately after generating a response
|
||||
|
||||
For example, to preload a model and leave it in memory use:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": -1}'
|
||||
```
|
||||
|
||||
To unload the model and free up memory use:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": 0}'
|
||||
```
|
||||
|
||||
@@ -20,13 +20,13 @@ Make sure that you use the same base model in the `FROM` command as you used to
|
||||
|
||||
Now run `ollama create` from the directory where the `Modelfile` was created:
|
||||
|
||||
```bash
|
||||
```shell
|
||||
ollama create my-model
|
||||
```
|
||||
|
||||
Lastly, test the model:
|
||||
|
||||
```bash
|
||||
```shell
|
||||
ollama run my-model
|
||||
```
|
||||
|
||||
|
||||
@@ -119,7 +119,7 @@ sudo systemctl status ollama
|
||||
|
||||
To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
|
||||
|
||||
```
|
||||
```shell
|
||||
sudo systemctl edit ollama
|
||||
```
|
||||
|
||||
@@ -186,3 +186,9 @@ sudo rm -r /usr/share/ollama
|
||||
sudo userdel ollama
|
||||
sudo groupdel ollama
|
||||
```
|
||||
|
||||
Remove installed libraries:
|
||||
|
||||
```shell
|
||||
sudo rm -rf /usr/local/lib/ollama
|
||||
```
|
||||
|
||||
@@ -28,7 +28,7 @@ A model file is the blueprint to create and share models with Ollama.
|
||||
|
||||
The format of the `Modelfile`:
|
||||
|
||||
```modelfile
|
||||
```
|
||||
# comment
|
||||
INSTRUCTION arguments
|
||||
```
|
||||
@@ -49,7 +49,7 @@ INSTRUCTION arguments
|
||||
|
||||
An example of a `Modelfile` creating a mario blueprint:
|
||||
|
||||
```modelfile
|
||||
```
|
||||
FROM llama3.2
|
||||
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
||||
PARAMETER temperature 1
|
||||
@@ -69,24 +69,30 @@ To use this:
|
||||
|
||||
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
||||
|
||||
```bash
|
||||
> ollama show --modelfile llama3.2
|
||||
# Modelfile generated by "ollama show"
|
||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||
# FROM llama3.2:latest
|
||||
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
||||
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
||||
```shell
|
||||
ollama show --modelfile llama3.2
|
||||
```
|
||||
|
||||
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
|
||||
> **Output**:
|
||||
>
|
||||
> ```
|
||||
> # Modelfile generated by "ollama show"
|
||||
> # To build a new Modelfile based on this one, replace the FROM line with:
|
||||
> # FROM llama3.2:latest
|
||||
> FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
||||
> TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
||||
>
|
||||
> {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
|
||||
>
|
||||
> {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
|
||||
>
|
||||
> {{ .Response }}<|eot_id|>"""
|
||||
> PARAMETER stop "<|start_header_id|>"
|
||||
> PARAMETER stop "<|end_header_id|>"
|
||||
> PARAMETER stop "<|eot_id|>"
|
||||
> PARAMETER stop "<|reserved_special_token"
|
||||
> ```
|
||||
|
||||
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
{{ .Response }}<|eot_id|>"""
|
||||
PARAMETER stop "<|start_header_id|>"
|
||||
PARAMETER stop "<|end_header_id|>"
|
||||
PARAMETER stop "<|eot_id|>"
|
||||
PARAMETER stop "<|reserved_special_token"
|
||||
```
|
||||
|
||||
## Instructions
|
||||
|
||||
@@ -94,13 +100,13 @@ To view the Modelfile of a given model, use the `ollama show --modelfile` comman
|
||||
|
||||
The `FROM` instruction defines the base model to use when creating a model.
|
||||
|
||||
```modelfile
|
||||
```
|
||||
FROM <model name>:<tag>
|
||||
```
|
||||
|
||||
#### Build from existing model
|
||||
|
||||
```modelfile
|
||||
```
|
||||
FROM llama3.2
|
||||
```
|
||||
|
||||
@@ -111,7 +117,7 @@ Additional models can be found at:
|
||||
|
||||
#### Build from a Safetensors model
|
||||
|
||||
```modelfile
|
||||
```
|
||||
FROM <model directory>
|
||||
```
|
||||
|
||||
@@ -125,7 +131,7 @@ Currently supported model architectures:
|
||||
|
||||
#### Build from a GGUF file
|
||||
|
||||
```modelfile
|
||||
```
|
||||
FROM ./ollama-model.gguf
|
||||
```
|
||||
|
||||
@@ -136,7 +142,7 @@ The GGUF file location should be specified as an absolute path or relative to th
|
||||
|
||||
The `PARAMETER` instruction defines a parameter that can be set when the model is run.
|
||||
|
||||
```modelfile
|
||||
```
|
||||
PARAMETER <parameter> <parametervalue>
|
||||
```
|
||||
|
||||
@@ -183,7 +189,7 @@ TEMPLATE """{{ if .System }}<|im_start|>system
|
||||
|
||||
The `SYSTEM` instruction specifies the system message to be used in the template, if applicable.
|
||||
|
||||
```modelfile
|
||||
```
|
||||
SYSTEM """<system message>"""
|
||||
```
|
||||
|
||||
@@ -193,7 +199,7 @@ The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply
|
||||
|
||||
#### Safetensor adapter
|
||||
|
||||
```modelfile
|
||||
```
|
||||
ADAPTER <path to safetensor adapter>
|
||||
```
|
||||
|
||||
@@ -204,7 +210,7 @@ Currently supported Safetensor adapters:
|
||||
|
||||
#### GGUF adapter
|
||||
|
||||
```modelfile
|
||||
```
|
||||
ADAPTER ./ollama-lora.gguf
|
||||
```
|
||||
|
||||
@@ -212,7 +218,7 @@ ADAPTER ./ollama-lora.gguf
|
||||
|
||||
The `LICENSE` instruction allows you to specify the legal license under which the model used with this Modelfile is shared or distributed.
|
||||
|
||||
```modelfile
|
||||
```
|
||||
LICENSE """
|
||||
<license text>
|
||||
"""
|
||||
@@ -222,7 +228,7 @@ LICENSE """
|
||||
|
||||
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
|
||||
|
||||
```modelfile
|
||||
```
|
||||
MESSAGE <role> <message>
|
||||
```
|
||||
|
||||
@@ -237,7 +243,7 @@ MESSAGE <role> <message>
|
||||
|
||||
#### Example conversation
|
||||
|
||||
```modelfile
|
||||
```
|
||||
MESSAGE user Is Toronto in Canada?
|
||||
MESSAGE assistant yes
|
||||
MESSAGE user Is Sacramento in Canada?
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# OpenAI compatibility
|
||||
|
||||
> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
|
||||
> [!NOTE]
|
||||
> OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
|
||||
|
||||
Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
|
||||
|
||||
@@ -59,8 +60,10 @@ embeddings = client.embeddings.create(
|
||||
input=["why is the sky blue?", "why is the grass green?"],
|
||||
)
|
||||
```
|
||||
|
||||
#### Structured outputs
|
||||
```py
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from openai import OpenAI
|
||||
|
||||
@@ -144,7 +147,7 @@ const embedding = await openai.embeddings.create({
|
||||
|
||||
### `curl`
|
||||
|
||||
``` shell
|
||||
```shell
|
||||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
@@ -319,7 +322,7 @@ ollama pull llama3.2
|
||||
|
||||
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
||||
|
||||
```
|
||||
```shell
|
||||
ollama cp llama3.2 gpt-3.5-turbo
|
||||
```
|
||||
|
||||
@@ -343,7 +346,7 @@ curl http://localhost:11434/v1/chat/completions \
|
||||
|
||||
The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
|
||||
|
||||
```modelfile
|
||||
```
|
||||
FROM <some model>
|
||||
PARAMETER num_ctx <context size>
|
||||
```
|
||||
|
||||
@@ -17,6 +17,7 @@ When you run Ollama in a **container**, the logs go to stdout/stderr in the cont
|
||||
```shell
|
||||
docker logs <container-name>
|
||||
```
|
||||
|
||||
(Use `docker ps` to find the container name)
|
||||
|
||||
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
|
||||
@@ -28,6 +29,7 @@ When you run Ollama on **Windows**, there are a few different locations. You can
|
||||
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
|
||||
|
||||
To enable additional debug logging to help troubleshoot problems, first **Quit the running app from the tray menu** then in a powershell terminal
|
||||
|
||||
```powershell
|
||||
$env:OLLAMA_DEBUG="1"
|
||||
& "ollama app.exe"
|
||||
@@ -49,12 +51,13 @@ Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
|
||||
|
||||
You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass autodetection, so for example, if you have a CUDA card, but want to force the CPU LLM library with AVX2 vector support, use:
|
||||
|
||||
```
|
||||
```shell
|
||||
OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
|
||||
```
|
||||
|
||||
You can see what features your CPU has with the following.
|
||||
```
|
||||
|
||||
```shell
|
||||
cat /proc/cpuinfo| grep flags | head -1
|
||||
```
|
||||
|
||||
@@ -62,8 +65,8 @@ cat /proc/cpuinfo| grep flags | head -1
|
||||
|
||||
If you run into problems on Linux and want to install an older version, or you'd like to try out a pre-release before it's officially released, you can tell the install script which version to install.
|
||||
|
||||
```sh
|
||||
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
|
||||
```shell
|
||||
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.5.7 sh
|
||||
```
|
||||
|
||||
## Linux tmp noexec
|
||||
|
||||
@@ -47,6 +47,7 @@ If Ollama is already running, Quit the tray application and relaunch it from the
|
||||
## API Access
|
||||
|
||||
Here's a quick example showing API access from `powershell`
|
||||
|
||||
```powershell
|
||||
(Invoke-WebRequest -method POST -Body '{"model":"llama3.2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||
```
|
||||
|
||||
@@ -40,8 +40,6 @@ func HumanBytes(b int64) string {
|
||||
}
|
||||
|
||||
switch {
|
||||
case value >= 100:
|
||||
return fmt.Sprintf("%d %s", int(value), unit)
|
||||
case value >= 10:
|
||||
return fmt.Sprintf("%d %s", int(value), unit)
|
||||
case value != math.Trunc(value):
|
||||
|
||||
91
format/bytes_test.go
Normal file
91
format/bytes_test.go
Normal file
@@ -0,0 +1,91 @@
|
||||
package format
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestHumanBytes(t *testing.T) {
|
||||
type testCase struct {
|
||||
input int64
|
||||
expected string
|
||||
}
|
||||
|
||||
tests := []testCase{
|
||||
// Test bytes (B)
|
||||
{0, "0 B"},
|
||||
{1, "1 B"},
|
||||
{999, "999 B"},
|
||||
|
||||
// Test kilobytes (KB)
|
||||
{1000, "1 KB"},
|
||||
{1500, "1.5 KB"},
|
||||
{999999, "999 KB"},
|
||||
|
||||
// Test megabytes (MB)
|
||||
{1000000, "1 MB"},
|
||||
{1500000, "1.5 MB"},
|
||||
{999999999, "999 MB"},
|
||||
|
||||
// Test gigabytes (GB)
|
||||
{1000000000, "1 GB"},
|
||||
{1500000000, "1.5 GB"},
|
||||
{999999999999, "999 GB"},
|
||||
|
||||
// Test terabytes (TB)
|
||||
{1000000000000, "1 TB"},
|
||||
{1500000000000, "1.5 TB"},
|
||||
{1999999999999, "2.0 TB"},
|
||||
|
||||
// Test fractional values
|
||||
{1234, "1.2 KB"},
|
||||
{1234567, "1.2 MB"},
|
||||
{1234567890, "1.2 GB"},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.expected, func(t *testing.T) {
|
||||
result := HumanBytes(tc.input)
|
||||
if result != tc.expected {
|
||||
t.Errorf("Expected %s, got %s", tc.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestHumanBytes2(t *testing.T) {
|
||||
type testCase struct {
|
||||
input uint64
|
||||
expected string
|
||||
}
|
||||
|
||||
tests := []testCase{
|
||||
// Test bytes (B)
|
||||
{0, "0 B"},
|
||||
{1, "1 B"},
|
||||
{1023, "1023 B"},
|
||||
|
||||
// Test kibibytes (KiB)
|
||||
{1024, "1.0 KiB"},
|
||||
{1536, "1.5 KiB"},
|
||||
{1048575, "1024.0 KiB"},
|
||||
|
||||
// Test mebibytes (MiB)
|
||||
{1048576, "1.0 MiB"},
|
||||
{1572864, "1.5 MiB"},
|
||||
{1073741823, "1024.0 MiB"},
|
||||
|
||||
// Test gibibytes (GiB)
|
||||
{1073741824, "1.0 GiB"},
|
||||
{1610612736, "1.5 GiB"},
|
||||
{2147483648, "2.0 GiB"},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.expected, func(t *testing.T) {
|
||||
result := HumanBytes2(tc.input)
|
||||
if result != tc.expected {
|
||||
t.Errorf("Expected %s, got %s", tc.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ Ollama vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](h
|
||||
|
||||
If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
|
||||
|
||||
```
|
||||
```shell
|
||||
make -f Makefile.sync apply-patches
|
||||
```
|
||||
|
||||
@@ -22,7 +22,7 @@ When updating to a newer base commit, the existing patches may not apply cleanly
|
||||
|
||||
Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
|
||||
|
||||
```
|
||||
```shell
|
||||
make -f Makefile.sync apply-patches
|
||||
```
|
||||
|
||||
@@ -30,7 +30,7 @@ If there are conflicts, you will see an error message. Resolve the conflicts in
|
||||
|
||||
Once all patches are applied, commit the changes to the tracking repository.
|
||||
|
||||
```
|
||||
```shell
|
||||
make -f Makefile.sync format-patches sync
|
||||
```
|
||||
|
||||
@@ -38,13 +38,13 @@ make -f Makefile.sync format-patches sync
|
||||
|
||||
When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
|
||||
|
||||
```
|
||||
```shell
|
||||
make -f Makefile.sync clean apply-patches
|
||||
```
|
||||
|
||||
Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
|
||||
|
||||
```
|
||||
```shell
|
||||
make -f Makefile.sync format-patches
|
||||
```
|
||||
|
||||
|
||||
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
@@ -1,4 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "ba1cb19cdd0d92e012e0f6e009e0620f854b6afd";
|
||||
char const *LLAMA_COMMIT = "46e3556e01b824e52395fb050b29804b6cff2a7c";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
||||
4
llama/build-info.cpp.in
Normal file
4
llama/build-info.cpp.in
Normal file
@@ -0,0 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "@FETCH_HEAD@";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
36
llama/llama.cpp/examples/llava/clip.cpp
vendored
36
llama/llama.cpp/examples/llava/clip.cpp
vendored
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
new_clip->backend = ggml_backend_cuda_init(0);
|
||||
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
new_clip->backend = ggml_backend_metal_init();
|
||||
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
new_clip->backend = ggml_backend_cann_init(0);
|
||||
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
new_clip->backend = ggml_backend_vk_init(0);
|
||||
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
new_clip->backend = ggml_backend_sycl_init(0);
|
||||
LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
ggml_backend_t backend = ggml_backend_init_best();
|
||||
if (backend == nullptr) {
|
||||
LOG_ERR("%s: failed to initialize backend\n", __func__);
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
new_clip->backend = backend;
|
||||
|
||||
// model size and capabilities
|
||||
{
|
||||
|
||||
@@ -199,21 +199,25 @@ func (c *Context) KvCacheDefrag() {
|
||||
|
||||
// Get the embeddings for a sequence id
|
||||
func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
|
||||
embeddings := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
|
||||
if embeddings == nil {
|
||||
e := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
|
||||
if e == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
|
||||
embeddings := make([]float32, c.Model().NEmbd())
|
||||
_ = copy(embeddings, unsafe.Slice((*float32)(e), c.Model().NEmbd()))
|
||||
return embeddings
|
||||
}
|
||||
|
||||
func (c *Context) GetEmbeddingsIth(i int) []float32 {
|
||||
embeddings := unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))
|
||||
if embeddings == nil {
|
||||
e := unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))
|
||||
if e == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
|
||||
embeddings := make([]float32, c.Model().NEmbd())
|
||||
_ = copy(embeddings, unsafe.Slice((*float32)(e), c.Model().NEmbd()))
|
||||
return embeddings
|
||||
}
|
||||
|
||||
type ModelParams struct {
|
||||
|
||||
31
llama/mllama.cpp
vendored
31
llama/mllama.cpp
vendored
@@ -558,30 +558,15 @@ struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1)
|
||||
|
||||
mllama_ctx *new_mllama = new mllama_ctx{};
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
new_mllama->backend = ggml_backend_cuda_init(0);
|
||||
LOG("vision using CUDA backend");
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
new_mllama->backend = ggml_backend_metal_init();
|
||||
LOG("vision using Metal backend");
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
new_mllama->backend = ggml_backend_cann_init(0);
|
||||
LOG("vision using CANN backend");
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
new_mllama->backend = ggml_backend_vk_init(0);
|
||||
LOG("vision using Vulkan backend");
|
||||
#endif
|
||||
|
||||
if (!new_mllama->backend) {
|
||||
new_mllama->backend = ggml_backend_cpu_init();
|
||||
LOG("vision using CPU backend");
|
||||
ggml_backend_t backend = ggml_backend_init_best();
|
||||
if (backend == nullptr) {
|
||||
LOG("%s: failed to initialize backend\n", __func__);
|
||||
mllama_free(new_mllama);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
new_mllama->backend = backend;
|
||||
|
||||
// load tensors
|
||||
{
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sat, 4 Jan 2025 22:52:48 -0800
|
||||
Subject: [PATCH] re-enable gpu for clip
|
||||
Subject: [PATCH] use dynamic backend loading for clip
|
||||
|
||||
---
|
||||
examples/llava/clip.cpp | 86 ++++++++++++++++++++---------------------
|
||||
1 file changed, 43 insertions(+), 43 deletions(-)
|
||||
examples/llava/clip.cpp | 74 +++++++++++++++--------------------------
|
||||
1 file changed, 27 insertions(+), 47 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index b3c1829f..718052e1 100644
|
||||
index b3c1829f..86b91d5c 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -8,25 +8,25 @@
|
||||
@@ -56,7 +56,7 @@ index b3c1829f..718052e1 100644
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
@@ -1235,30 +1235,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,30 +84,19 @@ index b3c1829f..718052e1 100644
|
||||
-// new_clip->backend = ggml_backend_sycl_init(0);
|
||||
-// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
-//#endif
|
||||
+#ifdef GGML_USE_CUDA
|
||||
+ new_clip->backend = ggml_backend_cuda_init(0);
|
||||
+ LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_METAL
|
||||
+ new_clip->backend = ggml_backend_metal_init();
|
||||
+ LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_CANN
|
||||
+ new_clip->backend = ggml_backend_cann_init(0);
|
||||
+ LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_VULKAN
|
||||
+ new_clip->backend = ggml_backend_vk_init(0);
|
||||
+ LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_SYCL
|
||||
+ new_clip->backend = ggml_backend_sycl_init(0);
|
||||
+ LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
+#endif
|
||||
-
|
||||
- if (!new_clip->backend) {
|
||||
- new_clip->backend = ggml_backend_cpu_init();
|
||||
- LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
+ ggml_backend_t backend = ggml_backend_init_best();
|
||||
+ if (backend == nullptr) {
|
||||
+ LOG_ERR("%s: failed to initialize backend\n", __func__);
|
||||
+ clip_free(new_clip);
|
||||
+ gguf_free(ctx);
|
||||
+ return nullptr;
|
||||
}
|
||||
+ LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
+ new_clip->backend = backend;
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
// model size and capabilities
|
||||
{
|
||||
@@ -4,18 +4,18 @@
|
||||
|
||||
A minimial runner for loading a model and running inference via a http web server.
|
||||
|
||||
```
|
||||
```shell
|
||||
./runner -model <model binary>
|
||||
```
|
||||
|
||||
### Completion
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/completion
|
||||
```
|
||||
|
||||
### Embeddings
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST -H "Content-Type: application/json" -d '{"prompt": "turn me into an embedding"}' http://localhost:8080/embedding
|
||||
```
|
||||
|
||||
@@ -281,9 +281,14 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
finalParams = append(finalParams, params...)
|
||||
finalParams = append(finalParams, "--port", strconv.Itoa(port))
|
||||
|
||||
pathEnv := "LD_LIBRARY_PATH"
|
||||
if runtime.GOOS == "windows" {
|
||||
var pathEnv string
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
pathEnv = "PATH"
|
||||
case "darwin":
|
||||
pathEnv = "DYLD_LIBRARY_PATH"
|
||||
default:
|
||||
pathEnv = "LD_LIBRARY_PATH"
|
||||
}
|
||||
|
||||
var libraryPaths []string
|
||||
@@ -385,7 +390,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
strings.HasPrefix(ev, "HSA_") ||
|
||||
strings.HasPrefix(ev, "GGML_") ||
|
||||
strings.HasPrefix(ev, "PATH=") ||
|
||||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
|
||||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
|
||||
strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
|
||||
filteredEnv = append(filteredEnv, ev)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,14 +6,14 @@ This app builds upon Ollama to provide a desktop experience for running models.
|
||||
|
||||
First, build the `ollama` binary:
|
||||
|
||||
```
|
||||
```shell
|
||||
cd ..
|
||||
go build .
|
||||
```
|
||||
|
||||
Then run the desktop app with `npm start`:
|
||||
|
||||
```
|
||||
```shell
|
||||
cd macapp
|
||||
npm install
|
||||
npm start
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from glob import glob
|
||||
import os
|
||||
|
||||
TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
|
||||
|
||||
SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-vec-f{vkq_size}.cuh"
|
||||
|
||||
DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v});
|
||||
"""
|
||||
|
||||
SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-wmma-f16.cuh"
|
||||
|
||||
"""
|
||||
|
||||
SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n"
|
||||
|
||||
TYPES_MMQ = [
|
||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
||||
]
|
||||
|
||||
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq.cuh"
|
||||
|
||||
DECL_MMQ_CASE({type});
|
||||
"""
|
||||
|
||||
|
||||
def get_short_name(long_quant_name):
|
||||
return long_quant_name.replace("GGML_TYPE_", "").lower()
|
||||
|
||||
|
||||
def get_head_sizes(type_k, type_v):
|
||||
if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16":
|
||||
return [64, 128, 256]
|
||||
if type_k == "GGML_TYPE_F16":
|
||||
return [64, 128]
|
||||
return [128]
|
||||
|
||||
|
||||
for filename in glob("*.cu"):
|
||||
os.remove(filename)
|
||||
|
||||
for vkq_size in [16, 32]:
|
||||
for type_k in TYPES_KV:
|
||||
for type_v in TYPES_KV:
|
||||
for head_size in get_head_sizes(type_k, type_v):
|
||||
with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
|
||||
|
||||
for kq_acc_t in ["half", "float"]:
|
||||
for cols_per_block in [8, 16, 32]:
|
||||
if kq_acc_t == "float" and cols_per_block == 8:
|
||||
continue
|
||||
|
||||
with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_WMMA_START)
|
||||
|
||||
for head_size in [64, 80, 96, 112, 128, 256]:
|
||||
if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32
|
||||
continue
|
||||
if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance
|
||||
continue
|
||||
f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size))
|
||||
|
||||
for type in TYPES_MMQ:
|
||||
with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
|
||||
f.write(SOURCE_MMQ.format(type=type))
|
||||
@@ -41,36 +41,48 @@ func sink(level C.int, text *C.char, _ unsafe.Pointer) {
|
||||
}
|
||||
|
||||
var OnceLoad = sync.OnceFunc(func() {
|
||||
var lib struct{ name, defaultValue string }
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
slog.Warn("failed to get executable path", "error", err)
|
||||
exe = "."
|
||||
}
|
||||
|
||||
// PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often
|
||||
// set by the parent process, however, use a default value
|
||||
// if the environment variable is not set.
|
||||
var name, value string
|
||||
switch runtime.GOOS {
|
||||
case "darwin", "linux":
|
||||
lib.name = "LD_LIBRARY_PATH"
|
||||
lib.defaultValue = "/usr/local/lib:/usr/lib"
|
||||
case "darwin":
|
||||
// On macOS, DYLD_LIBRARY_PATH is often not set, so
|
||||
// we use the directory of the executable as the default.
|
||||
name = "DYLD_LIBRARY_PATH"
|
||||
value = filepath.Dir(exe)
|
||||
case "windows":
|
||||
lib.name = "PATH"
|
||||
lib.defaultValue = "."
|
||||
name = "PATH"
|
||||
value = filepath.Join(filepath.Dir(exe), "lib", "ollama")
|
||||
default:
|
||||
return
|
||||
name = "LD_LIBRARY_PATH"
|
||||
value = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
|
||||
}
|
||||
|
||||
paths, ok := os.LookupEnv(lib.name)
|
||||
paths, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
paths = lib.defaultValue
|
||||
}
|
||||
|
||||
if runtime.GOOS == "darwin" {
|
||||
if _, ok := os.LookupEnv("DYLD_LIBRARY_PATH"); !ok {
|
||||
os.Setenv("DYLD_LIBRARY_PATH", paths)
|
||||
}
|
||||
paths = value
|
||||
}
|
||||
|
||||
split := filepath.SplitList(paths)
|
||||
visited := make(map[string]struct{}, len(split))
|
||||
for _, path := range split {
|
||||
abspath, _ := filepath.Abs(path)
|
||||
abspath, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
slog.Error("failed to get absolute path", "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := visited[abspath]; !ok {
|
||||
func() {
|
||||
cpath := C.CString(path)
|
||||
slog.Debug("ggml backend load all from path", "path", abspath)
|
||||
cpath := C.CString(abspath)
|
||||
defer C.free(unsafe.Pointer(cpath))
|
||||
C.ggml_backend_load_all_from_path(cpath)
|
||||
}()
|
||||
|
||||
@@ -32,9 +32,10 @@ _build_darwin() {
|
||||
status "Building darwin $ARCH dynamic backends"
|
||||
cmake -B build/darwin-$ARCH \
|
||||
-DCMAKE_OSX_ARCHITECTURES=x86_64 \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 \
|
||||
-DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX
|
||||
cmake --build build/darwin-$ARCH --target ggml-cpu -j
|
||||
install build/darwin-$ARCH/lib/ollama/*.{dylib,so} $INSTALL_PREFIX
|
||||
cmake --install build/darwin-$ARCH --component CPU
|
||||
fi
|
||||
done
|
||||
}
|
||||
@@ -43,6 +44,7 @@ _sign_darwin() {
|
||||
status "Creating universal binary..."
|
||||
mkdir -p dist/darwin
|
||||
lipo -create -output dist/darwin/ollama dist/darwin-*/ollama
|
||||
chmod +x dist/darwin/ollama
|
||||
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
for F in dist/darwin/ollama dist/darwin-amd64/lib/ollama/*; do
|
||||
|
||||
@@ -162,8 +162,11 @@ function gatherDependencies() {
|
||||
$depArch=$script:TARGET_ARCH
|
||||
}
|
||||
if ($depArch -eq "x64") {
|
||||
write-host "cp ${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll ${script:DIST_DIR}\lib\ollama\"
|
||||
cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DIST_DIR}\lib\ollama\"
|
||||
write-host "cp ${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll ${script:DIST_DIR}\lib\ollama\"
|
||||
cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DIST_DIR}\lib\ollama\"
|
||||
write-host "cp ${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll ${script:DIST_DIR}\lib\ollama\"
|
||||
cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DIST_DIR}\lib\ollama\"
|
||||
$llvmCrtDir="$env:VCToolsRedistDir\..\..\..\Tools\Llvm\${depArch}\bin"
|
||||
foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
|
||||
|
||||
@@ -172,7 +172,10 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info(fmt.Sprintf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
|
||||
if len(b.Parts) > 0 {
|
||||
slog.Info(fmt.Sprintf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -365,7 +368,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
|
||||
lastUpdated := part.lastUpdated
|
||||
part.lastUpdatedMu.Unlock()
|
||||
|
||||
if !lastUpdated.IsZero() && time.Since(lastUpdated) > 5*time.Second {
|
||||
if !lastUpdated.IsZero() && time.Since(lastUpdated) > 30*time.Second {
|
||||
const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
|
||||
slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
|
||||
// reset last updated
|
||||
|
||||
@@ -108,7 +108,9 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *reg
|
||||
offset += size
|
||||
}
|
||||
|
||||
slog.Info(fmt.Sprintf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
|
||||
if len(b.Parts) > 0 {
|
||||
slog.Info(fmt.Sprintf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
|
||||
}
|
||||
|
||||
requestURL, err = url.Parse(location)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user