Compare commits
2 Commits
native
...
bmizerany/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cfd4152eb6 | ||
|
|
0fbb379373 |
60
.github/ISSUE_TEMPLATE/10_bug_report.yml
vendored
@@ -1,60 +0,0 @@
|
|||||||
name: Bug report
|
|
||||||
labels: [bug]
|
|
||||||
description: Something isn't working right.
|
|
||||||
body:
|
|
||||||
- type: textarea
|
|
||||||
id: description
|
|
||||||
attributes:
|
|
||||||
label: What is the issue?
|
|
||||||
description: What happened? What did you expect to happen?
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: os
|
|
||||||
attributes:
|
|
||||||
label: OS
|
|
||||||
description: Which operating system are you using?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- macOS
|
|
||||||
- Windows
|
|
||||||
- Docker
|
|
||||||
- WSL2
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: dropdown
|
|
||||||
id: gpu
|
|
||||||
attributes:
|
|
||||||
label: GPU
|
|
||||||
description: Which GPU are you using?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Nvidia
|
|
||||||
- AMD
|
|
||||||
- Intel
|
|
||||||
- Apple
|
|
||||||
- Other
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: dropdown
|
|
||||||
id: cpu
|
|
||||||
attributes:
|
|
||||||
label: CPU
|
|
||||||
description: Which CPU are you using?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Intel
|
|
||||||
- AMD
|
|
||||||
- Apple
|
|
||||||
- Other
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: input
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Ollama version
|
|
||||||
description: What version of Ollama are you using? (`ollama --version`)
|
|
||||||
placeholder: e.g., 0.1.32
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
18
.github/ISSUE_TEMPLATE/10_model_request.yml
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
name: Model request
|
||||||
|
description: Request a new model for the library
|
||||||
|
labels: [mr]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Please check if your Model request is [already available](https://ollama.com/search) or that you cannot [import it](https://github.com/ollama/ollama/blob/main/docs/import.md#import-a-model) yourself.
|
||||||
|
Tell us about which Model you'd like to see in the library!
|
||||||
|
- type: textarea
|
||||||
|
id: problem
|
||||||
|
attributes:
|
||||||
|
label: What model would you like?
|
||||||
|
description: Please provide a link to the model.
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for filing a model request!
|
||||||
6
.github/ISSUE_TEMPLATE/20_feature_request.md
vendored
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
name: Feature request
|
|
||||||
about: Request a new feature
|
|
||||||
labels: feature request
|
|
||||||
---
|
|
||||||
|
|
||||||
41
.github/ISSUE_TEMPLATE/20_feature_request.yml
vendored
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
name: Feature request
|
||||||
|
description: Propose a new feature
|
||||||
|
labels: [needs-triage, fr]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Please check if your feature request is [already filed](https://github.com/ollama/ollama/issues).
|
||||||
|
Tell us about your idea!
|
||||||
|
- type: textarea
|
||||||
|
id: problem
|
||||||
|
attributes:
|
||||||
|
label: What are you trying to do?
|
||||||
|
description: Tell us about the problem you're trying to solve.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: solution
|
||||||
|
attributes:
|
||||||
|
label: How should we solve this?
|
||||||
|
description: If you have an idea of how you'd like to see this feature work, let us know.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: alternative
|
||||||
|
attributes:
|
||||||
|
label: What is the impact of not solving this?
|
||||||
|
description: (How) Are you currently working around the issue?
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: context
|
||||||
|
attributes:
|
||||||
|
label: Anything else?
|
||||||
|
description: Any additional context to share, e.g., links
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for filing a feature request!
|
||||||
5
.github/ISSUE_TEMPLATE/30_model_request.md
vendored
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
name: Model request
|
|
||||||
about: Request support for a new model to be added to Ollama
|
|
||||||
labels: model request
|
|
||||||
---
|
|
||||||
125
.github/ISSUE_TEMPLATE/90_bug_report.yml
vendored
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
name: Bug report
|
||||||
|
description: File a bug report. If you need help, please join our Discord server.
|
||||||
|
labels: [needs-triage, bug]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Please check if your bug is [already filed](https://github.com/ollama/ollama/issues) before filing a new one.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What is the issue?
|
||||||
|
description: What happened? What did you expect to happen?
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: what-was-expected
|
||||||
|
attributes:
|
||||||
|
label: What did you expect to see?
|
||||||
|
description: What did you expect to see/happen instead?
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: steps
|
||||||
|
attributes:
|
||||||
|
label: Steps to reproduce
|
||||||
|
description: What are the steps you took that hit this issue?
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: changes
|
||||||
|
attributes:
|
||||||
|
label: Are there any recent changes that introduced the issue?
|
||||||
|
description: If so, what are those changes?
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: dropdown
|
||||||
|
id: os
|
||||||
|
attributes:
|
||||||
|
label: OS
|
||||||
|
description: What OS are you using? You may select more than one.
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- macOS
|
||||||
|
- Windows
|
||||||
|
- Other
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: dropdown
|
||||||
|
id: architecture
|
||||||
|
attributes:
|
||||||
|
label: Architecture
|
||||||
|
description: What architecture are you using? You may select more than one.
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- arm64
|
||||||
|
- amd64
|
||||||
|
- x86
|
||||||
|
- Other
|
||||||
|
- type: dropdown
|
||||||
|
id: platform
|
||||||
|
attributes:
|
||||||
|
label: Platform
|
||||||
|
description: What platform are you using? You may select more than one.
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Docker
|
||||||
|
- WSL
|
||||||
|
- WSL2
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: input
|
||||||
|
id: ollama-version
|
||||||
|
attributes:
|
||||||
|
label: Ollama version
|
||||||
|
description: What Ollama version are you using? (`ollama --version`)
|
||||||
|
placeholder: e.g., 1.14.4
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: dropdown
|
||||||
|
id: gpu
|
||||||
|
attributes:
|
||||||
|
label: GPU
|
||||||
|
description: What GPU, if any, are you using? You may select more than one.
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Nvidia
|
||||||
|
- AMD
|
||||||
|
- Intel
|
||||||
|
- Apple
|
||||||
|
- Other
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: gpu-info
|
||||||
|
attributes:
|
||||||
|
label: GPU info
|
||||||
|
description: What GPU info do you have? (`nvidia-smi`, `rocminfo`, `system_profiler SPDisplaysDataType`, etc.)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: dropdown
|
||||||
|
id: cpu
|
||||||
|
attributes:
|
||||||
|
label: CPU
|
||||||
|
description: What CPU are you using? You may select more than one.
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Intel
|
||||||
|
- AMD
|
||||||
|
- Apple
|
||||||
|
- Other
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: other-software
|
||||||
|
attributes:
|
||||||
|
label: Other software
|
||||||
|
description: What other software are you using that might be related to this issue?
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for filing a bug report!
|
||||||
20
.github/workflows/release.yaml
vendored
@@ -103,7 +103,6 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
llm/build/**/bin/*
|
||||||
llm/build/**/*.a
|
llm/build/**/*.a
|
||||||
dist/windows-amd64/**
|
|
||||||
|
|
||||||
# ROCm generation step
|
# ROCm generation step
|
||||||
generate-windows-rocm:
|
generate-windows-rocm:
|
||||||
@@ -174,9 +173,7 @@ jobs:
|
|||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
path: |
|
path: llm/build/**/bin/*
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-rocm-deps
|
name: windows-rocm-deps
|
||||||
@@ -256,9 +253,7 @@ jobs:
|
|||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda
|
name: generate-windows-cuda
|
||||||
path: |
|
path: llm/build/**/bin/*
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-cuda-deps
|
name: windows-cuda-deps
|
||||||
@@ -311,18 +306,23 @@ jobs:
|
|||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cpu
|
name: generate-windows-cpu
|
||||||
|
path: llm/build
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda
|
name: generate-windows-cuda
|
||||||
|
path: llm/build
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-cuda-deps
|
name: windows-cuda-deps
|
||||||
|
path: dist/deps
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-rocm-deps
|
name: windows-rocm-deps
|
||||||
|
path: dist/deps
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
|
path: llm/build
|
||||||
- run: dir llm/build
|
- run: dir llm/build
|
||||||
- run: |
|
- run: |
|
||||||
$gopath=(get-command go).source | split-path -parent
|
$gopath=(get-command go).source | split-path -parent
|
||||||
@@ -331,13 +331,13 @@ jobs:
|
|||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
||||||
$env:PATH="$gopath;$env:PATH"
|
$env:PATH="$gopath;$env:PATH"
|
||||||
$env:OLLAMA_SKIP_GENERATE="1"
|
$env:OLLAMA_SKIP_GENERATE="1"
|
||||||
|
$env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
|
||||||
|
$env:HIP_PATH=$(resolve-path ".\dist\deps")
|
||||||
& .\scripts\build_windows.ps1
|
& .\scripts\build_windows.ps1
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: dist-windows
|
name: dist-windows
|
||||||
path: |
|
path: dist/*.exe
|
||||||
dist/OllamaSetup.exe
|
|
||||||
dist/ollama-windows-*.zip
|
|
||||||
|
|
||||||
# Linux x86 assets built using the container based build
|
# Linux x86 assets built using the container based build
|
||||||
build-linux-amd64:
|
build-linux-amd64:
|
||||||
|
|||||||
34
.github/workflows/test.yaml
vendored
@@ -1,15 +1,5 @@
|
|||||||
name: test
|
name: test
|
||||||
|
|
||||||
concurrency:
|
|
||||||
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
|
|
||||||
# cancels running CI jobs and starts all new ones.
|
|
||||||
#
|
|
||||||
# For non-PR pushes, concurrency.group needs to be unique for every distinct
|
|
||||||
# CI run we want to have happen. Use run_id, which in practice means all
|
|
||||||
# non-PR CI runs will be allowed to run without preempting each other.
|
|
||||||
group: ${{ github.workflow }}-$${{ github.pull_request.number || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
@@ -31,9 +21,7 @@ jobs:
|
|||||||
- id: changes
|
- id: changes
|
||||||
run: |
|
run: |
|
||||||
changed() {
|
changed() {
|
||||||
git diff-tree -r --no-commit-id --name-only \
|
git diff-tree -r --no-commit-id --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
|
||||||
$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
|
|
||||||
${{ github.event.pull_request.head.sha }} \
|
|
||||||
| xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
|
| xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,9 +103,7 @@ jobs:
|
|||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cuda-${{ matrix.cuda-version }}-libraries
|
name: cuda-${{ matrix.cuda-version }}-libraries
|
||||||
path: |
|
path: llm/build/**/bin/*
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
generate-rocm:
|
generate-rocm:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
||||||
@@ -148,9 +134,7 @@ jobs:
|
|||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: rocm-${{ matrix.rocm-version }}-libraries
|
name: rocm-${{ matrix.rocm-version }}-libraries
|
||||||
path: |
|
path: llm/build/**/bin/*
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
|
|
||||||
# ROCm generation step
|
# ROCm generation step
|
||||||
generate-windows-rocm:
|
generate-windows-rocm:
|
||||||
@@ -269,9 +253,14 @@ jobs:
|
|||||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
||||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
||||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
if: ${{ startsWith(matrix.os, 'macos-') }}
|
||||||
|
- run: |
|
||||||
|
mkdir -p llm/build/windows/$ARCH/stub/bin
|
||||||
|
touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
|
||||||
|
if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||||
|
shell: bash
|
||||||
- uses: golangci/golangci-lint-action@v4
|
- uses: golangci/golangci-lint-action@v4
|
||||||
with:
|
with:
|
||||||
args: --timeout 8m0s -v
|
args: --timeout 8m0s
|
||||||
test:
|
test:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@@ -295,6 +284,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
|
- run: go get
|
||||||
- run: |
|
- run: |
|
||||||
case ${{ matrix.arch }} in
|
case ${{ matrix.arch }} in
|
||||||
amd64) echo ARCH=x86_64 ;;
|
amd64) echo ARCH=x86_64 ;;
|
||||||
@@ -309,6 +299,10 @@ jobs:
|
|||||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
||||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
||||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
if: ${{ startsWith(matrix.os, 'macos-') }}
|
||||||
|
- run: |
|
||||||
|
mkdir -p llm/build/windows/$ARCH/stub/bin
|
||||||
|
touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
|
||||||
|
if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||||
shell: bash
|
shell: bash
|
||||||
- run: go generate ./...
|
- run: go generate ./...
|
||||||
- run: go build
|
- run: go build
|
||||||
|
|||||||
2
.gitignore
vendored
@@ -11,4 +11,4 @@ ggml-metal.metal
|
|||||||
.idea
|
.idea
|
||||||
test_data
|
test_data
|
||||||
*.crt
|
*.crt
|
||||||
llm/build
|
llm/build
|
||||||
14
Dockerfile
@@ -18,7 +18,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
|||||||
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
||||||
|
|
||||||
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
|
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
@@ -28,7 +28,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
|||||||
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
||||||
|
|
||||||
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
|
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
@@ -40,7 +40,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
|||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
ARG AMDGPU_TARGETS
|
ARG AMDGPU_TARGETS
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
||||||
RUN mkdir /tmp/scratch && \
|
RUN mkdir /tmp/scratch && \
|
||||||
for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
|
for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
|
||||||
cp ${dep} /tmp/scratch/ || exit 1 ; \
|
cp ${dep} /tmp/scratch/ || exit 1 ; \
|
||||||
@@ -64,11 +64,11 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
|||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
|
||||||
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
|
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
|
||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
|
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
|
||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
|
RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
|
||||||
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
|
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
|
RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
|
||||||
|
|
||||||
FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
|
FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
@@ -84,7 +84,7 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
|||||||
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
|
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
|
||||||
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
|
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
|
||||||
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
|
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
|
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
|
||||||
|
|
||||||
|
|
||||||
# Intermediate stage used for ./scripts/build_linux.sh
|
# Intermediate stage used for ./scripts/build_linux.sh
|
||||||
|
|||||||
49
README.md
@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
|
|||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
To run and chat with [Llama 3](https://ollama.com/library/llama3):
|
To run and chat with [Llama 2](https://ollama.com/library/llama2):
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama run llama3
|
ollama run llama2
|
||||||
```
|
```
|
||||||
|
|
||||||
## Model library
|
## Model library
|
||||||
@@ -49,14 +49,18 @@ Here are some example models that can be downloaded:
|
|||||||
|
|
||||||
| Model | Parameters | Size | Download |
|
| Model | Parameters | Size | Download |
|
||||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
| ------------------ | ---------- | ----- | ------------------------------ |
|
||||||
| Llama 3 | 8B | 4.7GB | `ollama run llama3` |
|
| Llama 2 | 7B | 3.8GB | `ollama run llama2` |
|
||||||
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
|
||||||
| Phi-3 | 3,8B | 2.3GB | `ollama run phi3` |
|
|
||||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||||
|
| Dolphin Phi | 2.7B | 1.6GB | `ollama run dolphin-phi` |
|
||||||
|
| Phi-2 | 2.7B | 1.7GB | `ollama run phi` |
|
||||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||||
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
|
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
|
||||||
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
||||||
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
||||||
|
| Llama 2 13B | 13B | 7.3GB | `ollama run llama2:13b` |
|
||||||
|
| Llama 2 70B | 70B | 39GB | `ollama run llama2:70b` |
|
||||||
|
| Orca Mini | 3B | 1.9GB | `ollama run orca-mini` |
|
||||||
|
| Vicuna | 7B | 3.8GB | `ollama run vicuna` |
|
||||||
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
||||||
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
|
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
|
||||||
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
|
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
|
||||||
@@ -94,16 +98,16 @@ See the [guide](docs/import.md) on importing models for more information.
|
|||||||
|
|
||||||
### Customize a prompt
|
### Customize a prompt
|
||||||
|
|
||||||
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:
|
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama2` model:
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama pull llama3
|
ollama pull llama2
|
||||||
```
|
```
|
||||||
|
|
||||||
Create a `Modelfile`:
|
Create a `Modelfile`:
|
||||||
|
|
||||||
```
|
```
|
||||||
FROM llama3
|
FROM llama2
|
||||||
|
|
||||||
# set the temperature to 1 [higher is more creative, lower is more coherent]
|
# set the temperature to 1 [higher is more creative, lower is more coherent]
|
||||||
PARAMETER temperature 1
|
PARAMETER temperature 1
|
||||||
@@ -138,7 +142,7 @@ ollama create mymodel -f ./Modelfile
|
|||||||
### Pull a model
|
### Pull a model
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama pull llama3
|
ollama pull llama2
|
||||||
```
|
```
|
||||||
|
|
||||||
> This command can also be used to update a local model. Only the diff will be pulled.
|
> This command can also be used to update a local model. Only the diff will be pulled.
|
||||||
@@ -146,13 +150,13 @@ ollama pull llama3
|
|||||||
### Remove a model
|
### Remove a model
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama rm llama3
|
ollama rm llama2
|
||||||
```
|
```
|
||||||
|
|
||||||
### Copy a model
|
### Copy a model
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama cp llama3 my-model
|
ollama cp llama2 my-llama2
|
||||||
```
|
```
|
||||||
|
|
||||||
### Multiline input
|
### Multiline input
|
||||||
@@ -176,7 +180,7 @@ The image features a yellow smiley face, which is likely the central focus of th
|
|||||||
### Pass in prompt as arguments
|
### Pass in prompt as arguments
|
||||||
|
|
||||||
```
|
```
|
||||||
$ ollama run llama3 "Summarize this file: $(cat README.md)"
|
$ ollama run llama2 "Summarize this file: $(cat README.md)"
|
||||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -223,7 +227,7 @@ Next, start the server:
|
|||||||
Finally, in a separate shell, run a model:
|
Finally, in a separate shell, run a model:
|
||||||
|
|
||||||
```
|
```
|
||||||
./ollama run llama3
|
./ollama run llama2
|
||||||
```
|
```
|
||||||
|
|
||||||
## REST API
|
## REST API
|
||||||
@@ -234,7 +238,7 @@ Ollama has a REST API for running and managing models.
|
|||||||
|
|
||||||
```
|
```
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama2",
|
||||||
"prompt":"Why is the sky blue?"
|
"prompt":"Why is the sky blue?"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
@@ -243,7 +247,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
|
|
||||||
```
|
```
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "mistral",
|
||||||
"messages": [
|
"messages": [
|
||||||
{ "role": "user", "content": "why is the sky blue?" }
|
{ "role": "user", "content": "why is the sky blue?" }
|
||||||
]
|
]
|
||||||
@@ -256,17 +260,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
|
|
||||||
### Web & Desktop
|
### Web & Desktop
|
||||||
|
|
||||||
- [Open WebUI](https://github.com/open-webui/open-webui)
|
|
||||||
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
|
|
||||||
- [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
|
- [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
|
||||||
- [LibreChat](https://github.com/danny-avila/LibreChat)
|
- [LibreChat](https://github.com/danny-avila/LibreChat)
|
||||||
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
|
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
|
||||||
|
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
|
||||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||||
- [Saddle](https://github.com/jikkuatwork/saddle)
|
- [Saddle](https://github.com/jikkuatwork/saddle)
|
||||||
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
||||||
- [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
|
|
||||||
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
||||||
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
||||||
|
- [Open WebUI](https://github.com/open-webui/open-webui)
|
||||||
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
|
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
|
||||||
- [big-AGI](https://github.com/enricoros/big-AGI/blob/main/docs/config-local-ollama.md)
|
- [big-AGI](https://github.com/enricoros/big-AGI/blob/main/docs/config-local-ollama.md)
|
||||||
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
|
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
|
||||||
@@ -288,13 +291,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
|
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
|
||||||
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
|
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
|
||||||
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
||||||
- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
|
|
||||||
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
|
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
|
||||||
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
|
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
|
||||||
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
|
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
|
||||||
- [chat: chat web app for teams](https://github.com/swuecho/chat)
|
|
||||||
- [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
|
|
||||||
- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
|
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
@@ -310,13 +309,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
|
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
|
||||||
- [cmdh](https://github.com/pgibler/cmdh)
|
- [cmdh](https://github.com/pgibler/cmdh)
|
||||||
- [ooo](https://github.com/npahlfer/ooo)
|
- [ooo](https://github.com/npahlfer/ooo)
|
||||||
- [shell-pilot](https://github.com/reid41/shell-pilot)
|
|
||||||
- [tenere](https://github.com/pythops/tenere)
|
- [tenere](https://github.com/pythops/tenere)
|
||||||
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
|
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
|
||||||
- [typechat-cli](https://github.com/anaisbetts/typechat-cli)
|
- [typechat-cli](https://github.com/anaisbetts/typechat-cli)
|
||||||
- [ShellOracle](https://github.com/djcopley/ShellOracle)
|
- [ShellOracle](https://github.com/djcopley/ShellOracle)
|
||||||
- [tlm](https://github.com/yusufcanb/tlm)
|
- [tlm](https://github.com/yusufcanb/tlm)
|
||||||
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
|
|
||||||
|
|
||||||
### Database
|
### Database
|
||||||
|
|
||||||
@@ -381,7 +378,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
||||||
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
|
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
|
||||||
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
|
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
|
||||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
|
||||||
|
|
||||||
### Supported backends
|
|
||||||
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
|
|
||||||
|
|||||||
@@ -91,13 +91,6 @@ func ClientFromEnvironment() (*Client, error) {
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewClient(base *url.URL, http *http.Client) *Client {
|
|
||||||
return &Client{
|
|
||||||
base: base,
|
|
||||||
http: http,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
|
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
|
||||||
var reqBody io.Reader
|
var reqBody io.Reader
|
||||||
var data []byte
|
var data []byte
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
@@ -308,7 +307,7 @@ func (m *Metrics) Summary() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var ErrInvalidOpts = errors.New("invalid options")
|
var ErrInvalidOpts = fmt.Errorf("invalid options")
|
||||||
|
|
||||||
func (opts *Options) FromMap(m map[string]interface{}) error {
|
func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||||
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
|
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
|
||||||
@@ -396,10 +395,8 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
|||||||
func DefaultOptions() Options {
|
func DefaultOptions() Options {
|
||||||
return Options{
|
return Options{
|
||||||
// options set on request to runner
|
// options set on request to runner
|
||||||
NumPredict: -1,
|
NumPredict: -1,
|
||||||
|
NumKeep: 0,
|
||||||
// set a minimal num_keep to avoid issues on context shifts
|
|
||||||
NumKeep: 4,
|
|
||||||
Temperature: 0.8,
|
Temperature: 0.8,
|
||||||
TopK: 40,
|
TopK: 40,
|
||||||
TopP: 0.9,
|
TopP: 0.9,
|
||||||
|
|||||||
1
app/.gitignore
vendored
@@ -1,2 +1 @@
|
|||||||
ollama.syso
|
ollama.syso
|
||||||
app
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
#import <Cocoa/Cocoa.h>
|
|
||||||
|
|
||||||
@interface AppDelegate : NSObject <NSApplicationDelegate>
|
|
||||||
|
|
||||||
- (void)applicationDidFinishLaunching:(NSNotification *)aNotification;
|
|
||||||
|
|
||||||
@end
|
|
||||||
@@ -1,6 +1,10 @@
|
|||||||
# Ollama App
|
# Ollama App
|
||||||
|
|
||||||
## macOS
|
## Linux
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
|
## MacOS
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
||||||
|
|||||||
@@ -1,76 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
// #cgo CFLAGS: -x objective-c
|
|
||||||
// #cgo LDFLAGS: -framework Cocoa -framework LocalAuthentication -framework ServiceManagement
|
|
||||||
// #include "app_darwin.h"
|
|
||||||
import "C"
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"syscall"
|
|
||||||
)
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
home, err := os.UserHomeDir()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
ServerLogFile = filepath.Join(home, ".ollama", "logs", "server.log")
|
|
||||||
}
|
|
||||||
|
|
||||||
func run() {
|
|
||||||
initLogging()
|
|
||||||
slog.Info("ollama macOS app started")
|
|
||||||
|
|
||||||
// Ask to move to applications directory
|
|
||||||
moving := C.askToMoveToApplications()
|
|
||||||
if moving {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
C.killOtherInstances()
|
|
||||||
|
|
||||||
code := C.installSymlink()
|
|
||||||
if code != 0 {
|
|
||||||
slog.Error("Failed to install symlink")
|
|
||||||
}
|
|
||||||
|
|
||||||
exe, err := os.Executable()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var options ServerOptions
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
var done chan int
|
|
||||||
|
|
||||||
done, err = SpawnServer(ctx, filepath.Join(filepath.Dir(exe), "..", "Resources", "ollama"), options)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("Failed to spawn ollama server %s", err))
|
|
||||||
done = make(chan int, 1)
|
|
||||||
done <- 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run the native macOS app
|
|
||||||
// Note: this will block until the app is closed
|
|
||||||
C.run()
|
|
||||||
|
|
||||||
slog.Info("ollama macOS app closed")
|
|
||||||
|
|
||||||
cancel()
|
|
||||||
slog.Info("Waiting for ollama server to shutdown...")
|
|
||||||
if done != nil {
|
|
||||||
<-done
|
|
||||||
}
|
|
||||||
slog.Info("Ollama app exiting")
|
|
||||||
}
|
|
||||||
|
|
||||||
//export Quit
|
|
||||||
func Quit() {
|
|
||||||
syscall.Kill(os.Getpid(), syscall.SIGTERM)
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
#import <Cocoa/Cocoa.h>
|
|
||||||
|
|
||||||
@interface AppDelegate : NSObject <NSApplicationDelegate>
|
|
||||||
- (void)applicationDidFinishLaunching:(NSNotification *)aNotification;
|
|
||||||
@end
|
|
||||||
|
|
||||||
void run();
|
|
||||||
void killOtherInstances();
|
|
||||||
bool askToMoveToApplications();
|
|
||||||
int createSymlinkWithAuthorization();
|
|
||||||
int installSymlink();
|
|
||||||
extern void Restart();
|
|
||||||
extern void Quit();
|
|
||||||
282
app/app_darwin.m
@@ -1,282 +0,0 @@
|
|||||||
#import <AppKit/AppKit.h>
|
|
||||||
#import <Cocoa/Cocoa.h>
|
|
||||||
#import <CoreServices/CoreServices.h>
|
|
||||||
#import <Security/Security.h>
|
|
||||||
#import <ServiceManagement/ServiceManagement.h>
|
|
||||||
#import "app_darwin.h"
|
|
||||||
|
|
||||||
@interface AppDelegate ()
|
|
||||||
|
|
||||||
@property (strong, nonatomic) NSStatusItem *statusItem;
|
|
||||||
|
|
||||||
@end
|
|
||||||
|
|
||||||
@implementation AppDelegate
|
|
||||||
|
|
||||||
- (void)applicationDidFinishLaunching:(NSNotification *)aNotification {
|
|
||||||
// show status menu
|
|
||||||
NSMenu *menu = [[NSMenu alloc] init];
|
|
||||||
|
|
||||||
NSMenuItem *aboutMenuItem = [[NSMenuItem alloc] initWithTitle:@"About Ollama" action:@selector(aboutOllama) keyEquivalent:@""];
|
|
||||||
[aboutMenuItem setTarget:self];
|
|
||||||
[menu addItem:aboutMenuItem];
|
|
||||||
|
|
||||||
// Settings submenu
|
|
||||||
NSMenu *settingsMenu = [[NSMenu alloc] initWithTitle:@"Settings"];
|
|
||||||
|
|
||||||
// Submenu items
|
|
||||||
NSMenuItem *chooseModelDirectoryItem = [[NSMenuItem alloc] initWithTitle:@"Choose model directory..." action:@selector(chooseModelDirectory) keyEquivalent:@""];
|
|
||||||
[chooseModelDirectoryItem setTarget:self];
|
|
||||||
[chooseModelDirectoryItem setEnabled:YES];
|
|
||||||
[settingsMenu addItem:chooseModelDirectoryItem];
|
|
||||||
|
|
||||||
NSMenuItem *exposeExternallyItem = [[NSMenuItem alloc] initWithTitle:@"Allow external connections" action:@selector(toggleExposeExternally:) keyEquivalent:@""];
|
|
||||||
[exposeExternallyItem setTarget:self];
|
|
||||||
[exposeExternallyItem setState:NSOffState]; // Set initial state to off
|
|
||||||
[exposeExternallyItem setEnabled:YES];
|
|
||||||
[settingsMenu addItem:exposeExternallyItem];
|
|
||||||
|
|
||||||
NSMenuItem *allowCrossOriginItem = [[NSMenuItem alloc] initWithTitle:@"Allow browser requests" action:@selector(toggleCrossOrigin:) keyEquivalent:@""];
|
|
||||||
[allowCrossOriginItem setTarget:self];
|
|
||||||
[allowCrossOriginItem setState:NSOffState]; // Set initial state to off
|
|
||||||
[allowCrossOriginItem setEnabled:YES];
|
|
||||||
[settingsMenu addItem:allowCrossOriginItem];
|
|
||||||
|
|
||||||
NSMenuItem *settingsMenuItem = [[NSMenuItem alloc] initWithTitle:@"Settings" action:nil keyEquivalent:@""];
|
|
||||||
[settingsMenuItem setSubmenu:settingsMenu];
|
|
||||||
[menu addItem:settingsMenuItem];
|
|
||||||
|
|
||||||
[menu addItemWithTitle:@"Quit Ollama" action:@selector(quit) keyEquivalent:@"q"];
|
|
||||||
|
|
||||||
self.statusItem = [[NSStatusBar systemStatusBar] statusItemWithLength:NSVariableStatusItemLength];
|
|
||||||
[self.statusItem addObserver:self forKeyPath:@"button.effectiveAppearance" options:NSKeyValueObservingOptionNew|NSKeyValueObservingOptionInitial context:nil];
|
|
||||||
|
|
||||||
self.statusItem.menu = menu;
|
|
||||||
[self showIcon];
|
|
||||||
}
|
|
||||||
|
|
||||||
- (void)aboutOllama {
|
|
||||||
[[NSApplication sharedApplication] orderFrontStandardAboutPanel:nil];
|
|
||||||
}
|
|
||||||
|
|
||||||
- (void)toggleCrossOrigin:(id)sender {
|
|
||||||
NSMenuItem *item = (NSMenuItem *)sender;
|
|
||||||
if ([item state] == NSOffState) {
|
|
||||||
// Do something when cross-origin requests are allowed
|
|
||||||
[item setState:NSOnState];
|
|
||||||
} else {
|
|
||||||
// Do something when cross-origin requests are disallowed
|
|
||||||
[item setState:NSOffState];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
- (void)toggleExposeExternally:(id)sender {
|
|
||||||
NSMenuItem *item = (NSMenuItem *)sender;
|
|
||||||
if ([item state] == NSOffState) {
|
|
||||||
// Do something when Ollama is exposed externally
|
|
||||||
[item setState:NSOnState];
|
|
||||||
} else {
|
|
||||||
// Do something when Ollama is not exposed externally
|
|
||||||
[item setState:NSOffState];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
- (void)chooseModelDirectory {
|
|
||||||
NSOpenPanel *openPanel = [NSOpenPanel openPanel];
|
|
||||||
[openPanel setCanChooseFiles:NO];
|
|
||||||
[openPanel setCanChooseDirectories:YES];
|
|
||||||
[openPanel setAllowsMultipleSelection:NO];
|
|
||||||
|
|
||||||
NSInteger result = [openPanel runModal];
|
|
||||||
if (result == NSModalResponseOK) {
|
|
||||||
NSURL *selectedDirectoryURL = [openPanel URLs].firstObject;
|
|
||||||
// Do something with the selected directory URL
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
-(void) showIcon {
|
|
||||||
NSAppearance* appearance = self.statusItem.button.effectiveAppearance;
|
|
||||||
NSString* appearanceName = (NSString*)(appearance.name);
|
|
||||||
NSString* iconName = [[appearanceName lowercaseString] containsString:@"dark"] ? @"iconDark" : @"icon";
|
|
||||||
NSImage* statusImage = [NSImage imageNamed:iconName];
|
|
||||||
[statusImage setTemplate:YES];
|
|
||||||
self.statusItem.button.image = statusImage;
|
|
||||||
}
|
|
||||||
|
|
||||||
-(void)observeValueForKeyPath:(NSString *)keyPath ofObject:(id)object change:(NSDictionary<NSKeyValueChangeKey,id> *)change context:(void *)context {
|
|
||||||
[self showIcon];
|
|
||||||
}
|
|
||||||
|
|
||||||
- (void)quit {
|
|
||||||
[NSApp stop:nil];
|
|
||||||
}
|
|
||||||
|
|
||||||
@end
|
|
||||||
|
|
||||||
void run() {
|
|
||||||
@autoreleasepool {
|
|
||||||
[NSApplication sharedApplication];
|
|
||||||
AppDelegate *appDelegate = [[AppDelegate alloc] init];
|
|
||||||
[NSApp setDelegate:appDelegate];
|
|
||||||
[NSApp run];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// killOtherInstances kills all other instances of the app currently
|
|
||||||
// running. This way we can ensure that only the most recently started
|
|
||||||
// instance of Ollama is running
|
|
||||||
void killOtherInstances() {
|
|
||||||
pid_t pid = getpid();
|
|
||||||
NSArray *all = [[NSWorkspace sharedWorkspace] runningApplications];
|
|
||||||
NSMutableArray *apps = [NSMutableArray array];
|
|
||||||
|
|
||||||
for (NSRunningApplication *app in all) {
|
|
||||||
if ([app.bundleIdentifier isEqualToString:[[NSBundle mainBundle] bundleIdentifier]] ||
|
|
||||||
[app.bundleIdentifier isEqualToString:@"ai.ollama.ollama"] ||
|
|
||||||
[app.bundleIdentifier isEqualToString:@"com.electron.ollama"]) {
|
|
||||||
if (app.processIdentifier != pid) {
|
|
||||||
[apps addObject:app];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (NSRunningApplication *app in apps) {
|
|
||||||
kill(app.processIdentifier, SIGTERM);
|
|
||||||
}
|
|
||||||
|
|
||||||
NSDate *startTime = [NSDate date];
|
|
||||||
for (NSRunningApplication *app in apps) {
|
|
||||||
while (!app.terminated) {
|
|
||||||
if (-[startTime timeIntervalSinceNow] >= 5) {
|
|
||||||
kill(app.processIdentifier, SIGKILL);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
[[NSRunLoop currentRunLoop] runUntilDate:[NSDate dateWithTimeIntervalSinceNow:0.1]];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool askToMoveToApplications() {
|
|
||||||
NSString *bundlePath = [[NSBundle mainBundle] bundlePath];
|
|
||||||
if ([bundlePath hasPrefix:@"/Applications"]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
NSAlert *alert = [[NSAlert alloc] init];
|
|
||||||
[alert setMessageText:@"Move to Applications?"];
|
|
||||||
[alert setInformativeText:@"Ollama works best when run from the Applications directory."];
|
|
||||||
[alert addButtonWithTitle:@"Move to Applications"];
|
|
||||||
[alert addButtonWithTitle:@"Don't move"];
|
|
||||||
|
|
||||||
[NSApp activateIgnoringOtherApps:YES];
|
|
||||||
|
|
||||||
if ([alert runModal] != NSAlertFirstButtonReturn) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// move to applications
|
|
||||||
NSString *applicationsPath = @"/Applications";
|
|
||||||
NSString *newPath = [applicationsPath stringByAppendingPathComponent:@"Ollama.app"];
|
|
||||||
NSFileManager *fileManager = [NSFileManager defaultManager];
|
|
||||||
|
|
||||||
// Check if the newPath already exists
|
|
||||||
if ([fileManager fileExistsAtPath:newPath]) {
|
|
||||||
NSError *removeError = nil;
|
|
||||||
[fileManager removeItemAtPath:newPath error:&removeError];
|
|
||||||
if (removeError) {
|
|
||||||
NSLog(@"Error removing file at %@: %@", newPath, removeError);
|
|
||||||
return false; // or handle the error
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
NSError *moveError = nil;
|
|
||||||
[fileManager moveItemAtPath:bundlePath toPath:newPath error:&moveError];
|
|
||||||
if (moveError) {
|
|
||||||
NSLog(@"Error moving file from %@ to %@: %@", bundlePath, newPath, moveError);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
NSLog(@"Opening %@", newPath);
|
|
||||||
NSError *error = nil;
|
|
||||||
NSWorkspace *workspace = [NSWorkspace sharedWorkspace];
|
|
||||||
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
||||||
[workspace launchApplicationAtURL:[NSURL fileURLWithPath:newPath]
|
|
||||||
options:NSWorkspaceLaunchNewInstance | NSWorkspaceLaunchDefault
|
|
||||||
configuration:@{}
|
|
||||||
error:&error];
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int installSymlink() {
|
|
||||||
NSString *linkPath = @"/usr/local/bin/ollama";
|
|
||||||
NSError *error = nil;
|
|
||||||
|
|
||||||
NSFileManager *fileManager = [NSFileManager defaultManager];
|
|
||||||
NSString *symlinkPath = [fileManager destinationOfSymbolicLinkAtPath:linkPath error:&error];
|
|
||||||
NSString *bundlePath = [[NSBundle mainBundle] bundlePath];
|
|
||||||
NSString *execPath = [[NSBundle mainBundle] executablePath];
|
|
||||||
NSString *resPath = [[NSBundle mainBundle] pathForResource:@"ollama" ofType:nil];
|
|
||||||
|
|
||||||
// if the symlink already exists and points to the right place, don't prompt
|
|
||||||
if ([symlinkPath isEqualToString:resPath]) {
|
|
||||||
NSLog(@"symbolic link already exists and points to the right place");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
NSString *authorizationPrompt = @"Ollama is trying to install its command line interface (CLI) tool.";
|
|
||||||
|
|
||||||
AuthorizationRef auth = NULL;
|
|
||||||
OSStatus createStatus = AuthorizationCreate(NULL, kAuthorizationEmptyEnvironment, kAuthorizationFlagDefaults, &auth);
|
|
||||||
if (createStatus != errAuthorizationSuccess) {
|
|
||||||
NSLog(@"Error creating authorization");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
NSString * bundleIdentifier = [[NSBundle mainBundle] bundleIdentifier];
|
|
||||||
NSString *rightNameString = [NSString stringWithFormat:@"%@.%@", bundleIdentifier, @"auth3"];
|
|
||||||
const char *rightName = rightNameString.UTF8String;
|
|
||||||
|
|
||||||
OSStatus getRightResult = AuthorizationRightGet(rightName, NULL);
|
|
||||||
if (getRightResult == errAuthorizationDenied) {
|
|
||||||
if (AuthorizationRightSet(auth, rightName, (__bridge CFTypeRef _Nonnull)(@(kAuthorizationRuleAuthenticateAsAdmin)), (__bridge CFStringRef _Nullable)(authorizationPrompt), NULL, NULL) != errAuthorizationSuccess) {
|
|
||||||
NSLog(@"Failed to set right");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
AuthorizationItem right = { .name = rightName, .valueLength = 0, .value = NULL, .flags = 0 };
|
|
||||||
AuthorizationRights rights = { .count = 1, .items = &right };
|
|
||||||
AuthorizationFlags flags = (AuthorizationFlags)(kAuthorizationFlagExtendRights | kAuthorizationFlagInteractionAllowed);
|
|
||||||
AuthorizationItem iconAuthorizationItem = {.name = kAuthorizationEnvironmentIcon, .valueLength = 0, .value = NULL, .flags = 0};
|
|
||||||
AuthorizationEnvironment authorizationEnvironment = {.count = 0, .items = NULL};
|
|
||||||
|
|
||||||
BOOL failedToUseSystemDomain = NO;
|
|
||||||
OSStatus copyStatus = AuthorizationCopyRights(auth, &rights, &authorizationEnvironment, flags, NULL);
|
|
||||||
if (copyStatus != errAuthorizationSuccess) {
|
|
||||||
failedToUseSystemDomain = YES;
|
|
||||||
|
|
||||||
if (copyStatus == errAuthorizationCanceled) {
|
|
||||||
NSLog(@"User cancelled authorization");
|
|
||||||
return -1;
|
|
||||||
} else {
|
|
||||||
NSLog(@"Failed copying system domain rights: %d", copyStatus);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *toolPath = "/bin/ln";
|
|
||||||
const char *args[] = {"-s", "-F", [resPath UTF8String], "/usr/local/bin/ollama", NULL};
|
|
||||||
FILE *pipe = NULL;
|
|
||||||
|
|
||||||
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
||||||
OSStatus status = AuthorizationExecuteWithPrivileges(auth, toolPath, kAuthorizationFlagDefaults, (char *const *)args, &pipe);
|
|
||||||
if (status != errAuthorizationSuccess) {
|
|
||||||
NSLog(@"Failed to create symlink");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
AuthorizationFree(auth, kAuthorizationFlagDestroyRights);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,166 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"log"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"os/exec"
|
|
||||||
"os/signal"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
"syscall"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/app/lifecycle"
|
|
||||||
"github.com/ollama/ollama/app/store"
|
|
||||||
"github.com/ollama/ollama/app/tray"
|
|
||||||
"github.com/ollama/ollama/app/updater"
|
|
||||||
)
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
AppName += ".exe"
|
|
||||||
CLIName += ".exe"
|
|
||||||
// Logs, configs, downloads go to LOCALAPPDATA
|
|
||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
|
||||||
AppDataDir = filepath.Join(localAppData, "Ollama")
|
|
||||||
AppLogFile = filepath.Join(AppDataDir, "app.log")
|
|
||||||
ServerLogFile = filepath.Join(AppDataDir, "server.log")
|
|
||||||
|
|
||||||
// Executables are stored in APPDATA
|
|
||||||
AppDir = filepath.Join(localAppData, "Programs", "Ollama")
|
|
||||||
|
|
||||||
// Make sure we have PATH set correctly for any spawned children
|
|
||||||
paths := strings.Split(os.Getenv("PATH"), ";")
|
|
||||||
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
|
|
||||||
found := false
|
|
||||||
for _, path := range paths {
|
|
||||||
d, err := filepath.Abs(path)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if strings.EqualFold(AppDir, d) {
|
|
||||||
found = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !found {
|
|
||||||
paths = append(paths, AppDir)
|
|
||||||
|
|
||||||
pathVal := strings.Join(paths, ";")
|
|
||||||
slog.Debug("setting PATH=" + pathVal)
|
|
||||||
err := os.Setenv("PATH", pathVal)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("failed to update PATH: %s", err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure our logging dir exists
|
|
||||||
_, err := os.Stat(AppDataDir)
|
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
|
||||||
if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func ShowLogs() {
|
|
||||||
cmd_path := "c:\\Windows\\system32\\cmd.exe"
|
|
||||||
slog.Debug(fmt.Sprintf("viewing logs with start %s", AppDataDir))
|
|
||||||
cmd := exec.Command(cmd_path, "/c", "start", AppDataDir)
|
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: false, CreationFlags: 0x08000000}
|
|
||||||
err := cmd.Start()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("Failed to open log dir: %s", err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func Start() {
|
|
||||||
cmd_path := "c:\\Windows\\system32\\cmd.exe"
|
|
||||||
slog.Debug(fmt.Sprintf("viewing logs with start %s", AppDataDir))
|
|
||||||
cmd := exec.Command(cmd_path, "/c", "start", AppDataDir)
|
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: false, CreationFlags: 0x08000000}
|
|
||||||
err := cmd.Start()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("Failed to open log dir: %s", err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func run() {
|
|
||||||
initLogging()
|
|
||||||
|
|
||||||
slog.Info("ollama windows app started")
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
var done chan int
|
|
||||||
|
|
||||||
t, err := tray.NewTray()
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Failed to start: %s", err)
|
|
||||||
}
|
|
||||||
callbacks := t.GetCallbacks()
|
|
||||||
|
|
||||||
signals := make(chan os.Signal, 1)
|
|
||||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
slog.Debug("starting callback loop")
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-callbacks.Quit:
|
|
||||||
slog.Debug("quit called")
|
|
||||||
t.Quit()
|
|
||||||
case <-signals:
|
|
||||||
slog.Debug("shutting down due to signal")
|
|
||||||
t.Quit()
|
|
||||||
case <-callbacks.Update:
|
|
||||||
err := updater.DoUpgrade(cancel, done)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn(fmt.Sprintf("upgrade attempt failed: %s", err))
|
|
||||||
}
|
|
||||||
case <-callbacks.ShowLogs:
|
|
||||||
ShowLogs()
|
|
||||||
case <-callbacks.DoFirstUse:
|
|
||||||
err := lifecycle.GetStarted()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn(fmt.Sprintf("Failed to launch getting started shell: %s", err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
if !store.GetFirstTimeRun() {
|
|
||||||
slog.Debug("First time run")
|
|
||||||
err = t.DisplayFirstUseNotification()
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug(fmt.Sprintf("XXX failed to display first use notification %v", err))
|
|
||||||
}
|
|
||||||
store.SetFirstTimeRun(true)
|
|
||||||
} else {
|
|
||||||
slog.Debug("Not first time, skipping first run notification")
|
|
||||||
}
|
|
||||||
|
|
||||||
if isServerRunning(ctx) {
|
|
||||||
slog.Info("Detected another instance of ollama running, exiting")
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
done, err = SpawnServer(ctx, CLIName)
|
|
||||||
if err != nil {
|
|
||||||
// TODO - should we retry in a backoff loop?
|
|
||||||
// TODO - should we pop up a warning and maybe add a menu item to view application logs?
|
|
||||||
slog.Error(fmt.Sprintf("Failed to spawn ollama server %s", err))
|
|
||||||
done = make(chan int, 1)
|
|
||||||
done <- 1
|
|
||||||
}
|
|
||||||
|
|
||||||
updater.StartBackgroundUpdaterChecker(ctx, t.UpdateAvailable)
|
|
||||||
|
|
||||||
t.Run()
|
|
||||||
cancel()
|
|
||||||
slog.Info("Waiting for ollama server to shutdown...")
|
|
||||||
if done != nil {
|
|
||||||
<-done
|
|
||||||
}
|
|
||||||
slog.Info("Ollama app exiting")
|
|
||||||
}
|
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
|
||||||
<plist version="1.0">
|
|
||||||
<dict>
|
|
||||||
<key>CFBundleDisplayName</key>
|
|
||||||
<string>Ollama</string>
|
|
||||||
<key>CFBundleExecutable</key>
|
|
||||||
<string>Ollama</string>
|
|
||||||
<key>CFBundleIconFile</key>
|
|
||||||
<string>icon.icns</string>
|
|
||||||
<key>CFBundleIdentifier</key>
|
|
||||||
<string>com.ollama.ollama</string>
|
|
||||||
<key>CFBundleInfoDictionaryVersion</key>
|
|
||||||
<string>6.0</string>
|
|
||||||
<key>CFBundleName</key>
|
|
||||||
<string>Ollama</string>
|
|
||||||
<key>CFBundlePackageType</key>
|
|
||||||
<string>APPL</string>
|
|
||||||
<key>CFBundleShortVersionString</key>
|
|
||||||
<string>0.0.0</string>
|
|
||||||
<key>CFBundleVersion</key>
|
|
||||||
<string>0.0.0</string>
|
|
||||||
<key>DTCompiler</key>
|
|
||||||
<string>com.apple.compilers.llvm.clang.1_0</string>
|
|
||||||
<key>DTSDKBuild</key>
|
|
||||||
<string>22E245</string>
|
|
||||||
<key>DTSDKName</key>
|
|
||||||
<string>macosx13.3</string>
|
|
||||||
<key>DTXcode</key>
|
|
||||||
<string>1431</string>
|
|
||||||
<key>DTXcodeBuild</key>
|
|
||||||
<string>14E300c</string>
|
|
||||||
<key>LSApplicationCategoryType</key>
|
|
||||||
<string>public.app-category.developer-tools</string>
|
|
||||||
<key>LSMinimumSystemVersion</key>
|
|
||||||
<string>11.0</string>
|
|
||||||
<key>LSUIElement</key>
|
|
||||||
<true/>
|
|
||||||
</dict>
|
|
||||||
</plist>
|
|
||||||
|
Before Width: | Height: | Size: 382 B |
|
Before Width: | Height: | Size: 691 B |
|
Before Width: | Height: | Size: 382 B |
|
Before Width: | Height: | Size: 721 B |
@@ -1,3 +1,5 @@
|
|||||||
|
//go:build !windows
|
||||||
|
|
||||||
package lifecycle
|
package lifecycle
|
||||||
|
|
||||||
import "fmt"
|
import "fmt"
|
||||||
92
app/lifecycle/lifecycle.go
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
package lifecycle
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/app/store"
|
||||||
|
"github.com/ollama/ollama/app/tray"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Run() {
|
||||||
|
InitLogging()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
var done chan int
|
||||||
|
|
||||||
|
t, err := tray.NewTray()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to start: %s", err)
|
||||||
|
}
|
||||||
|
callbacks := t.GetCallbacks()
|
||||||
|
|
||||||
|
signals := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
slog.Debug("starting callback loop")
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-callbacks.Quit:
|
||||||
|
slog.Debug("quit called")
|
||||||
|
t.Quit()
|
||||||
|
case <-signals:
|
||||||
|
slog.Debug("shutting down due to signal")
|
||||||
|
t.Quit()
|
||||||
|
case <-callbacks.Update:
|
||||||
|
err := DoUpgrade(cancel, done)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn(fmt.Sprintf("upgrade attempt failed: %s", err))
|
||||||
|
}
|
||||||
|
case <-callbacks.ShowLogs:
|
||||||
|
ShowLogs()
|
||||||
|
case <-callbacks.DoFirstUse:
|
||||||
|
err := GetStarted()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn(fmt.Sprintf("Failed to launch getting started shell: %s", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Are we first use?
|
||||||
|
if !store.GetFirstTimeRun() {
|
||||||
|
slog.Debug("First time run")
|
||||||
|
err = t.DisplayFirstUseNotification()
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug(fmt.Sprintf("XXX failed to display first use notification %v", err))
|
||||||
|
}
|
||||||
|
store.SetFirstTimeRun(true)
|
||||||
|
} else {
|
||||||
|
slog.Debug("Not first time, skipping first run notification")
|
||||||
|
}
|
||||||
|
|
||||||
|
if IsServerRunning(ctx) {
|
||||||
|
slog.Info("Detected another instance of ollama running, exiting")
|
||||||
|
os.Exit(1)
|
||||||
|
} else {
|
||||||
|
done, err = SpawnServer(ctx, CLIName)
|
||||||
|
if err != nil {
|
||||||
|
// TODO - should we retry in a backoff loop?
|
||||||
|
// TODO - should we pop up a warning and maybe add a menu item to view application logs?
|
||||||
|
slog.Error(fmt.Sprintf("Failed to spawn ollama server %s", err))
|
||||||
|
done = make(chan int, 1)
|
||||||
|
done <- 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
StartBackgroundUpdaterChecker(ctx, t.UpdateAvailable)
|
||||||
|
|
||||||
|
t.Run()
|
||||||
|
cancel()
|
||||||
|
slog.Info("Waiting for ollama server to shutdown...")
|
||||||
|
if done != nil {
|
||||||
|
<-done
|
||||||
|
}
|
||||||
|
slog.Info("Ollama app exiting")
|
||||||
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package main
|
package lifecycle
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -7,7 +7,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
)
|
)
|
||||||
|
|
||||||
func initLogging() {
|
func InitLogging() {
|
||||||
level := slog.LevelInfo
|
level := slog.LevelInfo
|
||||||
|
|
||||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||||
@@ -41,4 +41,6 @@ func initLogging() {
|
|||||||
})
|
})
|
||||||
|
|
||||||
slog.SetDefault(slog.New(handler))
|
slog.SetDefault(slog.New(handler))
|
||||||
|
|
||||||
|
slog.Info("ollama app started")
|
||||||
}
|
}
|
||||||
9
app/lifecycle/logging_nonwindows.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
//go:build !windows
|
||||||
|
|
||||||
|
package lifecycle
|
||||||
|
|
||||||
|
import "log/slog"
|
||||||
|
|
||||||
|
func ShowLogs() {
|
||||||
|
slog.Warn("ShowLogs not yet implemented")
|
||||||
|
}
|
||||||
19
app/lifecycle/logging_windows.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
package lifecycle
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
func ShowLogs() {
|
||||||
|
cmd_path := "c:\\Windows\\system32\\cmd.exe"
|
||||||
|
slog.Debug(fmt.Sprintf("viewing logs with start %s", AppDataDir))
|
||||||
|
cmd := exec.Command(cmd_path, "/c", "start", AppDataDir)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: false, CreationFlags: 0x08000000}
|
||||||
|
err := cmd.Start()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error(fmt.Sprintf("Failed to open log dir: %s", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -70,5 +70,10 @@ func init() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else if runtime.GOOS == "darwin" {
|
||||||
|
// TODO
|
||||||
|
AppName += ".app"
|
||||||
|
// } else if runtime.GOOS == "linux" {
|
||||||
|
// TODO
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
package main
|
package lifecycle
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
@@ -14,41 +14,65 @@ import (
|
|||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ServerOptions struct {
|
func getCLIFullPath(command string) string {
|
||||||
Cors bool
|
cmdPath := ""
|
||||||
Expose bool
|
appExe, err := os.Executable()
|
||||||
ModelsPath string
|
if err == nil {
|
||||||
|
cmdPath = filepath.Join(filepath.Dir(appExe), command)
|
||||||
|
_, err := os.Stat(cmdPath)
|
||||||
|
if err == nil {
|
||||||
|
return cmdPath
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cmdPath, err = exec.LookPath(command)
|
||||||
|
if err == nil {
|
||||||
|
_, err := os.Stat(cmdPath)
|
||||||
|
if err == nil {
|
||||||
|
return cmdPath
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pwd, err := os.Getwd()
|
||||||
|
if err == nil {
|
||||||
|
cmdPath = filepath.Join(pwd, command)
|
||||||
|
_, err = os.Stat(cmdPath)
|
||||||
|
if err == nil {
|
||||||
|
return cmdPath
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return command
|
||||||
}
|
}
|
||||||
|
|
||||||
func start(ctx context.Context, command string, options ServerOptions) (*exec.Cmd, error) {
|
func SpawnServer(ctx context.Context, command string) (chan int, error) {
|
||||||
cmd := getCmd(ctx, command)
|
done := make(chan int)
|
||||||
|
|
||||||
// set environment variables
|
logDir := filepath.Dir(ServerLogFile)
|
||||||
if options.ModelsPath != "" {
|
_, err := os.Stat(logDir)
|
||||||
cmd.Env = append(cmd.Env, fmt.Sprintf("OLLAMA_MODELS=%s", options.ModelsPath))
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
}
|
if err := os.MkdirAll(logDir, 0o755); err != nil {
|
||||||
|
return done, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
|
||||||
if options.Cors {
|
}
|
||||||
cmd.Env = append(cmd.Env, "OLLAMA_ORIGINS=*")
|
|
||||||
}
|
|
||||||
|
|
||||||
if options.Expose {
|
|
||||||
cmd.Env = append(cmd.Env, "OLLAMA_HOST=0.0.0.0")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cmd := getCmd(ctx, getCLIFullPath(command))
|
||||||
|
// send stdout and stderr to a file
|
||||||
stdout, err := cmd.StdoutPipe()
|
stdout, err := cmd.StdoutPipe()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
|
return done, fmt.Errorf("failed to spawn server stdout pipe %s", err)
|
||||||
}
|
}
|
||||||
stderr, err := cmd.StderrPipe()
|
stderr, err := cmd.StderrPipe()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
|
return done, fmt.Errorf("failed to spawn server stderr pipe %s", err)
|
||||||
|
}
|
||||||
|
stdin, err := cmd.StdinPipe()
|
||||||
|
if err != nil {
|
||||||
|
return done, fmt.Errorf("failed to spawn server stdin pipe %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - rotation
|
// TODO - rotation
|
||||||
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create server log: %w", err)
|
return done, fmt.Errorf("failed to create server log %w", err)
|
||||||
}
|
}
|
||||||
go func() {
|
go func() {
|
||||||
defer logFile.Close()
|
defer logFile.Close()
|
||||||
@@ -93,38 +117,19 @@ func start(ctx context.Context, command string, options ServerOptions) (*exec.Cm
|
|||||||
|
|
||||||
// run the command and wait for it to finish
|
// run the command and wait for it to finish
|
||||||
if err := cmd.Start(); err != nil {
|
if err := cmd.Start(); err != nil {
|
||||||
return nil, fmt.Errorf("failed to start server %w", err)
|
return done, fmt.Errorf("failed to start server %w", err)
|
||||||
}
|
}
|
||||||
if cmd.Process != nil {
|
if cmd.Process != nil {
|
||||||
slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
|
slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
|
||||||
}
|
}
|
||||||
slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))
|
slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))
|
||||||
|
|
||||||
return cmd, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func SpawnServer(ctx context.Context, command string, options ServerOptions) (chan int, error) {
|
|
||||||
logDir := filepath.Dir(ServerLogFile)
|
|
||||||
_, err := os.Stat(logDir)
|
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
|
||||||
if err := os.MkdirAll(logDir, 0o755); err != nil {
|
|
||||||
return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
done := make(chan int)
|
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
// Keep the server running unless we're shuttind down the app
|
// Keep the server running unless we're shuttind down the app
|
||||||
crashCount := 0
|
crashCount := 0
|
||||||
for {
|
for {
|
||||||
slog.Info(fmt.Sprintf("starting server..."))
|
|
||||||
cmd, err := start(ctx, command, options)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("failed to start server %s", err))
|
|
||||||
}
|
|
||||||
|
|
||||||
cmd.Wait() //nolint:errcheck
|
cmd.Wait() //nolint:errcheck
|
||||||
|
stdin.Close()
|
||||||
var code int
|
var code int
|
||||||
if cmd.ProcessState != nil {
|
if cmd.ProcessState != nil {
|
||||||
code = cmd.ProcessState.ExitCode()
|
code = cmd.ProcessState.ExitCode()
|
||||||
@@ -138,16 +143,19 @@ func SpawnServer(ctx context.Context, command string, options ServerOptions) (ch
|
|||||||
default:
|
default:
|
||||||
crashCount++
|
crashCount++
|
||||||
slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
|
slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
|
||||||
time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
|
time.Sleep(500 * time.Millisecond)
|
||||||
break
|
if err := cmd.Start(); err != nil {
|
||||||
|
slog.Error(fmt.Sprintf("failed to restart server %s", err))
|
||||||
|
// Keep trying, but back off if we keep failing
|
||||||
|
time.Sleep(time.Duration(crashCount) * time.Second)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
return done, nil
|
return done, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func isServerRunning(ctx context.Context) bool {
|
func IsServerRunning(ctx context.Context) bool {
|
||||||
client, err := api.ClientFromEnvironment()
|
client, err := api.ClientFromEnvironment()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Info("unable to connect to server")
|
slog.Info("unable to connect to server")
|
||||||
@@ -1,4 +1,6 @@
|
|||||||
package main
|
//go:build !windows
|
||||||
|
|
||||||
|
package lifecycle
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package main
|
package lifecycle
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package updater
|
package lifecycle
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
@@ -22,10 +22,6 @@ import (
|
|||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
UpdateStageDir string
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
UpdateCheckURLBase = "https://ollama.com/api/update"
|
UpdateCheckURLBase = "https://ollama.com/api/update"
|
||||||
UpdateDownloaded = false
|
UpdateDownloaded = false
|
||||||
@@ -127,7 +123,7 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
|
|||||||
slog.Debug("no etag detected, falling back to filename based dedup")
|
slog.Debug("no etag detected, falling back to filename based dedup")
|
||||||
etag = "_"
|
etag = "_"
|
||||||
}
|
}
|
||||||
filename := "OllamaSetup.exe"
|
filename := Installer
|
||||||
_, params, err := mime.ParseMediaType(resp.Header.Get("content-disposition"))
|
_, params, err := mime.ParseMediaType(resp.Header.Get("content-disposition"))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
filename = params["filename"]
|
filename = params["filename"]
|
||||||
@@ -1,4 +1,6 @@
|
|||||||
package updater
|
//go:build !windows
|
||||||
|
|
||||||
|
package lifecycle
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package updater
|
package lifecycle
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
@@ -9,13 +9,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
|
||||||
UpdateStageDir = filepath.Join(os.Getenv("LOCALAPPDATA"), "Ollama", "updates")
|
|
||||||
}
|
|
||||||
|
|
||||||
func DoUpgrade(cancel context.CancelFunc, done chan int) error {
|
func DoUpgrade(cancel context.CancelFunc, done chan int) error {
|
||||||
logFile := filepath.Join(os.Getenv("LOCALAPPDATA"), "Ollama", "upgrade.log")
|
|
||||||
|
|
||||||
files, err := filepath.Glob(filepath.Join(UpdateStageDir, "*", "*.exe")) // TODO generalize for multiplatform
|
files, err := filepath.Glob(filepath.Join(UpdateStageDir, "*", "*.exe")) // TODO generalize for multiplatform
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to lookup downloads: %s", err)
|
return fmt.Errorf("failed to lookup downloads: %s", err)
|
||||||
@@ -29,13 +23,13 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
|
|||||||
installerExe := files[0]
|
installerExe := files[0]
|
||||||
|
|
||||||
slog.Info("starting upgrade with " + installerExe)
|
slog.Info("starting upgrade with " + installerExe)
|
||||||
slog.Info("upgrade log file " + logFile)
|
slog.Info("upgrade log file " + UpgradeLogFile)
|
||||||
|
|
||||||
// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
|
// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
|
||||||
installArgs := []string{
|
installArgs := []string{
|
||||||
"/CLOSEAPPLICATIONS", // Quit the tray app if it's still running
|
"/CLOSEAPPLICATIONS", // Quit the tray app if it's still running
|
||||||
"/LOG=" + filepath.Base(logFile), // Only relative seems reliable, so set pwd
|
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
|
||||||
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
|
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
|
||||||
}
|
}
|
||||||
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
|
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
|
||||||
// TODO - temporarily disable since we're pinning in debug mode for the preview
|
// TODO - temporarily disable since we're pinning in debug mode for the preview
|
||||||
@@ -59,7 +53,7 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug(fmt.Sprintf("starting installer: %s %v", installerExe, installArgs))
|
slog.Debug(fmt.Sprintf("starting installer: %s %v", installerExe, installArgs))
|
||||||
os.Chdir(filepath.Dir(logFile)) //nolint:errcheck
|
os.Chdir(filepath.Dir(UpgradeLogFile)) //nolint:errcheck
|
||||||
cmd := exec.Command(installerExe, installArgs...)
|
cmd := exec.Command(installerExe, installArgs...)
|
||||||
|
|
||||||
if err := cmd.Start(); err != nil {
|
if err := cmd.Start(); err != nil {
|
||||||
12
app/main.go
@@ -2,15 +2,11 @@ package main
|
|||||||
|
|
||||||
// Compile with the following to get rid of the cmd pop up on windows
|
// Compile with the following to get rid of the cmd pop up on windows
|
||||||
// go build -ldflags="-H windowsgui" .
|
// go build -ldflags="-H windowsgui" .
|
||||||
var (
|
|
||||||
AppName string
|
import (
|
||||||
CLIName string
|
"github.com/ollama/ollama/app/lifecycle"
|
||||||
AppDir string
|
|
||||||
AppDataDir string
|
|
||||||
AppLogFile string
|
|
||||||
ServerLogFile string
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
run()
|
lifecycle.Run()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -88,12 +88,15 @@ DialogFontSize=12
|
|||||||
[Files]
|
[Files]
|
||||||
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
||||||
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||||
Source: "..\dist\windows-amd64\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||||
Source: "..\dist\windows-amd64\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
|
||||||
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
#if DirExists("..\dist\windows-amd64\rocm")
|
; Assumes v5.7, may need adjustments for v6
|
||||||
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
#if GetEnv("HIP_PATH") != ""
|
||||||
|
Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
|
||||||
|
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
|
||||||
|
; amdhip64.dll dependency comes from the driver and must be installed already
|
||||||
|
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
@@ -129,7 +132,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
|
|||||||
|
|
||||||
|
|
||||||
;FinishedHeadingLabel=Run your first model
|
;FinishedHeadingLabel=Run your first model
|
||||||
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3
|
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama2
|
||||||
;ClickFinish=%n
|
;ClickFinish=%n
|
||||||
|
|
||||||
[Registry]
|
[Registry]
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
|
//go:build !windows
|
||||||
|
|
||||||
package tray
|
package tray
|
||||||
|
|
||||||
import (
|
import (
|
||||||
209
cmd/cmd.go
@@ -17,7 +17,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -54,6 +53,8 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
p := progress.NewProgress(os.Stderr)
|
p := progress.NewProgress(os.Stderr)
|
||||||
defer p.Stop()
|
defer p.Stop()
|
||||||
|
|
||||||
|
bars := make(map[string]*progress.Bar)
|
||||||
|
|
||||||
modelfile, err := os.ReadFile(filename)
|
modelfile, err := os.ReadFile(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -94,16 +95,95 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO make this work w/ adapters
|
||||||
if fi.IsDir() {
|
if fi.IsDir() {
|
||||||
// this is likely a safetensors or pytorch directory
|
tf, err := os.CreateTemp("", "ollama-tf")
|
||||||
// TODO make this work w/ adapters
|
|
||||||
tempfile, err := tempZipFiles(path)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer os.RemoveAll(tempfile)
|
defer os.RemoveAll(tf.Name())
|
||||||
|
|
||||||
path = tempfile
|
zf := zip.NewWriter(tf)
|
||||||
|
|
||||||
|
files := []string{}
|
||||||
|
|
||||||
|
tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
} else if len(tfiles) == 0 {
|
||||||
|
tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
files = append(files, tfiles...)
|
||||||
|
|
||||||
|
if len(files) == 0 {
|
||||||
|
return fmt.Errorf("no models were found in '%s'", path)
|
||||||
|
}
|
||||||
|
|
||||||
|
// add the safetensor/torch config file + tokenizer
|
||||||
|
files = append(files, filepath.Join(path, "config.json"))
|
||||||
|
files = append(files, filepath.Join(path, "params.json"))
|
||||||
|
files = append(files, filepath.Join(path, "added_tokens.json"))
|
||||||
|
files = append(files, filepath.Join(path, "tokenizer.model"))
|
||||||
|
|
||||||
|
for _, fn := range files {
|
||||||
|
f, err := os.Open(fn)
|
||||||
|
|
||||||
|
// just skip whatever files aren't there
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
if strings.HasSuffix(fn, "tokenizer.model") {
|
||||||
|
// try the parent dir before giving up
|
||||||
|
parentDir := filepath.Dir(path)
|
||||||
|
newFn := filepath.Join(parentDir, "tokenizer.model")
|
||||||
|
f, err = os.Open(newFn)
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
continue
|
||||||
|
} else if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
} else if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
fi, err := f.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
h, err := zip.FileInfoHeader(fi)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
h.Name = filepath.Base(fn)
|
||||||
|
h.Method = zip.Store
|
||||||
|
|
||||||
|
w, err := zf.CreateHeader(h)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = io.Copy(w, f)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := zf.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := tf.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
path = tf.Name()
|
||||||
}
|
}
|
||||||
|
|
||||||
digest, err := createBlob(cmd, client, path)
|
digest, err := createBlob(cmd, client, path)
|
||||||
@@ -111,17 +191,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
name := c.Name
|
modelfile = bytes.ReplaceAll(modelfile, []byte(c.Args), []byte("@"+digest))
|
||||||
if c.Name == "model" {
|
|
||||||
name = "from"
|
|
||||||
}
|
|
||||||
|
|
||||||
re := regexp.MustCompile(fmt.Sprintf(`(?im)^(%s)\s+%s\s*$`, name, c.Args))
|
|
||||||
modelfile = re.ReplaceAll(modelfile, []byte("$1 @"+digest))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bars := make(map[string]*progress.Bar)
|
|
||||||
fn := func(resp api.ProgressResponse) error {
|
fn := func(resp api.ProgressResponse) error {
|
||||||
if resp.Digest != "" {
|
if resp.Digest != "" {
|
||||||
spinner.Stop()
|
spinner.Stop()
|
||||||
@@ -155,114 +228,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func tempZipFiles(path string) (string, error) {
|
|
||||||
tempfile, err := os.CreateTemp("", "ollama-tf")
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
defer tempfile.Close()
|
|
||||||
|
|
||||||
zipfile := zip.NewWriter(tempfile)
|
|
||||||
defer zipfile.Close()
|
|
||||||
|
|
||||||
detectContentType := func(path string) (string, error) {
|
|
||||||
f, err := os.Open(path)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
var b bytes.Buffer
|
|
||||||
b.Grow(512)
|
|
||||||
|
|
||||||
if _, err := io.CopyN(&b, f, 512); err != nil && !errors.Is(err, io.EOF) {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
contentType, _, _ := strings.Cut(http.DetectContentType(b.Bytes()), ";")
|
|
||||||
return contentType, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
glob := func(pattern, contentType string) ([]string, error) {
|
|
||||||
matches, err := filepath.Glob(pattern)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, safetensor := range matches {
|
|
||||||
if ct, err := detectContentType(safetensor); err != nil {
|
|
||||||
return nil, err
|
|
||||||
} else if ct != contentType {
|
|
||||||
return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, safetensor)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return matches, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var files []string
|
|
||||||
if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
|
|
||||||
// safetensors files might be unresolved git lfs references; skip if they are
|
|
||||||
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
|
||||||
files = append(files, st...)
|
|
||||||
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
|
||||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
|
||||||
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
|
||||||
files = append(files, pt...)
|
|
||||||
} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/octet-stream"); len(pt) > 0 {
|
|
||||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
|
||||||
// covers consolidated.x.pth, consolidated.pth
|
|
||||||
files = append(files, pt...)
|
|
||||||
} else {
|
|
||||||
return "", errors.New("no safetensors or torch files found")
|
|
||||||
}
|
|
||||||
|
|
||||||
// add configuration files, json files are detected as text/plain
|
|
||||||
js, err := glob(filepath.Join(path, "*.json"), "text/plain")
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
files = append(files, js...)
|
|
||||||
|
|
||||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
|
||||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
|
||||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
|
||||||
files = append(files, tks...)
|
|
||||||
} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
|
|
||||||
// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
|
|
||||||
files = append(files, tks...)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, file := range files {
|
|
||||||
f, err := os.Open(file)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
fi, err := f.Stat()
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
zfi, err := zip.FileInfoHeader(fi)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
zf, err := zipfile.CreateHeader(zfi)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, err := io.Copy(zf, f); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return tempfile.Name(), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
|
func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
|
||||||
bin, err := os.Open(path)
|
bin, err := os.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -18,23 +18,19 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Params struct {
|
type Params struct {
|
||||||
Architectures []string `json:"architectures"`
|
Architectures []string `json:"architectures"`
|
||||||
VocabSize int `json:"vocab_size"`
|
VocabSize int `json:"vocab_size"`
|
||||||
HiddenSize int `json:"hidden_size"` // n_embd
|
HiddenSize int `json:"hidden_size"` // n_embd
|
||||||
HiddenLayers int `json:"num_hidden_layers"` // n_layer
|
HiddenLayers int `json:"num_hidden_layers"` // n_layer
|
||||||
ContextSize int `json:"max_position_embeddings"`
|
ContextSize int `json:"max_position_embeddings"`
|
||||||
IntermediateSize int `json:"intermediate_size"`
|
IntermediateSize int `json:"intermediate_size"`
|
||||||
AttentionHeads int `json:"num_attention_heads"` // n_head
|
AttentionHeads int `json:"num_attention_heads"` // n_head
|
||||||
KeyValHeads int `json:"num_key_value_heads"`
|
KeyValHeads int `json:"num_key_value_heads"`
|
||||||
NormEPS float64 `json:"rms_norm_eps"`
|
NormEPS float64 `json:"rms_norm_eps"`
|
||||||
BoSTokenID int `json:"bos_token_id"`
|
BoSTokenID int `json:"bos_token_id"`
|
||||||
EoSTokenID int `json:"eos_token_id"`
|
EoSTokenID int `json:"eos_token_id"`
|
||||||
HeadDimension int `json:"head_dim"`
|
HeadDimension int `json:"head_dim"`
|
||||||
PaddingTokenID int `json:"pad_token_id"`
|
PaddingTokenID int `json:"pad_token_id"`
|
||||||
RopeFrequencyBase float64 `json:"rope_theta"`
|
|
||||||
|
|
||||||
Experts int `json:"num_local_experts"`
|
|
||||||
ExpertsUsed int `json:"num_experts_per_tok"`
|
|
||||||
|
|
||||||
ByteOrder
|
ByteOrder
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,96 +0,0 @@
|
|||||||
package convert
|
|
||||||
|
|
||||||
import (
|
|
||||||
"os"
|
|
||||||
"regexp"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
|
||||||
)
|
|
||||||
|
|
||||||
type MixtralModel struct {
|
|
||||||
ModelData
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MixtralModel) GetTensors() error {
|
|
||||||
t, err := m.Format.GetTensors(m.Path, m.Params)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
m.Tensors = []llm.Tensor{}
|
|
||||||
|
|
||||||
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
|
|
||||||
re, err := regexp.Compile(pattern)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, l := range t {
|
|
||||||
matches := re.FindAllStringSubmatch(l.Name, -1)
|
|
||||||
if len(matches) > 0 {
|
|
||||||
wt := l.WriterTo.(safetensorWriterTo)
|
|
||||||
wt.handler = mistralLayerHandler
|
|
||||||
l.WriterTo = wt
|
|
||||||
}
|
|
||||||
m.Tensors = append(m.Tensors, l)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MixtralModel) LoadVocab() error {
|
|
||||||
v, err := LoadSentencePieceTokens(m.Path, m.Params)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
m.Vocab = v
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MixtralModel) WriteGGUF() (string, error) {
|
|
||||||
kv := llm.KV{
|
|
||||||
"general.architecture": "llama",
|
|
||||||
"general.name": m.Name,
|
|
||||||
"llama.block_count": uint32(m.Params.HiddenLayers),
|
|
||||||
"llama.context_length": uint32(m.Params.ContextSize),
|
|
||||||
"llama.embedding_length": uint32(m.Params.HiddenSize),
|
|
||||||
"llama.feed_forward_length": uint32(m.Params.IntermediateSize),
|
|
||||||
"llama.attention.head_count": uint32(m.Params.AttentionHeads),
|
|
||||||
"llama.attention.head_count_kv": uint32(m.Params.KeyValHeads),
|
|
||||||
|
|
||||||
"llama.rope.freq_base": float32(m.Params.RopeFrequencyBase),
|
|
||||||
"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
|
|
||||||
|
|
||||||
"llama.expert_count": uint32(m.Params.Experts),
|
|
||||||
"llama.expert_used_count": uint32(m.Params.ExpertsUsed),
|
|
||||||
|
|
||||||
"llama.vocab_size": uint32(len(m.Vocab.Tokens)),
|
|
||||||
"llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
|
|
||||||
|
|
||||||
"general.file_type": uint32(1),
|
|
||||||
"tokenizer.ggml.model": "llama",
|
|
||||||
|
|
||||||
"tokenizer.ggml.tokens": m.Vocab.Tokens,
|
|
||||||
"tokenizer.ggml.scores": m.Vocab.Scores,
|
|
||||||
"tokenizer.ggml.token_type": m.Vocab.Types,
|
|
||||||
|
|
||||||
"tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID),
|
|
||||||
"tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID),
|
|
||||||
"tokenizer.ggml.unknown_token_id": uint32(0),
|
|
||||||
"tokenizer.ggml.add_bos_token": true,
|
|
||||||
"tokenizer.ggml.add_eos_token": false,
|
|
||||||
}
|
|
||||||
|
|
||||||
f, err := os.CreateTemp("", "ollama-gguf")
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
mod := llm.NewGGUFV3(m.Params.ByteOrder)
|
|
||||||
if err := mod.Encode(f, kv, m.Tensors); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
return f.Name(), nil
|
|
||||||
}
|
|
||||||
@@ -93,6 +93,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
|
|||||||
}
|
}
|
||||||
|
|
||||||
slices.Sort(keys)
|
slices.Sort(keys)
|
||||||
|
|
||||||
slog.Info("converting layers")
|
slog.Info("converting layers")
|
||||||
|
|
||||||
var tensors []llm.Tensor
|
var tensors []llm.Tensor
|
||||||
@@ -104,6 +105,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
|
|||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slog.Debug(fmt.Sprintf("metadata = %#v", data))
|
||||||
var size uint64
|
var size uint64
|
||||||
var kind uint32
|
var kind uint32
|
||||||
switch len(data.Shape) {
|
switch len(data.Shape) {
|
||||||
@@ -148,13 +150,11 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
|
|||||||
padding: 8 + jsonSize,
|
padding: 8 + jsonSize,
|
||||||
}
|
}
|
||||||
|
|
||||||
offset += size
|
|
||||||
tensors = append(tensors, t)
|
tensors = append(tensors, t)
|
||||||
|
offset += size
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors)))
|
slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors)))
|
||||||
slog.Debug(fmt.Sprintf("offset = %d", offset))
|
slog.Debug(fmt.Sprintf("offset = %d", offset))
|
||||||
|
|
||||||
return tensors, offset, nil
|
return tensors, offset, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -185,19 +185,15 @@ func (m *SafetensorFormat) GetLayerName(n string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
tMap := map[string]string{
|
tMap := map[string]string{
|
||||||
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
|
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
|
||||||
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
|
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
|
||||||
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
|
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
|
||||||
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
|
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
|
||||||
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
|
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
|
||||||
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
|
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
|
||||||
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
|
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
|
||||||
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
|
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
|
||||||
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
|
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
|
||||||
"model.layers.(\\d+).block_sparse_moe.gate.weight": "blk.$1.ffn_gate_inp.weight",
|
|
||||||
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w1.weight": "blk.$1.ffn_gate.$2.weight",
|
|
||||||
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w2.weight": "blk.$1.ffn_down.$2.weight",
|
|
||||||
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w3.weight": "blk.$1.ffn_up.$2.weight",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
v, ok := directMap[n]
|
v, ok := directMap[n]
|
||||||
@@ -290,15 +286,6 @@ func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (M
|
|||||||
Format: m,
|
Format: m,
|
||||||
},
|
},
|
||||||
}, nil
|
}, nil
|
||||||
case "MixtralForCausalLM":
|
|
||||||
return &MixtralModel{
|
|
||||||
ModelData{
|
|
||||||
Name: name,
|
|
||||||
Path: dirPath,
|
|
||||||
Params: params,
|
|
||||||
Format: m,
|
|
||||||
},
|
|
||||||
}, nil
|
|
||||||
case "GemmaForCausalLM":
|
case "GemmaForCausalLM":
|
||||||
return &GemmaModel{
|
return &GemmaModel{
|
||||||
ModelData{
|
ModelData{
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ The final response in the stream also includes additional data about the generat
|
|||||||
- `load_duration`: time spent in nanoseconds loading the model
|
- `load_duration`: time spent in nanoseconds loading the model
|
||||||
- `prompt_eval_count`: number of tokens in the prompt
|
- `prompt_eval_count`: number of tokens in the prompt
|
||||||
- `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
|
- `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
|
||||||
- `eval_count`: number of tokens in the response
|
- `eval_count`: number of tokens the response
|
||||||
- `eval_duration`: time in nanoseconds spent generating the response
|
- `eval_duration`: time in nanoseconds spent generating the response
|
||||||
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
||||||
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
||||||
|
|||||||
@@ -228,7 +228,3 @@ To unload the model and free up memory use:
|
|||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
|
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
|
||||||
|
|
||||||
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
|
|
||||||
|
|||||||
@@ -1,15 +1,38 @@
|
|||||||
# Running Ollama on NVIDIA Jetson Devices
|
# Running Ollama on NVIDIA Jetson Devices
|
||||||
|
|
||||||
Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions.
|
With some minor configuration, Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/). The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack).
|
||||||
|
|
||||||
The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
|
NVIDIA Jetson devices are Linux-based embedded AI computers that are purpose-built for AI applications.
|
||||||
|
|
||||||
|
Jetsons have an integrated GPU that is wired directly to the memory controller of the machine. For this reason, the `nvidia-smi` command is unrecognized, and Ollama proceeds to operate in "CPU only"
|
||||||
|
mode. This can be verified by using a monitoring tool like jtop.
|
||||||
|
|
||||||
|
In order to address this, we simply pass the path to the Jetson's pre-installed CUDA libraries into `ollama serve` (while in a tmux session). We then hardcode the num_gpu parameters into a cloned
|
||||||
|
version of our target model.
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
|
||||||
|
- curl
|
||||||
|
- tmux
|
||||||
|
|
||||||
|
Here are the steps:
|
||||||
|
|
||||||
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
|
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
|
||||||
|
- Stop the Ollama service: `sudo systemctl stop ollama`
|
||||||
|
- Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson
|
||||||
|
'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
|
||||||
- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
|
- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
|
||||||
- Start an interactive session: `ollama run mistral`
|
- Create a new Modelfile specifically for enabling GPU support on the Jetson: `touch ModelfileMistralJetson`
|
||||||
|
- In the ModelfileMistralJetson file, specify the FROM model and the num_gpu PARAMETER as shown below:
|
||||||
|
|
||||||
|
```
|
||||||
|
FROM mistral
|
||||||
|
PARAMETER num_gpu 999
|
||||||
|
```
|
||||||
|
|
||||||
|
- Create a new model from your Modelfile: `ollama create mistral-jetson -f ./ModelfileMistralJetson`
|
||||||
|
- Run the new model: `ollama run mistral-jetson`
|
||||||
|
|
||||||
|
If you run a monitoring tool like jtop you should now see that Ollama is using the Jetson's integrated GPU.
|
||||||
|
|
||||||
And that's it!
|
And that's it!
|
||||||
|
|
||||||
# Running Ollama in Docker
|
|
||||||
|
|
||||||
When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
|
|
||||||
@@ -14,7 +14,7 @@ As this is a preview release, you should expect a few bugs here and there. If
|
|||||||
you run into a problem you can reach out on
|
you run into a problem you can reach out on
|
||||||
[Discord](https://discord.gg/ollama), or file an
|
[Discord](https://discord.gg/ollama), or file an
|
||||||
[issue](https://github.com/ollama/ollama/issues).
|
[issue](https://github.com/ollama/ollama/issues).
|
||||||
Logs will often be helpful in diagnosing the problem (see
|
Logs will often be helpful in dianosing the problem (see
|
||||||
[Troubleshooting](#troubleshooting) below)
|
[Troubleshooting](#troubleshooting) below)
|
||||||
|
|
||||||
## System Requirements
|
## System Requirements
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ const (
|
|||||||
|
|
||||||
KibiByte = Byte * 1024
|
KibiByte = Byte * 1024
|
||||||
MebiByte = KibiByte * 1024
|
MebiByte = KibiByte * 1024
|
||||||
GibiByte = MebiByte * 1024
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func HumanBytes(b int64) string {
|
func HumanBytes(b int64) string {
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -35,64 +35,22 @@ func GetSupportedGFX(libDir string) ([]string, error) {
|
|||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
|
||||||
ids := []string{}
|
// Set the visible devices if not already set
|
||||||
for _, info := range gpuInfo {
|
// TODO - does sort order matter?
|
||||||
if info.Library != "rocm" {
|
devices := []string{}
|
||||||
// TODO shouldn't happen if things are wired correctly...
|
for i := range ids {
|
||||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
if _, skipped := skip[i]; skipped {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
ids = append(ids, info.ID)
|
devices = append(devices, strconv.Itoa(i))
|
||||||
|
}
|
||||||
|
|
||||||
|
val := strings.Join(devices, ",")
|
||||||
|
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
|
||||||
|
} else {
|
||||||
|
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
|
||||||
}
|
}
|
||||||
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
func commonAMDValidateLibDir() (string, error) {
|
|
||||||
// We try to favor system paths first, so that we can wire up the subprocess to use
|
|
||||||
// the system version. Only use our bundled version if the system version doesn't work
|
|
||||||
// This gives users a more recovery options if versions have subtle problems at runtime
|
|
||||||
|
|
||||||
// Prefer explicit HIP env var
|
|
||||||
hipPath := os.Getenv("HIP_PATH")
|
|
||||||
if hipPath != "" {
|
|
||||||
hipLibDir := filepath.Join(hipPath, "bin")
|
|
||||||
if rocmLibUsable(hipLibDir) {
|
|
||||||
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
|
|
||||||
return hipLibDir, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Scan the LD_LIBRARY_PATH or PATH
|
|
||||||
pathEnv := "LD_LIBRARY_PATH"
|
|
||||||
if runtime.GOOS == "windows" {
|
|
||||||
pathEnv = "PATH"
|
|
||||||
}
|
|
||||||
|
|
||||||
paths := os.Getenv(pathEnv)
|
|
||||||
for _, path := range filepath.SplitList(paths) {
|
|
||||||
d, err := filepath.Abs(path)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if rocmLibUsable(d) {
|
|
||||||
return d, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Well known location(s)
|
|
||||||
if rocmLibUsable(RocmStandardLocation) {
|
|
||||||
return RocmStandardLocation, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Installer payload location if we're running the installed binary
|
|
||||||
exe, err := os.Executable()
|
|
||||||
if err == nil {
|
|
||||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
|
||||||
if rocmLibUsable(rocmTargetDir) {
|
|
||||||
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
|
||||||
return rocmTargetDir, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ func NewHipLib() (*HipLib, error) {
|
|||||||
func (hl *HipLib) Release() {
|
func (hl *HipLib) Release() {
|
||||||
err := windows.FreeLibrary(hl.dll)
|
err := windows.FreeLibrary(hl.dll)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to unload amdhip64.dll", "error", err)
|
slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
|
||||||
}
|
}
|
||||||
hl.dll = 0
|
hl.dll = 0
|
||||||
}
|
}
|
||||||
@@ -98,7 +98,7 @@ func (hl *HipLib) HipGetDeviceCount() int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
if status != hipSuccess {
|
if status != hipSuccess {
|
||||||
slog.Warn("failed call to hipGetDeviceCount", "status", status, "error", err)
|
slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
|
||||||
}
|
}
|
||||||
return count
|
return count
|
||||||
}
|
}
|
||||||
|
|||||||
473
gpu/amd_linux.go
@@ -11,8 +11,6 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Discovery logic for AMD/ROCm GPUs
|
// Discovery logic for AMD/ROCm GPUs
|
||||||
@@ -26,6 +24,9 @@ const (
|
|||||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
||||||
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
||||||
RocmStandardLocation = "/opt/rocm/lib"
|
RocmStandardLocation = "/opt/rocm/lib"
|
||||||
|
|
||||||
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
|
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -34,11 +35,14 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
|
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
|
||||||
func AMDGetGPUInfo() []GpuInfo {
|
// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
|
||||||
resp := []GpuInfo{}
|
// and the user hasn't already set this variable
|
||||||
|
func AMDGetGPUInfo(resp *GpuInfo) {
|
||||||
|
// TODO - DRY this out with windows
|
||||||
if !AMDDetected() {
|
if !AMDDetected() {
|
||||||
return resp
|
return
|
||||||
}
|
}
|
||||||
|
skip := map[int]interface{}{}
|
||||||
|
|
||||||
// Opportunistic logging of driver version to aid in troubleshooting
|
// Opportunistic logging of driver version to aid in troubleshooting
|
||||||
ver, err := AMDDriverVersion()
|
ver, err := AMDDriverVersion()
|
||||||
@@ -46,117 +50,160 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
slog.Info("AMD Driver: " + ver)
|
slog.Info("AMD Driver: " + ver)
|
||||||
} else {
|
} else {
|
||||||
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
|
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
|
||||||
slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
|
slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
// If the user has specified exactly which GPUs to use, look up their memory
|
||||||
var visibleDevices []string
|
visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
|
||||||
hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only
|
if visibleDevices != "" {
|
||||||
rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
|
ids := []int{}
|
||||||
gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index
|
for _, idStr := range strings.Split(visibleDevices, ",") {
|
||||||
switch {
|
id, err := strconv.Atoi(idStr)
|
||||||
// TODO is this priorty order right?
|
if err != nil {
|
||||||
case hipVD != "":
|
slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
|
||||||
visibleDevices = strings.Split(hipVD, ",")
|
} else {
|
||||||
case rocrVD != "":
|
ids = append(ids, id)
|
||||||
visibleDevices = strings.Split(rocrVD, ",")
|
}
|
||||||
// TODO - since we don't yet support UUIDs, consider detecting and reporting here
|
}
|
||||||
// all our test systems show GPU-XX indicating UUID is not supported
|
amdProcMemLookup(resp, nil, ids)
|
||||||
case gpuDO != "":
|
return
|
||||||
visibleDevices = strings.Split(gpuDO, ",")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Gather GFX version information from all detected cards
|
||||||
|
gfx := AMDGFXVersions()
|
||||||
|
verStrings := []string{}
|
||||||
|
for i, v := range gfx {
|
||||||
|
verStrings = append(verStrings, v.ToGFXString())
|
||||||
|
if v.Major == 0 {
|
||||||
|
// Silently skip CPUs
|
||||||
|
skip[i] = struct{}{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if v.Major < 9 {
|
||||||
|
// TODO consider this a build-time setting if we can support 8xx family GPUs
|
||||||
|
slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
|
||||||
|
skip[i] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
|
||||||
|
|
||||||
|
// Abort if all GPUs are skipped
|
||||||
|
if len(skip) >= len(gfx) {
|
||||||
|
slog.Info("all detected amdgpus are skipped, falling back to CPU")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
|
||||||
|
libDir, err := AMDValidateLibDir()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
updateLibPath(libDir)
|
||||||
|
|
||||||
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
|
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
|
||||||
var supported []string
|
if gfxOverride == "" {
|
||||||
libDir := ""
|
supported, err := GetSupportedGFX(libDir)
|
||||||
|
|
||||||
// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
|
|
||||||
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
|
||||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
|
||||||
cpuCount := 0
|
|
||||||
for _, match := range matches {
|
|
||||||
slog.Debug("evaluating amdgpu node " + match)
|
|
||||||
fp, err := os.Open(match)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("failed to open sysfs node", "file", match, "error", err)
|
slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
|
||||||
continue
|
return
|
||||||
}
|
|
||||||
defer fp.Close()
|
|
||||||
nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to parse node ID", "error", err)
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
|
||||||
|
|
||||||
scanner := bufio.NewScanner(fp)
|
for i, v := range gfx {
|
||||||
isCPU := false
|
if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
|
||||||
var major, minor, patch uint64
|
slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
|
||||||
for scanner.Scan() {
|
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||||
line := strings.TrimSpace(scanner.Text())
|
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
|
||||||
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
|
skip[i] = struct{}{}
|
||||||
if strings.HasPrefix(line, "gfx_target_version") {
|
} else {
|
||||||
ver := strings.Fields(line)
|
slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
|
||||||
|
|
||||||
// Detect CPUs
|
|
||||||
if len(ver) == 2 && ver[1] == "0" {
|
|
||||||
slog.Debug("detected CPU " + match)
|
|
||||||
isCPU = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
|
||||||
slog.Warn("malformed "+match, "gfx_target_version", line)
|
|
||||||
// If this winds up being a CPU, our offsets may be wrong
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
l := len(ver[1])
|
|
||||||
var err1, err2, err3 error
|
|
||||||
patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
|
||||||
minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
|
||||||
major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
|
|
||||||
if err1 != nil || err2 != nil || err3 != nil {
|
|
||||||
slog.Debug("malformed int " + line)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - any other properties we want to extract and record?
|
|
||||||
// vendor_id + device_id -> pci lookup for "Name"
|
|
||||||
// Other metrics that may help us understand relative performance between multiple GPUs
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
||||||
|
}
|
||||||
|
|
||||||
if isCPU {
|
if len(skip) >= len(gfx) {
|
||||||
cpuCount++
|
slog.Info("all detected amdgpus are skipped, falling back to CPU")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ids := make([]int, len(gfx))
|
||||||
|
i := 0
|
||||||
|
for k := range gfx {
|
||||||
|
ids[i] = k
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
amdProcMemLookup(resp, skip, ids)
|
||||||
|
if resp.memInfo.DeviceCount == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(skip) > 0 {
|
||||||
|
amdSetVisibleDevices(ids, skip)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateLibPath(libDir string) {
|
||||||
|
ldPaths := []string{}
|
||||||
|
if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
|
||||||
|
ldPaths = strings.Split(val, ":")
|
||||||
|
}
|
||||||
|
for _, d := range ldPaths {
|
||||||
|
if d == libDir {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
val := strings.Join(append(ldPaths, libDir), ":")
|
||||||
|
slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
|
||||||
|
os.Setenv("LD_LIBRARY_PATH", val)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Walk the sysfs nodes for the available GPUs and gather information from them
|
||||||
|
// skipping over any devices in the skip map
|
||||||
|
func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
|
resp.memInfo.DeviceCount = 0
|
||||||
|
resp.memInfo.TotalMemory = 0
|
||||||
|
resp.memInfo.FreeMemory = 0
|
||||||
|
slog.Debug("discovering VRAM for amdgpu devices")
|
||||||
|
if len(ids) == 0 {
|
||||||
|
entries, err := os.ReadDir(AMDNodesSysfsDir)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, node := range entries {
|
||||||
|
if !node.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
id, err := strconv.Atoi(node.Name())
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("malformed amdgpu sysfs node id " + node.Name())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
|
||||||
|
|
||||||
|
for _, id := range ids {
|
||||||
|
if _, skipped := skip[id]; skipped {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// CPUs are always first in the list
|
|
||||||
gpuID := nodeID - cpuCount
|
|
||||||
|
|
||||||
// Shouldn't happen, but just in case...
|
|
||||||
if gpuID < 0 {
|
|
||||||
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
|
||||||
return []GpuInfo{}
|
|
||||||
}
|
|
||||||
|
|
||||||
if int(major) < RocmComputeMin {
|
|
||||||
slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look up the memory for the current node
|
|
||||||
totalMemory := uint64(0)
|
totalMemory := uint64(0)
|
||||||
usedMemory := uint64(0)
|
usedMemory := uint64(0)
|
||||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
|
// Adjust for sysfs vs HIP ids
|
||||||
|
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
|
||||||
propFiles, err := filepath.Glob(propGlob)
|
propFiles, err := filepath.Glob(propGlob)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
|
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
|
||||||
}
|
}
|
||||||
// 1 or more memory banks - sum the values of all of them
|
// 1 or more memory banks - sum the values of all of them
|
||||||
for _, propFile := range propFiles {
|
for _, propFile := range propFiles {
|
||||||
fp, err := os.Open(propFile)
|
fp, err := os.Open(propFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
|
slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
defer fp.Close()
|
||||||
@@ -179,113 +226,49 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if totalMemory == 0 {
|
if totalMemory == 0 {
|
||||||
slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
|
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
|
||||||
|
skip[id] = struct{}{}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
|
if totalMemory < IGPUMemLimit {
|
||||||
|
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
|
||||||
|
skip[id] = struct{}{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
|
||||||
usedFiles, err := filepath.Glob(usedGlob)
|
usedFiles, err := filepath.Glob(usedGlob)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
|
slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, usedFile := range usedFiles {
|
for _, usedFile := range usedFiles {
|
||||||
fp, err := os.Open(usedFile)
|
fp, err := os.Open(usedFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
|
slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
defer fp.Close()
|
||||||
data, err := io.ReadAll(fp)
|
data, err := io.ReadAll(fp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
|
slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
|
used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("malformed used memory", "data", string(data), "error", err)
|
slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
usedMemory += used
|
usedMemory += used
|
||||||
}
|
}
|
||||||
|
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
|
||||||
if totalMemory < IGPUMemLimit {
|
resp.memInfo.DeviceCount++
|
||||||
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
resp.memInfo.TotalMemory += totalMemory
|
||||||
continue
|
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
|
||||||
slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
|
||||||
gpuInfo := GpuInfo{
|
|
||||||
Library: "rocm",
|
|
||||||
memInfo: memInfo{
|
|
||||||
TotalMemory: totalMemory,
|
|
||||||
FreeMemory: (totalMemory - usedMemory),
|
|
||||||
},
|
|
||||||
ID: fmt.Sprintf("%d", gpuID),
|
|
||||||
// Name: not exposed in sysfs directly, would require pci device id lookup
|
|
||||||
Major: int(major),
|
|
||||||
Minor: int(minor),
|
|
||||||
Patch: int(patch),
|
|
||||||
MinimumMemory: rocmMinimumMemory,
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
|
||||||
if len(visibleDevices) > 0 {
|
|
||||||
include := false
|
|
||||||
for _, visible := range visibleDevices {
|
|
||||||
if visible == gpuInfo.ID {
|
|
||||||
include = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !include {
|
|
||||||
slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Final validation is gfx compatibility - load the library if we haven't already loaded it
|
|
||||||
// even if the user overrides, we still need to validate the library
|
|
||||||
if libDir == "" {
|
|
||||||
libDir, err = AMDValidateLibDir()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
|
||||||
return []GpuInfo{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
gpuInfo.DependencyPath = libDir
|
|
||||||
|
|
||||||
if gfxOverride == "" {
|
|
||||||
// Only load supported list once
|
|
||||||
if len(supported) == 0 {
|
|
||||||
supported, err = GetSupportedGFX(libDir)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
|
||||||
return []GpuInfo{}
|
|
||||||
}
|
|
||||||
slog.Debug("rocm supported GPUs", "types", supported)
|
|
||||||
}
|
|
||||||
gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
|
|
||||||
if !slices.Contains[[]string, string](supported, gfx) {
|
|
||||||
slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
|
||||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
|
||||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
|
|
||||||
continue
|
|
||||||
} else {
|
|
||||||
slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
|
||||||
}
|
|
||||||
|
|
||||||
// The GPU has passed all the verification steps and is supported
|
|
||||||
resp = append(resp, gpuInfo)
|
|
||||||
}
|
}
|
||||||
if len(resp) == 0 {
|
if resp.memInfo.DeviceCount > 0 {
|
||||||
slog.Info("no compatible amdgpu devices detected")
|
resp.Library = "rocm"
|
||||||
}
|
}
|
||||||
return resp
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Quick check for AMD driver so we can skip amdgpu discovery if not present
|
// Quick check for AMD driver so we can skip amdgpu discovery if not present
|
||||||
@@ -297,24 +280,87 @@ func AMDDetected() bool {
|
|||||||
slog.Debug("amdgpu driver not detected " + sysfsDir)
|
slog.Debug("amdgpu driver not detected " + sysfsDir)
|
||||||
return false
|
return false
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
|
slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func setupLink(source, target string) error {
|
||||||
|
if err := os.RemoveAll(target); err != nil {
|
||||||
|
return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
|
||||||
|
}
|
||||||
|
if err := os.Symlink(source, target); err != nil {
|
||||||
|
return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
|
||||||
|
}
|
||||||
|
slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the AMD rocm lib dir is wired up
|
||||||
// Prefer to use host installed ROCm, as long as it meets our minimum requirements
|
// Prefer to use host installed ROCm, as long as it meets our minimum requirements
|
||||||
// failing that, tell the user how to download it on their own
|
// failing that, tell the user how to download it on their own
|
||||||
func AMDValidateLibDir() (string, error) {
|
func AMDValidateLibDir() (string, error) {
|
||||||
libDir, err := commonAMDValidateLibDir()
|
// We rely on the rpath compiled into our library to find rocm
|
||||||
|
// so we establish a symlink to wherever we find it on the system
|
||||||
|
// to <payloads>/rocm
|
||||||
|
payloadsDir, err := PayloadsDir()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we already have a rocm dependency wired, nothing more to do
|
||||||
|
rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
|
||||||
|
if rocmLibUsable(rocmTargetDir) {
|
||||||
|
return rocmTargetDir, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// next to the running binary
|
||||||
|
exe, err := os.Executable()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return libDir, nil
|
peerDir := filepath.Dir(exe)
|
||||||
|
if rocmLibUsable(peerDir) {
|
||||||
|
slog.Debug("detected ROCM next to ollama executable " + peerDir)
|
||||||
|
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
|
||||||
|
}
|
||||||
|
peerDir = filepath.Join(filepath.Dir(exe), "rocm")
|
||||||
|
if rocmLibUsable(peerDir) {
|
||||||
|
slog.Debug("detected ROCM next to ollama executable " + peerDir)
|
||||||
|
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Well known ollama installer path
|
// Well known ollama installer path
|
||||||
installedRocmDir := "/usr/share/ollama/lib/rocm"
|
installedRocmDir := "/usr/share/ollama/lib/rocm"
|
||||||
if rocmLibUsable(installedRocmDir) {
|
if rocmLibUsable(installedRocmDir) {
|
||||||
return installedRocmDir, nil
|
return rocmTargetDir, setupLink(installedRocmDir, rocmTargetDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer explicit HIP env var
|
||||||
|
hipPath := os.Getenv("HIP_PATH")
|
||||||
|
if hipPath != "" {
|
||||||
|
hipLibDir := filepath.Join(hipPath, "lib")
|
||||||
|
if rocmLibUsable(hipLibDir) {
|
||||||
|
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
|
||||||
|
return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scan the library path for potential matches
|
||||||
|
ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
|
||||||
|
for _, ldPath := range ldPaths {
|
||||||
|
d, err := filepath.Abs(ldPath)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if rocmLibUsable(d) {
|
||||||
|
return rocmTargetDir, setupLink(d, rocmTargetDir)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Well known location(s)
|
||||||
|
if rocmLibUsable("/opt/rocm/lib") {
|
||||||
|
return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we still haven't found a usable rocm, the user will have to install it on their own
|
// If we still haven't found a usable rocm, the user will have to install it on their own
|
||||||
@@ -338,3 +384,68 @@ func AMDDriverVersion() (string, error) {
|
|||||||
}
|
}
|
||||||
return strings.TrimSpace(string(verString)), nil
|
return strings.TrimSpace(string(verString)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func AMDGFXVersions() map[int]Version {
|
||||||
|
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
|
||||||
|
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
||||||
|
res := map[int]Version{}
|
||||||
|
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||||
|
for _, match := range matches {
|
||||||
|
fp, err := os.Open(match)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if i == 0 {
|
||||||
|
// Skipping the CPU
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Align with HIP IDs (zero is first GPU, not CPU)
|
||||||
|
i -= 1
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(fp)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
if strings.HasPrefix(line, "gfx_target_version") {
|
||||||
|
ver := strings.Fields(line)
|
||||||
|
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||||
|
if ver[1] != "0" {
|
||||||
|
slog.Debug("malformed " + line)
|
||||||
|
}
|
||||||
|
res[i] = Version{
|
||||||
|
Major: 0,
|
||||||
|
Minor: 0,
|
||||||
|
Patch: 0,
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
l := len(ver[1])
|
||||||
|
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
||||||
|
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
||||||
|
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
|
||||||
|
if err1 != nil || err2 != nil || err3 != nil {
|
||||||
|
slog.Debug("malformed int " + line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
res[i] = Version{
|
||||||
|
Major: uint(major),
|
||||||
|
Minor: uint(minor),
|
||||||
|
Patch: uint(patch),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v Version) ToGFXString() string {
|
||||||
|
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,10 +7,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -25,32 +22,36 @@ var (
|
|||||||
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
|
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
|
||||||
)
|
)
|
||||||
|
|
||||||
func AMDGetGPUInfo() []GpuInfo {
|
func AMDGetGPUInfo(resp *GpuInfo) {
|
||||||
resp := []GpuInfo{}
|
|
||||||
hl, err := NewHipLib()
|
hl, err := NewHipLib()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug(err.Error())
|
slog.Debug(err.Error())
|
||||||
return nil
|
return
|
||||||
}
|
}
|
||||||
defer hl.Release()
|
defer hl.Release()
|
||||||
|
skip := map[int]interface{}{}
|
||||||
|
ids := []int{}
|
||||||
|
resp.memInfo.DeviceCount = 0
|
||||||
|
resp.memInfo.TotalMemory = 0
|
||||||
|
resp.memInfo.FreeMemory = 0
|
||||||
|
|
||||||
ver, err := hl.AMDDriverVersion()
|
ver, err := hl.AMDDriverVersion()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
slog.Info("AMD Driver: " + ver)
|
slog.Info("AMD Driver: " + ver)
|
||||||
} else {
|
} else {
|
||||||
// For now this is benign, but we may eventually need to fail compatibility checks
|
// For now this is benign, but we may eventually need to fail compatibility checks
|
||||||
slog.Debug("error looking up amd driver version", "error", err)
|
slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
|
// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
|
||||||
count := hl.HipGetDeviceCount()
|
count := hl.HipGetDeviceCount()
|
||||||
if count == 0 {
|
if count == 0 {
|
||||||
return nil
|
return
|
||||||
}
|
}
|
||||||
libDir, err := AMDValidateLibDir()
|
libDir, err := AMDValidateLibDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
|
||||||
return nil
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var supported []string
|
var supported []string
|
||||||
@@ -58,120 +59,95 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
supported, err = GetSupportedGFX(libDir)
|
supported, err = GetSupportedGFX(libDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
|
||||||
return nil
|
return
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("detected hip devices", "count", count)
|
slog.Info(fmt.Sprintf("detected %d hip devices", count))
|
||||||
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
|
|
||||||
for i := 0; i < count; i++ {
|
for i := 0; i < count; i++ {
|
||||||
|
ids = append(ids, i)
|
||||||
err = hl.HipSetDevice(i)
|
err = hl.HipSetDevice(i)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("set device", "id", i, "error", err)
|
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
|
||||||
|
skip[i] = struct{}{}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
props, err := hl.HipGetDeviceProperties(i)
|
props, err := hl.HipGetDeviceProperties(i)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("get properties", "id", i, "error", err)
|
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
|
||||||
|
skip[i] = struct{}{}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
n := bytes.IndexByte(props.Name[:], 0)
|
n := bytes.IndexByte(props.Name[:], 0)
|
||||||
name := string(props.Name[:n])
|
name := string(props.Name[:n])
|
||||||
// TODO is UUID actually populated on windows?
|
slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
|
||||||
// Can luid be used on windows for setting visible devices (and is it actually set?)
|
|
||||||
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
||||||
gfx := string(props.GcnArchName[:n])
|
gfx := string(props.GcnArchName[:n])
|
||||||
slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
|
slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
|
||||||
var major, minor, patch string
|
|
||||||
switch len(gfx) {
|
|
||||||
case 6:
|
|
||||||
major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
|
|
||||||
case 7:
|
|
||||||
major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
|
|
||||||
}
|
|
||||||
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
||||||
// TODO Why isn't props.iGPU accurate!?
|
// TODO Why isn't props.iGPU accurate!?
|
||||||
if strings.EqualFold(name, iGPUName) {
|
if strings.EqualFold(name, iGPUName) {
|
||||||
slog.Info("iGPU detected skipping", "id", i)
|
slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
|
||||||
|
skip[i] = struct{}{}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
if !slices.Contains[[]string, string](supported, gfx) {
|
if !slices.Contains[[]string, string](supported, gfx) {
|
||||||
slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
|
||||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
||||||
|
skip[i] = struct{}{}
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
|
slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
freeMemory, totalMemory, err := hl.HipMemGetInfo()
|
totalMemory, freeMemory, err := hl.HipMemGetInfo()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("get mem info", "id", i, "error", err)
|
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
// TODO according to docs, freeMem may lie on windows!
|
||||||
if totalMemory < IGPUMemLimit {
|
slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
|
||||||
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
slog.Info(fmt.Sprintf("[%d] Free Mem: %d", i, freeMemory))
|
||||||
continue
|
resp.memInfo.DeviceCount++
|
||||||
}
|
resp.memInfo.TotalMemory += totalMemory
|
||||||
|
resp.memInfo.FreeMemory += freeMemory
|
||||||
// TODO revisit this once ROCm v6 is available on windows.
|
|
||||||
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
|
||||||
slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
|
||||||
slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
|
||||||
gpuInfo := GpuInfo{
|
|
||||||
Library: "rocm",
|
|
||||||
memInfo: memInfo{
|
|
||||||
TotalMemory: totalMemory,
|
|
||||||
FreeMemory: freeMemory,
|
|
||||||
},
|
|
||||||
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
|
|
||||||
DependencyPath: libDir,
|
|
||||||
MinimumMemory: rocmMinimumMemory,
|
|
||||||
}
|
|
||||||
if major != "" {
|
|
||||||
gpuInfo.Major, err = strconv.Atoi(major)
|
|
||||||
if err != nil {
|
|
||||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if minor != "" {
|
|
||||||
gpuInfo.Minor, err = strconv.Atoi(minor)
|
|
||||||
if err != nil {
|
|
||||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if patch != "" {
|
|
||||||
// Patch rev is hex; e.g. gfx90a
|
|
||||||
p, err := strconv.ParseInt(patch, 16, 0)
|
|
||||||
if err != nil {
|
|
||||||
slog.Info("failed to parse version", "version", gfx, "error", err)
|
|
||||||
} else {
|
|
||||||
gpuInfo.Patch = int(p)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if gpuInfo.Major < RocmComputeMin {
|
|
||||||
slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
resp = append(resp, gpuInfo)
|
|
||||||
}
|
}
|
||||||
|
if resp.memInfo.DeviceCount > 0 {
|
||||||
return resp
|
resp.Library = "rocm"
|
||||||
|
}
|
||||||
|
// Abort if all GPUs are skipped
|
||||||
|
if len(skip) >= count {
|
||||||
|
slog.Info("all detected amdgpus are skipped, falling back to CPU")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(skip) > 0 {
|
||||||
|
amdSetVisibleDevices(ids, skip)
|
||||||
|
}
|
||||||
|
UpdatePath(libDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func AMDValidateLibDir() (string, error) {
|
func AMDValidateLibDir() (string, error) {
|
||||||
libDir, err := commonAMDValidateLibDir()
|
// On windows non-admins typically can't create links
|
||||||
|
// so instead of trying to rely on rpath and a link in
|
||||||
|
// $LibDir/rocm, we instead rely on setting PATH to point
|
||||||
|
// to the location of the ROCm library
|
||||||
|
|
||||||
|
// Installer payload location if we're running the installed binary
|
||||||
|
exe, err := os.Executable()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return libDir, nil
|
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
||||||
|
if rocmLibUsable(rocmTargetDir) {
|
||||||
|
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
||||||
|
return rocmTargetDir, nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Installer payload (if we're running from some other location)
|
// Installer payload (if we're running from some other location)
|
||||||
@@ -183,6 +159,21 @@ func AMDValidateLibDir() (string, error) {
|
|||||||
return rocmTargetDir, nil
|
return rocmTargetDir, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prefer explicit HIP env var
|
||||||
|
hipPath := os.Getenv("HIP_PATH")
|
||||||
|
if hipPath != "" {
|
||||||
|
hipLibDir := filepath.Join(hipPath, "bin")
|
||||||
|
if rocmLibUsable(hipLibDir) {
|
||||||
|
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
|
||||||
|
return hipLibDir, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Well known location(s)
|
||||||
|
if rocmLibUsable(RocmStandardLocation) {
|
||||||
|
return RocmStandardLocation, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
|
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
|
||||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
||||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
||||||
|
|||||||
@@ -24,51 +24,6 @@ func PayloadsDir() (string, error) {
|
|||||||
defer lock.Unlock()
|
defer lock.Unlock()
|
||||||
var err error
|
var err error
|
||||||
if payloadsDir == "" {
|
if payloadsDir == "" {
|
||||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
|
||||||
// On Windows we do not carry the payloads inside the main executable
|
|
||||||
if runtime.GOOS == "windows" && runnersDir == "" {
|
|
||||||
appExe, err := os.Executable()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("failed to lookup executable path", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
cwd, err := os.Getwd()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("failed to lookup working directory", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
var paths []string
|
|
||||||
for _, root := range []string{appExe, cwd} {
|
|
||||||
paths = append(paths,
|
|
||||||
filepath.Join(root),
|
|
||||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
|
||||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try a few variations to improve developer experience when building from source in the local tree
|
|
||||||
for _, p := range paths {
|
|
||||||
candidate := filepath.Join(p, "ollama_runners")
|
|
||||||
_, err := os.Stat(candidate)
|
|
||||||
if err == nil {
|
|
||||||
runnersDir = candidate
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if runnersDir == "" {
|
|
||||||
err = fmt.Errorf("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
|
||||||
slog.Error("incomplete distribution", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if runnersDir != "" {
|
|
||||||
payloadsDir = runnersDir
|
|
||||||
return payloadsDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
|
||||||
cleanupTmpDirs()
|
cleanupTmpDirs()
|
||||||
tmpDir := os.Getenv("OLLAMA_TMPDIR")
|
tmpDir := os.Getenv("OLLAMA_TMPDIR")
|
||||||
if tmpDir == "" {
|
if tmpDir == "" {
|
||||||
@@ -125,7 +80,7 @@ func cleanupTmpDirs() {
|
|||||||
}
|
}
|
||||||
err = os.RemoveAll(d)
|
err = os.RemoveAll(d)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
|
slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,8 +88,7 @@ func cleanupTmpDirs() {
|
|||||||
func Cleanup() {
|
func Cleanup() {
|
||||||
lock.Lock()
|
lock.Lock()
|
||||||
defer lock.Unlock()
|
defer lock.Unlock()
|
||||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
if payloadsDir != "" {
|
||||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
|
||||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
||||||
slog.Debug("cleaning up", "dir", tmpDir)
|
slog.Debug("cleaning up", "dir", tmpDir)
|
||||||
@@ -166,7 +120,7 @@ func UpdatePath(dir string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||||
slog.Info("updating", "PATH", newPath)
|
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
|
||||||
os.Setenv("PATH", newPath)
|
os.Setenv("PATH", newPath)
|
||||||
}
|
}
|
||||||
// linux and darwin rely on rpath
|
// linux and darwin rely on rpath
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
//go:build linux || windows
|
|
||||||
|
|
||||||
package gpu
|
|
||||||
|
|
||||||
import (
|
|
||||||
"log/slog"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "cuda" {
|
|
||||||
// TODO shouldn't happen if things are wired correctly...
|
|
||||||
slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
||||||
|
|
||||||
}
|
|
||||||
227
gpu/gpu.go
@@ -16,6 +16,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
@@ -24,8 +25,8 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type handles struct {
|
type handles struct {
|
||||||
deviceCount int
|
nvml *C.nvml_handle_t
|
||||||
cudart *C.cudart_handle_t
|
cudart *C.cudart_handle_t
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -38,10 +39,26 @@ var gpuMutex sync.Mutex
|
|||||||
// With our current CUDA compile flags, older than 5.0 will not work properly
|
// With our current CUDA compile flags, older than 5.0 will not work properly
|
||||||
var CudaComputeMin = [2]C.int{5, 0}
|
var CudaComputeMin = [2]C.int{5, 0}
|
||||||
|
|
||||||
var RocmComputeMin = 9
|
// Possible locations for the nvidia-ml library
|
||||||
|
var NvmlLinuxGlobs = []string{
|
||||||
|
"/usr/local/cuda/lib64/libnvidia-ml.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
|
||||||
|
"/usr/lib/wsl/lib/libnvidia-ml.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
|
||||||
|
"/opt/cuda/lib64/libnvidia-ml.so*",
|
||||||
|
"/usr/lib*/libnvidia-ml.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
|
||||||
|
"/usr/local/lib*/libnvidia-ml.so*",
|
||||||
|
|
||||||
// TODO find a better way to detect iGPU instead of minimum memory
|
// TODO: are these stubs ever valid?
|
||||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvmlWindowsGlobs = []string{
|
||||||
|
"c:\\Windows\\System32\\nvml.dll",
|
||||||
|
}
|
||||||
|
|
||||||
var CudartLinuxGlobs = []string{
|
var CudartLinuxGlobs = []string{
|
||||||
"/usr/local/cuda/lib64/libcudart.so*",
|
"/usr/local/cuda/lib64/libcudart.so*",
|
||||||
@@ -71,18 +88,26 @@ func initGPUHandles() *handles {
|
|||||||
|
|
||||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||||
|
|
||||||
gpuHandles := &handles{}
|
gpuHandles := &handles{nil, nil}
|
||||||
|
var nvmlMgmtName string
|
||||||
|
var nvmlMgmtPatterns []string
|
||||||
var cudartMgmtName string
|
var cudartMgmtName string
|
||||||
var cudartMgmtPatterns []string
|
var cudartMgmtPatterns []string
|
||||||
|
|
||||||
tmpDir, _ := PayloadsDir()
|
tmpDir, _ := PayloadsDir()
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "windows":
|
case "windows":
|
||||||
|
nvmlMgmtName = "nvml.dll"
|
||||||
|
nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
|
||||||
|
copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
|
||||||
cudartMgmtName = "cudart64_*.dll"
|
cudartMgmtName = "cudart64_*.dll"
|
||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
||||||
case "linux":
|
case "linux":
|
||||||
|
nvmlMgmtName = "libnvidia-ml.so"
|
||||||
|
nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
|
||||||
|
copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
|
||||||
cudartMgmtName = "libcudart.so*"
|
cudartMgmtName = "libcudart.so*"
|
||||||
if tmpDir != "" {
|
if tmpDir != "" {
|
||||||
// TODO - add "payloads" for subprocess
|
// TODO - add "payloads" for subprocess
|
||||||
@@ -93,21 +118,31 @@ func initGPUHandles() *handles {
|
|||||||
return gpuHandles
|
return gpuHandles
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("Detecting GPUs")
|
slog.Info("Detecting GPU type")
|
||||||
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
||||||
if len(cudartLibPaths) > 0 {
|
if len(cudartLibPaths) > 0 {
|
||||||
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
cudart := LoadCUDARTMgmt(cudartLibPaths)
|
||||||
if cudart != nil {
|
if cudart != nil {
|
||||||
slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
|
slog.Info("Nvidia GPU detected via cudart")
|
||||||
gpuHandles.cudart = cudart
|
gpuHandles.cudart = cudart
|
||||||
gpuHandles.deviceCount = deviceCount
|
return gpuHandles
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
|
||||||
|
nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
|
||||||
|
if len(nvmlLibPaths) > 0 {
|
||||||
|
nvml := LoadNVMLMgmt(nvmlLibPaths)
|
||||||
|
if nvml != nil {
|
||||||
|
slog.Info("Nvidia GPU detected via nvidia-ml")
|
||||||
|
gpuHandles.nvml = nvml
|
||||||
return gpuHandles
|
return gpuHandles
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return gpuHandles
|
return gpuHandles
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
func GetGPUInfo() GpuInfo {
|
||||||
// TODO - consider exploring lspci (and equivalent on windows) to check for
|
// TODO - consider exploring lspci (and equivalent on windows) to check for
|
||||||
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
||||||
gpuMutex.Lock()
|
gpuMutex.Lock()
|
||||||
@@ -115,6 +150,9 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
|
|
||||||
gpuHandles := initGPUHandles()
|
gpuHandles := initGPUHandles()
|
||||||
defer func() {
|
defer func() {
|
||||||
|
if gpuHandles.nvml != nil {
|
||||||
|
C.nvml_release(*gpuHandles.nvml)
|
||||||
|
}
|
||||||
if gpuHandles.cudart != nil {
|
if gpuHandles.cudart != nil {
|
||||||
C.cudart_release(*gpuHandles.cudart)
|
C.cudart_release(*gpuHandles.cudart)
|
||||||
}
|
}
|
||||||
@@ -127,63 +165,72 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
resp := []GpuInfo{}
|
resp := GpuInfo{}
|
||||||
|
if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||||
// NVIDIA first
|
C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
|
||||||
for i := 0; i < gpuHandles.deviceCount; i++ {
|
|
||||||
// TODO once we support CPU compilation variants of GPU libraries refine this...
|
|
||||||
if cpuVariant == "" && runtime.GOARCH == "amd64" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
gpuInfo := GpuInfo{
|
|
||||||
Library: "cuda",
|
|
||||||
}
|
|
||||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
continue
|
} else if memInfo.count > 0 {
|
||||||
|
// Verify minimum compute capability
|
||||||
|
var cc C.nvml_compute_capability_t
|
||||||
|
C.nvml_compute_capability(*gpuHandles.nvml, &cc)
|
||||||
|
if cc.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
|
||||||
|
C.free(unsafe.Pointer(cc.err))
|
||||||
|
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
|
||||||
|
slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
|
resp.Library = "cuda"
|
||||||
|
resp.MinimumMemory = cudaMinimumMemory
|
||||||
|
} else {
|
||||||
|
slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
|
} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||||
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
|
C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
|
||||||
continue
|
|
||||||
}
|
|
||||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
|
||||||
gpuInfo.Major = int(memInfo.major)
|
|
||||||
gpuInfo.Minor = int(memInfo.minor)
|
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
|
||||||
|
|
||||||
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
|
||||||
resp = append(resp, gpuInfo)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Then AMD
|
|
||||||
resp = append(resp, AMDGetGPUInfo()...)
|
|
||||||
|
|
||||||
if len(resp) == 0 {
|
|
||||||
C.cpu_check_ram(&memInfo)
|
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
|
slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
} else if memInfo.count > 0 {
|
||||||
|
// Verify minimum compute capability
|
||||||
|
var cc C.cudart_compute_capability_t
|
||||||
|
C.cudart_compute_capability(*gpuHandles.cudart, &cc)
|
||||||
|
if cc.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
|
||||||
|
C.free(unsafe.Pointer(cc.err))
|
||||||
|
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
|
||||||
|
slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
|
resp.Library = "cuda"
|
||||||
|
resp.MinimumMemory = cudaMinimumMemory
|
||||||
|
} else {
|
||||||
|
slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
AMDGetGPUInfo(&resp)
|
||||||
|
if resp.Library != "" {
|
||||||
|
resp.MinimumMemory = rocmMinimumMemory
|
||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
gpuInfo := GpuInfo{
|
}
|
||||||
Library: "cpu",
|
if resp.Library == "" {
|
||||||
Variant: cpuVariant,
|
C.cpu_check_ram(&memInfo)
|
||||||
}
|
resp.Library = "cpu"
|
||||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
resp.Variant = cpuVariant
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
}
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
if memInfo.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
|
||||||
resp = append(resp, gpuInfo)
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
return resp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resp.DeviceCount = uint32(memInfo.count)
|
||||||
|
resp.FreeMemory = uint64(memInfo.free)
|
||||||
|
resp.TotalMemory = uint64(memInfo.total)
|
||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func getCPUMem() (memInfo, error) {
|
||||||
var ret memInfo
|
var ret memInfo
|
||||||
var info C.mem_info_t
|
var info C.mem_info_t
|
||||||
C.cpu_check_ram(&info)
|
C.cpu_check_ram(&info)
|
||||||
@@ -196,11 +243,29 @@ func GetCPUMem() (memInfo, error) {
|
|||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CheckVRAM() (uint64, error) {
|
||||||
|
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||||
|
if userLimit != "" {
|
||||||
|
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
||||||
|
}
|
||||||
|
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
||||||
|
return uint64(avail), nil
|
||||||
|
}
|
||||||
|
gpuInfo := GetGPUInfo()
|
||||||
|
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||||
|
return gpuInfo.FreeMemory, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
||||||
|
}
|
||||||
|
|
||||||
func FindGPULibs(baseLibName string, patterns []string) []string {
|
func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||||
var ldPaths []string
|
var ldPaths []string
|
||||||
gpuLibPaths := []string{}
|
gpuLibPaths := []string{}
|
||||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
|
||||||
|
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "windows":
|
case "windows":
|
||||||
@@ -218,7 +283,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
|||||||
}
|
}
|
||||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
||||||
}
|
}
|
||||||
slog.Debug("gpu library search", "globs", patterns)
|
slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
|
||||||
for _, pattern := range patterns {
|
for _, pattern := range patterns {
|
||||||
// Ignore glob discovery errors
|
// Ignore glob discovery errors
|
||||||
matches, _ := filepath.Glob(pattern)
|
matches, _ := filepath.Glob(pattern)
|
||||||
@@ -246,11 +311,28 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slog.Debug("discovered GPU libraries", "paths", gpuLibPaths)
|
slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
|
||||||
return gpuLibPaths
|
return gpuLibPaths
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
|
func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
|
||||||
|
var resp C.nvml_init_resp_t
|
||||||
|
resp.ch.verbose = getVerboseState()
|
||||||
|
for _, libPath := range nvmlLibPaths {
|
||||||
|
lib := C.CString(libPath)
|
||||||
|
defer C.free(unsafe.Pointer(lib))
|
||||||
|
C.nvml_init(lib, &resp)
|
||||||
|
if resp.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
|
||||||
|
C.free(unsafe.Pointer(resp.err))
|
||||||
|
} else {
|
||||||
|
return &resp.ch
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
|
||||||
var resp C.cudart_init_resp_t
|
var resp C.cudart_init_resp_t
|
||||||
resp.ch.verbose = getVerboseState()
|
resp.ch.verbose = getVerboseState()
|
||||||
for _, libPath := range cudartLibPaths {
|
for _, libPath := range cudartLibPaths {
|
||||||
@@ -258,13 +340,13 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
|
|||||||
defer C.free(unsafe.Pointer(lib))
|
defer C.free(unsafe.Pointer(lib))
|
||||||
C.cudart_init(lib, &resp)
|
C.cudart_init(lib, &resp)
|
||||||
if resp.err != nil {
|
if resp.err != nil {
|
||||||
slog.Debug("Unable to load cudart", "library", libPath, "error", C.GoString(resp.err))
|
slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
|
||||||
C.free(unsafe.Pointer(resp.err))
|
C.free(unsafe.Pointer(resp.err))
|
||||||
} else {
|
} else {
|
||||||
return int(resp.num_devices), &resp.ch, libPath
|
return &resp.ch
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0, nil, ""
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getVerboseState() C.uint16_t {
|
func getVerboseState() C.uint16_t {
|
||||||
@@ -273,22 +355,3 @@ func getVerboseState() C.uint16_t {
|
|||||||
}
|
}
|
||||||
return C.uint16_t(0)
|
return C.uint16_t(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Given the list of GPUs this instantiation is targeted for,
|
|
||||||
// figure out the visible devices environment variable
|
|
||||||
//
|
|
||||||
// If different libraries are detected, the first one is what we use
|
|
||||||
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
|
||||||
if len(l) == 0 {
|
|
||||||
return "", ""
|
|
||||||
}
|
|
||||||
switch l[0].Library {
|
|
||||||
case "cuda":
|
|
||||||
return cudaGetVisibleDevicesEnv(l)
|
|
||||||
case "rocm":
|
|
||||||
return rocmGetVisibleDevicesEnv(l)
|
|
||||||
default:
|
|
||||||
slog.Debug("no filter required for library " + l[0].Library)
|
|
||||||
return "", ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
//go:build darwin
|
||||||
|
|
||||||
package gpu
|
package gpu
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -7,41 +9,52 @@ package gpu
|
|||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strconv"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
|
||||||
mem, _ := GetCPUMem()
|
func CheckVRAM() (uint64, error) {
|
||||||
if runtime.GOARCH == "amd64" {
|
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||||
return []GpuInfo{
|
if userLimit != "" {
|
||||||
{
|
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
||||||
Library: "cpu",
|
if err != nil {
|
||||||
Variant: GetCPUVariant(),
|
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
||||||
memInfo: mem,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
||||||
|
return uint64(avail), nil
|
||||||
}
|
}
|
||||||
info := GpuInfo{
|
|
||||||
Library: "metal",
|
if runtime.GOARCH == "amd64" {
|
||||||
ID: "0",
|
// gpu not supported, this may not be metal
|
||||||
|
return 0, nil
|
||||||
}
|
}
|
||||||
info.TotalMemory = uint64(C.getRecommendedMaxVRAM())
|
|
||||||
|
|
||||||
// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
|
return uint64(C.getRecommendedMaxVRAM()), nil
|
||||||
info.FreeMemory = info.TotalMemory
|
|
||||||
|
|
||||||
info.MinimumMemory = 0
|
|
||||||
return []GpuInfo{info}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetGPUInfo() GpuInfo {
|
||||||
|
mem, _ := getCPUMem()
|
||||||
|
if runtime.GOARCH == "amd64" {
|
||||||
|
return GpuInfo{
|
||||||
|
Library: "cpu",
|
||||||
|
Variant: GetCPUVariant(),
|
||||||
|
memInfo: mem,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return GpuInfo{
|
||||||
|
Library: "metal",
|
||||||
|
memInfo: mem,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getCPUMem() (memInfo, error) {
|
||||||
return memInfo{
|
return memInfo{
|
||||||
TotalMemory: uint64(C.getPhysicalMemory()),
|
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||||
FreeMemory: 0,
|
FreeMemory: 0,
|
||||||
|
DeviceCount: 0,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
|
||||||
// No-op on darwin
|
|
||||||
return "", ""
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -38,17 +38,12 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GPU_ID_LEN 64
|
|
||||||
|
|
||||||
typedef struct mem_info {
|
typedef struct mem_info {
|
||||||
char *err; // If non-nill, caller responsible for freeing
|
|
||||||
char gpu_id[GPU_ID_LEN];
|
|
||||||
uint64_t total;
|
uint64_t total;
|
||||||
uint64_t free;
|
uint64_t free;
|
||||||
|
unsigned int count;
|
||||||
// Compute Capability
|
int igpu_index; // If >= 0, we detected an integrated GPU to ignore
|
||||||
int major;
|
char *err; // If non-nill, caller responsible for freeing
|
||||||
int minor;
|
|
||||||
} mem_info_t;
|
} mem_info_t;
|
||||||
|
|
||||||
void cpu_check_ram(mem_info_t *resp);
|
void cpu_check_ram(mem_info_t *resp);
|
||||||
@@ -57,6 +52,7 @@ void cpu_check_ram(mem_info_t *resp);
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "gpu_info_nvml.h"
|
||||||
#include "gpu_info_cudart.h"
|
#include "gpu_info_cudart.h"
|
||||||
|
|
||||||
#endif // __GPU_INFO_H__
|
#endif // __GPU_INFO_H__
|
||||||
|
|||||||
@@ -8,11 +8,9 @@ void cpu_check_ram(mem_info_t *resp) {
|
|||||||
MEMORYSTATUSEX info;
|
MEMORYSTATUSEX info;
|
||||||
info.dwLength = sizeof(info);
|
info.dwLength = sizeof(info);
|
||||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
if (GlobalMemoryStatusEx(&info) != 0) {
|
||||||
|
resp->count = 1;
|
||||||
resp->total = info.ullTotalPhys;
|
resp->total = info.ullTotalPhys;
|
||||||
resp->free = info.ullAvailPhys;
|
resp->free = info.ullAvailPhys;
|
||||||
resp->major = 0;
|
|
||||||
resp->minor = 0;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
|
||||||
} else {
|
} else {
|
||||||
resp->err = LOAD_ERR();
|
resp->err = LOAD_ERR();
|
||||||
}
|
}
|
||||||
@@ -29,11 +27,9 @@ void cpu_check_ram(mem_info_t *resp) {
|
|||||||
if (sysinfo(&info) != 0) {
|
if (sysinfo(&info) != 0) {
|
||||||
resp->err = strdup(strerror(errno));
|
resp->err = strdup(strerror(errno));
|
||||||
} else {
|
} else {
|
||||||
|
resp->count = 1;
|
||||||
resp->total = info.totalram * info.mem_unit;
|
resp->total = info.totalram * info.mem_unit;
|
||||||
resp->free = info.freeram * info.mem_unit;
|
resp->free = info.freeram * info.mem_unit;
|
||||||
resp->major = 0;
|
|
||||||
resp->minor = 0;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,6 @@
|
|||||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||||
cudartReturn_t ret;
|
cudartReturn_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
resp->num_devices = 0;
|
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i;
|
int i;
|
||||||
@@ -22,7 +21,6 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|||||||
{"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
|
{"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
|
||||||
{"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
|
{"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
|
||||||
{"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
|
{"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
|
||||||
{"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
|
|
||||||
{NULL, NULL},
|
{NULL, NULL},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -38,7 +36,13 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
||||||
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
if (!l[i].p) {
|
if (!l[i].p) {
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
@@ -59,7 +63,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
resp->ch.handle = NULL;
|
resp->ch.handle = NULL;
|
||||||
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
||||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
snprintf(buf, buflen, "cudart init failure: %d", ret);
|
snprintf(buf, buflen, "cudart init failure: %d", ret);
|
||||||
@@ -81,95 +85,110 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|||||||
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
|
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
|
||||||
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
|
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
|
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
|
||||||
resp->ch.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
|
void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
cudartMemory_t memInfo = {0,0,0};
|
cudartMemory_t memInfo = {0,0,0};
|
||||||
cudartReturn_t ret;
|
cudartReturn_t ret;
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
if (h.handle == NULL) {
|
if (h.handle == NULL) {
|
||||||
resp->err = strdup("cudart handle isn't initialized");
|
resp->err = strdup("cudart handle isn't initialized");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.cudaSetDevice)(i);
|
// cudaGetDeviceCount takes int type, resp-> count is uint
|
||||||
|
int deviceCount;
|
||||||
|
ret = (*h.cudaGetDeviceCount)(&deviceCount);
|
||||||
if (ret != CUDART_SUCCESS) {
|
if (ret != CUDART_SUCCESS) {
|
||||||
snprintf(buf, buflen, "cudart device failed to initialize");
|
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
cudaDeviceProp_t props;
|
|
||||||
ret = (*h.cudaGetDeviceProperties)(&props, i);
|
|
||||||
if (ret != CUDART_SUCCESS) {
|
|
||||||
LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
|
||||||
resp->major = 0;
|
|
||||||
resp->minor = 0;
|
|
||||||
} else {
|
} else {
|
||||||
int allNull = 1;
|
resp->count = (unsigned int)deviceCount;
|
||||||
for (int j = 0; j < 16; j++) {
|
|
||||||
if (props.uuid.bytes[j] != 0) {
|
|
||||||
allNull = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (allNull != 0) {
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
|
||||||
} else {
|
|
||||||
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
|
|
||||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
|
||||||
props.uuid.bytes[0],
|
|
||||||
props.uuid.bytes[1],
|
|
||||||
props.uuid.bytes[2],
|
|
||||||
props.uuid.bytes[3],
|
|
||||||
props.uuid.bytes[4],
|
|
||||||
props.uuid.bytes[5],
|
|
||||||
props.uuid.bytes[6],
|
|
||||||
props.uuid.bytes[7],
|
|
||||||
props.uuid.bytes[8],
|
|
||||||
props.uuid.bytes[9],
|
|
||||||
props.uuid.bytes[10],
|
|
||||||
props.uuid.bytes[11],
|
|
||||||
props.uuid.bytes[12],
|
|
||||||
props.uuid.bytes[13],
|
|
||||||
props.uuid.bytes[14],
|
|
||||||
props.uuid.bytes[15]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
resp->major = props.major;
|
|
||||||
resp->minor = props.minor;
|
|
||||||
|
|
||||||
// TODO add other useful properties from props
|
|
||||||
}
|
}
|
||||||
ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
|
|
||||||
|
resp->total = 0;
|
||||||
|
resp->free = 0;
|
||||||
|
for (i = 0; i < resp-> count; i++) {
|
||||||
|
ret = (*h.cudaSetDevice)(i);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "cudart device failed to initialize");
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
|
||||||
|
LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
|
||||||
|
|
||||||
|
resp->total += memInfo.total;
|
||||||
|
resp->free += memInfo.free;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
|
||||||
|
resp->err = NULL;
|
||||||
|
resp->major = 0;
|
||||||
|
resp->minor = 0;
|
||||||
|
int major = 0;
|
||||||
|
int minor = 0;
|
||||||
|
cudartReturn_t ret;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (h.handle == NULL) {
|
||||||
|
resp->err = strdup("cudart handle not initialized");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int devices;
|
||||||
|
ret = (*h.cudaGetDeviceCount)(&devices);
|
||||||
if (ret != CUDART_SUCCESS) {
|
if (ret != CUDART_SUCCESS) {
|
||||||
snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
|
snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
resp->total = memInfo.total;
|
for (i = 0; i < devices; i++) {
|
||||||
resp->free = memInfo.free;
|
ret = (*h.cudaSetDevice)(i);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "cudart device failed to initialize");
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
|
ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
|
||||||
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
|
if (ret != CUDART_SUCCESS) {
|
||||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report the lowest major.minor we detect as that limits our compatibility
|
||||||
|
if (resp->major == 0 || resp->major > major ) {
|
||||||
|
resp->major = major;
|
||||||
|
resp->minor = minor;
|
||||||
|
} else if ( resp->major == major && resp->minor > minor ) {
|
||||||
|
resp->minor = minor;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cudart_release(cudart_handle_t h) {
|
void cudart_release(cudart_handle_t h) {
|
||||||
|
|||||||
@@ -6,8 +6,7 @@
|
|||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
// Just enough typedef's to dlopen/dlsym for memory information
|
||||||
typedef enum cudartReturn_enum {
|
typedef enum cudartReturn_enum {
|
||||||
CUDART_SUCCESS = 0,
|
CUDART_SUCCESS = 0,
|
||||||
CUDA_ERROR_INVALID_VALUE = 1,
|
CUDART_UNSUPPORTED = 1,
|
||||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
|
||||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||||
// Other values omitted for now...
|
// Other values omitted for now...
|
||||||
} cudartReturn_t;
|
} cudartReturn_t;
|
||||||
@@ -15,11 +14,6 @@ typedef enum cudartReturn_enum {
|
|||||||
typedef enum cudartDeviceAttr_enum {
|
typedef enum cudartDeviceAttr_enum {
|
||||||
cudartDevAttrComputeCapabilityMajor = 75,
|
cudartDevAttrComputeCapabilityMajor = 75,
|
||||||
cudartDevAttrComputeCapabilityMinor = 76,
|
cudartDevAttrComputeCapabilityMinor = 76,
|
||||||
|
|
||||||
// TODO - not yet wired up but may be useful for Jetson or other
|
|
||||||
// integrated GPU scenarios with shared memory
|
|
||||||
cudaDevAttrIntegrated = 18
|
|
||||||
|
|
||||||
} cudartDeviceAttr_t;
|
} cudartDeviceAttr_t;
|
||||||
|
|
||||||
typedef void *cudartDevice_t; // Opaque is sufficient
|
typedef void *cudartDevice_t; // Opaque is sufficient
|
||||||
@@ -34,92 +28,6 @@ typedef struct cudartDriverVersion {
|
|||||||
int minor;
|
int minor;
|
||||||
} cudartDriverVersion_t;
|
} cudartDriverVersion_t;
|
||||||
|
|
||||||
typedef struct cudaUUID {
|
|
||||||
unsigned char bytes[16];
|
|
||||||
} cudaUUID_t;
|
|
||||||
typedef struct cudaDeviceProp {
|
|
||||||
char name[256]; /**< ASCII string identifying device */
|
|
||||||
cudaUUID_t uuid; /**< 16-byte unique identifier */
|
|
||||||
char luid[8]; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
|
|
||||||
unsigned int luidDeviceNodeMask; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
|
|
||||||
size_t totalGlobalMem; /**< Global memory available on device in bytes */
|
|
||||||
size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */
|
|
||||||
int regsPerBlock; /**< 32-bit registers available per block */
|
|
||||||
int warpSize; /**< Warp size in threads */
|
|
||||||
size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */
|
|
||||||
int maxThreadsPerBlock; /**< Maximum number of threads per block */
|
|
||||||
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
|
|
||||||
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
|
|
||||||
int clockRate; /**< Clock frequency in kilohertz */
|
|
||||||
size_t totalConstMem; /**< Constant memory available on device in bytes */
|
|
||||||
int major; /**< Major compute capability */
|
|
||||||
int minor; /**< Minor compute capability */
|
|
||||||
size_t textureAlignment; /**< Alignment requirement for textures */
|
|
||||||
size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */
|
|
||||||
int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
|
|
||||||
int multiProcessorCount; /**< Number of multiprocessors on device */
|
|
||||||
int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */
|
|
||||||
int integrated; /**< Device is integrated as opposed to discrete */
|
|
||||||
int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
|
|
||||||
int computeMode; /**< Compute mode (See ::cudaComputeMode) */
|
|
||||||
int maxTexture1D; /**< Maximum 1D texture size */
|
|
||||||
int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */
|
|
||||||
int maxTexture1DLinear; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
|
|
||||||
int maxTexture2D[2]; /**< Maximum 2D texture dimensions */
|
|
||||||
int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */
|
|
||||||
int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
|
|
||||||
int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
|
|
||||||
int maxTexture3D[3]; /**< Maximum 3D texture dimensions */
|
|
||||||
int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */
|
|
||||||
int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */
|
|
||||||
int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */
|
|
||||||
int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */
|
|
||||||
int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
|
|
||||||
int maxSurface1D; /**< Maximum 1D surface size */
|
|
||||||
int maxSurface2D[2]; /**< Maximum 2D surface dimensions */
|
|
||||||
int maxSurface3D[3]; /**< Maximum 3D surface dimensions */
|
|
||||||
int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */
|
|
||||||
int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */
|
|
||||||
int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */
|
|
||||||
int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
|
|
||||||
size_t surfaceAlignment; /**< Alignment requirements for surfaces */
|
|
||||||
int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */
|
|
||||||
int ECCEnabled; /**< Device has ECC support enabled */
|
|
||||||
int pciBusID; /**< PCI bus ID of the device */
|
|
||||||
int pciDeviceID; /**< PCI device ID of the device */
|
|
||||||
int pciDomainID; /**< PCI domain ID of the device */
|
|
||||||
int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
|
|
||||||
int asyncEngineCount; /**< Number of asynchronous engines */
|
|
||||||
int unifiedAddressing; /**< Device shares a unified address space with the host */
|
|
||||||
int memoryClockRate; /**< Peak memory clock frequency in kilohertz */
|
|
||||||
int memoryBusWidth; /**< Global memory bus width in bits */
|
|
||||||
int l2CacheSize; /**< Size of L2 cache in bytes */
|
|
||||||
int persistingL2CacheMaxSize; /**< Device's maximum l2 persisting lines capacity setting in bytes */
|
|
||||||
int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
|
|
||||||
int streamPrioritiesSupported; /**< Device supports stream priorities */
|
|
||||||
int globalL1CacheSupported; /**< Device supports caching globals in L1 */
|
|
||||||
int localL1CacheSupported; /**< Device supports caching locals in L1 */
|
|
||||||
size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
|
|
||||||
int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */
|
|
||||||
int managedMemory; /**< Device supports allocating managed memory on this system */
|
|
||||||
int isMultiGpuBoard; /**< Device is on a multi-GPU board */
|
|
||||||
int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */
|
|
||||||
int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */
|
|
||||||
int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
|
|
||||||
int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
|
|
||||||
int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */
|
|
||||||
int computePreemptionSupported; /**< Device supports Compute Preemption */
|
|
||||||
int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
|
|
||||||
int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
|
|
||||||
int cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
|
|
||||||
size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */
|
|
||||||
int pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
|
|
||||||
int directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
|
|
||||||
int maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
|
|
||||||
int accessPolicyMaxWindowSize; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
|
|
||||||
size_t reservedSharedMemPerBlock; /**< Shared memory reserved by CUDA driver per block in bytes */
|
|
||||||
} cudaDeviceProp_t;
|
|
||||||
|
|
||||||
typedef struct cudart_handle {
|
typedef struct cudart_handle {
|
||||||
void *handle;
|
void *handle;
|
||||||
uint16_t verbose;
|
uint16_t verbose;
|
||||||
@@ -130,17 +38,23 @@ typedef struct cudart_handle {
|
|||||||
cudartReturn_t (*cudaGetDeviceCount)(int *);
|
cudartReturn_t (*cudaGetDeviceCount)(int *);
|
||||||
cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
|
cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
|
||||||
cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
|
cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
|
||||||
cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
|
|
||||||
} cudart_handle_t;
|
} cudart_handle_t;
|
||||||
|
|
||||||
typedef struct cudart_init_resp {
|
typedef struct cudart_init_resp {
|
||||||
char *err; // If err is non-null handle is invalid
|
char *err; // If err is non-null handle is invalid
|
||||||
cudart_handle_t ch;
|
cudart_handle_t ch;
|
||||||
int num_devices;
|
|
||||||
} cudart_init_resp_t;
|
} cudart_init_resp_t;
|
||||||
|
|
||||||
|
typedef struct cudart_compute_capability {
|
||||||
|
char *err;
|
||||||
|
int major;
|
||||||
|
int minor;
|
||||||
|
} cudart_compute_capability_t;
|
||||||
|
|
||||||
|
|
||||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
||||||
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
|
void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
|
||||||
|
void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
|
||||||
void cudart_release(cudart_handle_t ch);
|
void cudart_release(cudart_handle_t ch);
|
||||||
|
|
||||||
#endif // __GPU_INFO_CUDART_H__
|
#endif // __GPU_INFO_CUDART_H__
|
||||||
|
|||||||
221
gpu/gpu_info_nvml.c
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "gpu_info_nvml.h"
|
||||||
|
|
||||||
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
|
nvmlReturn_t ret;
|
||||||
|
resp->err = NULL;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
struct lookup {
|
||||||
|
char *s;
|
||||||
|
void **p;
|
||||||
|
} l[] = {
|
||||||
|
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
||||||
|
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
||||||
|
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
|
||||||
|
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
||||||
|
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
|
||||||
|
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
|
||||||
|
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
|
||||||
|
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
|
||||||
|
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
|
||||||
|
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
|
||||||
|
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
|
||||||
|
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
|
||||||
|
{NULL, NULL},
|
||||||
|
};
|
||||||
|
|
||||||
|
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
|
||||||
|
if (!resp->ch.handle) {
|
||||||
|
char *msg = LOAD_ERR();
|
||||||
|
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
|
||||||
|
snprintf(buf, buflen,
|
||||||
|
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||||
|
nvml_lib_path, msg);
|
||||||
|
free(msg);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
|
||||||
|
|
||||||
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
||||||
|
|
||||||
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
|
if (!l[i].p) {
|
||||||
|
resp->ch.handle = NULL;
|
||||||
|
char *msg = LOAD_ERR();
|
||||||
|
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||||
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
|
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||||
|
msg);
|
||||||
|
free(msg);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*resp->ch.nvmlInit_v2)();
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
|
||||||
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
|
resp->ch.handle = NULL;
|
||||||
|
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report driver version if we're in verbose mode, ignore errors
|
||||||
|
ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
|
||||||
|
resp->err = NULL;
|
||||||
|
nvmlDevice_t device;
|
||||||
|
nvmlMemory_t memInfo = {0};
|
||||||
|
nvmlReturn_t ret;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (h.handle == NULL) {
|
||||||
|
resp->err = strdup("nvml handle isn't initialized");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
resp->total = 0;
|
||||||
|
resp->free = 0;
|
||||||
|
for (i = 0; i < resp->count; i++) {
|
||||||
|
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (h.verbose) {
|
||||||
|
nvmlBrandType_t brand = 0;
|
||||||
|
// When in verbose mode, report more information about
|
||||||
|
// the card we discover, but don't fail on error
|
||||||
|
ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetBrand)(device, &brand);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
|
||||||
|
LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);
|
||||||
|
|
||||||
|
resp->total += memInfo.total;
|
||||||
|
resp->free += memInfo.free;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
|
||||||
|
resp->err = NULL;
|
||||||
|
resp->major = 0;
|
||||||
|
resp->minor = 0;
|
||||||
|
nvmlDevice_t device;
|
||||||
|
int major = 0;
|
||||||
|
int minor = 0;
|
||||||
|
nvmlReturn_t ret;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (h.handle == NULL) {
|
||||||
|
resp->err = strdup("nvml handle not initialized");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int devices;
|
||||||
|
ret = (*h.nvmlDeviceGetCount_v2)(&devices);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < devices; i++) {
|
||||||
|
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Report the lowest major.minor we detect as that limits our compatibility
|
||||||
|
if (resp->major == 0 || resp->major > major ) {
|
||||||
|
resp->major = major;
|
||||||
|
resp->minor = minor;
|
||||||
|
} else if ( resp->major == major && resp->minor > minor ) {
|
||||||
|
resp->minor = minor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nvml_release(nvml_handle_t h) {
|
||||||
|
LOG(h.verbose, "releasing nvml library\n");
|
||||||
|
UNLOAD_LIBRARY(h.handle);
|
||||||
|
h.handle = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __APPLE__
|
||||||
57
gpu/gpu_info_nvml.h
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
#ifndef __APPLE__
|
||||||
|
#ifndef __GPU_INFO_NVML_H__
|
||||||
|
#define __GPU_INFO_NVML_H__
|
||||||
|
#include "gpu_info.h"
|
||||||
|
|
||||||
|
// Just enough typedef's to dlopen/dlsym for memory information
|
||||||
|
typedef enum nvmlReturn_enum {
|
||||||
|
NVML_SUCCESS = 0,
|
||||||
|
// Other values omitted for now...
|
||||||
|
} nvmlReturn_t;
|
||||||
|
typedef void *nvmlDevice_t; // Opaque is sufficient
|
||||||
|
typedef struct nvmlMemory_st {
|
||||||
|
unsigned long long total;
|
||||||
|
unsigned long long free;
|
||||||
|
unsigned long long used;
|
||||||
|
} nvmlMemory_t;
|
||||||
|
|
||||||
|
typedef enum nvmlBrandType_enum
|
||||||
|
{
|
||||||
|
NVML_BRAND_UNKNOWN = 0,
|
||||||
|
} nvmlBrandType_t;
|
||||||
|
|
||||||
|
typedef struct nvml_handle {
|
||||||
|
void *handle;
|
||||||
|
uint16_t verbose;
|
||||||
|
nvmlReturn_t (*nvmlInit_v2)(void);
|
||||||
|
nvmlReturn_t (*nvmlShutdown)(void);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
|
||||||
|
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
|
||||||
|
} nvml_handle_t;
|
||||||
|
|
||||||
|
typedef struct nvml_init_resp {
|
||||||
|
char *err; // If err is non-null handle is invalid
|
||||||
|
nvml_handle_t ch;
|
||||||
|
} nvml_init_resp_t;
|
||||||
|
|
||||||
|
typedef struct nvml_compute_capability {
|
||||||
|
char *err;
|
||||||
|
int major;
|
||||||
|
int minor;
|
||||||
|
} nvml_compute_capability_t;
|
||||||
|
|
||||||
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
||||||
|
void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
|
||||||
|
void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
|
||||||
|
void nvml_release(nvml_handle_t ch);
|
||||||
|
|
||||||
|
#endif // __GPU_INFO_NVML_H__
|
||||||
|
#endif // __APPLE__
|
||||||
@@ -9,16 +9,23 @@ import (
|
|||||||
|
|
||||||
func TestBasicGetGPUInfo(t *testing.T) {
|
func TestBasicGetGPUInfo(t *testing.T) {
|
||||||
info := GetGPUInfo()
|
info := GetGPUInfo()
|
||||||
assert.Greater(t, len(info), 0)
|
assert.Contains(t, "cuda rocm cpu metal", info.Library)
|
||||||
assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
|
|
||||||
if info[0].Library != "cpu" {
|
switch runtime.GOOS {
|
||||||
assert.Greater(t, info[0].TotalMemory, uint64(0))
|
case "darwin":
|
||||||
assert.Greater(t, info[0].FreeMemory, uint64(0))
|
// TODO - remove this once MacOS returns some size for CPU
|
||||||
|
return
|
||||||
|
case "linux", "windows":
|
||||||
|
assert.Greater(t, info.TotalMemory, uint64(0))
|
||||||
|
assert.Greater(t, info.FreeMemory, uint64(0))
|
||||||
|
assert.Greater(t, info.DeviceCount, uint32(0))
|
||||||
|
default:
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCPUMemInfo(t *testing.T) {
|
func TestCPUMemInfo(t *testing.T) {
|
||||||
info, err := GetCPUMem()
|
info, err := getCPUMem()
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "darwin":
|
case "darwin":
|
||||||
|
|||||||
49
gpu/types.go
@@ -3,6 +3,7 @@ package gpu
|
|||||||
type memInfo struct {
|
type memInfo struct {
|
||||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||||
|
DeviceCount uint32 `json:"device_count,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Beginning of an `ollama info` command
|
// Beginning of an `ollama info` command
|
||||||
@@ -16,49 +17,11 @@ type GpuInfo struct {
|
|||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
// MinimumMemory represents the minimum memory required to use the GPU
|
||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
|
|
||||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
// TODO add other useful attributes about the card here for discovery information
|
||||||
DependencyPath string `json:"lib_path,omitempty"`
|
|
||||||
|
|
||||||
// GPU information
|
|
||||||
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
|
||||||
Name string `json:"name"` // user friendly name if available
|
|
||||||
Major int `json:"major,omitempty"` // Major compatibility version (CC or gfx)
|
|
||||||
Minor int `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
|
|
||||||
Patch int `json:"patch,omitempty"` // Patch compatibility only matters on AMD
|
|
||||||
|
|
||||||
// TODO other performance capability info to help in scheduling decisions
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type GpuInfoList []GpuInfo
|
type Version struct {
|
||||||
|
Major uint
|
||||||
// Split up the set of gpu info's by Library and variant
|
Minor uint
|
||||||
func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
Patch uint
|
||||||
resp := []GpuInfoList{}
|
|
||||||
libs := []string{}
|
|
||||||
for _, info := range l {
|
|
||||||
found := false
|
|
||||||
requested := info.Library
|
|
||||||
if info.Variant != "" {
|
|
||||||
requested += "_" + info.Variant
|
|
||||||
}
|
|
||||||
for i, lib := range libs {
|
|
||||||
if lib == requested {
|
|
||||||
resp[i] = append(resp[i], info)
|
|
||||||
found = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !found {
|
|
||||||
libs = append(libs, info.Library)
|
|
||||||
resp = append(resp, []GpuInfo{info})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return resp
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort by Free Space
|
|
||||||
type ByFreeMemory []GpuInfo
|
|
||||||
|
|
||||||
func (a ByFreeMemory) Len() int { return len(a) }
|
|
||||||
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
||||||
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
|
||||||
|
|||||||
@@ -4,14 +4,11 @@ package integration
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"log/slog"
|
"net/http"
|
||||||
"os"
|
|
||||||
"runtime"
|
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/stretchr/testify/require"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestOrcaMiniBlueSky(t *testing.T) {
|
func TestOrcaMiniBlueSky(t *testing.T) {
|
||||||
@@ -27,44 +24,5 @@ func TestOrcaMiniBlueSky(t *testing.T) {
|
|||||||
"seed": 123,
|
"seed": 123,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh", "scattering"})
|
||||||
}
|
|
||||||
|
|
||||||
func TestUnicodeModelDir(t *testing.T) {
|
|
||||||
// This is only useful for Windows with utf-16 characters, so skip this test for other platforms
|
|
||||||
if runtime.GOOS != "windows" {
|
|
||||||
t.Skip("Unicode test only applicable to windows")
|
|
||||||
}
|
|
||||||
// Only works for local testing
|
|
||||||
if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
|
|
||||||
t.Skip("TestUnicodeModelDir only works for local testing, skipping")
|
|
||||||
}
|
|
||||||
|
|
||||||
modelDir, err := os.MkdirTemp("", "ollama_埃")
|
|
||||||
require.NoError(t, err)
|
|
||||||
defer os.RemoveAll(modelDir)
|
|
||||||
slog.Info("unicode", "OLLAMA_MODELS", modelDir)
|
|
||||||
|
|
||||||
oldModelsDir := os.Getenv("OLLAMA_MODELS")
|
|
||||||
if oldModelsDir == "" {
|
|
||||||
defer os.Unsetenv("OLLAMA_MODELS")
|
|
||||||
} else {
|
|
||||||
defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
|
|
||||||
}
|
|
||||||
err = os.Setenv("OLLAMA_MODELS", modelDir)
|
|
||||||
require.NoError(t, err)
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
req := api.GenerateRequest{
|
|
||||||
Model: "orca-mini",
|
|
||||||
Prompt: "why is the sky blue?",
|
|
||||||
Stream: &stream,
|
|
||||||
Options: map[string]interface{}{
|
|
||||||
"temperature": 0,
|
|
||||||
"seed": 123,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,225 +0,0 @@
|
|||||||
//go:build integration
|
|
||||||
|
|
||||||
package integration
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"strconv"
|
|
||||||
"sync"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
|
||||||
"github.com/stretchr/testify/require"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestMultiModelConcurrency(t *testing.T) {
|
|
||||||
var (
|
|
||||||
req = [2]api.GenerateRequest{
|
|
||||||
{
|
|
||||||
Model: "orca-mini",
|
|
||||||
Prompt: "why is the ocean blue?",
|
|
||||||
Stream: &stream,
|
|
||||||
Options: map[string]interface{}{
|
|
||||||
"seed": 42,
|
|
||||||
"temperature": 0.0,
|
|
||||||
},
|
|
||||||
}, {
|
|
||||||
Model: "tinydolphin",
|
|
||||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
|
||||||
Stream: &stream,
|
|
||||||
Options: map[string]interface{}{
|
|
||||||
"seed": 42,
|
|
||||||
"temperature": 0.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
resp = [2][]string{
|
|
||||||
[]string{"sunlight"},
|
|
||||||
[]string{"england", "english", "massachusetts", "pilgrims"},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
wg.Add(len(req))
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
|
||||||
defer cancel()
|
|
||||||
for i := 0; i < len(req); i++ {
|
|
||||||
go func(i int) {
|
|
||||||
defer wg.Done()
|
|
||||||
GenerateTestHelper(ctx, t, req[i], resp[i])
|
|
||||||
}(i)
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
|
|
||||||
defer cancel()
|
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
req, resp := GenerateRequests()
|
|
||||||
// Get the server running (if applicable) warm the model up with a single initial request
|
|
||||||
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
wg.Add(len(req))
|
|
||||||
for i := 0; i < len(req); i++ {
|
|
||||||
go func(i int) {
|
|
||||||
defer wg.Done()
|
|
||||||
for j := 0; j < 5; j++ {
|
|
||||||
slog.Info("Starting", "req", i, "iter", j)
|
|
||||||
// On slower GPUs it can take a while to process the 4 concurrent requests
|
|
||||||
// so we allow a much longer initial timeout
|
|
||||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
|
|
||||||
}
|
|
||||||
}(i)
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
|
|
||||||
func TestMultiModelStress(t *testing.T) {
|
|
||||||
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
|
||||||
if vram == "" {
|
|
||||||
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
|
|
||||||
}
|
|
||||||
max, err := strconv.ParseUint(vram, 10, 64)
|
|
||||||
require.NoError(t, err)
|
|
||||||
const MB = uint64(1024 * 1024)
|
|
||||||
type model struct {
|
|
||||||
name string
|
|
||||||
size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
|
|
||||||
}
|
|
||||||
|
|
||||||
smallModels := []model{
|
|
||||||
{
|
|
||||||
name: "orca-mini",
|
|
||||||
size: 2992 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "phi",
|
|
||||||
size: 2616 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gemma:2b",
|
|
||||||
size: 2364 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "stable-code:3b",
|
|
||||||
size: 2608 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "starcoder2:3b",
|
|
||||||
size: 2166 * MB,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
mediumModels := []model{
|
|
||||||
{
|
|
||||||
name: "llama2",
|
|
||||||
size: 5118 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "mistral",
|
|
||||||
size: 4620 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "orca-mini:7b",
|
|
||||||
size: 5118 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "dolphin-mistral",
|
|
||||||
size: 4620 * MB,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gemma:7b",
|
|
||||||
size: 5000 * MB,
|
|
||||||
},
|
|
||||||
// TODO - uncomment this once #3565 is merged and this is rebased on it
|
|
||||||
// {
|
|
||||||
// name: "codellama:7b",
|
|
||||||
// size: 5118 * MB,
|
|
||||||
// },
|
|
||||||
}
|
|
||||||
|
|
||||||
// These seem to be too slow to be useful...
|
|
||||||
// largeModels := []model{
|
|
||||||
// {
|
|
||||||
// name: "llama2:13b",
|
|
||||||
// size: 7400 * MB,
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// name: "codellama:13b",
|
|
||||||
// size: 7400 * MB,
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// name: "orca-mini:13b",
|
|
||||||
// size: 7400 * MB,
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// name: "gemma:7b",
|
|
||||||
// size: 5000 * MB,
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// name: "starcoder2:15b",
|
|
||||||
// size: 9100 * MB,
|
|
||||||
// },
|
|
||||||
// }
|
|
||||||
|
|
||||||
var chosenModels []model
|
|
||||||
switch {
|
|
||||||
case max < 10000*MB:
|
|
||||||
slog.Info("selecting small models")
|
|
||||||
chosenModels = smallModels
|
|
||||||
// case max < 30000*MB:
|
|
||||||
default:
|
|
||||||
slog.Info("selecting medium models")
|
|
||||||
chosenModels = mediumModels
|
|
||||||
// default:
|
|
||||||
// slog.Info("selecting large models")
|
|
||||||
// chosenModels = largModels
|
|
||||||
}
|
|
||||||
|
|
||||||
req, resp := GenerateRequests()
|
|
||||||
|
|
||||||
for i := range req {
|
|
||||||
if i > len(chosenModels) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
req[i].Model = chosenModels[i].name
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
|
|
||||||
defer cancel()
|
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
// Make sure all the models are pulled before we get started
|
|
||||||
for _, r := range req {
|
|
||||||
require.NoError(t, PullIfMissing(ctx, client, r.Model))
|
|
||||||
}
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
consumed := uint64(256 * MB) // Assume some baseline usage
|
|
||||||
for i := 0; i < len(req); i++ {
|
|
||||||
// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
|
|
||||||
if i > 1 && consumed > max {
|
|
||||||
slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
consumed += chosenModels[i].size
|
|
||||||
slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
|
|
||||||
|
|
||||||
wg.Add(1)
|
|
||||||
go func(i int) {
|
|
||||||
defer wg.Done()
|
|
||||||
for j := 0; j < 3; j++ {
|
|
||||||
slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
|
|
||||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
|
|
||||||
}
|
|
||||||
}(i)
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
}
|
|
||||||
@@ -4,6 +4,7 @@ package integration
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"net/http"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -24,5 +25,5 @@ func TestContextExhaustion(t *testing.T) {
|
|||||||
"num_ctx": 128,
|
"num_ctx": 128,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
GenerateTestHelper(ctx, t, req, []string{"once", "upon", "lived"})
|
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"once", "upon", "lived"})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ package integration
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
|
"net/http"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -28,11 +29,10 @@ func TestIntegrationMultimodal(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: sometimes it returns "the ollamas" sometimes "the ollams"
|
resp := "the ollamas"
|
||||||
resp := "the ollam"
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
GenerateTestHelper(ctx, t, req, []string{resp})
|
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{resp})
|
||||||
}
|
}
|
||||||
|
|
||||||
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
|
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ package integration
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -43,5 +45,25 @@ var (
|
|||||||
func TestIntegrationSimpleOrcaMini(t *testing.T) {
|
func TestIntegrationSimpleOrcaMini(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
GenerateTestHelper(ctx, t, req[0], resp[0])
|
GenerateTestHelper(ctx, t, &http.Client{}, req[0], resp[0])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO
|
||||||
|
// The server always loads a new runner and closes the old one, which forces serial execution
|
||||||
|
// At present this test case fails with concurrency problems. Eventually we should try to
|
||||||
|
// get true concurrency working with n_parallel support in the backend
|
||||||
|
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(len(req))
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
||||||
|
defer cancel()
|
||||||
|
for i := 0; i < len(req); i++ {
|
||||||
|
go func(i int) {
|
||||||
|
defer wg.Done()
|
||||||
|
GenerateTestHelper(ctx, t, &http.Client{}, req[i], resp[i])
|
||||||
|
}(i)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO - create a parallel test with 2 different models once we support concurrency
|
||||||
|
|||||||
@@ -5,14 +5,13 @@ package integration
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
@@ -24,13 +23,9 @@ import (
|
|||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/app/lifecycle"
|
"github.com/ollama/ollama/app/lifecycle"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/assert"
|
||||||
)
|
)
|
||||||
|
|
||||||
func Init() {
|
|
||||||
lifecycle.InitLogging()
|
|
||||||
}
|
|
||||||
|
|
||||||
func FindPort() string {
|
func FindPort() string {
|
||||||
port := 0
|
port := 0
|
||||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||||
@@ -46,7 +41,7 @@ func FindPort() string {
|
|||||||
return strconv.Itoa(port)
|
return strconv.Itoa(port)
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetTestEndpoint() (*api.Client, string) {
|
func GetTestEndpoint() (string, string) {
|
||||||
defaultPort := "11434"
|
defaultPort := "11434"
|
||||||
ollamaHost := os.Getenv("OLLAMA_HOST")
|
ollamaHost := os.Getenv("OLLAMA_HOST")
|
||||||
|
|
||||||
@@ -72,20 +67,16 @@ func GetTestEndpoint() (*api.Client, string) {
|
|||||||
port = FindPort()
|
port = FindPort()
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("server connection", "host", host, "port", port)
|
url := fmt.Sprintf("%s:%s", host, port)
|
||||||
|
slog.Info("server connection", "url", url)
|
||||||
return api.NewClient(
|
return scheme, url
|
||||||
&url.URL{
|
|
||||||
Scheme: scheme,
|
|
||||||
Host: net.JoinHostPort(host, port),
|
|
||||||
},
|
|
||||||
http.DefaultClient), fmt.Sprintf("%s:%s", host, port)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO make fanicier, grab logs, etc.
|
||||||
var serverMutex sync.Mutex
|
var serverMutex sync.Mutex
|
||||||
var serverReady bool
|
var serverReady bool
|
||||||
|
|
||||||
func startServer(ctx context.Context, ollamaHost string) error {
|
func StartServer(ctx context.Context, ollamaHost string) error {
|
||||||
// Make sure the server has been built
|
// Make sure the server has been built
|
||||||
CLIName, err := filepath.Abs("../ollama")
|
CLIName, err := filepath.Abs("../ollama")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -134,76 +125,67 @@ func startServer(ctx context.Context, ollamaHost string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func PullIfMissing(ctx context.Context, client *api.Client, modelName string) error {
|
func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
|
||||||
slog.Info("checking status of model", "model", modelName)
|
slog.Info("checking status of model", "model", modelName)
|
||||||
showReq := &api.ShowRequest{Name: modelName}
|
showReq := &api.ShowRequest{Name: modelName}
|
||||||
|
requestJSON, err := json.Marshal(showReq)
|
||||||
showCtx, cancel := context.WithDeadlineCause(
|
if err != nil {
|
||||||
ctx,
|
|
||||||
time.Now().Add(5*time.Second),
|
|
||||||
fmt.Errorf("show for existing model %s took too long", modelName),
|
|
||||||
)
|
|
||||||
defer cancel()
|
|
||||||
_, err := client.Show(showCtx, showReq)
|
|
||||||
var statusError api.StatusError
|
|
||||||
switch {
|
|
||||||
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
|
|
||||||
break
|
|
||||||
case err != nil:
|
|
||||||
return err
|
return err
|
||||||
default:
|
}
|
||||||
|
|
||||||
|
req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/show", bytes.NewReader(requestJSON))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make the request with the HTTP client
|
||||||
|
response, err := client.Do(req.WithContext(ctx))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer response.Body.Close()
|
||||||
|
if response.StatusCode == 200 {
|
||||||
slog.Info("model already present", "model", modelName)
|
slog.Info("model already present", "model", modelName)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
slog.Info("model missing", "model", modelName)
|
slog.Info("model missing", "status", response.StatusCode)
|
||||||
|
|
||||||
stallDuration := 30 * time.Second // This includes checksum verification, which can take a while on larger models
|
|
||||||
stallTimer := time.NewTimer(stallDuration)
|
|
||||||
fn := func(resp api.ProgressResponse) error {
|
|
||||||
// fmt.Print(".")
|
|
||||||
if !stallTimer.Reset(stallDuration) {
|
|
||||||
return fmt.Errorf("stall was detected, aborting status reporting")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
stream := true
|
|
||||||
pullReq := &api.PullRequest{Name: modelName, Stream: &stream}
|
pullReq := &api.PullRequest{Name: modelName, Stream: &stream}
|
||||||
|
requestJSON, err = json.Marshal(pullReq)
|
||||||
var pullError error
|
if err != nil {
|
||||||
|
return err
|
||||||
done := make(chan int)
|
|
||||||
go func() {
|
|
||||||
pullError = client.Pull(ctx, pullReq, fn)
|
|
||||||
done <- 0
|
|
||||||
}()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-stallTimer.C:
|
|
||||||
return fmt.Errorf("download stalled")
|
|
||||||
case <-done:
|
|
||||||
return pullError
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
req, err = http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/pull", bytes.NewReader(requestJSON))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
slog.Info("pulling", "model", modelName)
|
||||||
|
|
||||||
|
response, err = client.Do(req.WithContext(ctx))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer response.Body.Close()
|
||||||
|
if response.StatusCode != 200 {
|
||||||
|
return fmt.Errorf("failed to pull model") // TODO more details perhaps
|
||||||
|
}
|
||||||
|
slog.Info("model pulled", "model", modelName)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var serverProcMutex sync.Mutex
|
var serverProcMutex sync.Mutex
|
||||||
|
|
||||||
// Returns an Client, the testEndpoint, and a cleanup function, fails the test on errors
|
func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
|
||||||
// Starts the server if needed
|
|
||||||
func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, string, func()) {
|
|
||||||
client, testEndpoint := GetTestEndpoint()
|
|
||||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
|
||||||
serverProcMutex.Lock()
|
|
||||||
fp, err := os.CreateTemp("", "ollama-server-*.log")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to generate log file: %s", err)
|
|
||||||
}
|
|
||||||
lifecycle.ServerLogFile = fp.Name()
|
|
||||||
fp.Close()
|
|
||||||
require.NoError(t, startServer(ctx, testEndpoint))
|
|
||||||
}
|
|
||||||
|
|
||||||
return client, testEndpoint, func() {
|
// TODO maybe stuff in an init routine?
|
||||||
|
lifecycle.InitLogging()
|
||||||
|
|
||||||
|
requestJSON, err := json.Marshal(genReq)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error serializing request: %v", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
||||||
defer serverProcMutex.Unlock()
|
defer serverProcMutex.Unlock()
|
||||||
if t.Failed() {
|
if t.Failed() {
|
||||||
@@ -221,118 +203,63 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
|
|||||||
os.Stderr.Write(data)
|
os.Stderr.Write(data)
|
||||||
slog.Warn("END OF SERVER")
|
slog.Warn("END OF SERVER")
|
||||||
}
|
}
|
||||||
err := os.Remove(lifecycle.ServerLogFile)
|
err = os.Remove(lifecycle.ServerLogFile)
|
||||||
if err != nil && !os.IsNotExist(err) {
|
if err != nil && !os.IsNotExist(err) {
|
||||||
slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
|
slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func GenerateTestHelper(ctx context.Context, t *testing.T, genReq api.GenerateRequest, anyResp []string) {
|
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
require.NoError(t, PullIfMissing(ctx, client, genReq.Model))
|
|
||||||
DoGenerate(ctx, t, client, genReq, anyResp, 30*time.Second, 10*time.Second)
|
|
||||||
}
|
|
||||||
|
|
||||||
func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq api.GenerateRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) {
|
|
||||||
stallTimer := time.NewTimer(initialTimeout)
|
|
||||||
var buf bytes.Buffer
|
|
||||||
fn := func(response api.GenerateResponse) error {
|
|
||||||
// fmt.Print(".")
|
|
||||||
buf.Write([]byte(response.Response))
|
|
||||||
if !stallTimer.Reset(streamTimeout) {
|
|
||||||
return fmt.Errorf("stall was detected while streaming response, aborting")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
stream := true
|
|
||||||
genReq.Stream = &stream
|
|
||||||
done := make(chan int)
|
|
||||||
var genErr error
|
|
||||||
go func() {
|
|
||||||
genErr = client.Generate(ctx, &genReq, fn)
|
|
||||||
done <- 0
|
|
||||||
}()
|
}()
|
||||||
|
scheme, testEndpoint := GetTestEndpoint()
|
||||||
|
|
||||||
select {
|
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
|
||||||
case <-stallTimer.C:
|
serverProcMutex.Lock()
|
||||||
if buf.Len() == 0 {
|
fp, err := os.CreateTemp("", "ollama-server-*.log")
|
||||||
t.Errorf("generate never started. Timed out after :%s", initialTimeout.String())
|
if err != nil {
|
||||||
} else {
|
t.Fatalf("failed to generate log file: %s", err)
|
||||||
t.Errorf("generate stalled. Response so far:%s", buf.String())
|
|
||||||
}
|
}
|
||||||
case <-done:
|
lifecycle.ServerLogFile = fp.Name()
|
||||||
require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
|
fp.Close()
|
||||||
// Verify the response contains the expected data
|
assert.NoError(t, StartServer(ctx, testEndpoint))
|
||||||
response := buf.String()
|
|
||||||
atLeastOne := false
|
|
||||||
for _, resp := range anyResp {
|
|
||||||
if strings.Contains(strings.ToLower(response), resp) {
|
|
||||||
atLeastOne = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
require.True(t, atLeastOne, "none of %v found in %s", anyResp, response)
|
|
||||||
slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response)
|
|
||||||
case <-ctx.Done():
|
|
||||||
t.Error("outer test context done while waiting for generate")
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Generate a set of requests
|
err = PullIfMissing(ctx, client, scheme, testEndpoint, genReq.Model)
|
||||||
// By default each request uses orca-mini as the model
|
if err != nil {
|
||||||
func GenerateRequests() ([]api.GenerateRequest, [][]string) {
|
t.Fatalf("Error pulling model: %v", err)
|
||||||
return []api.GenerateRequest{
|
}
|
||||||
{
|
|
||||||
Model: "orca-mini",
|
// Make the request and get the response
|
||||||
Prompt: "why is the ocean blue?",
|
req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/generate", bytes.NewReader(requestJSON))
|
||||||
Stream: &stream,
|
if err != nil {
|
||||||
Options: map[string]interface{}{
|
t.Fatalf("Error creating request: %v", err)
|
||||||
"seed": 42,
|
}
|
||||||
"temperature": 0.0,
|
|
||||||
},
|
// Set the content type for the request
|
||||||
}, {
|
req.Header.Set("Content-Type", "application/json")
|
||||||
Model: "orca-mini",
|
|
||||||
Prompt: "why is the color of dirt brown?",
|
// Make the request with the HTTP client
|
||||||
Stream: &stream,
|
response, err := client.Do(req.WithContext(ctx))
|
||||||
Options: map[string]interface{}{
|
if err != nil {
|
||||||
"seed": 42,
|
t.Fatalf("Error making request: %v", err)
|
||||||
"temperature": 0.0,
|
}
|
||||||
},
|
defer response.Body.Close()
|
||||||
}, {
|
body, err := io.ReadAll(response.Body)
|
||||||
Model: "orca-mini",
|
assert.NoError(t, err)
|
||||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
assert.Equal(t, response.StatusCode, 200, string(body))
|
||||||
Stream: &stream,
|
|
||||||
Options: map[string]interface{}{
|
// Verify the response is valid JSON
|
||||||
"seed": 42,
|
var payload api.GenerateResponse
|
||||||
"temperature": 0.0,
|
err = json.Unmarshal(body, &payload)
|
||||||
},
|
if err != nil {
|
||||||
}, {
|
assert.NoError(t, err, body)
|
||||||
Model: "orca-mini",
|
}
|
||||||
Prompt: "what is the origin of independence day?",
|
|
||||||
Stream: &stream,
|
// Verify the response contains the expected data
|
||||||
Options: map[string]interface{}{
|
atLeastOne := false
|
||||||
"seed": 42,
|
for _, resp := range anyResp {
|
||||||
"temperature": 0.0,
|
if strings.Contains(strings.ToLower(payload.Response), resp) {
|
||||||
},
|
atLeastOne = true
|
||||||
}, {
|
break
|
||||||
Model: "orca-mini",
|
|
||||||
Prompt: "what is the composition of air?",
|
|
||||||
Stream: &stream,
|
|
||||||
Options: map[string]interface{}{
|
|
||||||
"seed": 42,
|
|
||||||
"temperature": 0.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
[][]string{
|
|
||||||
[]string{"sunlight"},
|
|
||||||
[]string{"soil", "organic", "earth", "black", "tan"},
|
|
||||||
[]string{"england", "english", "massachusetts", "pilgrims"},
|
|
||||||
[]string{"fourth", "july", "declaration", "independence"},
|
|
||||||
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
assert.True(t, atLeastOne, "none of %v found in %s", anyResp, payload.Response)
|
||||||
}
|
}
|
||||||
|
|||||||
33
llm/ext_server/server.cpp
vendored
@@ -39,10 +39,6 @@
|
|||||||
#include "httplib.h"
|
#include "httplib.h"
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#include <windows.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
@@ -2774,28 +2770,8 @@ inline void signal_handler(int signal) {
|
|||||||
shutdown_handler(signal);
|
shutdown_handler(signal);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_WIN32)
|
int main(int argc, char **argv)
|
||||||
char* wchar_to_char(const wchar_t* wstr) {
|
{
|
||||||
if (wstr == nullptr) return nullptr;
|
|
||||||
|
|
||||||
// Determine the number of bytes needed for the UTF-8 string
|
|
||||||
int bytes = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, nullptr, 0, nullptr, nullptr);
|
|
||||||
char* str = new char[bytes];
|
|
||||||
|
|
||||||
// Convert the wide-character string to a UTF-8 string
|
|
||||||
WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, bytes, nullptr, nullptr);
|
|
||||||
return str;
|
|
||||||
}
|
|
||||||
|
|
||||||
int wmain(int argc, wchar_t **wargv) {
|
|
||||||
char** argv = new char*[argc];
|
|
||||||
for (int i = 0; i < argc; ++i) {
|
|
||||||
argv[i] = wchar_to_char(wargv[i]);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if SERVER_VERBOSE != 1
|
#if SERVER_VERBOSE != 1
|
||||||
log_disable();
|
log_disable();
|
||||||
#endif
|
#endif
|
||||||
@@ -3306,11 +3282,6 @@ int main(int argc, char **argv) {
|
|||||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||||
};
|
};
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
|
|
||||||
for (int i = 0; i < argc; ++i) {
|
|
||||||
delete[] argv[i];
|
|
||||||
}
|
|
||||||
delete[] argv;
|
|
||||||
#endif
|
#endif
|
||||||
llama.queue_tasks.start_loop();
|
llama.queue_tasks.start_loop();
|
||||||
svr.stop();
|
svr.stop();
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ init_vars() {
|
|||||||
# TODO - add additional optimization flags...
|
# TODO - add additional optimization flags...
|
||||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
||||||
fi
|
fi
|
||||||
case $(uname -s) in
|
case $(uname -s) in
|
||||||
"Darwin")
|
"Darwin")
|
||||||
LIB_EXT="dylib"
|
LIB_EXT="dylib"
|
||||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||||
|
|||||||
@@ -57,21 +57,21 @@ init_vars
|
|||||||
git_module_setup
|
git_module_setup
|
||||||
apply_patches
|
apply_patches
|
||||||
|
|
||||||
init_vars
|
|
||||||
if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
|
|
||||||
# Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
|
|
||||||
# Enables optimized Dockerfile builds using a blanket skip and targeted overrides
|
|
||||||
# Static build for linking into the Go binary
|
|
||||||
init_vars
|
|
||||||
CMAKE_TARGETS="--target llama --target ggml"
|
|
||||||
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
|
||||||
BUILD_DIR="../build/linux/${ARCH}_static"
|
|
||||||
echo "Building static library"
|
|
||||||
build
|
|
||||||
fi
|
|
||||||
|
|
||||||
init_vars
|
init_vars
|
||||||
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
|
|
||||||
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
|
||||||
|
# Static build for linking into the Go binary
|
||||||
|
init_vars
|
||||||
|
CMAKE_TARGETS="--target llama --target ggml"
|
||||||
|
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||||
|
BUILD_DIR="../build/linux/${ARCH}_static"
|
||||||
|
echo "Building static library"
|
||||||
|
build
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
# Users building from source can tune the exact flags we pass to cmake for configuring
|
# Users building from source can tune the exact flags we pass to cmake for configuring
|
||||||
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
|
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
|
||||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||||
@@ -165,22 +165,14 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
|
|||||||
fi
|
fi
|
||||||
if [ "${ARCH}" == "arm64" ]; then
|
if [ "${ARCH}" == "arm64" ]; then
|
||||||
echo "ARM CPU detected - disabling unsupported AVX instructions"
|
echo "ARM CPU detected - disabling unsupported AVX instructions"
|
||||||
|
|
||||||
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
|
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
|
||||||
#
|
#
|
||||||
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||||
# Disabling has minimal performance effect while maintaining compatibility.
|
# Disabling has minimal performance effect while maintaining compatibility.
|
||||||
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
|
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
|
||||||
fi
|
fi
|
||||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
CMAKE_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
|
||||||
if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
|
|
||||||
echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
|
|
||||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
|
||||||
echo "Building custom CUDA GPU"
|
|
||||||
else
|
|
||||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
|
||||||
fi
|
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||||
build
|
build
|
||||||
@@ -225,12 +217,6 @@ if [ -d "${ROCM_PATH}" ]; then
|
|||||||
fi
|
fi
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
|
||||||
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
|
||||||
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
|
||||||
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
|
||||||
echo "Building custom ROCM GPU"
|
|
||||||
fi
|
|
||||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||||
build
|
build
|
||||||
|
|||||||
@@ -26,25 +26,15 @@ function amdGPUs {
|
|||||||
$GPU_LIST -join ';'
|
$GPU_LIST -join ';'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function init_vars {
|
function init_vars {
|
||||||
if (!$script:SRC_DIR) {
|
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
$script:llamacppDir = "../llama.cpp"
|
||||||
}
|
|
||||||
if (!$script:llamacppDir) {
|
|
||||||
$script:llamacppDir = "../llama.cpp"
|
|
||||||
}
|
|
||||||
if (!$script:cmakeTargets) {
|
|
||||||
$script:cmakeTargets = @("ollama_llama_server")
|
|
||||||
}
|
|
||||||
$script:cmakeDefs = @(
|
$script:cmakeDefs = @(
|
||||||
"-DBUILD_SHARED_LIBS=on",
|
"-DBUILD_SHARED_LIBS=on",
|
||||||
"-DLLAMA_NATIVE=off"
|
"-DLLAMA_NATIVE=off"
|
||||||
)
|
)
|
||||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
$script:cmakeTargets = @("ollama_llama_server")
|
||||||
$script:ARCH = "amd64" # arm not yet supported.
|
$script:ARCH = "amd64" # arm not yet supported.
|
||||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
|
|
||||||
md "$script:DIST_BASE" -ea 0 > $null
|
|
||||||
if ($env:CGO_CFLAGS -contains "-g") {
|
if ($env:CGO_CFLAGS -contains "-g") {
|
||||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
||||||
$script:config = "RelWithDebInfo"
|
$script:config = "RelWithDebInfo"
|
||||||
@@ -65,6 +55,7 @@ function init_vars {
|
|||||||
} else {
|
} else {
|
||||||
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
|
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
|
||||||
}
|
}
|
||||||
|
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
|
||||||
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
|
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
|
||||||
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
|
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
|
||||||
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||||
@@ -143,18 +134,21 @@ function sign {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function install {
|
function compress {
|
||||||
write-host "Installing binaries to dist dir ${script:distDir}"
|
if ($script:GZIP -eq $null) {
|
||||||
mkdir ${script:distDir} -ErrorAction SilentlyContinue
|
write-host "gzip not installed, not compressing files"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
write-host "Compressing binaries..."
|
||||||
$binaries = dir "${script:buildDir}/bin/*.exe"
|
$binaries = dir "${script:buildDir}/bin/*.exe"
|
||||||
foreach ($file in $binaries) {
|
foreach ($file in $binaries) {
|
||||||
copy-item -Path $file -Destination ${script:distDir} -Force
|
& "$script:GZIP" --best -f $file
|
||||||
}
|
}
|
||||||
|
|
||||||
write-host "Installing dlls to dist dir ${script:distDir}"
|
write-host "Compressing dlls..."
|
||||||
$dlls = dir "${script:buildDir}/bin/*.dll"
|
$dlls = dir "${script:buildDir}/bin/*.dll"
|
||||||
foreach ($file in $dlls) {
|
foreach ($file in $dlls) {
|
||||||
copy-item -Path $file -Destination ${script:distDir} -Force
|
& "$script:GZIP" --best -f $file
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,191 +169,123 @@ function cleanup {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
init_vars
|
||||||
|
git_module_setup
|
||||||
|
apply_patches
|
||||||
|
|
||||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||||
|
|
||||||
|
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||||
|
|
||||||
function build_static() {
|
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
|
||||||
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
|
|
||||||
# GCC build for direct linking into the Go binary
|
|
||||||
init_vars
|
|
||||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
|
||||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
|
||||||
write-host "Checking for MinGW..."
|
|
||||||
# error action ensures we exit on failure
|
|
||||||
get-command gcc
|
|
||||||
get-command mingw32-make
|
|
||||||
$oldTargets = $script:cmakeTargets
|
|
||||||
$script:cmakeTargets = @("llama", "ggml")
|
|
||||||
$script:cmakeDefs = @(
|
|
||||||
"-G", "MinGW Makefiles"
|
|
||||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
|
||||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
|
||||||
"-DBUILD_SHARED_LIBS=off",
|
|
||||||
"-DLLAMA_NATIVE=off",
|
|
||||||
"-DLLAMA_AVX=off",
|
|
||||||
"-DLLAMA_AVX2=off",
|
|
||||||
"-DLLAMA_AVX512=off",
|
|
||||||
"-DLLAMA_F16C=off",
|
|
||||||
"-DLLAMA_FMA=off")
|
|
||||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
|
||||||
write-host "Building static library"
|
|
||||||
build
|
|
||||||
$script:cmakeTargets = $oldTargets
|
|
||||||
} else {
|
|
||||||
write-host "Skipping CPU generation step as requested"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function build_cpu() {
|
|
||||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
|
|
||||||
# remaining llama.cpp builds use MSVC
|
|
||||||
init_vars
|
|
||||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
|
||||||
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
|
||||||
$script:distDir="$script:DIST_BASE\cpu"
|
|
||||||
write-host "Building LCD CPU"
|
|
||||||
build
|
|
||||||
sign
|
|
||||||
install
|
|
||||||
} else {
|
|
||||||
write-host "Skipping CPU generation step as requested"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function build_cpu_avx() {
|
|
||||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
|
|
||||||
init_vars
|
|
||||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
|
||||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
|
||||||
$script:distDir="$script:DIST_BASE\cpu_avx"
|
|
||||||
write-host "Building AVX CPU"
|
|
||||||
build
|
|
||||||
sign
|
|
||||||
install
|
|
||||||
} else {
|
|
||||||
write-host "Skipping CPU AVX generation step as requested"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function build_cpu_avx2() {
|
|
||||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
|
|
||||||
init_vars
|
|
||||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
|
||||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
|
||||||
$script:distDir="$script:DIST_BASE\cpu_avx2"
|
|
||||||
write-host "Building AVX2 CPU"
|
|
||||||
build
|
|
||||||
sign
|
|
||||||
install
|
|
||||||
} else {
|
|
||||||
write-host "Skipping CPU AVX2 generation step as requested"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function build_cuda() {
|
|
||||||
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
|
||||||
# Then build cuda as a dynamically loaded library
|
|
||||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
|
||||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
|
||||||
if ($null -ne $script:CUDA_VERSION) {
|
|
||||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
|
||||||
}
|
|
||||||
init_vars
|
|
||||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
|
||||||
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
|
||||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
|
||||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
|
||||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
|
||||||
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
|
||||||
write-host "building custom CUDA GPU"
|
|
||||||
}
|
|
||||||
build
|
|
||||||
sign
|
|
||||||
install
|
|
||||||
|
|
||||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
|
||||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
|
||||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
|
||||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
|
||||||
} else {
|
|
||||||
write-host "Skipping CUDA generation step"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function build_rocm() {
|
|
||||||
if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
|
|
||||||
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
|
||||||
if ($null -ne $script:ROCM_VERSION) {
|
|
||||||
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
|
||||||
}
|
|
||||||
|
|
||||||
init_vars
|
|
||||||
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
|
||||||
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
|
|
||||||
$script:cmakeDefs += @(
|
|
||||||
"-G", "Ninja",
|
|
||||||
"-DCMAKE_C_COMPILER=clang.exe",
|
|
||||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
|
||||||
"-DLLAMA_HIPBLAS=on",
|
|
||||||
"-DHIP_PLATFORM=amd",
|
|
||||||
"-DLLAMA_AVX=on",
|
|
||||||
"-DLLAMA_AVX2=off",
|
|
||||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
|
||||||
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
|
||||||
"-DGPU_TARGETS=$(amdGPUs)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make sure the ROCm binary dir is first in the path
|
|
||||||
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
|
||||||
|
|
||||||
# We have to clobber the LIB var from the developer shell for clang to work properly
|
|
||||||
$env:LIB=""
|
|
||||||
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
|
|
||||||
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
|
|
||||||
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
|
|
||||||
write-host "building custom ROCM GPU"
|
|
||||||
}
|
|
||||||
write-host "Building ROCm"
|
|
||||||
build
|
|
||||||
# Ninja doesn't prefix with config name
|
|
||||||
${script:config}=""
|
|
||||||
if ($null -ne $script:DUMPBIN) {
|
|
||||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
|
||||||
}
|
|
||||||
sign
|
|
||||||
install
|
|
||||||
|
|
||||||
# Assumes v5.7, may need adjustments for v6
|
|
||||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
|
||||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
|
|
||||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
|
||||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
|
||||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
|
||||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
|
|
||||||
} else {
|
|
||||||
write-host "Skipping ROCm generation step"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
# GCC build for direct linking into the Go binary
|
||||||
init_vars
|
init_vars
|
||||||
if ($($args.count) -eq 0) {
|
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||||
git_module_setup
|
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||||
apply_patches
|
write-host "Checking for MinGW..."
|
||||||
build_static
|
# error action ensures we exit on failure
|
||||||
build_cpu
|
get-command gcc
|
||||||
build_cpu_avx
|
get-command mingw32-make
|
||||||
build_cpu_avx2
|
$script:cmakeTargets = @("llama", "ggml")
|
||||||
build_cuda
|
$script:cmakeDefs = @(
|
||||||
build_rocm
|
"-G", "MinGW Makefiles"
|
||||||
|
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||||
|
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||||
|
"-DBUILD_SHARED_LIBS=off",
|
||||||
|
"-DLLAMA_NATIVE=off",
|
||||||
|
"-DLLAMA_AVX=off",
|
||||||
|
"-DLLAMA_AVX2=off",
|
||||||
|
"-DLLAMA_AVX512=off",
|
||||||
|
"-DLLAMA_F16C=off",
|
||||||
|
"-DLLAMA_FMA=off")
|
||||||
|
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||||
|
write-host "Building static library"
|
||||||
|
build
|
||||||
|
|
||||||
cleanup
|
# remaining llama.cpp builds use MSVC
|
||||||
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"
|
init_vars
|
||||||
|
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||||
|
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
||||||
|
write-host "Building LCD CPU"
|
||||||
|
build
|
||||||
|
sign
|
||||||
|
compress
|
||||||
|
|
||||||
|
init_vars
|
||||||
|
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||||
|
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
||||||
|
write-host "Building AVX CPU"
|
||||||
|
build
|
||||||
|
sign
|
||||||
|
compress
|
||||||
|
|
||||||
|
init_vars
|
||||||
|
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||||
|
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
||||||
|
write-host "Building AVX2 CPU"
|
||||||
|
build
|
||||||
|
sign
|
||||||
|
compress
|
||||||
} else {
|
} else {
|
||||||
for ( $i = 0; $i -lt $args.count; $i++ ) {
|
write-host "Skipping CPU generation step as requested"
|
||||||
write-host "performing $($args[$i])"
|
}
|
||||||
& $($args[$i])
|
|
||||||
}
|
if ($null -ne $script:CUDA_LIB_DIR) {
|
||||||
}
|
# Then build cuda as a dynamically loaded library
|
||||||
|
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||||
|
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||||
|
if ($null -ne $script:CUDA_VERSION) {
|
||||||
|
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||||
|
}
|
||||||
|
init_vars
|
||||||
|
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||||
|
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
build
|
||||||
|
sign
|
||||||
|
compress
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($null -ne $env:HIP_PATH) {
|
||||||
|
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
||||||
|
if ($null -ne $script:ROCM_VERSION) {
|
||||||
|
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
||||||
|
}
|
||||||
|
|
||||||
|
init_vars
|
||||||
|
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
||||||
|
$script:cmakeDefs += @(
|
||||||
|
"-G", "Ninja",
|
||||||
|
"-DCMAKE_C_COMPILER=clang.exe",
|
||||||
|
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||||
|
"-DLLAMA_HIPBLAS=on",
|
||||||
|
"-DHIP_PLATFORM=amd",
|
||||||
|
"-DLLAMA_AVX=on",
|
||||||
|
"-DLLAMA_AVX2=off",
|
||||||
|
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
||||||
|
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
||||||
|
"-DGPU_TARGETS=$(amdGPUs)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure the ROCm binary dir is first in the path
|
||||||
|
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
||||||
|
|
||||||
|
# We have to clobber the LIB var from the developer shell for clang to work properly
|
||||||
|
$env:LIB=""
|
||||||
|
|
||||||
|
write-host "Building ROCm"
|
||||||
|
build
|
||||||
|
# Ninja doesn't prefix with config name
|
||||||
|
${script:config}=""
|
||||||
|
if ($null -ne $script:DUMPBIN) {
|
||||||
|
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
||||||
|
}
|
||||||
|
sign
|
||||||
|
compress
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
|
||||||
|
|||||||
19
llm/ggml.go
@@ -164,8 +164,7 @@ func (ts Tensors) Layers() map[string]Layer {
|
|||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
parts := strings.Split(t.Name, ".")
|
parts := strings.Split(t.Name, ".")
|
||||||
if parts[0] == "blk" {
|
if parts[0] == "blk" {
|
||||||
// join first and second part, e.g. blk.%d
|
parts = parts[1:]
|
||||||
parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, ok := layers[parts[0]]; !ok {
|
if _, ok := layers[parts[0]]; !ok {
|
||||||
@@ -343,15 +342,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
)
|
)
|
||||||
|
|
||||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
if ffnGateWeight, ok := layers["0"]["ffn_gate.0.weight"]; ok {
|
||||||
// mixtral 8x22b
|
|
||||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
|
||||||
partialOffload = max(
|
|
||||||
3*ffnGateExpsWeight.size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
|
||||||
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
|
|
||||||
)
|
|
||||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
|
||||||
// mixtral 8x7b
|
|
||||||
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
||||||
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
@@ -389,12 +380,6 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
|
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
|
||||||
case "stablelm":
|
|
||||||
fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
|
|
||||||
partialOffload = max(
|
|
||||||
4*batch*(vocab+2*embedding),
|
|
||||||
fullOffload,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|||||||
37
llm/gguf.go
@@ -190,6 +190,8 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
llm.kv[k] = v
|
llm.kv[k] = v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slog.Debug(fmt.Sprintf("general.architecture = %s", llm.kv["general.architecture"]))
|
||||||
|
|
||||||
// decode tensors
|
// decode tensors
|
||||||
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
||||||
name, err := readGGUFString(llm, rs)
|
name, err := readGGUFString(llm, rs)
|
||||||
@@ -246,17 +248,13 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
padding := llm.padding(offset, int64(alignment))
|
padding := llm.padding(offset, int64(alignment))
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
if _, err := rs.Seek(padding-offset, io.SeekCurrent); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tensor := range llm.tensors {
|
for _, tensor := range llm.tensors {
|
||||||
if _, err := rs.Seek(int64(tensor.size()), io.SeekCurrent); err != nil {
|
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
|
||||||
return err
|
if _, err := rs.Seek(padded, io.SeekCurrent); err != nil {
|
||||||
}
|
|
||||||
|
|
||||||
padding := llm.padding(int64(tensor.size()), int64(alignment))
|
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -463,13 +461,11 @@ var ggufKVOrder = map[string][]string{
|
|||||||
"llama.embedding_length",
|
"llama.embedding_length",
|
||||||
"llama.block_count",
|
"llama.block_count",
|
||||||
"llama.feed_forward_length",
|
"llama.feed_forward_length",
|
||||||
|
"llama.rope.dimension_count",
|
||||||
"llama.attention.head_count",
|
"llama.attention.head_count",
|
||||||
"llama.attention.head_count_kv",
|
"llama.attention.head_count_kv",
|
||||||
"llama.attention.layer_norm_rms_epsilon",
|
"llama.attention.layer_norm_rms_epsilon",
|
||||||
"llama.rope.freq_base",
|
"llama.rope.freq_base",
|
||||||
"llama.rope.dimension_count",
|
|
||||||
"llama.expert_count",
|
|
||||||
"llama.expert_used_count",
|
|
||||||
"gemma.context_length",
|
"gemma.context_length",
|
||||||
"gemma.embedding_length",
|
"gemma.embedding_length",
|
||||||
"gemma.block_count",
|
"gemma.block_count",
|
||||||
@@ -577,8 +573,6 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
default:
|
|
||||||
return fmt.Errorf("improper type for '%s'", k)
|
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -600,11 +594,9 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
dims := 0
|
dims := 1
|
||||||
for cnt := 0; cnt < len(tensor.Shape); cnt++ {
|
if tensor.Shape[1] > 0 {
|
||||||
if tensor.Shape[cnt] > 0 {
|
dims = 2
|
||||||
dims++
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := binary.Write(ws, llm.ByteOrder, uint32(dims)); err != nil {
|
if err := binary.Write(ws, llm.ByteOrder, uint32(dims)); err != nil {
|
||||||
@@ -631,9 +623,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
var alignment int64 = 32
|
padding := llm.padding(offset, 32)
|
||||||
padding := llm.padding(offset, alignment)
|
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil {
|
||||||
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -647,8 +638,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
padding := llm.padding(offset, alignment)
|
padding := llm.padding(offset, 32)
|
||||||
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
|
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -657,5 +648,5 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (gguf) padding(offset, align int64) int64 {
|
func (gguf) padding(offset, align int64) int64 {
|
||||||
return (align - offset%align) % align
|
return (offset + align - 1) / align * align
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,5 +2,5 @@ package llm
|
|||||||
|
|
||||||
import "embed"
|
import "embed"
|
||||||
|
|
||||||
// unused on windows
|
//go:embed build/windows/*/*/bin/*
|
||||||
var libEmbed embed.FS
|
var libEmbed embed.FS
|
||||||
|
|||||||
188
llm/memory.go
@@ -1,188 +0,0 @@
|
|||||||
package llm
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
)
|
|
||||||
|
|
||||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
|
||||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
|
||||||
var estimatedVRAM uint64
|
|
||||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
|
||||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
|
||||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NumCtx < 4 {
|
|
||||||
opts.NumCtx = 4
|
|
||||||
}
|
|
||||||
|
|
||||||
// Split up the GPUs by type and try them
|
|
||||||
for _, gpus := range allGpus.ByLibrary() {
|
|
||||||
var layerCount int
|
|
||||||
layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
|
||||||
if opts.NumGPU < 0 {
|
|
||||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
|
||||||
return true, estimatedVRAM
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if layerCount > 0 && layerCount >= opts.NumGPU {
|
|
||||||
return true, estimatedVRAM
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false, estimatedVRAM
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
|
|
||||||
// The GPUs provided must all be the same Library
|
|
||||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
|
|
||||||
if gpus[0].Library == "cpu" {
|
|
||||||
return 0, 0
|
|
||||||
}
|
|
||||||
var memoryAvailable uint64
|
|
||||||
for _, info := range gpus {
|
|
||||||
memoryAvailable += info.FreeMemory
|
|
||||||
}
|
|
||||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
|
||||||
if userLimit != "" {
|
|
||||||
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
|
||||||
} else {
|
|
||||||
slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
|
|
||||||
memoryAvailable = avail
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
|
||||||
|
|
||||||
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
|
|
||||||
memoryMinimum := gpus[0].MinimumMemory
|
|
||||||
|
|
||||||
for _, projector := range projectors {
|
|
||||||
memoryMinimum += projectorMemoryRequirements(projector)
|
|
||||||
|
|
||||||
// multimodal models require at least 2048 context
|
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
|
||||||
}
|
|
||||||
|
|
||||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
|
||||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
|
||||||
|
|
||||||
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
|
||||||
if graphPartialOffload == 0 {
|
|
||||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
|
||||||
}
|
|
||||||
|
|
||||||
if graphFullOffload == 0 {
|
|
||||||
graphFullOffload = graphPartialOffload
|
|
||||||
}
|
|
||||||
|
|
||||||
graphFullOffload *= uint64(len(gpus))
|
|
||||||
graphPartialOffload *= uint64(len(gpus))
|
|
||||||
|
|
||||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
|
||||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
|
||||||
|
|
||||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
|
||||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
|
||||||
|
|
||||||
if memoryRequiredPartial > memoryAvailable {
|
|
||||||
slog.Debug("insufficient VRAM to load any model layers")
|
|
||||||
return 0, 0
|
|
||||||
}
|
|
||||||
|
|
||||||
layers := ggml.Tensors().Layers()
|
|
||||||
|
|
||||||
var memoryLayerOutput uint64
|
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
|
||||||
memoryLayerOutput += layer.size()
|
|
||||||
}
|
|
||||||
|
|
||||||
if layer, ok := layers["output"]; ok {
|
|
||||||
memoryLayerOutput += layer.size()
|
|
||||||
} else if layer, ok := layers["token_embd"]; ok {
|
|
||||||
memoryLayerOutput += layer.size()
|
|
||||||
}
|
|
||||||
|
|
||||||
if gpus[0].Library == "metal" && opts.UseMMap {
|
|
||||||
// memory is preallocated for output tensors
|
|
||||||
memoryRequiredTotal += memoryLayerOutput
|
|
||||||
memoryRequiredPartial += memoryLayerOutput
|
|
||||||
}
|
|
||||||
|
|
||||||
var layerCount int
|
|
||||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
|
||||||
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
|
|
||||||
|
|
||||||
// KV is proportional to the number of layers
|
|
||||||
memoryLayer += kv / ggml.KV().BlockCount()
|
|
||||||
|
|
||||||
memoryRequiredTotal += memoryLayer
|
|
||||||
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
|
||||||
memoryRequiredPartial += memoryLayer
|
|
||||||
layerCount++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if gpus[0].Library != "metal" || !opts.UseMMap {
|
|
||||||
// memory was not preallocated for output tensors
|
|
||||||
memoryRequiredTotal += memoryLayerOutput
|
|
||||||
}
|
|
||||||
|
|
||||||
if memoryAvailable > memoryRequiredTotal {
|
|
||||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
|
||||||
memoryRequiredPartial = memoryRequiredTotal
|
|
||||||
}
|
|
||||||
|
|
||||||
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
|
|
||||||
|
|
||||||
slog.Info(
|
|
||||||
"offload to gpu",
|
|
||||||
slog.Group(
|
|
||||||
"layers",
|
|
||||||
// actual number of layers offloaded
|
|
||||||
"real", opts.NumGPU,
|
|
||||||
// estimated number of layers that can be offloaded
|
|
||||||
"estimate", layerCount,
|
|
||||||
),
|
|
||||||
slog.Group(
|
|
||||||
"memory",
|
|
||||||
// memory available for offloading
|
|
||||||
"available", format.HumanBytes2(memoryAvailable),
|
|
||||||
slog.Group(
|
|
||||||
"required",
|
|
||||||
// memory required for full offloading
|
|
||||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
|
||||||
// memory required to offload layers.estimate layers
|
|
||||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
|
||||||
// memory of KV cache
|
|
||||||
"kv", format.HumanBytes2(kv),
|
|
||||||
),
|
|
||||||
slog.Group(
|
|
||||||
"weights",
|
|
||||||
// memory of the weights
|
|
||||||
"total", format.HumanBytes2(memoryWeights),
|
|
||||||
// memory of repeating layers
|
|
||||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
|
||||||
// memory of non-repeating layers
|
|
||||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
|
||||||
),
|
|
||||||
slog.Group(
|
|
||||||
"graph",
|
|
||||||
// memory of graph when fully offloaded
|
|
||||||
"full", format.HumanBytes2(graphFullOffload),
|
|
||||||
// memory of graph when not fully offloaded
|
|
||||||
"partial", format.HumanBytes2(graphPartialOffload),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
return layerCount, uint64(memoryRequiredPartial)
|
|
||||||
}
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
|
||||||
index e431c7f7..f077e688 100644
|
|
||||||
--- a/examples/llava/clip.cpp
|
|
||||||
+++ b/examples/llava/clip.cpp
|
|
||||||
@@ -3,6 +3,7 @@
|
|
||||||
// I'll gradually clean and extend it
|
|
||||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
|
||||||
#include "clip.h"
|
|
||||||
+#include "common.h"
|
|
||||||
#include "log.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-alloc.h"
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
diff --git a/ggml-metal.m b/ggml-metal.m
|
|
||||||
index 0207b787..b5e9884b 100644
|
|
||||||
--- a/ggml-metal.m
|
|
||||||
+++ b/ggml-metal.m
|
|
||||||
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
||||||
// to the matrix-vector kernel
|
|
||||||
int ne11_mm_min = 1;
|
|
||||||
|
|
||||||
-#if 0
|
|
||||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
|
||||||
// these numbers do not translate to other devices or model sizes
|
|
||||||
// TODO: need to find a better approach
|
|
||||||
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
|
||||||
- switch (src0t) {
|
|
||||||
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
|
||||||
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
|
||||||
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
|
||||||
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
|
||||||
- case GGML_TYPE_Q4_0:
|
|
||||||
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
|
||||||
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
|
||||||
- case GGML_TYPE_Q5_0: // not tested yet
|
|
||||||
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
|
||||||
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
|
||||||
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
|
||||||
- default: ne11_mm_min = 1; break;
|
|
||||||
- }
|
|
||||||
+ switch (src0t) {
|
|
||||||
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
|
||||||
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
|
||||||
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
|
||||||
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
|
||||||
+ case GGML_TYPE_Q4_0:
|
|
||||||
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
|
||||||
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
|
||||||
+ case GGML_TYPE_Q5_0: // not tested yet
|
|
||||||
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
|
||||||
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
|
||||||
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
|
||||||
+ default: ne11_mm_min = 1; break;
|
|
||||||
}
|
|
||||||
-#endif
|
|
||||||
|
|
||||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
|
||||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
|
||||||
@@ -9,7 +9,6 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"golang.org/x/exp/slices"
|
"golang.org/x/exp/slices"
|
||||||
@@ -18,7 +17,7 @@ import (
|
|||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
)
|
)
|
||||||
|
|
||||||
var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
|
var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
|
||||||
|
|
||||||
func Init() error {
|
func Init() error {
|
||||||
payloadsDir, err := gpu.PayloadsDir()
|
payloadsDir, err := gpu.PayloadsDir()
|
||||||
@@ -26,15 +25,13 @@ func Init() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if runtime.GOOS != "windows" {
|
slog.Info("extracting embedded files", "dir", payloadsDir)
|
||||||
slog.Info("extracting embedded files", "dir", payloadsDir)
|
binGlob := "build/*/*/*/bin/*"
|
||||||
binGlob := "build/*/*/*/bin/*"
|
|
||||||
|
|
||||||
// extract server libraries
|
// extract server libraries
|
||||||
err = extractFiles(payloadsDir, binGlob)
|
err = extractFiles(payloadsDir, binGlob)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("extract binaries: %v", err)
|
return fmt.Errorf("extract binaries: %v", err)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var variants []string
|
var variants []string
|
||||||
@@ -141,23 +138,6 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
|||||||
return servers
|
return servers
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the optimal server for this CPU architecture
|
|
||||||
func serverForCpu() string {
|
|
||||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
|
||||||
return "metal"
|
|
||||||
}
|
|
||||||
variant := gpu.GetCPUVariant()
|
|
||||||
availableServers := availableServers()
|
|
||||||
if variant != "" {
|
|
||||||
for cmp := range availableServers {
|
|
||||||
if cmp == "cpu_"+variant {
|
|
||||||
return cmp
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "cpu"
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract extracts the embedded files to the target directory
|
// extract extracts the embedded files to the target directory
|
||||||
func extractFiles(targetDir string, glob string) error {
|
func extractFiles(targetDir string, glob string) error {
|
||||||
files, err := fs.Glob(libEmbed, glob)
|
files, err := fs.Glob(libEmbed, glob)
|
||||||
|
|||||||
285
llm/server.go
@@ -21,43 +21,21 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"golang.org/x/sync/semaphore"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
)
|
)
|
||||||
|
|
||||||
type LlamaServer interface {
|
// LlamaServer is an instance of the llama.cpp server
|
||||||
Ping(ctx context.Context) error
|
type LlamaServer struct {
|
||||||
WaitUntilRunning(ctx context.Context) error
|
|
||||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
|
||||||
Embedding(ctx context.Context, prompt string) ([]float64, error)
|
|
||||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
|
||||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
|
||||||
Close() error
|
|
||||||
EstimatedVRAM() uint64
|
|
||||||
}
|
|
||||||
|
|
||||||
// llmServer is an instance of the llama.cpp server
|
|
||||||
type llmServer struct {
|
|
||||||
port int
|
port int
|
||||||
cmd *exec.Cmd
|
cmd *exec.Cmd
|
||||||
done chan error // Channel to signal when the process exits
|
done chan error // Channel to signal when the process exits
|
||||||
status *StatusWriter
|
status *StatusWriter
|
||||||
options api.Options
|
options api.Options
|
||||||
|
|
||||||
// TODO - this should be broken down by GPU
|
|
||||||
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
|
||||||
|
|
||||||
sem *semaphore.Weighted
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadModel(model string) (*GGML, error) {
|
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
f, err := os.Open(model)
|
f, err := os.Open(model)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -65,13 +43,10 @@ func LoadModel(model string) (*GGML, error) {
|
|||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(f)
|
ggml, _, err := DecodeGGML(f)
|
||||||
return ggml, err
|
if err != nil {
|
||||||
}
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
// NewLlamaServer will run a server for the given GPUs
|
|
||||||
// The gpu list must be a single family.
|
|
||||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
|
||||||
var err error
|
|
||||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||||
@@ -81,51 +56,91 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
opts.NumCtx = 4
|
opts.NumCtx = 4
|
||||||
}
|
}
|
||||||
|
|
||||||
cpuRunner := ""
|
memoryAvailable, _ := gpu.CheckVRAM()
|
||||||
var estimatedVRAM uint64
|
info := gpu.GetGPUInfo()
|
||||||
var systemMemory uint64
|
|
||||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
|
||||||
|
|
||||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
memoryMinimum := info.MinimumMemory
|
||||||
|
for _, projector := range projectors {
|
||||||
|
memoryMinimum += projectorMemoryRequirements(projector)
|
||||||
|
|
||||||
cpuRunner = serverForCpu()
|
// multimodal models require at least 2048 context
|
||||||
} else {
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
if gpus[0].Library == "metal" {
|
}
|
||||||
memInfo, err := gpu.GetCPUMem()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("failed to lookup system memory", "error", err)
|
|
||||||
} else {
|
|
||||||
systemMemory = memInfo.TotalMemory
|
|
||||||
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
var layers int
|
|
||||||
layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
|
||||||
|
|
||||||
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
|
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||||
// disable partial offloading when model is greater than total system memory as this
|
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||||
// can lead to locking up the system
|
|
||||||
opts.NumGPU = 0
|
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||||
} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
|
if graphPartialOffload == 0 {
|
||||||
opts.NumGPU = layers
|
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||||
|
}
|
||||||
|
|
||||||
|
if graphFullOffload == 0 {
|
||||||
|
graphFullOffload = graphPartialOffload
|
||||||
|
}
|
||||||
|
|
||||||
|
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||||
|
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||||
|
|
||||||
|
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||||
|
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||||
|
|
||||||
|
if info.Library != "metal" {
|
||||||
|
if memoryRequiredPartial > memoryAvailable {
|
||||||
|
info.Library = "cpu"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop through potential servers
|
var layerCount int
|
||||||
finalErr := fmt.Errorf("no suitable llama servers found")
|
layers := ggml.Tensors().Layers()
|
||||||
|
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||||
|
memoryLayer := layers[fmt.Sprintf("%d", i)].size()
|
||||||
|
|
||||||
|
// KV is proportional to the number of layers
|
||||||
|
memoryLayer += kv / ggml.KV().BlockCount()
|
||||||
|
|
||||||
|
memoryRequiredTotal += memoryLayer
|
||||||
|
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
||||||
|
memoryRequiredPartial += memoryLayer
|
||||||
|
layerCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memoryLayerOutput := layers["output"].size()
|
||||||
|
memoryRequiredTotal += memoryLayerOutput
|
||||||
|
|
||||||
|
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
|
||||||
|
// disable partial offloading when model is greater than total system memory
|
||||||
|
opts.NumGPU = 0
|
||||||
|
} else if memoryAvailable > memoryRequiredTotal {
|
||||||
|
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||||
|
memoryRequiredPartial = memoryRequiredTotal
|
||||||
|
}
|
||||||
|
|
||||||
|
if opts.NumGPU < 0 {
|
||||||
|
opts.NumGPU = layerCount
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info(
|
||||||
|
"offload to gpu",
|
||||||
|
"reallayers", opts.NumGPU,
|
||||||
|
"layers", layerCount,
|
||||||
|
"required", format.HumanBytes2(memoryRequiredTotal),
|
||||||
|
"used", format.HumanBytes2(memoryRequiredPartial),
|
||||||
|
"available", format.HumanBytes2(memoryAvailable),
|
||||||
|
"kv", format.HumanBytes2(kv),
|
||||||
|
"fulloffload", format.HumanBytes2(graphFullOffload),
|
||||||
|
"partialoffload", format.HumanBytes2(graphPartialOffload),
|
||||||
|
)
|
||||||
|
|
||||||
if len(adapters) > 1 {
|
if len(adapters) > 1 {
|
||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||||
}
|
}
|
||||||
|
|
||||||
availableServers := availableServers()
|
availableServers := availableServers()
|
||||||
var servers []string
|
servers := serversForGpu(info)
|
||||||
if cpuRunner != "" {
|
|
||||||
servers = []string{cpuRunner}
|
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
|
||||||
} else {
|
|
||||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
|
||||||
}
|
|
||||||
demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
|
|
||||||
if demandLib != "" {
|
if demandLib != "" {
|
||||||
serverPath := availableServers[demandLib]
|
serverPath := availableServers[demandLib]
|
||||||
if serverPath == "" {
|
if serverPath == "" {
|
||||||
@@ -137,7 +152,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(servers) == 0 {
|
if len(servers) == 0 {
|
||||||
return nil, fmt.Errorf("no servers found for %v", gpus)
|
return nil, fmt.Errorf("no servers found for %v", info)
|
||||||
}
|
}
|
||||||
|
|
||||||
params := []string{
|
params := []string{
|
||||||
@@ -194,26 +209,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
params = append(params, "--numa")
|
params = append(params, "--numa")
|
||||||
}
|
}
|
||||||
|
|
||||||
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
|
// Loop through potential servers
|
||||||
numParallel := 1
|
var finalErr error
|
||||||
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
|
||||||
numParallel, err = strconv.Atoi(onp)
|
|
||||||
if err != nil || numParallel <= 0 {
|
|
||||||
err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
|
|
||||||
slog.Error("misconfiguration", "error", err)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
|
||||||
|
|
||||||
for i := 0; i < len(servers); i++ {
|
for i := 0; i < len(servers); i++ {
|
||||||
dir := availableServers[servers[i]]
|
dir := availableServers[servers[i]]
|
||||||
if dir == "" {
|
|
||||||
// Shouldn't happen
|
|
||||||
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
|
|
||||||
slog.Error("sever list inconsistent", "error", finalErr)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
|
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
|
||||||
port := 0
|
port := 0
|
||||||
@@ -236,60 +235,30 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
// append the server directory to LD_LIBRARY_PATH/PATH
|
// append the server directory to LD_LIBRARY_PATH/PATH
|
||||||
libraryPaths := []string{dir}
|
libraryPaths := []string{dir}
|
||||||
|
|
||||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||||
// Append our runner directory to the path
|
// Append our runner directory to the path
|
||||||
// This will favor system libraries over our bundled library dependencies
|
// This will favor system libraries over our bundled library dependencies
|
||||||
libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
|
libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: we always put the dependency path first
|
|
||||||
// since this was the exact version we verified for AMD GPUs
|
|
||||||
// and we favor what the user had in their path
|
|
||||||
if gpus[0].DependencyPath != "" {
|
|
||||||
// TODO refine for multi-gpu support
|
|
||||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
|
||||||
}
|
|
||||||
|
|
||||||
server := filepath.Join(dir, "ollama_llama_server")
|
server := filepath.Join(dir, "ollama_llama_server")
|
||||||
if runtime.GOOS == "windows" {
|
if runtime.GOOS == "windows" {
|
||||||
server = server + ".exe"
|
server = server + ".exe"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Detect tmp cleaners wiping out the file
|
s := &LlamaServer{
|
||||||
_, err := os.Stat(server)
|
port: port,
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
cmd: exec.Command(server, finalParams...),
|
||||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
status: NewStatusWriter(os.Stderr),
|
||||||
err = Init()
|
options: opts,
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
s := &llmServer{
|
|
||||||
port: port,
|
|
||||||
cmd: exec.Command(server, finalParams...),
|
|
||||||
status: NewStatusWriter(os.Stderr),
|
|
||||||
options: opts,
|
|
||||||
estimatedVRAM: estimatedVRAM,
|
|
||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
|
||||||
}
|
|
||||||
|
|
||||||
libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
|
libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
|
||||||
|
slog.Debug(libEnv)
|
||||||
s.cmd.Env = append(os.Environ(), libEnv)
|
s.cmd.Env = append(os.Environ(), libEnv)
|
||||||
s.cmd.Stdout = os.Stdout
|
s.cmd.Stdout = os.Stdout
|
||||||
s.cmd.Stderr = s.status
|
s.cmd.Stderr = s.status
|
||||||
|
|
||||||
// TODO - multiple GPU selection logic...
|
|
||||||
key, val := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
|
|
||||||
if key != "" {
|
|
||||||
s.cmd.Env = append(s.cmd.Env, key+"="+val)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("starting llama server", "cmd", s.cmd.String())
|
slog.Info("starting llama server", "cmd", s.cmd.String())
|
||||||
// Log at debug as the environment is inherited and might contain sensitive information
|
|
||||||
slog.Debug("subprocess", "environment", s.cmd.Env)
|
|
||||||
|
|
||||||
if err = s.cmd.Start(); err != nil {
|
if err = s.cmd.Start(); err != nil {
|
||||||
msg := ""
|
msg := ""
|
||||||
@@ -307,13 +276,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
_ = s.cmd.Wait()
|
_ = s.cmd.Wait()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// TODO - make sure this is all wired up correctly
|
|
||||||
// if err = s.WaitUntilRunning(); err != nil {
|
|
||||||
// slog.Error("error starting llama server", "server", servers[i], "error", err)
|
|
||||||
// s.Close()
|
|
||||||
// finalErr = err
|
|
||||||
// continue
|
|
||||||
// }
|
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -351,21 +313,6 @@ const ( // iota is reset to 0
|
|||||||
ServerStatusError
|
ServerStatusError
|
||||||
)
|
)
|
||||||
|
|
||||||
func (s ServerStatus) ToString() string {
|
|
||||||
switch s {
|
|
||||||
case ServerStatusReady:
|
|
||||||
return "llm server ready"
|
|
||||||
case ServerStatusNoSlotsAvaialble:
|
|
||||||
return "llm busy - no slots available"
|
|
||||||
case ServerStatusLoadingModel:
|
|
||||||
return "llm server loading model"
|
|
||||||
case ServerStatusNotResponding:
|
|
||||||
return "llm server not responding"
|
|
||||||
default:
|
|
||||||
return "llm server error"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type ServerStatusResp struct {
|
type ServerStatusResp struct {
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
SlotsIdle int `json:"slots_idle"`
|
SlotsIdle int `json:"slots_idle"`
|
||||||
@@ -373,7 +320,7 @@ type ServerStatusResp struct {
|
|||||||
Error string `json:"error"`
|
Error string `json:"error"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||||
// Fail fast if its exited
|
// Fail fast if its exited
|
||||||
if s.cmd.ProcessState != nil {
|
if s.cmd.ProcessState != nil {
|
||||||
msg := ""
|
msg := ""
|
||||||
@@ -420,7 +367,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Ping(ctx context.Context) error {
|
func (s *LlamaServer) Ping(ctx context.Context) error {
|
||||||
_, err := s.getServerStatus(ctx)
|
_, err := s.getServerStatus(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("server unhealthy", "error", err)
|
slog.Debug("server unhealthy", "error", err)
|
||||||
@@ -429,7 +376,7 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
func (s *LlamaServer) WaitUntilRunning() error {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
// TODO we need to wire up a better way to detect hangs during model load and startup of the server
|
// TODO we need to wire up a better way to detect hangs during model load and startup of the server
|
||||||
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
||||||
@@ -440,9 +387,6 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|||||||
var lastStatus ServerStatus = -1
|
var lastStatus ServerStatus = -1
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
|
||||||
slog.Info("context expired before server started")
|
|
||||||
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
|
|
||||||
case err := <-s.done:
|
case err := <-s.done:
|
||||||
msg := ""
|
msg := ""
|
||||||
if s.status != nil && s.status.LastErrMsg != "" {
|
if s.status != nil && s.status.LastErrMsg != "" {
|
||||||
@@ -466,9 +410,9 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|||||||
return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
||||||
}
|
}
|
||||||
|
|
||||||
c, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
status, err := s.getServerStatus(c)
|
status, err := s.getServerStatus(ctx)
|
||||||
if err != nil && lastStatus != status {
|
if err != nil && lastStatus != status {
|
||||||
slog.Debug("server not yet available", "error", err)
|
slog.Debug("server not yet available", "error", err)
|
||||||
lastStatus = status
|
lastStatus = status
|
||||||
@@ -554,19 +498,7 @@ type CompletionResponse struct {
|
|||||||
EvalDuration time.Duration
|
EvalDuration time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
|
||||||
slog.Error("Failed to acquire semaphore", "error", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer s.sem.Release(1)
|
|
||||||
|
|
||||||
// only allow maximum 10 "context shifts" to avoid infinite generation
|
|
||||||
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
|
||||||
req.Options.NumPredict = 10 * s.options.NumCtx
|
|
||||||
slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
|
|
||||||
}
|
|
||||||
|
|
||||||
request := map[string]any{
|
request := map[string]any{
|
||||||
"prompt": req.Prompt,
|
"prompt": req.Prompt,
|
||||||
"stream": true,
|
"stream": true,
|
||||||
@@ -597,7 +529,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
} else if status != ServerStatusReady {
|
} else if status != ServerStatusReady {
|
||||||
return fmt.Errorf("unexpected server status: %s", status.ToString())
|
return fmt.Errorf("unexpected server status: %d", status)
|
||||||
}
|
}
|
||||||
|
|
||||||
if req.Format == "json" {
|
if req.Format == "json" {
|
||||||
@@ -744,18 +676,13 @@ type EmbeddingResponse struct {
|
|||||||
Embedding []float64 `json:"embedding"`
|
Embedding []float64 `json:"embedding"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
||||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
|
||||||
slog.Error("Failed to acquire semaphore", "error", err)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer s.sem.Release(1)
|
|
||||||
// Make sure the server is ready
|
// Make sure the server is ready
|
||||||
status, err := s.getServerStatus(ctx)
|
status, err := s.getServerStatus(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
} else if status != ServerStatusReady {
|
} else if status != ServerStatusReady {
|
||||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
return nil, fmt.Errorf("unexpected server status: %d", status)
|
||||||
}
|
}
|
||||||
|
|
||||||
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
||||||
@@ -801,13 +728,13 @@ type TokenizeResponse struct {
|
|||||||
Tokens []int `json:"tokens"`
|
Tokens []int `json:"tokens"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||||
// Make sure the server is ready
|
// Make sure the server is ready
|
||||||
status, err := s.getServerStatus(ctx)
|
status, err := s.getServerStatus(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
|
} else if status != ServerStatusReady {
|
||||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
return nil, fmt.Errorf("unexpected server status: %d", status)
|
||||||
}
|
}
|
||||||
|
|
||||||
data, err := json.Marshal(TokenizeRequest{Content: content})
|
data, err := json.Marshal(TokenizeRequest{Content: content})
|
||||||
@@ -853,13 +780,13 @@ type DetokenizeResponse struct {
|
|||||||
Content string `json:"content"`
|
Content string `json:"content"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||||
// Make sure the server is ready
|
// Make sure the server is ready
|
||||||
status, err := s.getServerStatus(ctx)
|
status, err := s.getServerStatus(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
|
} else if status != ServerStatusReady {
|
||||||
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
|
return "", fmt.Errorf("unexpected server status: %d", status)
|
||||||
}
|
}
|
||||||
|
|
||||||
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
||||||
@@ -897,7 +824,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
|
|||||||
return decoded.Content, nil
|
return decoded.Content, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Close() error {
|
func (s *LlamaServer) Close() error {
|
||||||
if s.cmd != nil {
|
if s.cmd != nil {
|
||||||
slog.Debug("stopping llama server")
|
slog.Debug("stopping llama server")
|
||||||
return s.cmd.Process.Kill()
|
return s.cmd.Process.Kill()
|
||||||
@@ -906,10 +833,6 @@ func (s *llmServer) Close() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) EstimatedVRAM() uint64 {
|
|
||||||
return s.estimatedVRAM
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseDurationMs(ms float64) time.Duration {
|
func parseDurationMs(ms float64) time.Duration {
|
||||||
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
16
macapp/.eslintrc.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"env": {
|
||||||
|
"browser": true,
|
||||||
|
"es6": true,
|
||||||
|
"node": true
|
||||||
|
},
|
||||||
|
"extends": [
|
||||||
|
"eslint:recommended",
|
||||||
|
"plugin:@typescript-eslint/eslint-recommended",
|
||||||
|
"plugin:@typescript-eslint/recommended",
|
||||||
|
"plugin:import/recommended",
|
||||||
|
"plugin:import/electron",
|
||||||
|
"plugin:import/typescript"
|
||||||
|
],
|
||||||
|
"parser": "@typescript-eslint/parser"
|
||||||
|
}
|
||||||
92
macapp/.gitignore
vendored
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
# Logs
|
||||||
|
logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
lerna-debug.log*
|
||||||
|
|
||||||
|
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||||
|
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
||||||
|
|
||||||
|
# Runtime data
|
||||||
|
pids
|
||||||
|
*.pid
|
||||||
|
*.seed
|
||||||
|
*.pid.lock
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||||
|
lib-cov
|
||||||
|
|
||||||
|
# Coverage directory used by tools like istanbul
|
||||||
|
coverage
|
||||||
|
*.lcov
|
||||||
|
|
||||||
|
# nyc test coverage
|
||||||
|
.nyc_output
|
||||||
|
|
||||||
|
# node-waf configuration
|
||||||
|
.lock-wscript
|
||||||
|
|
||||||
|
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||||
|
build/Release
|
||||||
|
|
||||||
|
# Dependency directories
|
||||||
|
node_modules/
|
||||||
|
jspm_packages/
|
||||||
|
|
||||||
|
# TypeScript v1 declaration files
|
||||||
|
typings/
|
||||||
|
|
||||||
|
# TypeScript cache
|
||||||
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
# Optional npm cache directory
|
||||||
|
.npm
|
||||||
|
|
||||||
|
# Optional eslint cache
|
||||||
|
.eslintcache
|
||||||
|
|
||||||
|
# Optional REPL history
|
||||||
|
.node_repl_history
|
||||||
|
|
||||||
|
# Output of 'npm pack'
|
||||||
|
*.tgz
|
||||||
|
|
||||||
|
# Yarn Integrity file
|
||||||
|
.yarn-integrity
|
||||||
|
|
||||||
|
# dotenv environment variables file
|
||||||
|
.env
|
||||||
|
.env.test
|
||||||
|
|
||||||
|
# parcel-bundler cache (https://parceljs.org/)
|
||||||
|
.cache
|
||||||
|
|
||||||
|
# next.js build output
|
||||||
|
.next
|
||||||
|
|
||||||
|
# nuxt.js build output
|
||||||
|
.nuxt
|
||||||
|
|
||||||
|
# vuepress build output
|
||||||
|
.vuepress/dist
|
||||||
|
|
||||||
|
# Serverless directories
|
||||||
|
.serverless/
|
||||||
|
|
||||||
|
# FuseBox cache
|
||||||
|
.fusebox/
|
||||||
|
|
||||||
|
# DynamoDB Local files
|
||||||
|
.dynamodb/
|
||||||
|
|
||||||
|
# Webpack
|
||||||
|
.webpack/
|
||||||
|
|
||||||
|
# Vite
|
||||||
|
.vite/
|
||||||
|
|
||||||
|
# Electron-Forge
|
||||||
|
out/
|
||||||
21
macapp/README.md
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# Desktop
|
||||||
|
|
||||||
|
This app builds upon Ollama to provide a desktop experience for running models.
|
||||||
|
|
||||||
|
## Developing
|
||||||
|
|
||||||
|
First, build the `ollama` binary:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd ..
|
||||||
|
go build .
|
||||||
|
```
|
||||||
|
|
||||||
|
Then run the desktop app with `npm start`:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd macapp
|
||||||
|
npm install
|
||||||
|
npm start
|
||||||
|
```
|
||||||
|
|
||||||
BIN
macapp/assets/iconDarkTemplate.png
Normal file
|
After Width: | Height: | Size: 402 B |
BIN
macapp/assets/iconDarkTemplate@2x.png
Normal file
|
After Width: | Height: | Size: 741 B |
BIN
macapp/assets/iconDarkUpdateTemplate.png
Normal file
|
After Width: | Height: | Size: 440 B |
BIN
macapp/assets/iconDarkUpdateTemplate@2x.png
Normal file
|
After Width: | Height: | Size: 763 B |
BIN
macapp/assets/iconTemplate.png
Normal file
|
After Width: | Height: | Size: 447 B |
BIN
macapp/assets/iconTemplate@2x.png
Normal file
|
After Width: | Height: | Size: 891 B |
BIN
macapp/assets/iconUpdateTemplate.png
Normal file
|
After Width: | Height: | Size: 443 B |
BIN
macapp/assets/iconUpdateTemplate@2x.png
Normal file
|
After Width: | Height: | Size: 844 B |
78
macapp/forge.config.ts
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import type { ForgeConfig } from '@electron-forge/shared-types'
|
||||||
|
import { MakerSquirrel } from '@electron-forge/maker-squirrel'
|
||||||
|
import { MakerZIP } from '@electron-forge/maker-zip'
|
||||||
|
import { PublisherGithub } from '@electron-forge/publisher-github'
|
||||||
|
import { AutoUnpackNativesPlugin } from '@electron-forge/plugin-auto-unpack-natives'
|
||||||
|
import { WebpackPlugin } from '@electron-forge/plugin-webpack'
|
||||||
|
import * as path from 'path'
|
||||||
|
import * as fs from 'fs'
|
||||||
|
|
||||||
|
import { mainConfig } from './webpack.main.config'
|
||||||
|
import { rendererConfig } from './webpack.renderer.config'
|
||||||
|
|
||||||
|
const packageJson = JSON.parse(fs.readFileSync(path.resolve(__dirname, './package.json'), 'utf8'))
|
||||||
|
|
||||||
|
const config: ForgeConfig = {
|
||||||
|
packagerConfig: {
|
||||||
|
appVersion: process.env.VERSION || packageJson.version,
|
||||||
|
asar: true,
|
||||||
|
icon: './assets/icon.icns',
|
||||||
|
extraResource: [
|
||||||
|
'../dist/ollama',
|
||||||
|
path.join(__dirname, './assets/iconTemplate.png'),
|
||||||
|
path.join(__dirname, './assets/iconTemplate@2x.png'),
|
||||||
|
path.join(__dirname, './assets/iconUpdateTemplate.png'),
|
||||||
|
path.join(__dirname, './assets/iconUpdateTemplate@2x.png'),
|
||||||
|
path.join(__dirname, './assets/iconDarkTemplate.png'),
|
||||||
|
path.join(__dirname, './assets/iconDarkTemplate@2x.png'),
|
||||||
|
path.join(__dirname, './assets/iconDarkUpdateTemplate.png'),
|
||||||
|
path.join(__dirname, './assets/iconDarkUpdateTemplate@2x.png'),
|
||||||
|
],
|
||||||
|
...(process.env.SIGN
|
||||||
|
? {
|
||||||
|
osxSign: {
|
||||||
|
identity: process.env.APPLE_IDENTITY,
|
||||||
|
},
|
||||||
|
osxNotarize: {
|
||||||
|
tool: 'notarytool',
|
||||||
|
appleId: process.env.APPLE_ID || '',
|
||||||
|
appleIdPassword: process.env.APPLE_PASSWORD || '',
|
||||||
|
teamId: process.env.APPLE_TEAM_ID || '',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
: {}),
|
||||||
|
osxUniversal: {
|
||||||
|
x64ArchFiles: '**/ollama',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
rebuildConfig: {},
|
||||||
|
makers: [new MakerSquirrel({}), new MakerZIP({}, ['darwin'])],
|
||||||
|
hooks: {
|
||||||
|
readPackageJson: async (_, packageJson) => {
|
||||||
|
return { ...packageJson, version: process.env.VERSION || packageJson.version }
|
||||||
|
},
|
||||||
|
},
|
||||||
|
plugins: [
|
||||||
|
new AutoUnpackNativesPlugin({}),
|
||||||
|
new WebpackPlugin({
|
||||||
|
mainConfig,
|
||||||
|
devContentSecurityPolicy: `default-src * 'unsafe-eval' 'unsafe-inline'; img-src data: 'self'`,
|
||||||
|
renderer: {
|
||||||
|
config: rendererConfig,
|
||||||
|
nodeIntegration: true,
|
||||||
|
entryPoints: [
|
||||||
|
{
|
||||||
|
html: './src/index.html',
|
||||||
|
js: './src/renderer.tsx',
|
||||||
|
name: 'main_window',
|
||||||
|
preload: {
|
||||||
|
js: './src/preload.ts',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
export default config
|
||||||