Compare commits
8 Commits
v0.0.19
...
matt/strea
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e2389b63aa | ||
|
|
f89c23764b | ||
|
|
d028853879 | ||
|
|
949553db23 | ||
|
|
0c5a454361 | ||
|
|
f59c4d03f7 | ||
|
|
7dee25a07f | ||
|
|
f221637053 |
22
Dockerfile.cuda
Normal file
22
Dockerfile.cuda
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||||
|
RUN apt-get update && apt-get install -y git build-essential cmake
|
||||||
|
ADD https://dl.google.com/go/go1.21.1.linux-amd64.tar.gz /tmp/go1.21.1.tar.gz
|
||||||
|
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
RUN /usr/local/go/bin/go generate ./... && /usr/local/go/bin/go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||||
|
|
||||||
|
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||||
|
ENV OLLAMA_HOST 0.0.0.0
|
||||||
|
|
||||||
|
ARG USER=ollama
|
||||||
|
ARG GROUP=ollama
|
||||||
|
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
|
||||||
|
|
||||||
|
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||||
|
|
||||||
|
USER $USER:$GROUP
|
||||||
|
ENTRYPOINT ["/bin/ollama"]
|
||||||
|
CMD ["serve"]
|
||||||
@@ -291,7 +291,7 @@ func DefaultOptions() Options {
|
|||||||
NumCtx: 2048,
|
NumCtx: 2048,
|
||||||
NumKeep: -1,
|
NumKeep: -1,
|
||||||
NumBatch: 512,
|
NumBatch: 512,
|
||||||
NumGPU: 1,
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumGQA: 1,
|
NumGQA: 1,
|
||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
F16KV: true,
|
||||||
|
|||||||
@@ -20,6 +20,10 @@ Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` a
|
|||||||
|
|
||||||
All durations are returned in nanoseconds.
|
All durations are returned in nanoseconds.
|
||||||
|
|
||||||
|
### Streams
|
||||||
|
|
||||||
|
Many API responses are streams of JSON objects showing the current status. For examples of working with streams in various languages, see [streaming.md](./streaming.md)
|
||||||
|
|
||||||
## Generate a completion
|
## Generate a completion
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -6,6 +6,10 @@
|
|||||||
|
|
||||||
Install required tools:
|
Install required tools:
|
||||||
|
|
||||||
|
- cmake version 3.24 or higher
|
||||||
|
- go version 1.20 or higher
|
||||||
|
- gcc version 11.4.0 or higher
|
||||||
|
|
||||||
```
|
```
|
||||||
brew install go cmake gcc
|
brew install go cmake gcc
|
||||||
```
|
```
|
||||||
@@ -27,3 +31,9 @@ Now you can run `ollama`:
|
|||||||
```
|
```
|
||||||
./ollama
|
./ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Building on Linux with GPU support
|
||||||
|
|
||||||
|
- Install cmake and nvidia-cuda-toolkit
|
||||||
|
- run `go generate ./...`
|
||||||
|
- run `go build .`
|
||||||
|
|||||||
35
docs/streaming.md
Normal file
35
docs/streaming.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Streaming responses in the Ollama Client API
|
||||||
|
|
||||||
|
## JavaScript / TypeScript / Deno
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const pull = async () => {
|
||||||
|
const request = await fetch("http://localhost:11434/api/pull", {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify({ name: "llama2:7b-q5_0" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const reader = await request.body?.pipeThrough(new TextDecoderStream());
|
||||||
|
if (!reader) throw new Error("No reader");
|
||||||
|
for await (const chunk of reader) {
|
||||||
|
const out = JSON.parse(chunk);
|
||||||
|
if (out.status.startsWith("downloading")) {
|
||||||
|
console.log(`${out.status} - ${(out.completed / out.total) * 100}%`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pull();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Python
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
response = requests.post("http://localhost:11434/api/pull", json={"name": "llama2:7b-q5_0"}, stream=True)
|
||||||
|
for data in response.iter_lines():
|
||||||
|
out = json.loads(data)
|
||||||
|
if "completed" in out:
|
||||||
|
print(out["completed"] / out["total"] * 100)
|
||||||
|
```
|
||||||
22
llm/falcon.go
Normal file
22
llm/falcon.go
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
const ModelFamilyFalcon = "falcon"
|
||||||
|
|
||||||
|
const (
|
||||||
|
falconModelType7B = 32
|
||||||
|
falconModelType40B = 60
|
||||||
|
falconModelType180B = 80
|
||||||
|
)
|
||||||
|
|
||||||
|
func falconModelType(numLayer uint32) string {
|
||||||
|
switch numLayer {
|
||||||
|
case 32:
|
||||||
|
return "7B"
|
||||||
|
case 60:
|
||||||
|
return "40B"
|
||||||
|
case 80:
|
||||||
|
return "180B"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
105
llm/ggml.go
105
llm/ggml.go
@@ -8,54 +8,77 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ModelFamily string
|
|
||||||
|
|
||||||
const ModelFamilyUnknown ModelFamily = "unknown"
|
|
||||||
|
|
||||||
type ModelType uint32
|
|
||||||
|
|
||||||
const (
|
|
||||||
ModelType3B ModelType = 26
|
|
||||||
ModelType7B ModelType = 32
|
|
||||||
ModelType13B ModelType = 40
|
|
||||||
ModelType34B ModelType = 48
|
|
||||||
ModelType30B ModelType = 60
|
|
||||||
ModelType65B ModelType = 80
|
|
||||||
)
|
|
||||||
|
|
||||||
func (mt ModelType) String() string {
|
|
||||||
switch mt {
|
|
||||||
case ModelType3B:
|
|
||||||
return "3B"
|
|
||||||
case ModelType7B:
|
|
||||||
return "7B"
|
|
||||||
case ModelType13B:
|
|
||||||
return "13B"
|
|
||||||
case ModelType34B:
|
|
||||||
return "34B"
|
|
||||||
case ModelType30B:
|
|
||||||
return "30B"
|
|
||||||
case ModelType65B:
|
|
||||||
return "65B"
|
|
||||||
default:
|
|
||||||
return "Unknown"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type FileType interface {
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
magic uint32
|
magic uint32
|
||||||
container
|
container
|
||||||
model
|
model
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
fileTypeF32 uint32 = iota
|
||||||
|
fileTypeF16
|
||||||
|
fileTypeQ4_0
|
||||||
|
fileTypeQ4_1
|
||||||
|
fileTypeQ4_1_F16
|
||||||
|
fileTypeQ8_0 uint32 = iota + 2
|
||||||
|
fileTypeQ5_0
|
||||||
|
fileTypeQ5_1
|
||||||
|
fileTypeQ2_K
|
||||||
|
fileTypeQ3_K_S
|
||||||
|
fileTypeQ3_K_M
|
||||||
|
fileTypeQ3_K_L
|
||||||
|
fileTypeQ4_K_S
|
||||||
|
fileTypeQ4_K_M
|
||||||
|
fileTypeQ5_K_S
|
||||||
|
fileTypeQ5_K_M
|
||||||
|
fileTypeQ6_K
|
||||||
|
)
|
||||||
|
|
||||||
|
func fileType(fileType uint32) string {
|
||||||
|
switch fileType {
|
||||||
|
case fileTypeF32:
|
||||||
|
return "F32"
|
||||||
|
case fileTypeF16:
|
||||||
|
return "F16"
|
||||||
|
case fileTypeQ4_0:
|
||||||
|
return "Q4_0"
|
||||||
|
case fileTypeQ4_1:
|
||||||
|
return "Q4_1"
|
||||||
|
case fileTypeQ4_1_F16:
|
||||||
|
return "Q4_1_F16"
|
||||||
|
case fileTypeQ8_0:
|
||||||
|
return "Q8_0"
|
||||||
|
case fileTypeQ5_0:
|
||||||
|
return "Q5_0"
|
||||||
|
case fileTypeQ5_1:
|
||||||
|
return "Q5_1"
|
||||||
|
case fileTypeQ2_K:
|
||||||
|
return "Q2_K"
|
||||||
|
case fileTypeQ3_K_S:
|
||||||
|
return "Q3_K_S"
|
||||||
|
case fileTypeQ3_K_M:
|
||||||
|
return "Q3_K_M"
|
||||||
|
case fileTypeQ3_K_L:
|
||||||
|
return "Q3_K_L"
|
||||||
|
case fileTypeQ4_K_S:
|
||||||
|
return "Q4_K_S"
|
||||||
|
case fileTypeQ4_K_M:
|
||||||
|
return "Q4_K_M"
|
||||||
|
case fileTypeQ5_K_S:
|
||||||
|
return "Q5_K_S"
|
||||||
|
case fileTypeQ5_K_M:
|
||||||
|
return "Q5_K_M"
|
||||||
|
case fileTypeQ6_K:
|
||||||
|
return "Q6_K"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type model interface {
|
type model interface {
|
||||||
ModelFamily() ModelFamily
|
ModelFamily() string
|
||||||
ModelType() ModelType
|
ModelType() string
|
||||||
FileType() FileType
|
FileType() string
|
||||||
}
|
}
|
||||||
|
|
||||||
type container interface {
|
type container interface {
|
||||||
|
|||||||
42
llm/gguf.go
42
llm/gguf.go
@@ -6,7 +6,6 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log"
|
|
||||||
"path"
|
"path"
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
@@ -87,38 +86,43 @@ func (llm *ggufModel) NumKV() uint64 {
|
|||||||
return llm.V2.NumKV
|
return llm.V2.NumKV
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggufModel) ModelFamily() ModelFamily {
|
func (llm *ggufModel) ModelFamily() string {
|
||||||
t, ok := llm.kv["general.architecture"].(string)
|
t, ok := llm.kv["general.architecture"].(string)
|
||||||
if ok {
|
if ok {
|
||||||
return ModelFamily(t)
|
return t
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Printf("unknown model family: %T", t)
|
return "unknown"
|
||||||
return ModelFamilyUnknown
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggufModel) ModelType() ModelType {
|
func (llm *ggufModel) ModelType() string {
|
||||||
switch llm.ModelFamily() {
|
switch llm.ModelFamily() {
|
||||||
case ModelFamilyLlama:
|
case "llama":
|
||||||
blocks, ok := llm.kv["llama.block_count"].(uint32)
|
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
|
||||||
if ok {
|
heads, headsOK := llm.kv["llama.head_count"].(uint32)
|
||||||
return ModelType(blocks)
|
headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
|
||||||
|
if headsOK && headsKVsOK && heads/headKVs == 8 {
|
||||||
|
return "70B"
|
||||||
|
}
|
||||||
|
|
||||||
|
return llamaModelType(blocks)
|
||||||
|
}
|
||||||
|
case "falcon":
|
||||||
|
if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
|
||||||
|
return falconModelType(blocks)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ModelType7B
|
return "Unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggufModel) FileType() FileType {
|
func (llm *ggufModel) FileType() string {
|
||||||
switch llm.ModelFamily() {
|
t, ok := llm.kv["general.file_type"].(uint32)
|
||||||
case ModelFamilyLlama:
|
if ok {
|
||||||
t, ok := llm.kv["general.file_type"].(uint32)
|
return fileType(t)
|
||||||
if ok {
|
|
||||||
return llamaFileType(t)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return llamaFileTypeF16
|
return "Unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggufModel) Decode(r io.Reader) error {
|
func (llm *ggufModel) Decode(r io.Reader) error {
|
||||||
|
|||||||
@@ -4,12 +4,14 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
//go:generate git submodule update --force ggml gguf
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate git submodule update --force ggml
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate -command git-apply git -C ggml apply
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
|
|||||||
@@ -1,12 +1,16 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
//go:generate git submodule update --force ggml gguf
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate git submodule update --force ggml
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate -command git-apply git -C ggml apply
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||||
//go:generate cmake --fresh -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
|
|||||||
@@ -1,12 +1,16 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
//go:generate git submodule update --force ggml gguf
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate git submodule update --force ggml
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate -command git-apply git -C ggml apply
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||||
|
|||||||
15
llm/llama.cpp/generate_linux.go
Normal file
15
llm/llama.cpp/generate_linux.go
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
//go:generate git submodule init
|
||||||
|
|
||||||
|
//go:generate git submodule update --force ggml
|
||||||
|
//go:generate -command git-apply git -C ggml apply
|
||||||
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||||
|
Date: Tue, 22 Aug 2023 15:14:23 +0800
|
||||||
|
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
|
||||||
|
|
||||||
|
* ggml: support CUDA's half type for aarch64(#1455)
|
||||||
|
support CUDA's half type for aarch64 in ggml_fp16_t definition
|
||||||
|
|
||||||
|
* ggml: use __CUDACC__ to recognise nvcc compiler
|
||||||
|
---
|
||||||
|
ggml.h | 5 +++--
|
||||||
|
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ggml.h b/ggml.h
|
||||||
|
index 544ad2d..0ec7ec5 100644
|
||||||
|
--- a/ggml.h
|
||||||
|
+++ b/ggml.h
|
||||||
|
@@ -259,8 +259,9 @@
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#ifdef __ARM_NEON
|
||||||
|
- // we use the built-in 16-bit float type
|
||||||
|
+#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||||
|
+ typedef half ggml_fp16_t;
|
||||||
|
+#elif defined(__ARM_NEON)
|
||||||
|
typedef __fp16 ggml_fp16_t;
|
||||||
|
#else
|
||||||
|
typedef uint16_t ggml_fp16_t;
|
||||||
|
--
|
||||||
|
2.39.2 (Apple Git-143)
|
||||||
|
|
||||||
176
llm/llama.go
176
llm/llama.go
@@ -58,6 +58,12 @@ func chooseRunner(gpuPath, cpuPath string) string {
|
|||||||
if llamaPath == osPath(gpuPath) {
|
if llamaPath == osPath(gpuPath) {
|
||||||
files = append(files, "ggml-metal.metal")
|
files = append(files, "ggml-metal.metal")
|
||||||
}
|
}
|
||||||
|
case "linux":
|
||||||
|
// check if there is a GPU available
|
||||||
|
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
|
||||||
|
// this error was logged on start-up, so we don't need to log it again
|
||||||
|
llamaPath = osPath(cpuPath)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
@@ -89,38 +95,39 @@ func chooseRunner(gpuPath, cpuPath string) string {
|
|||||||
return runPath
|
return runPath
|
||||||
}
|
}
|
||||||
|
|
||||||
const ModelFamilyLlama ModelFamily = "llama"
|
|
||||||
|
|
||||||
type llamaModel struct {
|
type llamaModel struct {
|
||||||
hyperparameters llamaHyperparameters
|
hyperparameters llamaHyperparameters
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llamaModel) ModelFamily() ModelFamily {
|
func (llm *llamaModel) ModelFamily() string {
|
||||||
return ModelFamilyLlama
|
return "llama"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llamaModel) ModelType() ModelType {
|
func llamaModelType(numLayer uint32) string {
|
||||||
switch llm.hyperparameters.NumLayer {
|
switch numLayer {
|
||||||
case 26:
|
case 26:
|
||||||
return ModelType3B
|
return "3B"
|
||||||
case 32:
|
case 32:
|
||||||
return ModelType7B
|
return "7B"
|
||||||
case 40:
|
case 40:
|
||||||
return ModelType13B
|
return "13B"
|
||||||
case 48:
|
case 48:
|
||||||
return ModelType34B
|
return "34B"
|
||||||
case 60:
|
case 60:
|
||||||
return ModelType30B
|
return "30B"
|
||||||
case 80:
|
case 80:
|
||||||
return ModelType65B
|
return "65B"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: find a better default
|
|
||||||
return ModelType7B
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llamaModel) FileType() FileType {
|
func (llm *llamaModel) ModelType() string {
|
||||||
return llm.hyperparameters.FileType
|
return llamaModelType(llm.hyperparameters.NumLayer)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *llamaModel) FileType() string {
|
||||||
|
return fileType(llm.hyperparameters.FileType)
|
||||||
}
|
}
|
||||||
|
|
||||||
type llamaHyperparameters struct {
|
type llamaHyperparameters struct {
|
||||||
@@ -137,70 +144,7 @@ type llamaHyperparameters struct {
|
|||||||
NumRot uint32
|
NumRot uint32
|
||||||
|
|
||||||
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
|
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
|
||||||
FileType llamaFileType
|
FileType uint32
|
||||||
}
|
|
||||||
|
|
||||||
type llamaFileType uint32
|
|
||||||
|
|
||||||
const (
|
|
||||||
llamaFileTypeF32 llamaFileType = iota
|
|
||||||
llamaFileTypeF16
|
|
||||||
llamaFileTypeQ4_0
|
|
||||||
llamaFileTypeQ4_1
|
|
||||||
llamaFileTypeQ4_1_F16
|
|
||||||
llamaFileTypeQ8_0 llamaFileType = iota + 2
|
|
||||||
llamaFileTypeQ5_0
|
|
||||||
llamaFileTypeQ5_1
|
|
||||||
llamaFileTypeQ2_K
|
|
||||||
llamaFileTypeQ3_K_S
|
|
||||||
llamaFileTypeQ3_K_M
|
|
||||||
llamaFileTypeQ3_K_L
|
|
||||||
llamaFileTypeQ4_K_S
|
|
||||||
llamaFileTypeQ4_K_M
|
|
||||||
llamaFileTypeQ5_K_S
|
|
||||||
llamaFileTypeQ5_K_M
|
|
||||||
llamaFileTypeQ6_K
|
|
||||||
)
|
|
||||||
|
|
||||||
func (ft llamaFileType) String() string {
|
|
||||||
switch ft {
|
|
||||||
case llamaFileTypeF32:
|
|
||||||
return "F32"
|
|
||||||
case llamaFileTypeF16:
|
|
||||||
return "F16"
|
|
||||||
case llamaFileTypeQ4_0:
|
|
||||||
return "Q4_0"
|
|
||||||
case llamaFileTypeQ4_1:
|
|
||||||
return "Q4_1"
|
|
||||||
case llamaFileTypeQ4_1_F16:
|
|
||||||
return "Q4_1_F16"
|
|
||||||
case llamaFileTypeQ8_0:
|
|
||||||
return "Q8_0"
|
|
||||||
case llamaFileTypeQ5_0:
|
|
||||||
return "Q5_0"
|
|
||||||
case llamaFileTypeQ5_1:
|
|
||||||
return "Q5_1"
|
|
||||||
case llamaFileTypeQ2_K:
|
|
||||||
return "Q2_K"
|
|
||||||
case llamaFileTypeQ3_K_S:
|
|
||||||
return "Q3_K_S"
|
|
||||||
case llamaFileTypeQ3_K_M:
|
|
||||||
return "Q3_K_M"
|
|
||||||
case llamaFileTypeQ3_K_L:
|
|
||||||
return "Q3_K_L"
|
|
||||||
case llamaFileTypeQ4_K_S:
|
|
||||||
return "Q4_K_S"
|
|
||||||
case llamaFileTypeQ4_K_M:
|
|
||||||
return "Q4_K_M"
|
|
||||||
case llamaFileTypeQ5_K_S:
|
|
||||||
return "Q5_K_S"
|
|
||||||
case llamaFileTypeQ5_K_M:
|
|
||||||
return "Q5_K_M"
|
|
||||||
case llamaFileTypeQ6_K:
|
|
||||||
return "Q6_K"
|
|
||||||
default:
|
|
||||||
return "Unknown"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type Running struct {
|
type Running struct {
|
||||||
@@ -218,6 +162,72 @@ type llama struct {
|
|||||||
Running
|
Running
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var errNoGPU = errors.New("nvidia-smi command failed")
|
||||||
|
|
||||||
|
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
|
||||||
|
func CheckVRAM() (int, error) {
|
||||||
|
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
|
||||||
|
var stdout bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout
|
||||||
|
err := cmd.Run()
|
||||||
|
if err != nil {
|
||||||
|
return 0, errNoGPU
|
||||||
|
}
|
||||||
|
|
||||||
|
var total int
|
||||||
|
scanner := bufio.NewScanner(&stdout)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
vram, err := strconv.Atoi(line)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
total += vram
|
||||||
|
}
|
||||||
|
|
||||||
|
return total, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func NumGPU(opts api.Options) int {
|
||||||
|
if opts.NumGPU != -1 {
|
||||||
|
return opts.NumGPU
|
||||||
|
}
|
||||||
|
n := 1 // default to enable metal on macOS
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
|
vram, err := CheckVRAM()
|
||||||
|
if err != nil {
|
||||||
|
if err.Error() != "nvidia-smi command failed" {
|
||||||
|
log.Print(err.Error())
|
||||||
|
}
|
||||||
|
// nvidia driver not installed or no nvidia GPU found
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
|
||||||
|
switch {
|
||||||
|
case vram < 500:
|
||||||
|
log.Printf("WARNING: Low VRAM detected, disabling GPU")
|
||||||
|
n = 0
|
||||||
|
case vram < 1000:
|
||||||
|
n = 4
|
||||||
|
case vram < 2000:
|
||||||
|
n = 8
|
||||||
|
case vram < 4000:
|
||||||
|
n = 12
|
||||||
|
case vram < 8000:
|
||||||
|
n = 16
|
||||||
|
case vram < 12000:
|
||||||
|
n = 24
|
||||||
|
case vram < 16000:
|
||||||
|
n = 32
|
||||||
|
default:
|
||||||
|
n = 48
|
||||||
|
}
|
||||||
|
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -237,7 +247,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
|||||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
|
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -305,7 +315,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
|||||||
func waitForServer(llm *llama) error {
|
func waitForServer(llm *llama) error {
|
||||||
// wait for the server to start responding
|
// wait for the server to start responding
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
expiresAt := time.Now().Add(30 * time.Second)
|
expiresAt := time.Now().Add(45 * time.Second)
|
||||||
ticker := time.NewTicker(200 * time.Millisecond)
|
ticker := time.NewTicker(200 * time.Millisecond)
|
||||||
|
|
||||||
log.Print("waiting for llama.cpp server to start responding")
|
log.Print("waiting for llama.cpp server to start responding")
|
||||||
|
|||||||
24
llm/llm.go
24
llm/llm.go
@@ -37,7 +37,7 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
switch ggml.FileType().String() {
|
switch ggml.FileType() {
|
||||||
case "Q8_0":
|
case "Q8_0":
|
||||||
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
|
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
|
||||||
// GGML Q8_0 do not support Metal API and will
|
// GGML Q8_0 do not support Metal API and will
|
||||||
@@ -56,30 +56,36 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
|||||||
|
|
||||||
totalResidentMemory := memory.TotalMemory()
|
totalResidentMemory := memory.TotalMemory()
|
||||||
switch ggml.ModelType() {
|
switch ggml.ModelType() {
|
||||||
case ModelType3B, ModelType7B:
|
case "3B", "7B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
|
||||||
} else if totalResidentMemory < 8*1024*1024 {
|
} else if totalResidentMemory < 8*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 8GB of memory")
|
return nil, fmt.Errorf("model requires at least 8GB of memory")
|
||||||
}
|
}
|
||||||
case ModelType13B:
|
case "13B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
|
||||||
} else if totalResidentMemory < 16*1024*1024 {
|
} else if totalResidentMemory < 16*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 16GB of memory")
|
return nil, fmt.Errorf("model requires at least 16GB of memory")
|
||||||
}
|
}
|
||||||
case ModelType30B, ModelType34B:
|
case "30B", "34B", "40B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
|
||||||
} else if totalResidentMemory < 32*1024*1024 {
|
} else if totalResidentMemory < 32*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 32GB of memory")
|
return nil, fmt.Errorf("model requires at least 32GB of memory")
|
||||||
}
|
}
|
||||||
case ModelType65B:
|
case "65B", "70B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
|
||||||
} else if totalResidentMemory < 64*1024*1024 {
|
} else if totalResidentMemory < 64*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 64GB of memory")
|
return nil, fmt.Errorf("model requires at least 64GB of memory")
|
||||||
}
|
}
|
||||||
|
case "180B":
|
||||||
|
if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
|
||||||
|
return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
|
||||||
|
} else if totalResidentMemory < 128*1024*1024 {
|
||||||
|
return nil, fmt.Errorf("model requires at least 128GB of memory")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
switch ggml.Name() {
|
switch ggml.Name() {
|
||||||
|
|||||||
@@ -114,11 +114,11 @@ type LayerReader struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type ConfigV2 struct {
|
type ConfigV2 struct {
|
||||||
ModelFamily llm.ModelFamily `json:"model_family"`
|
ModelFormat string `json:"model_format"`
|
||||||
ModelType string `json:"model_type"`
|
ModelFamily string `json:"model_family"`
|
||||||
ModelFormat string `json:"model_format"`
|
ModelType string `json:"model_type"`
|
||||||
FileType string `json:"file_type"`
|
FileType string `json:"file_type"`
|
||||||
RootFS RootFS `json:"rootfs"`
|
RootFS RootFS `json:"rootfs"`
|
||||||
|
|
||||||
// required by spec
|
// required by spec
|
||||||
Architecture string `json:"architecture"`
|
Architecture string `json:"architecture"`
|
||||||
@@ -357,10 +357,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
config.ModelFamily = ggml.ModelFamily()
|
|
||||||
config.ModelType = ggml.ModelType().String()
|
|
||||||
config.ModelFormat = ggml.Name()
|
config.ModelFormat = ggml.Name()
|
||||||
config.FileType = ggml.FileType().String()
|
config.ModelFamily = ggml.ModelFamily()
|
||||||
|
config.ModelType = ggml.ModelType()
|
||||||
|
config.FileType = ggml.FileType()
|
||||||
|
|
||||||
// reset the file
|
// reset the file
|
||||||
file.Seek(0, io.SeekStart)
|
file.Seek(0, io.SeekStart)
|
||||||
@@ -498,6 +498,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if config.ModelType == "65B" {
|
||||||
|
if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
|
||||||
|
config.ModelType = "70B"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bts, err := json.Marshal(formattedParams)
|
bts, err := json.Marshal(formattedParams)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -815,14 +821,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
|
|||||||
return nil, fmt.Errorf("invalid float value %s", vals)
|
return nil, fmt.Errorf("invalid float value %s", vals)
|
||||||
}
|
}
|
||||||
|
|
||||||
out[key] = floatVal
|
out[key] = float32(floatVal)
|
||||||
case reflect.Int:
|
case reflect.Int:
|
||||||
intVal, err := strconv.ParseInt(vals[0], 10, 0)
|
intVal, err := strconv.ParseInt(vals[0], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("invalid int value %s", vals)
|
return nil, fmt.Errorf("invalid int value %s", vals)
|
||||||
}
|
}
|
||||||
|
|
||||||
out[key] = intVal
|
out[key] = int(intVal)
|
||||||
case reflect.Bool:
|
case reflect.Bool:
|
||||||
boolVal, err := strconv.ParseBool(vals[0])
|
boolVal, err := strconv.ParseBool(vals[0])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
"os/signal"
|
"os/signal"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -548,6 +549,13 @@ func Serve(ln net.Listener, origins []string) error {
|
|||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
|
// check compatibility to log warnings
|
||||||
|
if _, err := llm.CheckVRAM(); err != nil {
|
||||||
|
log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return s.Serve(ln)
|
return s.Serve(ln)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user