Compare commits

..

47 Commits

Author SHA1 Message Date
Matt Williams
e2389b63aa add examples of streaming in python and node
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-09-14 07:12:09 -07:00
Michael Yang
f89c23764b Merge pull request #525 from jmorganca/mxyng/falcon-decode
fix: add falcon.go
2023-09-13 15:08:47 -07:00
Michael Yang
d028853879 fix: add falcon.go 2023-09-13 14:47:37 -07:00
Michael Yang
949553db23 Merge pull request #519 from jmorganca/mxyng/decode
Mxyng/decode
2023-09-13 12:43:57 -07:00
Michael Yang
0c5a454361 fix model type for 70b 2023-09-12 15:12:59 -07:00
Bruce MacDonald
f59c4d03f7 fix ggml arm64 cuda build (#520) 2023-09-12 17:06:48 -04:00
Michael Yang
7dee25a07f fix falcon decode
get model and file type from bin file
2023-09-12 12:34:53 -07:00
Bruce MacDonald
f221637053 first pass at linux gpu support (#454)
* linux gpu support
* handle multiple gpus
* add cuda docker image (#488)
---------

Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-12 11:04:35 -04:00
Patrick Devine
45ac07cd02 create the blobs directory correctly (#508) 2023-09-11 14:54:52 -07:00
Jeffrey Morgan
7d749cc787 fix darwin build script 2023-09-11 16:31:46 -04:00
Patrick Devine
e7e91cd71c add autoprune to remove unused layers (#491) 2023-09-11 11:46:35 -07:00
Jeffrey Morgan
3920e15386 add model format to config layer (#497) 2023-09-09 17:53:44 -04:00
Michael Yang
41e976edde Merge pull request #492 from jmorganca/mxyng/nil-pointer
fix nil pointer dereference
2023-09-07 17:25:23 -07:00
Michael Yang
de227b620f fix nil pointer dereference 2023-09-07 17:24:31 -07:00
Michael Yang
63def6ca49 Merge pull request #487 from jmorganca/mxyng/dockerignore
update dockerignore
2023-09-07 14:16:17 -07:00
Michael Yang
738fe9c4aa Merge pull request #486 from jmorganca/mxyng/fix-push
fix: retry push on expired token
2023-09-07 13:58:34 -07:00
Michael Yang
a8da0bacbe update dockerignore 2023-09-07 13:36:25 -07:00
Michael Yang
bf146fb072 fix retry on unauthorized chunk 2023-09-07 12:02:04 -07:00
Michael Yang
f0f4943577 fix get auth token 2023-09-07 12:01:56 -07:00
Bruce MacDonald
09dd2aeff9 GGUF support (#441) 2023-09-07 13:55:37 -04:00
Alexander Pepper
07b4074e7b [docs] Improve build instructions (#482)
Go is required and not installed by default.
2023-09-07 06:43:26 -04:00
Jeffrey Morgan
61dda6a5e0 set minimum CMAKE_OSX_DEPLOYMENT_TARGET to 11.0 2023-09-06 19:56:50 -04:00
Michael Yang
e1f9ced568 Merge pull request #479 from jmorganca/mxyng/dockerfile
update dockerfile
2023-09-06 15:44:24 -07:00
Michael Yang
9795b43d93 update dockerfile 2023-09-06 15:31:25 -07:00
Michael Yang
0980d5c7e3 Merge pull request #478 from jmorganca/mxyng/cleanup
remove unused openssh key types
2023-09-06 15:18:54 -07:00
Michael Yang
0dae34b6a7 remove unused openssh key types 2023-09-06 14:34:09 -07:00
Michael Yang
83c6be1666 fix model manifests (#477) 2023-09-06 17:30:08 -04:00
Patrick Devine
1adfa67589 tighten up the error string for ollama show flags (#476) 2023-09-06 13:38:49 -07:00
Patrick Devine
790d24eb7b add show command (#474) 2023-09-06 11:04:17 -07:00
Jeffrey Morgan
7de300856b use osPath in gpu check 2023-09-05 21:52:21 -04:00
Jeffrey Morgan
213ffdb548 macos amd64 compatibility fixes 2023-09-05 21:33:31 -04:00
Michael Yang
d42d88386a Merge pull request #473 from jmorganca/mxyng/fix-manifest-path
create manifests directory
2023-09-05 17:37:41 -07:00
Ackermann Yuriy
154f24af91 Added missing options params to the embeddings docs (#472) 2023-09-05 20:18:49 -04:00
Michael Yang
a1ecdd36d5 create manifests directory 2023-09-05 17:10:40 -07:00
Bruce MacDonald
d18282bfda metal: add missing barriers for mul-mat (#469) 2023-09-05 19:37:13 -04:00
Michael Yang
9ae76ba8c9 Merge pull request #471 from jmorganca/mxyng/fix-empty-response
fix empty response
2023-09-05 15:23:05 -07:00
Michael Yang
2bc06565c7 fix empty response 2023-09-05 15:03:24 -07:00
Michael Yang
d1c2558f7e Merge pull request #461 from jmorganca/mxyng/fix-inherit-params
fix inherit params
2023-09-05 12:30:23 -07:00
Michael Yang
7b5aefb427 Merge pull request #462 from jmorganca/mxyng/rm-marshal-prompt
remove marshalPrompt which is no longer needed
2023-09-05 11:48:41 -07:00
Michael Yang
06ef90c051 fix parameter inheritence
parameters are not inherited because they are processed differently from
other layer. fix this by explicitly merging the inherited params into
the new params. parameter values defined in the new modelfile will
override those defined in the inherited modelfile. array lists are
replaced instead of appended
2023-09-05 11:40:20 -07:00
Michael Yang
7efbc84320 Merge pull request #464 from jmorganca/mxyng/fix-num-keep
fix num_keep
2023-09-05 11:30:45 -07:00
Michael Yang
e9f6df7dca use slices.DeleteFunc 2023-09-05 09:56:59 -07:00
Jeffrey Morgan
7fa6e51686 generate binary dependencies based on GOARCH on macos (#459) 2023-09-05 12:53:57 -04:00
Michael Yang
8dc68417e7 Merge pull request #463 from jmorganca/mxyng/fix-last-token
fix not forwarding last token
2023-09-05 09:01:32 -07:00
Michael Yang
681f3c4c42 fix num_keep 2023-09-03 17:47:49 -04:00
Michael Yang
59a705525c fix not forwarding last token 2023-09-03 17:46:50 -04:00
Michael Yang
5d3f314b0b remove marshalPrompt which is no longer needed 2023-09-03 17:01:05 -04:00
35 changed files with 1603 additions and 574 deletions

View File

@@ -1,8 +1,5 @@
build
llama/build
.venv
.vscode .vscode
ollama ollama
app app
web llm/llama.cpp/ggml
.env llm/llama.cpp/gguf

11
.gitmodules vendored
View File

@@ -1,4 +1,9 @@
[submodule "llm/llama.cpp/ggml"] [submodule "llm/llama.cpp/ggml"]
path = llm/llama.cpp/ggml path = llm/llama.cpp/ggml
url = https://github.com/ggerganov/llama.cpp.git url = https://github.com/ggerganov/llama.cpp.git
ignore = dirty ignore = dirty
shallow = true
[submodule "llm/llama.cpp/gguf"]
path = llm/llama.cpp/gguf
url = https://github.com/ggerganov/llama.cpp.git
shallow = true

View File

@@ -1,15 +1,21 @@
FROM golang:1.20 FROM golang:alpine
WORKDIR /go/src/github.com/jmorganca/ollama WORKDIR /go/src/github.com/jmorganca/ollama
RUN apk add --no-cache git build-base cmake
COPY . . COPY . .
RUN CGO_ENABLED=1 go build -ldflags '-linkmode external -extldflags "-static"' . RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
FROM alpine FROM alpine
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama ENV OLLAMA_HOST 0.0.0.0
EXPOSE 11434 RUN apk add --no-cache libstdc++
ARG USER=ollama ARG USER=ollama
ARG GROUP=ollama ARG GROUP=ollama
RUN addgroup -g 1000 $GROUP && adduser -u 1000 -DG $GROUP $USER RUN addgroup $GROUP && adduser -D -G $GROUP $USER
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
USER $USER:$GROUP USER $USER:$GROUP
ENTRYPOINT ["/bin/ollama"] ENTRYPOINT ["/bin/ollama"]
ENV OLLAMA_HOST 0.0.0.0
CMD ["serve"] CMD ["serve"]

22
Dockerfile.cuda Normal file
View File

@@ -0,0 +1,22 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
WORKDIR /go/src/github.com/jmorganca/ollama
RUN apt-get update && apt-get install -y git build-essential cmake
ADD https://dl.google.com/go/go1.21.1.linux-amd64.tar.gz /tmp/go1.21.1.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
COPY . .
RUN /usr/local/go/bin/go generate ./... && /usr/local/go/bin/go build -ldflags '-linkmode external -extldflags "-static"' .
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
ENV OLLAMA_HOST 0.0.0.0
ARG USER=ollama
ARG GROUP=ollama
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
USER $USER:$GROUP
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

View File

@@ -165,10 +165,11 @@ Ollama bundles model weights, configurations, and data into a single package, de
## Building ## Building
Install `cmake`: Install `cmake` and `go`:
``` ```
brew install cmake brew install cmake
brew install go
``` ```
Then generate dependencies and build: Then generate dependencies and build:

View File

@@ -255,6 +255,14 @@ func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
return nil return nil
} }
func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, error) {
var resp ShowResponse
if err := c.do(ctx, http.MethodPost, "/api/show", req, &resp); err != nil {
return nil, err
}
return &resp, nil
}
func (c *Client) Heartbeat(ctx context.Context) error { func (c *Client) Heartbeat(ctx context.Context) error {
if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil { if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
return err return err

View File

@@ -32,12 +32,11 @@ func (e StatusError) Error() string {
} }
type GenerateRequest struct { type GenerateRequest struct {
Model string `json:"model"` Model string `json:"model"`
Prompt string `json:"prompt"` Prompt string `json:"prompt"`
System string `json:"system"` System string `json:"system"`
Template string `json:"template"` Template string `json:"template"`
Context []int `json:"context,omitempty"` Context []int `json:"context,omitempty"`
Args map[string]any `json:"args,omitempty"`
Options map[string]interface{} `json:"options"` Options map[string]interface{} `json:"options"`
} }
@@ -62,6 +61,18 @@ type DeleteRequest struct {
Name string `json:"name"` Name string `json:"name"`
} }
type ShowRequest struct {
Name string `json:"name"`
}
type ShowResponse struct {
License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
}
type CopyRequest struct { type CopyRequest struct {
Source string `json:"source"` Source string `json:"source"`
Destination string `json:"destination"` Destination string `json:"destination"`
@@ -280,7 +291,7 @@ func DefaultOptions() Options {
NumCtx: 2048, NumCtx: 2048,
NumKeep: -1, NumKeep: -1,
NumBatch: 512, NumBatch: 512,
NumGPU: 1, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumGQA: 1, NumGQA: 1,
LowVRAM: false, LowVRAM: false,
F16KV: true, F16KV: true,

View File

@@ -230,6 +230,84 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
return nil return nil
} }
func ShowHandler(cmd *cobra.Command, args []string) error {
client, err := api.FromEnv()
if err != nil {
return err
}
if len(args) != 1 {
return errors.New("missing model name")
}
license, errLicense := cmd.Flags().GetBool("license")
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
parameters, errParams := cmd.Flags().GetBool("parameters")
system, errSystem := cmd.Flags().GetBool("system")
template, errTemplate := cmd.Flags().GetBool("template")
for _, boolErr := range []error{errLicense, errModelfile, errParams, errSystem, errTemplate} {
if boolErr != nil {
return errors.New("error retrieving flags")
}
}
flagsSet := 0
showType := ""
if license {
flagsSet++
showType = "license"
}
if modelfile {
flagsSet++
showType = "modelfile"
}
if parameters {
flagsSet++
showType = "parameters"
}
if system {
flagsSet++
showType = "system"
}
if template {
flagsSet++
showType = "template"
}
if flagsSet > 1 {
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
} else if flagsSet == 0 {
return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
}
req := api.ShowRequest{Name: args[0]}
resp, err := client.Show(context.Background(), &req)
if err != nil {
return err
}
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
}
return nil
}
func CopyHandler(cmd *cobra.Command, args []string) error { func CopyHandler(cmd *cobra.Command, args []string) error {
client, err := api.FromEnv() client, err := api.FromEnv()
if err != nil { if err != nil {
@@ -377,20 +455,6 @@ func generate(cmd *cobra.Command, model, prompt string) error {
return nil return nil
} }
func showLayer(l *server.Layer) {
filename, err := server.GetBlobsPath(l.Digest)
if err != nil {
fmt.Println("Couldn't get layer's path")
return
}
bts, err := os.ReadFile(filename)
if err != nil {
fmt.Println("Couldn't read layer")
return
}
fmt.Println(string(bts))
}
func generateInteractive(cmd *cobra.Command, model string) error { func generateInteractive(cmd *cobra.Command, model string) error {
home, err := os.UserHomeDir() home, err := os.UserHomeDir()
if err != nil { if err != nil {
@@ -413,6 +477,8 @@ func generateInteractive(cmd *cobra.Command, model string) error {
), ),
readline.PcItem("/show", readline.PcItem("/show",
readline.PcItem("license"), readline.PcItem("license"),
readline.PcItem("modelfile"),
readline.PcItem("parameters"),
readline.PcItem("system"), readline.PcItem("system"),
readline.PcItem("template"), readline.PcItem("template"),
), ),
@@ -522,42 +588,28 @@ func generateInteractive(cmd *cobra.Command, model string) error {
case strings.HasPrefix(line, "/show"): case strings.HasPrefix(line, "/show"):
args := strings.Fields(line) args := strings.Fields(line)
if len(args) > 1 { if len(args) > 1 {
mp := server.ParseModelPath(model) resp, err := server.GetModelInfo(model)
if err != nil { if err != nil {
return err fmt.Println("error: couldn't get model")
continue
} }
manifest, _, err := server.GetManifest(mp)
if err != nil {
fmt.Println("error: couldn't get a manifest for this model")
continue
}
switch args[1] { switch args[1] {
case "license": case "license":
for _, l := range manifest.Layers { fmt.Println(resp.License)
if l.MediaType == "application/vnd.ollama.image.license" { case "modelfile":
showLayer(l) fmt.Println(resp.Modelfile)
} case "parameters":
} fmt.Println(resp.Parameters)
continue
case "system": case "system":
for _, l := range manifest.Layers { fmt.Println(resp.System)
if l.MediaType == "application/vnd.ollama.image.system" {
showLayer(l)
}
}
continue
case "template": case "template":
for _, l := range manifest.Layers { fmt.Println(resp.Template)
if l.MediaType == "application/vnd.ollama.image.template" {
showLayer(l)
}
}
continue
default: default:
usage() fmt.Println("error: unknown command")
continue
} }
continue
} else { } else {
usage() usage()
continue continue
@@ -620,6 +672,12 @@ func RunServer(cmd *cobra.Command, _ []string) error {
origins = strings.Split(o, ",") origins = strings.Split(o, ",")
} }
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
if err := server.PruneLayers(); err != nil {
return err
}
}
return server.Serve(ln, origins) return server.Serve(ln, origins)
} }
@@ -749,6 +807,20 @@ func NewCLI() *cobra.Command {
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")") createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
showCmd := &cobra.Command{
Use: "show MODEL",
Short: "Show information for a model",
Args: cobra.MinimumNArgs(1),
PreRunE: checkServerHeartbeat,
RunE: ShowHandler,
}
showCmd.Flags().Bool("license", false, "Show license of a model")
showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
showCmd.Flags().Bool("template", false, "Show template of a model")
showCmd.Flags().Bool("system", false, "Show system prompt of a model")
runCmd := &cobra.Command{ runCmd := &cobra.Command{
Use: "run MODEL [PROMPT]", Use: "run MODEL [PROMPT]",
Short: "Run a model", Short: "Run a model",
@@ -814,6 +886,7 @@ func NewCLI() *cobra.Command {
rootCmd.AddCommand( rootCmd.AddCommand(
serveCmd, serveCmd,
createCmd, createCmd,
showCmd,
runCmd, runCmd,
pullCmd, pullCmd,
pushCmd, pushCmd,

View File

@@ -20,6 +20,10 @@ Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` a
All durations are returned in nanoseconds. All durations are returned in nanoseconds.
### Streams
Many API responses are streams of JSON objects showing the current status. For examples of working with streams in various languages, see [streaming.md](./streaming.md)
## Generate a completion ## Generate a completion
``` ```
@@ -238,6 +242,10 @@ Generate embeddings from a model
- `model`: name of model to generate embeddings from - `model`: name of model to generate embeddings from
- `prompt`: text to generate embeddings for - `prompt`: text to generate embeddings for
Advanced parameters:
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
### Request ### Request
``` ```

View File

@@ -6,6 +6,10 @@
Install required tools: Install required tools:
- cmake version 3.24 or higher
- go version 1.20 or higher
- gcc version 11.4.0 or higher
``` ```
brew install go cmake gcc brew install go cmake gcc
``` ```
@@ -27,3 +31,9 @@ Now you can run `ollama`:
``` ```
./ollama ./ollama
``` ```
## Building on Linux with GPU support
- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
- run `go build .`

35
docs/streaming.md Normal file
View File

@@ -0,0 +1,35 @@
# Streaming responses in the Ollama Client API
## JavaScript / TypeScript / Deno
```javascript
const pull = async () => {
const request = await fetch("http://localhost:11434/api/pull", {
method: "POST",
body: JSON.stringify({ name: "llama2:7b-q5_0" }),
});
const reader = await request.body?.pipeThrough(new TextDecoderStream());
if (!reader) throw new Error("No reader");
for await (const chunk of reader) {
const out = JSON.parse(chunk);
if (out.status.startsWith("downloading")) {
console.log(`${out.status} - ${(out.completed / out.total) * 100}%`);
}
}
}
pull();
```
## Python
```python
import requests
import json
response = requests.post("http://localhost:11434/api/pull", json={"name": "llama2:7b-q5_0"}, stream=True)
for data in response.iter_lines():
out = json.loads(data)
if "completed" in out:
print(out["completed"] / out["total"] * 100)
```

View File

@@ -10,15 +10,11 @@ package format
import ( import (
"crypto" "crypto"
"crypto/ecdsa"
"crypto/ed25519" "crypto/ed25519"
"crypto/elliptic"
"crypto/rand" "crypto/rand"
"crypto/rsa"
"encoding/binary" "encoding/binary"
"encoding/pem" "encoding/pem"
"fmt" "fmt"
"math/big"
"golang.org/x/crypto/ssh" "golang.org/x/crypto/ssh"
) )
@@ -41,25 +37,6 @@ type openSSHPrivateKey struct {
Rest []byte `ssh:"rest"` Rest []byte `ssh:"rest"`
} }
type openSSHRSAPrivateKey struct {
N *big.Int
E *big.Int
D *big.Int
Iqmp *big.Int
P *big.Int
Q *big.Int
Comment string
Pad []byte `ssh:"rest"`
}
type openSSHECDSAPrivateKey struct {
Curve string
Pub []byte
D *big.Int
Comment string
Pad []byte `ssh:"rest"`
}
type openSSHEd25519PrivateKey struct { type openSSHEd25519PrivateKey struct {
Pub []byte Pub []byte
Priv []byte Priv []byte
@@ -85,64 +62,6 @@ func OpenSSHPrivateKey(key crypto.PrivateKey, comment string) (*pem.Block, error
} }
switch k := key.(type) { switch k := key.(type) {
case *rsa.PrivateKey:
e := new(big.Int).SetInt64(int64(k.E))
key := openSSHRSAPrivateKey{
N: k.N,
E: e,
D: k.D,
Iqmp: k.Precomputed.Qinv,
P: k.Primes[0],
Q: k.Primes[1],
Comment: comment,
}
pk1.Keytype = ssh.KeyAlgoRSA
pk1.Rest = ssh.Marshal(key)
w.PubKey = ssh.Marshal(struct {
KeyType string
E *big.Int
N *big.Int
}{
ssh.KeyAlgoRSA, e, k.N,
})
case *ecdsa.PrivateKey:
var curve, keytype string
switch name := k.Curve.Params().Name; name {
case "P-256":
curve = "nistp256"
keytype = ssh.KeyAlgoECDSA256
case "P-384":
curve = "nistp384"
keytype = ssh.KeyAlgoECDSA384
case "P-521":
curve = "nistp521"
keytype = ssh.KeyAlgoECDSA521
default:
return nil, fmt.Errorf("ssh: unknown curve %q", name)
}
pub := elliptic.Marshal(k.Curve, k.X, k.Y)
key := openSSHECDSAPrivateKey{
Curve: curve,
Pub: pub,
D: k.D,
Comment: comment,
}
pk1.Keytype = keytype
pk1.Rest = ssh.Marshal(key)
w.PubKey = ssh.Marshal(struct {
KeyType string
Curve string
Pub []byte
}{
keytype, curve, pub,
})
case ed25519.PrivateKey: case ed25519.PrivateKey:
pub, priv := k[32:], k pub, priv := k[32:], k
key := openSSHEd25519PrivateKey{ key := openSSHEd25519PrivateKey{

1
go.mod
View File

@@ -39,6 +39,7 @@ require (
github.com/ugorji/go/codec v1.2.11 // indirect github.com/ugorji/go/codec v1.2.11 // indirect
golang.org/x/arch v0.3.0 // indirect golang.org/x/arch v0.3.0 // indirect
golang.org/x/crypto v0.10.0 golang.org/x/crypto v0.10.0
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
golang.org/x/net v0.10.0 // indirect golang.org/x/net v0.10.0 // indirect
golang.org/x/sys v0.11.0 // indirect golang.org/x/sys v0.11.0 // indirect
golang.org/x/term v0.10.0 golang.org/x/term v0.10.0

2
go.sum
View File

@@ -121,6 +121,8 @@ golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5y
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug= golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=

22
llm/falcon.go Normal file
View File

@@ -0,0 +1,22 @@
package llm
const ModelFamilyFalcon = "falcon"
const (
falconModelType7B = 32
falconModelType40B = 60
falconModelType180B = 80
)
func falconModelType(numLayer uint32) string {
switch numLayer {
case 32:
return "7B"
case 60:
return "40B"
case 80:
return "180B"
default:
return "Unknown"
}
}

View File

@@ -3,72 +3,97 @@ package llm
import ( import (
"encoding/binary" "encoding/binary"
"errors" "errors"
"fmt"
"io" "io"
"path"
"sync"
) )
type ModelFamily string
type ModelType uint32
const (
ModelType3B ModelType = 26
ModelType7B ModelType = 32
ModelType13B ModelType = 40
ModelType34B ModelType = 48
ModelType30B ModelType = 60
ModelType65B ModelType = 80
)
func (mt ModelType) String() string {
switch mt {
case ModelType3B:
return "3B"
case ModelType7B:
return "7B"
case ModelType13B:
return "13B"
case ModelType34B:
return "34B"
case ModelType30B:
return "30B"
case ModelType65B:
return "65B"
default:
return "Unknown"
}
}
type FileType interface {
String() string
}
type GGML struct { type GGML struct {
magic uint32 magic uint32
container container
model model
} }
const (
fileTypeF32 uint32 = iota
fileTypeF16
fileTypeQ4_0
fileTypeQ4_1
fileTypeQ4_1_F16
fileTypeQ8_0 uint32 = iota + 2
fileTypeQ5_0
fileTypeQ5_1
fileTypeQ2_K
fileTypeQ3_K_S
fileTypeQ3_K_M
fileTypeQ3_K_L
fileTypeQ4_K_S
fileTypeQ4_K_M
fileTypeQ5_K_S
fileTypeQ5_K_M
fileTypeQ6_K
)
func fileType(fileType uint32) string {
switch fileType {
case fileTypeF32:
return "F32"
case fileTypeF16:
return "F16"
case fileTypeQ4_0:
return "Q4_0"
case fileTypeQ4_1:
return "Q4_1"
case fileTypeQ4_1_F16:
return "Q4_1_F16"
case fileTypeQ8_0:
return "Q8_0"
case fileTypeQ5_0:
return "Q5_0"
case fileTypeQ5_1:
return "Q5_1"
case fileTypeQ2_K:
return "Q2_K"
case fileTypeQ3_K_S:
return "Q3_K_S"
case fileTypeQ3_K_M:
return "Q3_K_M"
case fileTypeQ3_K_L:
return "Q3_K_L"
case fileTypeQ4_K_S:
return "Q4_K_S"
case fileTypeQ4_K_M:
return "Q4_K_M"
case fileTypeQ5_K_S:
return "Q5_K_S"
case fileTypeQ5_K_M:
return "Q5_K_M"
case fileTypeQ6_K:
return "Q6_K"
default:
return "Unknown"
}
}
type model interface { type model interface {
ModelFamily() ModelFamily ModelFamily() string
ModelType() ModelType ModelType() string
FileType() FileType FileType() string
} }
type container interface { type container interface {
Name() string Name() string
Decode(io.Reader) error Decode(io.Reader) (model, error)
} }
type containerGGML struct { type containerGGML struct{}
}
func (c *containerGGML) Name() string { func (c *containerGGML) Name() string {
return "ggml" return "ggml"
} }
func (c *containerGGML) Decode(r io.Reader) error { func (c *containerGGML) Decode(r io.Reader) (model, error) {
return nil return nil, nil
} }
type containerGGMF struct { type containerGGMF struct {
@@ -79,18 +104,18 @@ func (c *containerGGMF) Name() string {
return "ggmf" return "ggmf"
} }
func (c *containerGGMF) Decode(r io.Reader) error { func (c *containerGGMF) Decode(r io.Reader) (model, error) {
var version uint32 var version uint32
binary.Read(r, binary.LittleEndian, &version) binary.Read(r, binary.LittleEndian, &version)
switch version { switch version {
case 1: case 1:
default: default:
return errors.New("invalid version") return nil, errors.New("invalid version")
} }
c.version = version c.version = version
return nil return nil, nil
} }
type containerGGJT struct { type containerGGJT struct {
@@ -101,18 +126,22 @@ func (c *containerGGJT) Name() string {
return "ggjt" return "ggjt"
} }
func (c *containerGGJT) Decode(r io.Reader) error { func (c *containerGGJT) Decode(r io.Reader) (model, error) {
var version uint32 var version uint32
binary.Read(r, binary.LittleEndian, &version) binary.Read(r, binary.LittleEndian, &version)
switch version { switch version {
case 1, 2, 3: case 1, 2, 3:
default: default:
return errors.New("invalid version") return nil, errors.New("invalid version")
} }
c.version = version c.version = version
return nil
// different model types may have different layouts for hyperparameters
var llama llamaModel
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
return &llama, nil
} }
type containerLORA struct { type containerLORA struct {
@@ -123,32 +152,51 @@ func (c *containerLORA) Name() string {
return "ggla" return "ggla"
} }
func (c *containerLORA) Decode(r io.Reader) error { func (c *containerLORA) Decode(r io.Reader) (model, error) {
var version uint32 var version uint32
binary.Read(r, binary.LittleEndian, &version) binary.Read(r, binary.LittleEndian, &version)
switch version { switch version {
case 1: case 1:
default: default:
return errors.New("invalid version") return nil, errors.New("invalid version")
} }
c.version = version c.version = version
return nil return nil, nil
}
var (
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
)
var (
ggmlInit sync.Once
ggmlRunnerPath string
)
func ggmlRunner() ModelRunner {
ggmlInit.Do(func() {
ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
})
return ModelRunner{Path: ggmlRunnerPath}
} }
const ( const (
// / Magic constant for `ggml` files (unversioned). // Magic constant for `ggml` files (unversioned).
FILE_MAGIC_GGML = 0x67676d6c FILE_MAGIC_GGML = 0x67676d6c
// / Magic constant for `ggml` files (versioned, ggmf). // Magic constant for `ggml` files (versioned, ggmf).
FILE_MAGIC_GGMF = 0x67676d66 FILE_MAGIC_GGMF = 0x67676d66
// / Magic constant for `ggml` files (versioned, ggjt). // Magic constant for `ggml` files (versioned, ggjt).
FILE_MAGIC_GGJT = 0x67676a74 FILE_MAGIC_GGJT = 0x67676a74
// / Magic constant for `ggla` files (LoRA adapter). // Magic constant for `ggla` files (LoRA adapter).
FILE_MAGIC_GGLA = 0x67676C61 FILE_MAGIC_GGLA = 0x67676C61
// Magic constant for `gguf` files (versioned, gguf)
FILE_MAGIC_GGUF = 0x46554747
) )
func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) { func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
var ggml GGML var ggml GGML
binary.Read(r, binary.LittleEndian, &ggml.magic) binary.Read(r, binary.LittleEndian, &ggml.magic)
@@ -161,24 +209,18 @@ func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
ggml.container = &containerGGJT{} ggml.container = &containerGGJT{}
case FILE_MAGIC_GGLA: case FILE_MAGIC_GGLA:
ggml.container = &containerLORA{} ggml.container = &containerLORA{}
case FILE_MAGIC_GGUF:
ggml.container = &containerGGUF{}
default: default:
return nil, errors.New("invalid file magic") return nil, errors.New("invalid file magic")
} }
if err := ggml.Decode(r); err != nil { model, err := ggml.Decode(r)
if err != nil {
return nil, err return nil, err
} }
// different model types may have different layouts for hyperparameters ggml.model = model
switch hint {
case ModelFamilyLlama:
var llama llamaModel
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
ggml.model = &llama
// TODO: sanity check hyperparameters
default:
return nil, fmt.Errorf("unsupported model type: %s", hint)
}
// final model type // final model type
return &ggml, nil return &ggml, nil

389
llm/gguf.go Normal file
View File

@@ -0,0 +1,389 @@
package llm
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"path"
"sync"
)
type containerGGUF struct {
Version uint32
V1 struct {
NumTensor uint32
NumKV uint32
}
V2 struct {
NumTensor uint64
NumKV uint64
}
}
func (c *containerGGUF) Name() string {
return "gguf"
}
func (c *containerGGUF) Decode(r io.Reader) (model, error) {
binary.Read(r, binary.LittleEndian, &c.Version)
switch c.Version {
case 1:
binary.Read(r, binary.LittleEndian, &c.V1)
case 2:
binary.Read(r, binary.LittleEndian, &c.V2)
default:
return nil, errors.New("invalid version")
}
model := newGGUFModel(c)
if err := model.Decode(r); err != nil {
return nil, err
}
return model, nil
}
const (
ggufTypeUint8 uint32 = iota
ggufTypeInt8
ggufTypeUint16
ggufTypeInt16
ggufTypeUint32
ggufTypeInt32
ggufTypeFloat32
ggufTypeBool
ggufTypeString
ggufTypeArray
ggufTypeUint64
ggufTypeInt64
ggufTypeFloat64
)
type kv map[string]any
type ggufModel struct {
*containerGGUF
kv
}
func newGGUFModel(container *containerGGUF) *ggufModel {
return &ggufModel{
containerGGUF: container,
kv: make(kv),
}
}
func (llm *ggufModel) NumKV() uint64 {
if llm.Version == 1 {
return uint64(llm.V1.NumKV)
}
return llm.V2.NumKV
}
func (llm *ggufModel) ModelFamily() string {
t, ok := llm.kv["general.architecture"].(string)
if ok {
return t
}
return "unknown"
}
func (llm *ggufModel) ModelType() string {
switch llm.ModelFamily() {
case "llama":
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
heads, headsOK := llm.kv["llama.head_count"].(uint32)
headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
if headsOK && headsKVsOK && heads/headKVs == 8 {
return "70B"
}
return llamaModelType(blocks)
}
case "falcon":
if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
return falconModelType(blocks)
}
}
return "Unknown"
}
func (llm *ggufModel) FileType() string {
t, ok := llm.kv["general.file_type"].(uint32)
if ok {
return fileType(t)
}
return "Unknown"
}
func (llm *ggufModel) Decode(r io.Reader) error {
read := llm.readString
if llm.Version == 1 {
read = llm.readStringV1
}
for i := 0; uint64(i) < llm.NumKV(); i++ {
k, err := read(r)
if err != nil {
return err
}
vtype := llm.readU32(r)
var v any
switch vtype {
case ggufTypeUint8:
v = llm.readU8(r)
case ggufTypeInt8:
v = llm.readI8(r)
case ggufTypeUint16:
v = llm.readU16(r)
case ggufTypeInt16:
v = llm.readI16(r)
case ggufTypeUint32:
v = llm.readU32(r)
case ggufTypeInt32:
v = llm.readI32(r)
case ggufTypeUint64:
v = llm.readU64(r)
case ggufTypeInt64:
v = llm.readI64(r)
case ggufTypeFloat32:
v = llm.readF32(r)
case ggufTypeFloat64:
v = llm.readF64(r)
case ggufTypeBool:
v = llm.readBool(r)
case ggufTypeString:
fn := llm.readString
if llm.Version == 1 {
fn = llm.readStringV1
}
s, err := fn(r)
if err != nil {
return err
}
v = s
case ggufTypeArray:
fn := llm.readArray
if llm.Version == 1 {
fn = llm.readArrayV1
}
a, err := fn(r)
if err != nil {
return err
}
v = a
default:
return fmt.Errorf("invalid type: %d", vtype)
}
llm.kv[k] = v
}
return nil
}
func (ggufModel) readU8(r io.Reader) uint8 {
var u8 uint8
binary.Read(r, binary.LittleEndian, &u8)
return u8
}
func (ggufModel) readI8(r io.Reader) int8 {
var i8 int8
binary.Read(r, binary.LittleEndian, &i8)
return i8
}
func (ggufModel) readU16(r io.Reader) uint16 {
var u16 uint16
binary.Read(r, binary.LittleEndian, &u16)
return u16
}
func (ggufModel) readI16(r io.Reader) int16 {
var i16 int16
binary.Read(r, binary.LittleEndian, &i16)
return i16
}
func (ggufModel) readU32(r io.Reader) uint32 {
var u32 uint32
binary.Read(r, binary.LittleEndian, &u32)
return u32
}
func (ggufModel) readI32(r io.Reader) int32 {
var i32 int32
binary.Read(r, binary.LittleEndian, &i32)
return i32
}
func (ggufModel) readU64(r io.Reader) uint64 {
var u64 uint64
binary.Read(r, binary.LittleEndian, &u64)
return u64
}
func (ggufModel) readI64(r io.Reader) int64 {
var i64 int64
binary.Read(r, binary.LittleEndian, &i64)
return i64
}
func (ggufModel) readF32(r io.Reader) float32 {
var f32 float32
binary.Read(r, binary.LittleEndian, &f32)
return f32
}
func (ggufModel) readF64(r io.Reader) float64 {
var f64 float64
binary.Read(r, binary.LittleEndian, &f64)
return f64
}
func (ggufModel) readBool(r io.Reader) bool {
var b bool
binary.Read(r, binary.LittleEndian, &b)
return b
}
func (ggufModel) readStringV1(r io.Reader) (string, error) {
var nameLength uint32
binary.Read(r, binary.LittleEndian, &nameLength)
var b bytes.Buffer
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
return "", err
}
// gguf v1 strings are null-terminated
b.Truncate(b.Len() - 1)
return b.String(), nil
}
func (llm ggufModel) readString(r io.Reader) (string, error) {
var nameLength uint64
binary.Read(r, binary.LittleEndian, &nameLength)
var b bytes.Buffer
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
return "", err
}
return b.String(), nil
}
func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
atype := llm.readU32(r)
n := llm.readU32(r)
for i := 0; uint32(i) < n; i++ {
switch atype {
case ggufTypeUint8:
arr = append(arr, llm.readU8(r))
case ggufTypeInt8:
arr = append(arr, llm.readU8(r))
case ggufTypeUint16:
arr = append(arr, llm.readU16(r))
case ggufTypeInt16:
arr = append(arr, llm.readI16(r))
case ggufTypeUint32:
arr = append(arr, llm.readU32(r))
case ggufTypeInt32:
arr = append(arr, llm.readI32(r))
case ggufTypeFloat32:
arr = append(arr, llm.readF32(r))
case ggufTypeBool:
arr = append(arr, llm.readBool(r))
case ggufTypeString:
s, err := llm.readStringV1(r)
if err != nil {
return nil, err
}
arr = append(arr, s)
default:
return nil, fmt.Errorf("invalid array type: %d", atype)
}
}
return
}
func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
atype := llm.readU32(r)
n := llm.readU64(r)
for i := 0; uint64(i) < n; i++ {
switch atype {
case ggufTypeUint8:
arr = append(arr, llm.readU8(r))
case ggufTypeInt8:
arr = append(arr, llm.readU8(r))
case ggufTypeUint16:
arr = append(arr, llm.readU16(r))
case ggufTypeInt16:
arr = append(arr, llm.readI16(r))
case ggufTypeUint32:
arr = append(arr, llm.readU32(r))
case ggufTypeInt32:
arr = append(arr, llm.readI32(r))
case ggufTypeUint64:
arr = append(arr, llm.readU64(r))
case ggufTypeInt64:
arr = append(arr, llm.readI64(r))
case ggufTypeFloat32:
arr = append(arr, llm.readF32(r))
case ggufTypeFloat64:
arr = append(arr, llm.readF64(r))
case ggufTypeBool:
arr = append(arr, llm.readBool(r))
case ggufTypeString:
s, err := llm.readString(r)
if err != nil {
return nil, err
}
arr = append(arr, s)
default:
return nil, fmt.Errorf("invalid array type: %d", atype)
}
}
return
}
var (
ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
)
var (
ggufInit sync.Once
ggufRunnerPath string
)
func ggufRunner() ModelRunner {
ggufInit.Do(func() {
ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
})
return ModelRunner{Path: ggufRunnerPath}
}

View File

@@ -1,8 +1,17 @@
//go:build !darwin
// +build !darwin
package llm package llm
//go:generate git submodule init //go:generate git submodule init
//go:generate git submodule update --force ggml //go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release //go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cpu --target server --config Release

View File

@@ -1,11 +0,0 @@
//go:build darwin
// +build darwin
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/gpu --target server --config Release

View File

@@ -0,0 +1,16 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/cpu --target server --config Release

View File

@@ -0,0 +1,16 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/gpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/gpu --target server --config Release

View File

@@ -0,0 +1,15 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/gpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/gpu --target server --config Release

View File

@@ -0,0 +1,32 @@
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 5 Sep 2023 16:05:08 -0400
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
---
ggml-metal.metal | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
--
2.39.2 (Apple Git-143)

View File

@@ -0,0 +1,30 @@
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Mon, 21 Aug 2023 06:59:29 -0400
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
kernel (#2686)
---
ggml-metal.metal | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..88d48f6 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
+ threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
- threadgroup_barrier(mem_flags::mem_threadgroup);
+ threadgroup_barrier(mem_flags::mem_device);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {
--
2.41.0

View File

@@ -0,0 +1,41 @@
From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Tue, 22 Aug 2023 02:18:40 -0400
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
---
ggml-metal.metal | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 88d48f6..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
- threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
- threadgroup_barrier(mem_flags::mem_device);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {
--
2.41.0

View File

@@ -0,0 +1,32 @@
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
From: Kylin <56434533+KyL0N@users.noreply.github.com>
Date: Tue, 22 Aug 2023 15:14:23 +0800
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
* ggml: support CUDA's half type for aarch64(#1455)
support CUDA's half type for aarch64 in ggml_fp16_t definition
* ggml: use __CUDACC__ to recognise nvcc compiler
---
ggml.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/ggml.h b/ggml.h
index 544ad2d..0ec7ec5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -259,8 +259,9 @@
extern "C" {
#endif
-#ifdef __ARM_NEON
- // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+ typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
--
2.39.2 (Apple Git-143)

1
llm/llama.cpp/gguf Submodule

Submodule llm/llama.cpp/gguf added at 53885d7256

View File

@@ -20,124 +20,114 @@ import (
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
) )
const ModelFamilyLlama ModelFamily = "llama" //go:embed llama.cpp/*/build/*/bin/*
//go:embed llama.cpp/ggml/build/*/bin/*
var llamaCppEmbed embed.FS var llamaCppEmbed embed.FS
var (
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
)
var (
ggmlInit sync.Once
ggmlRunnerPath string
)
func osPath(llamaPath string) string { func osPath(llamaPath string) string {
if runtime.GOOS == "windows" { if runtime.GOOS == "windows" {
return path.Join(llamaPath, "Release") return path.Join(llamaPath, "Release")
} }
return llamaPath return llamaPath
} }
func initGGML() { func chooseRunner(gpuPath, cpuPath string) string {
ggmlInit.Do(func() { tmpDir, err := os.MkdirTemp("", "llama-*")
tmpDir, err := os.MkdirTemp("", "llama-*") if err != nil {
if err != nil { log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
log.Fatalf("llama.cpp: failed to create temp dir: %v", err) }
}
llamaPath := osPath(ggmlGPU) llamaPath := osPath(gpuPath)
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
llamaPath = osPath(cpuPath)
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil { if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
llamaPath = osPath(ggmlCPU) log.Fatalf("llama.cpp executable not found")
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
log.Fatalf("llama.cpp executable not found")
}
} }
}
files := []string{"server"} files := []string{"server"}
switch runtime.GOOS { switch runtime.GOOS {
case "windows": case "windows":
files = []string{"server.exe"} files = []string{"server.exe"}
case "darwin": case "darwin":
if llamaPath == osPath(gpuPath) {
files = append(files, "ggml-metal.metal") files = append(files, "ggml-metal.metal")
} }
case "linux":
for _, f := range files { // check if there is a GPU available
srcPath := path.Join(llamaPath, f) if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
destPath := filepath.Join(tmpDir, f) // this error was logged on start-up, so we don't need to log it again
llamaPath = osPath(cpuPath)
srcFile, err := llamaCppEmbed.Open(srcPath)
if err != nil {
log.Fatalf("read llama.cpp %s: %v", f, err)
}
defer srcFile.Close()
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
log.Fatalf("write llama.cpp %s: %v", f, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
log.Fatalf("copy llama.cpp %s: %v", f, err)
}
} }
}
ggmlRunnerPath = filepath.Join(tmpDir, "server") for _, f := range files {
if runtime.GOOS == "windows" { srcPath := path.Join(llamaPath, f)
ggmlRunnerPath = filepath.Join(tmpDir, "server.exe") destPath := filepath.Join(tmpDir, f)
srcFile, err := llamaCppEmbed.Open(srcPath)
if err != nil {
log.Fatalf("read llama.cpp %s: %v", f, err)
} }
}) defer srcFile.Close()
}
type ModelRunner struct { destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
Path string // path to the model runner executable if err != nil {
} log.Fatalf("write llama.cpp %s: %v", f, err)
}
defer destFile.Close()
func ggmlRunner() ModelRunner { if _, err := io.Copy(destFile, srcFile); err != nil {
initGGML() log.Fatalf("copy llama.cpp %s: %v", f, err)
return ModelRunner{Path: ggmlRunnerPath} }
}
runPath := filepath.Join(tmpDir, "server")
if runtime.GOOS == "windows" {
runPath = filepath.Join(tmpDir, "server.exe")
}
return runPath
} }
type llamaModel struct { type llamaModel struct {
hyperparameters llamaHyperparameters hyperparameters llamaHyperparameters
} }
func (llm *llamaModel) ModelFamily() ModelFamily { func (llm *llamaModel) ModelFamily() string {
return ModelFamilyLlama return "llama"
} }
func (llm *llamaModel) ModelType() ModelType { func llamaModelType(numLayer uint32) string {
switch llm.hyperparameters.NumLayer { switch numLayer {
case 26: case 26:
return ModelType3B return "3B"
case 32: case 32:
return ModelType7B return "7B"
case 40: case 40:
return ModelType13B return "13B"
case 48: case 48:
return ModelType34B return "34B"
case 60: case 60:
return ModelType30B return "30B"
case 80: case 80:
return ModelType65B return "65B"
default:
return "Unknown"
} }
// TODO: find a better default
return ModelType7B
} }
func (llm *llamaModel) FileType() FileType { func (llm *llamaModel) ModelType() string {
return llm.hyperparameters.FileType return llamaModelType(llm.hyperparameters.NumLayer)
}
func (llm *llamaModel) FileType() string {
return fileType(llm.hyperparameters.FileType)
} }
type llamaHyperparameters struct { type llamaHyperparameters struct {
@@ -154,70 +144,7 @@ type llamaHyperparameters struct {
NumRot uint32 NumRot uint32
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc. // FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
FileType llamaFileType FileType uint32
}
type llamaFileType uint32
const (
llamaFileTypeF32 llamaFileType = iota
llamaFileTypeF16
llamaFileTypeQ4_0
llamaFileTypeQ4_1
llamaFileTypeQ4_1_F16
llamaFileTypeQ8_0 llamaFileType = iota + 2
llamaFileTypeQ5_0
llamaFileTypeQ5_1
llamaFileTypeQ2_K
llamaFileTypeQ3_K_S
llamaFileTypeQ3_K_M
llamaFileTypeQ3_K_L
llamaFileTypeQ4_K_S
llamaFileTypeQ4_K_M
llamaFileTypeQ5_K_S
llamaFileTypeQ5_K_M
llamaFileTypeQ6_K
)
func (ft llamaFileType) String() string {
switch ft {
case llamaFileTypeF32:
return "F32"
case llamaFileTypeF16:
return "F16"
case llamaFileTypeQ4_0:
return "Q4_0"
case llamaFileTypeQ4_1:
return "Q4_1"
case llamaFileTypeQ4_1_F16:
return "Q4_1_F16"
case llamaFileTypeQ8_0:
return "Q8_0"
case llamaFileTypeQ5_0:
return "Q5_0"
case llamaFileTypeQ5_1:
return "Q5_1"
case llamaFileTypeQ2_K:
return "Q2_K"
case llamaFileTypeQ3_K_S:
return "Q3_K_S"
case llamaFileTypeQ3_K_M:
return "Q3_K_M"
case llamaFileTypeQ3_K_L:
return "Q3_K_L"
case llamaFileTypeQ4_K_S:
return "Q4_K_S"
case llamaFileTypeQ4_K_M:
return "Q4_K_M"
case llamaFileTypeQ5_K_S:
return "Q5_K_S"
case llamaFileTypeQ5_K_M:
return "Q5_K_M"
case llamaFileTypeQ6_K:
return "Q6_K"
default:
return "Unknown"
}
} }
type Running struct { type Running struct {
@@ -226,11 +153,81 @@ type Running struct {
Cancel context.CancelFunc Cancel context.CancelFunc
} }
type ModelRunner struct {
Path string // path to the model runner executable
}
type llama struct { type llama struct {
api.Options api.Options
Running Running
} }
var errNoGPU = errors.New("nvidia-smi command failed")
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
func CheckVRAM() (int, error) {
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
var stdout bytes.Buffer
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoGPU
}
var total int
scanner := bufio.NewScanner(&stdout)
for scanner.Scan() {
line := scanner.Text()
vram, err := strconv.Atoi(line)
if err != nil {
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
}
total += vram
}
return total, nil
}
func NumGPU(opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
n := 1 // default to enable metal on macOS
if runtime.GOOS == "linux" {
vram, err := CheckVRAM()
if err != nil {
if err.Error() != "nvidia-smi command failed" {
log.Print(err.Error())
}
// nvidia driver not installed or no nvidia GPU found
return 0
}
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
switch {
case vram < 500:
log.Printf("WARNING: Low VRAM detected, disabling GPU")
n = 0
case vram < 1000:
n = 4
case vram < 2000:
n = 8
case vram < 4000:
n = 12
case vram < 8000:
n = 16
case vram < 12000:
n = 24
case vram < 16000:
n = 32
default:
n = 48
}
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
}
return n
}
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) { func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
if _, err := os.Stat(model); err != nil { if _, err := os.Stat(model); err != nil {
return nil, err return nil, err
@@ -247,14 +244,17 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
params := []string{ params := []string{
"--model", model, "--model", model,
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx), "--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
"--gqa", fmt.Sprintf("%d", opts.NumGQA),
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase), "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale), "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
"--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU), "--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
"--embedding", "--embedding",
} }
if opts.NumGQA > 0 {
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
}
if len(adapters) > 0 { if len(adapters) > 0 {
// TODO: applying multiple adapters is not supported by the llama.cpp server yet // TODO: applying multiple adapters is not supported by the llama.cpp server yet
params = append(params, "--lora", adapters[0]) params = append(params, "--lora", adapters[0])
@@ -286,17 +286,25 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
runner.Path, runner.Path,
append(params, "--port", strconv.Itoa(port))..., append(params, "--port", strconv.Itoa(port))...,
) )
var stderr bytes.Buffer
cmd.Stderr = &stderr cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}} llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
log.Print("starting llama.cpp server")
if err := llm.Cmd.Start(); err != nil {
log.Printf("error starting the external llama.cpp server: %v", err)
continue
}
if err := waitForServer(llm); err != nil { if err := waitForServer(llm); err != nil {
log.Printf("error starting llama.cpp server: %v", err) log.Printf("error starting llama.cpp server: %v", err)
llm.Close() llm.Close()
// try again // try again
continue continue
} }
// server started successfully // server started successfully
return llm, nil return llm, nil
} }
@@ -305,59 +313,37 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
} }
func waitForServer(llm *llama) error { func waitForServer(llm *llama) error {
log.Print("starting llama.cpp server")
var stderr bytes.Buffer
llm.Cmd.Stderr = &stderr
err := llm.Cmd.Start()
if err != nil {
return fmt.Errorf("error starting the external llama.cpp server: %w", err)
}
exitChan := make(chan error, 1)
// the server is a long running process, watch for it exiting to keep track of something going wrong
go func() {
err := llm.Cmd.Wait()
log.Print(stderr.String())
exitChan <- err
}()
// wait for the server to start responding // wait for the server to start responding
start := time.Now() start := time.Now()
expiresAt := time.Now().Add(30 * time.Second) expiresAt := time.Now().Add(45 * time.Second)
ticker := time.NewTicker(100 * time.Millisecond) ticker := time.NewTicker(200 * time.Millisecond)
log.Print("waiting for llama.cpp server to start responding") log.Print("waiting for llama.cpp server to start responding")
for range ticker.C {
if time.Now().After(expiresAt) {
return fmt.Errorf("llama.cpp server did not start within alloted time, retrying")
}
for { if err := llm.Ping(context.Background()); err == nil {
select { break
case <-ticker.C:
if time.Now().After(expiresAt) {
return fmt.Errorf("llama.cpp server did not start responding within 30 seconds, retrying")
}
if err := llm.Ping(context.Background()); err == nil {
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
return nil
}
case err := <-exitChan:
return fmt.Errorf("llama.cpp server exited unexpectedly: %w", err)
} }
} }
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
return nil
} }
func (llm *llama) Close() { func (llm *llama) Close() {
llm.Running.Cmd.Cancel() llm.Cancel()
if err := llm.Cmd.Wait(); err != nil {
log.Printf("llama.cpp server exited with error: %v", err)
}
} }
func (llm *llama) SetOptions(opts api.Options) { func (llm *llama) SetOptions(opts api.Options) {
llm.Options = opts llm.Options = opts
} }
type Prediction struct {
Content string `json:"content"`
Stop bool `json:"stop"`
}
type GenerationSettings struct { type GenerationSettings struct {
FrequencyPenalty float64 `json:"frequency_penalty"` FrequencyPenalty float64 `json:"frequency_penalty"`
IgnoreEOS bool `json:"ignore_eos"` IgnoreEOS bool `json:"ignore_eos"`
@@ -385,31 +371,19 @@ type GenerationSettings struct {
} }
type Timings struct { type Timings struct {
PredictedMS float64 `json:"predicted_ms"` PredictedN int `json:"predicted_n"`
PredictedN int `json:"predicted_n"` PredictedMS float64 `json:"predicted_ms"`
PredictedPerSecond float64 `json:"predicted_per_second"` PromptN int `json:"prompt_n"`
PredictedPerTokenMS float64 `json:"predicted_per_token_ms"` PromptMS float64 `json:"prompt_ms"`
PromptMS float64 `json:"prompt_ms"`
PromptN int `json:"prompt_n"`
PromptPerSecond float64 `json:"prompt_per_second"`
PromptPerTokenMS float64 `json:"prompt_per_token_ms"`
} }
type PredictComplete struct { type Prediction struct {
Content string `json:"content"` Content string `json:"content"`
GenerationSettings GenerationSettings `json:"generation_settings"` Model string `json:"model"`
Model string `json:"model"` Prompt string `json:"prompt"`
Prompt string `json:"prompt"` Stop bool `json:"stop"`
Stop bool `json:"stop"`
StoppedEOS bool `json:"stopped_eos"` Timings `json:"timings"`
StoppedLimit bool `json:"stopped_limit"`
StoppedWord bool `json:"stopped_word"`
StoppingWord string `json:"stopping_word"`
Timings Timings `json:"timings"`
TokensCached int `json:"tokens_cached"`
TokensEvaluated int `json:"tokens_evaluated"`
TokensPredicted int `json:"tokens_predicted"`
Truncated bool `json:"truncated"`
} }
type PredictRequest struct { type PredictRequest struct {
@@ -437,15 +411,19 @@ type PredictRequest struct {
Stop []string `json:"stop,omitempty"` Stop []string `json:"stop,omitempty"`
} }
func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string, fn func(api.GenerateResponse)) error { func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
// we need to find the trimmed prompt context before predicting so that we can return it to the client prevConvo, err := llm.Decode(ctx, prevContext)
trimmedPrompt, err := llm.marshalPrompt(ctx, predictCtx, prompt)
if err != nil { if err != nil {
return fmt.Errorf("marshaling prompt: %v", err) return err
} }
var nextContext strings.Builder
nextContext.WriteString(prevConvo)
nextContext.WriteString(prompt)
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port) endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
predReq := PredictRequest{ predReq := PredictRequest{
Prompt: trimmedPrompt, Prompt: nextContext.String(),
Stream: true, Stream: true,
NPredict: llm.NumPredict, NPredict: llm.NumPredict,
NKeep: llm.NumKeep, NKeep: llm.NumKeep,
@@ -491,7 +469,6 @@ func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string,
} }
scanner := bufio.NewScanner(resp.Body) scanner := bufio.NewScanner(resp.Body)
genCtx := trimmedPrompt // start with the trimmed prompt
for scanner.Scan() { for scanner.Scan() {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@@ -506,34 +483,33 @@ func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string,
// Read data from the server-side event stream // Read data from the server-side event stream
if strings.HasPrefix(line, "data: ") { if strings.HasPrefix(line, "data: ") {
evt := line[6:] evt := line[6:]
var complete PredictComplete var p Prediction
if err := json.Unmarshal([]byte(evt), &complete); err != nil { if err := json.Unmarshal([]byte(evt), &p); err != nil {
return fmt.Errorf("error unmarshaling llm complete response: %v", err) return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
} }
if complete.Timings.PredictedMS > 0 { if p.Content != "" {
genCtx += complete.Content fn(api.GenerateResponse{Response: p.Content})
embd, err := llm.Encode(ctx, genCtx) nextContext.WriteString(p.Content)
}
if p.Stop {
embd, err := llm.Encode(ctx, nextContext.String())
if err != nil { if err != nil {
return fmt.Errorf("encoding context: %v", err) return fmt.Errorf("encoding context: %v", err)
} }
fn(api.GenerateResponse{ fn(api.GenerateResponse{
Done: true, Done: true,
Context: embd, Context: embd,
PromptEvalCount: int(complete.Timings.PromptN), PromptEvalCount: p.PromptN,
PromptEvalDuration: parseDurationMs(float64(complete.Timings.PromptMS)), PromptEvalDuration: parseDurationMs(p.PromptMS),
EvalCount: int(complete.Timings.PredictedN), EvalCount: p.PredictedN,
EvalDuration: parseDurationMs(float64(complete.Timings.PredictedMS)), EvalDuration: parseDurationMs(p.PredictedMS),
}) })
return nil return nil
} }
var pred Prediction
if err := json.Unmarshal([]byte(evt), &pred); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
genCtx += pred.Content
fn(api.GenerateResponse{Response: pred.Content})
} }
} }
} }
@@ -545,34 +521,6 @@ func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string,
return nil return nil
} }
func (llm *llama) marshalPrompt(ctx context.Context, pCtx []int, prompt string) (string, error) {
pEncode, err := llm.Encode(ctx, prompt)
if err != nil {
return "", fmt.Errorf("encoding prompt context: %w", err)
}
tokens := append(pCtx, pEncode...)
if llm.NumKeep < 0 {
llm.NumKeep = len(tokens)
}
// min(llm.NumCtx - 4, llm.NumKeep)
if llm.NumCtx-4 < llm.NumKeep {
llm.NumKeep = llm.NumCtx - 4
}
if len(tokens) >= llm.NumCtx {
// truncate input
numLeft := (llm.NumCtx - llm.NumKeep) / 2
truncated := tokens[:llm.NumKeep]
erasedBlocks := (len(tokens) - llm.NumKeep - numLeft - 1) / numLeft
truncated = append(truncated, tokens[llm.NumKeep+erasedBlocks*numLeft:]...)
tokens = truncated
log.Printf("input truncated: num_ctx=%d num_keep=%d num_left=%d num_tokens=%d", llm.NumCtx, llm.NumKeep, numLeft, len(truncated))
}
return llm.Decode(ctx, tokens)
}
type TokenizeRequest struct { type TokenizeRequest struct {
Content string `json:"content"` Content string `json:"content"`
} }
@@ -716,7 +664,7 @@ func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error
// Ping checks that the server subprocess is still running and responding to requests // Ping checks that the server subprocess is still running and responding to requests
func (llm *llama) Ping(ctx context.Context) error { func (llm *llama) Ping(ctx context.Context) error {
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Running.Port)) resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
if err != nil { if err != nil {
return fmt.Errorf("ping resp: %w", err) return fmt.Errorf("ping resp: %w", err)
} }

View File

@@ -32,15 +32,22 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
} }
defer f.Close() defer f.Close()
ggml, err := DecodeGGML(f, ModelFamilyLlama) ggml, err := DecodeGGML(f)
if err != nil { if err != nil {
return nil, err return nil, err
} }
switch ggml.FileType().String() { switch ggml.FileType() {
case "F32", "Q5_0", "Q5_1", "Q8_0": case "Q8_0":
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
// GGML Q8_0 do not support Metal API and will
// cause the runner to segmentation fault so disable GPU
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
opts.NumGPU = 0
}
case "F32", "Q5_0", "Q5_1":
if opts.NumGPU != 0 { if opts.NumGPU != 0 {
// F32, F16, Q5_0, Q5_1, and Q8_0 do not support Metal API and will // F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
// cause the runner to segmentation fault so disable GPU // cause the runner to segmentation fault so disable GPU
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0") log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
opts.NumGPU = 0 opts.NumGPU = 0
@@ -49,34 +56,43 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
totalResidentMemory := memory.TotalMemory() totalResidentMemory := memory.TotalMemory()
switch ggml.ModelType() { switch ggml.ModelType() {
case ModelType3B, ModelType7B: case "3B", "7B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 16GB of memory") return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
} else if totalResidentMemory < 8*1024*1024 { } else if totalResidentMemory < 8*1024*1024 {
return nil, fmt.Errorf("model requires at least 8GB of memory") return nil, fmt.Errorf("model requires at least 8GB of memory")
} }
case ModelType13B: case "13B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 32GB of memory") return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
} else if totalResidentMemory < 16*1024*1024 { } else if totalResidentMemory < 16*1024*1024 {
return nil, fmt.Errorf("model requires at least 16GB of memory") return nil, fmt.Errorf("model requires at least 16GB of memory")
} }
case ModelType30B, ModelType34B: case "30B", "34B", "40B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 64GB of memory") return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
} else if totalResidentMemory < 32*1024*1024 { } else if totalResidentMemory < 32*1024*1024 {
return nil, fmt.Errorf("model requires at least 32GB of memory") return nil, fmt.Errorf("model requires at least 32GB of memory")
} }
case ModelType65B: case "65B", "70B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 128GB of memory") return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
} else if totalResidentMemory < 64*1024*1024 { } else if totalResidentMemory < 64*1024*1024 {
return nil, fmt.Errorf("model requires at least 64GB of memory") return nil, fmt.Errorf("model requires at least 64GB of memory")
} }
case "180B":
if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
} else if totalResidentMemory < 128*1024*1024 {
return nil, fmt.Errorf("model requires at least 128GB of memory")
}
} }
switch ggml.ModelFamily() { switch ggml.Name() {
case ModelFamilyLlama: case "gguf":
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
return newLlama(model, adapters, ggufRunner(), opts)
case "ggml", "ggmf", "ggjt", "ggla":
return newLlama(model, adapters, ggmlRunner(), opts) return newLlama(model, adapters, ggmlRunner(), opts)
default: default:
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily()) return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())

View File

@@ -6,8 +6,11 @@ GO_LDFLAGS="-X github.com/jmorganca/ollama/version.Version=$VERSION"
GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release" GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
# build universal binary # build universal binary
CGO_ENABLED=1 GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64 GOARCH=arm64 go generate ./...
CGO_ENABLED=1 GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64 GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
rm -rf llm/llama.cpp/*/build/*/bin
GOARCH=amd64 go generate ./...
GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
rm dist/ollama-darwin-amd64 dist/ollama-darwin-arm64 rm dist/ollama-darwin-amd64 dist/ollama-darwin-arm64
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama

View File

@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
headers := make(http.Header) headers := make(http.Header)
headers.Set("Authorization", sig) headers.Set("Authorization", sig)
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, regOpts) resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
if err != nil { if err != nil {
log.Printf("couldn't get token: %q", err) log.Printf("couldn't get token: %q", err)
} }

View File

@@ -22,6 +22,8 @@ import (
"strings" "strings"
"text/template" "text/template"
"golang.org/x/exp/slices"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/llm" "github.com/jmorganca/ollama/llm"
"github.com/jmorganca/ollama/parser" "github.com/jmorganca/ollama/parser"
@@ -39,15 +41,18 @@ type RegistryOptions struct {
} }
type Model struct { type Model struct {
Name string `json:"name"` Name string `json:"name"`
ModelPath string ShortName string
AdapterPaths []string ModelPath string
Template string OriginalModel string
System string AdapterPaths []string
Digest string Template string
ConfigDigest string System string
Options map[string]interface{} License []string
Embeddings []vector.Embedding Digest string
ConfigDigest string
Options map[string]interface{}
Embeddings []vector.Embedding
} }
func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) { func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
@@ -66,7 +71,6 @@ func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, e
System string System string
Prompt string Prompt string
Embed string Embed string
Args map[string]any
// deprecated: versions <= 0.0.7 used this to omit the system prompt // deprecated: versions <= 0.0.7 used this to omit the system prompt
Context []int Context []int
@@ -76,7 +80,6 @@ func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, e
vars.System = m.System vars.System = m.System
vars.Prompt = request.Prompt vars.Prompt = request.Prompt
vars.Context = request.Context vars.Context = request.Context
vars.Args = request.Args
vars.Embed = embedding vars.Embed = embedding
if request.System != "" { if request.System != "" {
@@ -111,10 +114,11 @@ type LayerReader struct {
} }
type ConfigV2 struct { type ConfigV2 struct {
ModelFamily llm.ModelFamily `json:"model_family"` ModelFormat string `json:"model_format"`
ModelType string `json:"model_type"` ModelFamily string `json:"model_family"`
FileType string `json:"file_type"` ModelType string `json:"model_type"`
RootFS RootFS `json:"rootfs"` FileType string `json:"file_type"`
RootFS RootFS `json:"rootfs"`
// required by spec // required by spec
Architecture string `json:"architecture"` Architecture string `json:"architecture"`
@@ -171,9 +175,11 @@ func GetModel(name string) (*Model, error) {
model := &Model{ model := &Model{
Name: mp.GetFullTagname(), Name: mp.GetFullTagname(),
ShortName: mp.GetShortTagname(),
Digest: digest, Digest: digest,
ConfigDigest: manifest.Config.Digest, ConfigDigest: manifest.Config.Digest,
Template: "{{ .Prompt }}", Template: "{{ .Prompt }}",
License: []string{},
} }
for _, layer := range manifest.Layers { for _, layer := range manifest.Layers {
@@ -185,6 +191,7 @@ func GetModel(name string) (*Model, error) {
switch layer.MediaType { switch layer.MediaType {
case "application/vnd.ollama.image.model": case "application/vnd.ollama.image.model":
model.ModelPath = filename model.ModelPath = filename
model.OriginalModel = layer.From
case "application/vnd.ollama.image.embed": case "application/vnd.ollama.image.embed":
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
@@ -229,6 +236,12 @@ func GetModel(name string) (*Model, error) {
if err = json.NewDecoder(params).Decode(&model.Options); err != nil { if err = json.NewDecoder(params).Decode(&model.Options); err != nil {
return nil, err return nil, err
} }
case "application/vnd.ollama.image.license":
bts, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
model.License = append(model.License, string(bts))
} }
} }
@@ -256,6 +269,29 @@ func filenameWithPath(path, f string) (string, error) {
} }
func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error { func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
mp := ParseModelPath(name)
var manifest *ManifestV2
var err error
var noprune string
// build deleteMap to prune unused layers
deleteMap := make(map[string]bool)
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
if manifest != nil {
for _, l := range manifest.Layers {
deleteMap[l.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
}
}
mf, err := os.Open(path) mf, err := os.Open(path)
if err != nil { if err != nil {
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)}) fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
@@ -276,6 +312,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
var layers []*LayerReader var layers []*LayerReader
params := make(map[string][]string) params := make(map[string][]string)
var sourceParams map[string]any
embed := EmbeddingParams{fn: fn} embed := EmbeddingParams{fn: fn}
for _, c := range commands { for _, c := range commands {
log.Printf("[%s] - %s\n", c.Name, c.Args) log.Printf("[%s] - %s\n", c.Name, c.Args)
@@ -315,14 +352,15 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
} }
defer file.Close() defer file.Close()
ggml, err := llm.DecodeGGML(file, llm.ModelFamilyLlama) ggml, err := llm.DecodeGGML(file)
if err != nil { if err != nil {
return err return err
} }
config.ModelFormat = ggml.Name()
config.ModelFamily = ggml.ModelFamily() config.ModelFamily = ggml.ModelFamily()
config.ModelType = ggml.ModelType().String() config.ModelType = ggml.ModelType()
config.FileType = ggml.FileType().String() config.FileType = ggml.FileType()
// reset the file // reset the file
file.Seek(0, io.SeekStart) file.Seek(0, io.SeekStart)
@@ -356,9 +394,27 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
// copie the model metadata // copie the model metadata
config.ModelFamily = source.ModelFamily config.ModelFamily = source.ModelFamily
config.ModelType = source.ModelType config.ModelType = source.ModelType
config.ModelFormat = source.ModelFormat
config.FileType = source.FileType config.FileType = source.FileType
for _, l := range mf.Layers { for _, l := range mf.Layers {
if l.MediaType == "application/vnd.ollama.image.params" {
sourceParamsBlobPath, err := GetBlobsPath(l.Digest)
if err != nil {
return err
}
sourceParamsBlob, err := os.Open(sourceParamsBlobPath)
if err != nil {
return err
}
defer sourceParamsBlob.Close()
if err := json.NewDecoder(sourceParamsBlob).Decode(&sourceParams); err != nil {
return err
}
}
newLayer, err := GetLayerWithBufferFromLayer(l) newLayer, err := GetLayerWithBufferFromLayer(l)
if err != nil { if err != nil {
return err return err
@@ -429,12 +485,25 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
// Create a single layer for the parameters // Create a single layer for the parameters
if len(params) > 0 { if len(params) > 0 {
fn(api.ProgressResponse{Status: "creating parameter layer"}) fn(api.ProgressResponse{Status: "creating parameter layer"})
layers = removeLayerFromLayers(layers, "application/vnd.ollama.image.params") layers = removeLayerFromLayers(layers, "application/vnd.ollama.image.params")
formattedParams, err := formatParams(params) formattedParams, err := formatParams(params)
if err != nil { if err != nil {
return fmt.Errorf("couldn't create params json: %v", err) return fmt.Errorf("couldn't create params json: %v", err)
} }
for k, v := range sourceParams {
if _, ok := formattedParams[k]; !ok {
formattedParams[k] = v
}
}
if config.ModelType == "65B" {
if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
config.ModelType = "70B"
}
}
bts, err := json.Marshal(formattedParams) bts, err := json.Marshal(formattedParams)
if err != nil { if err != nil {
return err return err
@@ -466,6 +535,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
var manifestLayers []*Layer var manifestLayers []*Layer
for _, l := range layers { for _, l := range layers {
manifestLayers = append(manifestLayers, &l.Layer) manifestLayers = append(manifestLayers, &l.Layer)
delete(deleteMap, l.Layer.Digest)
} }
// Create a layer for the config object // Create a layer for the config object
@@ -475,6 +545,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
return err return err
} }
layers = append(layers, cfg) layers = append(layers, cfg)
delete(deleteMap, cfg.Layer.Digest)
if err := SaveLayers(layers, fn, false); err != nil { if err := SaveLayers(layers, fn, false); err != nil {
return err return err
@@ -487,6 +558,14 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
return err return err
} }
if noprune == "" {
fn(api.ProgressResponse{Status: "removing any unused layers"})
err = deleteUnusedLayers(nil, deleteMap, false)
if err != nil {
return err
}
}
fn(api.ProgressResponse{Status: "success"}) fn(api.ProgressResponse{Status: "success"})
return nil return nil
} }
@@ -632,14 +711,9 @@ func existingFileEmbeddings(digest string) (map[string][]float64, error) {
} }
func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader { func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader {
j := 0 return slices.DeleteFunc(layers, func(layer *LayerReader) bool {
for _, l := range layers { return layer.MediaType == mediaType
if l.MediaType != mediaType { })
layers[j] = l
j++
}
}
return layers[:j]
} }
func SaveLayers(layers []*LayerReader, fn func(resp api.ProgressResponse), force bool) error { func SaveLayers(layers []*LayerReader, fn func(resp api.ProgressResponse), force bool) error {
@@ -747,14 +821,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
return nil, fmt.Errorf("invalid float value %s", vals) return nil, fmt.Errorf("invalid float value %s", vals)
} }
out[key] = floatVal out[key] = float32(floatVal)
case reflect.Int: case reflect.Int:
intVal, err := strconv.ParseInt(vals[0], 10, 0) intVal, err := strconv.ParseInt(vals[0], 10, 64)
if err != nil { if err != nil {
return nil, fmt.Errorf("invalid int value %s", vals) return nil, fmt.Errorf("invalid int value %s", vals)
} }
out[key] = intVal out[key] = int(intVal)
case reflect.Bool: case reflect.Bool:
boolVal, err := strconv.ParseBool(vals[0]) boolVal, err := strconv.ParseBool(vals[0])
if err != nil { if err != nil {
@@ -834,18 +908,7 @@ func CopyModel(src, dest string) error {
return nil return nil
} }
func DeleteModel(name string) error { func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dryRun bool) error {
mp := ParseModelPath(name)
manifest, _, err := GetManifest(mp)
if err != nil {
return err
}
deleteMap := make(map[string]bool)
for _, layer := range manifest.Layers {
deleteMap[layer.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
fp, err := GetManifestPath() fp, err := GetManifestPath()
if err != nil { if err != nil {
return err return err
@@ -862,14 +925,13 @@ func DeleteModel(name string) error {
fmp := ParseModelPath(tag) fmp := ParseModelPath(tag)
// skip the manifest we're trying to delete // skip the manifest we're trying to delete
if mp.GetFullTagname() == fmp.GetFullTagname() { if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
return nil return nil
} }
// save (i.e. delete from the deleteMap) any files used in other manifests // save (i.e. delete from the deleteMap) any files used in other manifests
manifest, _, err := GetManifest(fmp) manifest, _, err := GetManifest(fmp)
if err != nil { if err != nil {
log.Printf("skipping file: %s", fp)
return nil return nil
} }
@@ -893,14 +955,72 @@ func DeleteModel(name string) error {
log.Printf("couldn't get file path for '%s': %v", k, err) log.Printf("couldn't get file path for '%s': %v", k, err)
continue continue
} }
if err := os.Remove(fp); err != nil { if !dryRun {
log.Printf("couldn't remove file '%s': %v", fp, err) if err := os.Remove(fp); err != nil {
continue log.Printf("couldn't remove file '%s': %v", fp, err)
continue
}
} else {
log.Printf("wanted to remove: %s", fp)
} }
} }
} }
fp, err = mp.GetManifestPath(false) return nil
}
func PruneLayers() error {
deleteMap := make(map[string]bool)
p, err := GetBlobsPath("")
if err != nil {
return err
}
blobs, err := os.ReadDir(p)
if err != nil {
log.Printf("couldn't read dir '%s': %v", p, err)
return err
}
for _, blob := range blobs {
name := blob.Name()
if runtime.GOOS == "windows" {
name = strings.ReplaceAll(name, "-", ":")
}
deleteMap[name] = true
}
log.Printf("total blobs: %d", len(deleteMap))
err = deleteUnusedLayers(nil, deleteMap, false)
if err != nil {
return err
}
log.Printf("total unused blobs removed: %d", len(deleteMap))
return nil
}
func DeleteModel(name string) error {
mp := ParseModelPath(name)
manifest, _, err := GetManifest(mp)
if err != nil {
return err
}
deleteMap := make(map[string]bool)
for _, layer := range manifest.Layers {
deleteMap[layer.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
err = deleteUnusedLayers(&mp, deleteMap, false)
if err != nil {
return err
}
fp, err := mp.GetManifestPath(false)
if err != nil { if err != nil {
return err return err
} }
@@ -913,6 +1033,83 @@ func DeleteModel(name string) error {
return nil return nil
} }
func ShowModelfile(model *Model) (string, error) {
type modelTemplate struct {
*Model
From string
Params string
}
var params []string
for k, v := range model.Options {
switch val := v.(type) {
case string:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, val))
case int:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.Itoa(val)))
case float64:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatFloat(val, 'f', 0, 64)))
case bool:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatBool(val)))
case []interface{}:
for _, nv := range val {
switch nval := nv.(type) {
case string:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, nval))
case int:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.Itoa(nval)))
case float64:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatFloat(nval, 'f', 0, 64)))
case bool:
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatBool(nval)))
default:
log.Printf("unknown type: %s", reflect.TypeOf(nv).String())
}
}
default:
log.Printf("unknown type: %s", reflect.TypeOf(v).String())
}
}
mt := modelTemplate{
Model: model,
From: model.OriginalModel,
Params: strings.Join(params, "\n"),
}
if mt.From == "" {
mt.From = model.ModelPath
}
modelFile := `# Modelfile generated by "ollama show"
# To build a new Modelfile based on this one, replace the FROM line with:
# FROM {{ .ShortName }}
FROM {{ .From }}
TEMPLATE """{{ .Template }}"""
SYSTEM """{{ .System }}"""
{{ .Params }}
`
for _, l := range mt.Model.AdapterPaths {
modelFile += fmt.Sprintf("ADAPTER %s\n", l)
}
tmpl, err := template.New("").Parse(modelFile)
if err != nil {
log.Printf("error parsing template: %q", err)
return "", err
}
var buf bytes.Buffer
if err = tmpl.Execute(&buf, mt); err != nil {
log.Printf("error executing template: %q", err)
return "", err
}
return buf.String(), nil
}
func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error { func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name) mp := ParseModelPath(name)
fn(api.ProgressResponse{Status: "retrieving manifest"}) fn(api.ProgressResponse{Status: "retrieving manifest"})
@@ -1002,13 +1199,34 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error { func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name) mp := ParseModelPath(name)
var manifest *ManifestV2
var err error
var noprune string
// build deleteMap to prune unused layers
deleteMap := make(map[string]bool)
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
if manifest != nil {
for _, l := range manifest.Layers {
deleteMap[l.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
}
}
if mp.ProtocolScheme == "http" && !regOpts.Insecure { if mp.ProtocolScheme == "http" && !regOpts.Insecure {
return fmt.Errorf("insecure protocol http") return fmt.Errorf("insecure protocol http")
} }
fn(api.ProgressResponse{Status: "pulling manifest"}) fn(api.ProgressResponse{Status: "pulling manifest"})
manifest, err := pullModelManifest(ctx, mp, regOpts) manifest, err = pullModelManifest(ctx, mp, regOpts)
if err != nil { if err != nil {
return fmt.Errorf("pull model manifest: %s", err) return fmt.Errorf("pull model manifest: %s", err)
} }
@@ -1028,7 +1246,9 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
}); err != nil { }); err != nil {
return err return err
} }
delete(deleteMap, layer.Digest)
} }
delete(deleteMap, manifest.Config.Digest)
fn(api.ProgressResponse{Status: "verifying sha256 digest"}) fn(api.ProgressResponse{Status: "verifying sha256 digest"})
for _, layer := range layers { for _, layer := range layers {
@@ -1066,6 +1286,14 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return err return err
} }
if noprune == "" {
fn(api.ProgressResponse{Status: "removing any unused layers"})
err = deleteUnusedLayers(nil, deleteMap, false)
if err != nil {
return err
}
}
fn(api.ProgressResponse{Status: "success"}) fn(api.ProgressResponse{Status: "success"})
return nil return nil
@@ -1191,7 +1419,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
} }
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) { func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
if requestURL.Scheme != "http" && regOpts.Insecure { if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
requestURL.Scheme = "http" requestURL.Scheme = "http"
} }
@@ -1204,10 +1432,12 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
req.Header = headers req.Header = headers
} }
if regOpts.Token != "" { if regOpts != nil {
req.Header.Set("Authorization", "Bearer "+regOpts.Token) if regOpts.Token != "" {
} else if regOpts.Username != "" && regOpts.Password != "" { req.Header.Set("Authorization", "Bearer "+regOpts.Token)
req.SetBasicAuth(regOpts.Username, regOpts.Password) } else if regOpts.Username != "" && regOpts.Password != "" {
req.SetBasicAuth(regOpts.Username, regOpts.Password)
}
} }
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version())) req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

View File

@@ -114,7 +114,12 @@ func GetManifestPath() (string, error) {
return "", err return "", err
} }
return filepath.Join(home, ".ollama", "models", "manifests"), nil path := filepath.Join(home, ".ollama", "models", "manifests")
if err := os.MkdirAll(path, 0o755); err != nil {
return "", err
}
return path, nil
} }
func GetBlobsPath(digest string) (string, error) { func GetBlobsPath(digest string) (string, error) {
@@ -128,7 +133,12 @@ func GetBlobsPath(digest string) (string, error) {
} }
path := filepath.Join(home, ".ollama", "models", "blobs", digest) path := filepath.Join(home, ".ollama", "models", "blobs", digest)
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { dirPath := filepath.Dir(path)
if digest == "" {
dirPath = path
}
if err := os.MkdirAll(dirPath, 0o755); err != nil {
return "", err return "", err
} }

View File

@@ -12,6 +12,8 @@ import (
"os/signal" "os/signal"
"path/filepath" "path/filepath"
"reflect" "reflect"
"runtime"
"strconv"
"strings" "strings"
"sync" "sync"
"syscall" "syscall"
@@ -117,12 +119,13 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
if err != nil { if err != nil {
return err return err
} }
tokensNoSystem, err := llmModel.Encode(ctx, promptNoSystem) tokensNoSystem, err := llmModel.Encode(ctx, promptNoSystem)
if err != nil { if err != nil {
return err return err
} }
opts.NumKeep = len(tokensWithSystem) - len(tokensNoSystem) + 1 opts.NumKeep = len(tokensWithSystem) - len(tokensNoSystem)
llmModel.SetOptions(opts) llmModel.SetOptions(opts)
} }
@@ -361,6 +364,78 @@ func DeleteModelHandler(c *gin.Context) {
} }
return return
} }
c.JSON(http.StatusOK, nil)
}
func ShowModelHandler(c *gin.Context) {
var req api.ShowRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
resp, err := GetModelInfo(req.Name)
if err != nil {
if os.IsNotExist(err) {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
} else {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
}
c.JSON(http.StatusOK, resp)
}
func GetModelInfo(name string) (*api.ShowResponse, error) {
model, err := GetModel(name)
if err != nil {
return nil, err
}
resp := &api.ShowResponse{
License: strings.Join(model.License, "\n"),
System: model.System,
Template: model.Template,
}
mf, err := ShowModelfile(model)
if err != nil {
return nil, err
}
resp.Modelfile = mf
var params []string
cs := 30
for k, v := range model.Options {
switch val := v.(type) {
case string:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, val))
case int:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(val)))
case float64:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(val, 'f', 0, 64)))
case bool:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(val)))
case []interface{}:
for _, nv := range val {
switch nval := nv.(type) {
case string:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, nval))
case int:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(nval)))
case float64:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(nval, 'f', 0, 64)))
case bool:
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(nval)))
}
}
}
}
resp.Parameters = strings.Join(params, "\n")
return resp, nil
} }
func ListModelsHandler(c *gin.Context) { func ListModelsHandler(c *gin.Context) {
@@ -456,6 +531,7 @@ func Serve(ln net.Listener, origins []string) error {
r.POST("/api/copy", CopyModelHandler) r.POST("/api/copy", CopyModelHandler)
r.GET("/api/tags", ListModelsHandler) r.GET("/api/tags", ListModelsHandler)
r.DELETE("/api/delete", DeleteModelHandler) r.DELETE("/api/delete", DeleteModelHandler)
r.POST("/api/show", ShowModelHandler)
log.Printf("Listening on %s", ln.Addr()) log.Printf("Listening on %s", ln.Addr())
s := &http.Server{ s := &http.Server{
@@ -473,6 +549,13 @@ func Serve(ln net.Listener, origins []string) error {
os.Exit(0) os.Exit(0)
}() }()
if runtime.GOOS == "linux" {
// check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil {
log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err)
}
}
return s.Serve(ln) return s.Serve(ln)
} }

View File

@@ -66,31 +66,39 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
sectionReader := io.NewSectionReader(f, int64(offset), chunk) sectionReader := io.NewSectionReader(f, int64(offset), chunk)
for try := 0; try < MaxRetries; try++ { for try := 0; try < MaxRetries; try++ {
ch := make(chan error, 1)
r, w := io.Pipe() r, w := io.Pipe()
defer r.Close() defer r.Close()
go func() { go func() {
defer w.Close() defer w.Close()
for chunked := int64(0); chunked < chunk; { for chunked := int64(0); chunked < chunk; {
n, err := io.CopyN(w, sectionReader, 1024*1024) select {
if err != nil && !errors.Is(err, io.EOF) { case err := <-ch:
log.Printf("chunk interrupted: %v", err)
return
default:
n, err := io.CopyN(w, sectionReader, 1024*1024)
if err != nil && !errors.Is(err, io.EOF) {
fn(api.ProgressResponse{
Status: fmt.Sprintf("error reading chunk: %v", err),
Digest: layer.Digest,
Total: layer.Size,
Completed: int(offset),
})
return
}
chunked += n
fn(api.ProgressResponse{ fn(api.ProgressResponse{
Status: fmt.Sprintf("error reading chunk: %v", err), Status: fmt.Sprintf("uploading %s", layer.Digest),
Digest: layer.Digest, Digest: layer.Digest,
Total: layer.Size, Total: layer.Size,
Completed: int(offset), Completed: int(offset) + int(chunked),
}) })
return
} }
chunked += n
fn(api.ProgressResponse{
Status: fmt.Sprintf("uploading %s", layer.Digest),
Digest: layer.Digest,
Total: layer.Size,
Completed: int(offset) + int(chunked),
})
} }
}() }()
@@ -113,6 +121,8 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
switch { switch {
case resp.StatusCode == http.StatusUnauthorized: case resp.StatusCode == http.StatusUnauthorized:
ch <- errors.New("unauthorized")
auth := resp.Header.Get("www-authenticate") auth := resp.Header.Get("www-authenticate")
authRedir := ParseAuthRedirectString(auth) authRedir := ParseAuthRedirectString(auth)
token, err := getAuthToken(ctx, authRedir, regOpts) token, err := getAuthToken(ctx, authRedir, regOpts)
@@ -121,10 +131,7 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
} }
regOpts.Token = token regOpts.Token = token
if _, err := sectionReader.Seek(0, io.SeekStart); err != nil { sectionReader = io.NewSectionReader(f, int64(offset), chunk)
return err
}
continue continue
case resp.StatusCode >= http.StatusBadRequest: case resp.StatusCode >= http.StatusBadRequest:
body, _ := io.ReadAll(resp.Body) body, _ := io.ReadAll(resp.Body)