Compare commits
47 Commits
mxyng/extr
...
matt/strea
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e2389b63aa | ||
|
|
f89c23764b | ||
|
|
d028853879 | ||
|
|
949553db23 | ||
|
|
0c5a454361 | ||
|
|
f59c4d03f7 | ||
|
|
7dee25a07f | ||
|
|
f221637053 | ||
|
|
45ac07cd02 | ||
|
|
7d749cc787 | ||
|
|
e7e91cd71c | ||
|
|
3920e15386 | ||
|
|
41e976edde | ||
|
|
de227b620f | ||
|
|
63def6ca49 | ||
|
|
738fe9c4aa | ||
|
|
a8da0bacbe | ||
|
|
bf146fb072 | ||
|
|
f0f4943577 | ||
|
|
09dd2aeff9 | ||
|
|
07b4074e7b | ||
|
|
61dda6a5e0 | ||
|
|
e1f9ced568 | ||
|
|
9795b43d93 | ||
|
|
0980d5c7e3 | ||
|
|
0dae34b6a7 | ||
|
|
83c6be1666 | ||
|
|
1adfa67589 | ||
|
|
790d24eb7b | ||
|
|
7de300856b | ||
|
|
213ffdb548 | ||
|
|
d42d88386a | ||
|
|
154f24af91 | ||
|
|
a1ecdd36d5 | ||
|
|
d18282bfda | ||
|
|
9ae76ba8c9 | ||
|
|
2bc06565c7 | ||
|
|
d1c2558f7e | ||
|
|
7b5aefb427 | ||
|
|
06ef90c051 | ||
|
|
7efbc84320 | ||
|
|
e9f6df7dca | ||
|
|
7fa6e51686 | ||
|
|
8dc68417e7 | ||
|
|
681f3c4c42 | ||
|
|
59a705525c | ||
|
|
5d3f314b0b |
@@ -1,8 +1,5 @@
|
|||||||
build
|
|
||||||
llama/build
|
|
||||||
.venv
|
|
||||||
.vscode
|
.vscode
|
||||||
ollama
|
ollama
|
||||||
app
|
app
|
||||||
web
|
llm/llama.cpp/ggml
|
||||||
.env
|
llm/llama.cpp/gguf
|
||||||
|
|||||||
11
.gitmodules
vendored
11
.gitmodules
vendored
@@ -1,4 +1,9 @@
|
|||||||
[submodule "llm/llama.cpp/ggml"]
|
[submodule "llm/llama.cpp/ggml"]
|
||||||
path = llm/llama.cpp/ggml
|
path = llm/llama.cpp/ggml
|
||||||
url = https://github.com/ggerganov/llama.cpp.git
|
url = https://github.com/ggerganov/llama.cpp.git
|
||||||
ignore = dirty
|
ignore = dirty
|
||||||
|
shallow = true
|
||||||
|
[submodule "llm/llama.cpp/gguf"]
|
||||||
|
path = llm/llama.cpp/gguf
|
||||||
|
url = https://github.com/ggerganov/llama.cpp.git
|
||||||
|
shallow = true
|
||||||
|
|||||||
18
Dockerfile
18
Dockerfile
@@ -1,15 +1,21 @@
|
|||||||
FROM golang:1.20
|
FROM golang:alpine
|
||||||
|
|
||||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||||
|
RUN apk add --no-cache git build-base cmake
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN CGO_ENABLED=1 go build -ldflags '-linkmode external -extldflags "-static"' .
|
RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||||
|
|
||||||
FROM alpine
|
FROM alpine
|
||||||
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
ENV OLLAMA_HOST 0.0.0.0
|
||||||
EXPOSE 11434
|
RUN apk add --no-cache libstdc++
|
||||||
|
|
||||||
ARG USER=ollama
|
ARG USER=ollama
|
||||||
ARG GROUP=ollama
|
ARG GROUP=ollama
|
||||||
RUN addgroup -g 1000 $GROUP && adduser -u 1000 -DG $GROUP $USER
|
RUN addgroup $GROUP && adduser -D -G $GROUP $USER
|
||||||
|
|
||||||
|
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||||
|
|
||||||
USER $USER:$GROUP
|
USER $USER:$GROUP
|
||||||
ENTRYPOINT ["/bin/ollama"]
|
ENTRYPOINT ["/bin/ollama"]
|
||||||
ENV OLLAMA_HOST 0.0.0.0
|
|
||||||
CMD ["serve"]
|
CMD ["serve"]
|
||||||
|
|||||||
22
Dockerfile.cuda
Normal file
22
Dockerfile.cuda
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||||
|
RUN apt-get update && apt-get install -y git build-essential cmake
|
||||||
|
ADD https://dl.google.com/go/go1.21.1.linux-amd64.tar.gz /tmp/go1.21.1.tar.gz
|
||||||
|
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
RUN /usr/local/go/bin/go generate ./... && /usr/local/go/bin/go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||||
|
|
||||||
|
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||||
|
ENV OLLAMA_HOST 0.0.0.0
|
||||||
|
|
||||||
|
ARG USER=ollama
|
||||||
|
ARG GROUP=ollama
|
||||||
|
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
|
||||||
|
|
||||||
|
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||||
|
|
||||||
|
USER $USER:$GROUP
|
||||||
|
ENTRYPOINT ["/bin/ollama"]
|
||||||
|
CMD ["serve"]
|
||||||
@@ -165,10 +165,11 @@ Ollama bundles model weights, configurations, and data into a single package, de
|
|||||||
|
|
||||||
## Building
|
## Building
|
||||||
|
|
||||||
Install `cmake`:
|
Install `cmake` and `go`:
|
||||||
|
|
||||||
```
|
```
|
||||||
brew install cmake
|
brew install cmake
|
||||||
|
brew install go
|
||||||
```
|
```
|
||||||
|
|
||||||
Then generate dependencies and build:
|
Then generate dependencies and build:
|
||||||
|
|||||||
@@ -255,6 +255,14 @@ func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, error) {
|
||||||
|
var resp ShowResponse
|
||||||
|
if err := c.do(ctx, http.MethodPost, "/api/show", req, &resp); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &resp, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Client) Heartbeat(ctx context.Context) error {
|
func (c *Client) Heartbeat(ctx context.Context) error {
|
||||||
if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
|
if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
|
||||||
return err
|
return err
|
||||||
|
|||||||
25
api/types.go
25
api/types.go
@@ -32,12 +32,11 @@ func (e StatusError) Error() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type GenerateRequest struct {
|
type GenerateRequest struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Prompt string `json:"prompt"`
|
Prompt string `json:"prompt"`
|
||||||
System string `json:"system"`
|
System string `json:"system"`
|
||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
Args map[string]any `json:"args,omitempty"`
|
|
||||||
|
|
||||||
Options map[string]interface{} `json:"options"`
|
Options map[string]interface{} `json:"options"`
|
||||||
}
|
}
|
||||||
@@ -62,6 +61,18 @@ type DeleteRequest struct {
|
|||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ShowRequest struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ShowResponse struct {
|
||||||
|
License string `json:"license,omitempty"`
|
||||||
|
Modelfile string `json:"modelfile,omitempty"`
|
||||||
|
Parameters string `json:"parameters,omitempty"`
|
||||||
|
Template string `json:"template,omitempty"`
|
||||||
|
System string `json:"system,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type CopyRequest struct {
|
type CopyRequest struct {
|
||||||
Source string `json:"source"`
|
Source string `json:"source"`
|
||||||
Destination string `json:"destination"`
|
Destination string `json:"destination"`
|
||||||
@@ -280,7 +291,7 @@ func DefaultOptions() Options {
|
|||||||
NumCtx: 2048,
|
NumCtx: 2048,
|
||||||
NumKeep: -1,
|
NumKeep: -1,
|
||||||
NumBatch: 512,
|
NumBatch: 512,
|
||||||
NumGPU: 1,
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumGQA: 1,
|
NumGQA: 1,
|
||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
F16KV: true,
|
||||||
|
|||||||
155
cmd/cmd.go
155
cmd/cmd.go
@@ -230,6 +230,84 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ShowHandler(cmd *cobra.Command, args []string) error {
|
||||||
|
client, err := api.FromEnv()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(args) != 1 {
|
||||||
|
return errors.New("missing model name")
|
||||||
|
}
|
||||||
|
|
||||||
|
license, errLicense := cmd.Flags().GetBool("license")
|
||||||
|
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
|
||||||
|
parameters, errParams := cmd.Flags().GetBool("parameters")
|
||||||
|
system, errSystem := cmd.Flags().GetBool("system")
|
||||||
|
template, errTemplate := cmd.Flags().GetBool("template")
|
||||||
|
|
||||||
|
for _, boolErr := range []error{errLicense, errModelfile, errParams, errSystem, errTemplate} {
|
||||||
|
if boolErr != nil {
|
||||||
|
return errors.New("error retrieving flags")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
flagsSet := 0
|
||||||
|
showType := ""
|
||||||
|
|
||||||
|
if license {
|
||||||
|
flagsSet++
|
||||||
|
showType = "license"
|
||||||
|
}
|
||||||
|
|
||||||
|
if modelfile {
|
||||||
|
flagsSet++
|
||||||
|
showType = "modelfile"
|
||||||
|
}
|
||||||
|
|
||||||
|
if parameters {
|
||||||
|
flagsSet++
|
||||||
|
showType = "parameters"
|
||||||
|
}
|
||||||
|
|
||||||
|
if system {
|
||||||
|
flagsSet++
|
||||||
|
showType = "system"
|
||||||
|
}
|
||||||
|
|
||||||
|
if template {
|
||||||
|
flagsSet++
|
||||||
|
showType = "template"
|
||||||
|
}
|
||||||
|
|
||||||
|
if flagsSet > 1 {
|
||||||
|
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
||||||
|
} else if flagsSet == 0 {
|
||||||
|
return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
|
||||||
|
}
|
||||||
|
|
||||||
|
req := api.ShowRequest{Name: args[0]}
|
||||||
|
resp, err := client.Show(context.Background(), &req)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
switch showType {
|
||||||
|
case "license":
|
||||||
|
fmt.Println(resp.License)
|
||||||
|
case "modelfile":
|
||||||
|
fmt.Println(resp.Modelfile)
|
||||||
|
case "parameters":
|
||||||
|
fmt.Println(resp.Parameters)
|
||||||
|
case "system":
|
||||||
|
fmt.Println(resp.System)
|
||||||
|
case "template":
|
||||||
|
fmt.Println(resp.Template)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func CopyHandler(cmd *cobra.Command, args []string) error {
|
func CopyHandler(cmd *cobra.Command, args []string) error {
|
||||||
client, err := api.FromEnv()
|
client, err := api.FromEnv()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -377,20 +455,6 @@ func generate(cmd *cobra.Command, model, prompt string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func showLayer(l *server.Layer) {
|
|
||||||
filename, err := server.GetBlobsPath(l.Digest)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println("Couldn't get layer's path")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
bts, err := os.ReadFile(filename)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println("Couldn't read layer")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
fmt.Println(string(bts))
|
|
||||||
}
|
|
||||||
|
|
||||||
func generateInteractive(cmd *cobra.Command, model string) error {
|
func generateInteractive(cmd *cobra.Command, model string) error {
|
||||||
home, err := os.UserHomeDir()
|
home, err := os.UserHomeDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -413,6 +477,8 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
|||||||
),
|
),
|
||||||
readline.PcItem("/show",
|
readline.PcItem("/show",
|
||||||
readline.PcItem("license"),
|
readline.PcItem("license"),
|
||||||
|
readline.PcItem("modelfile"),
|
||||||
|
readline.PcItem("parameters"),
|
||||||
readline.PcItem("system"),
|
readline.PcItem("system"),
|
||||||
readline.PcItem("template"),
|
readline.PcItem("template"),
|
||||||
),
|
),
|
||||||
@@ -522,42 +588,28 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
|||||||
case strings.HasPrefix(line, "/show"):
|
case strings.HasPrefix(line, "/show"):
|
||||||
args := strings.Fields(line)
|
args := strings.Fields(line)
|
||||||
if len(args) > 1 {
|
if len(args) > 1 {
|
||||||
mp := server.ParseModelPath(model)
|
resp, err := server.GetModelInfo(model)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
fmt.Println("error: couldn't get model")
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
manifest, _, err := server.GetManifest(mp)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println("error: couldn't get a manifest for this model")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
switch args[1] {
|
switch args[1] {
|
||||||
case "license":
|
case "license":
|
||||||
for _, l := range manifest.Layers {
|
fmt.Println(resp.License)
|
||||||
if l.MediaType == "application/vnd.ollama.image.license" {
|
case "modelfile":
|
||||||
showLayer(l)
|
fmt.Println(resp.Modelfile)
|
||||||
}
|
case "parameters":
|
||||||
}
|
fmt.Println(resp.Parameters)
|
||||||
continue
|
|
||||||
case "system":
|
case "system":
|
||||||
for _, l := range manifest.Layers {
|
fmt.Println(resp.System)
|
||||||
if l.MediaType == "application/vnd.ollama.image.system" {
|
|
||||||
showLayer(l)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
case "template":
|
case "template":
|
||||||
for _, l := range manifest.Layers {
|
fmt.Println(resp.Template)
|
||||||
if l.MediaType == "application/vnd.ollama.image.template" {
|
|
||||||
showLayer(l)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
default:
|
default:
|
||||||
usage()
|
fmt.Println("error: unknown command")
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
continue
|
||||||
} else {
|
} else {
|
||||||
usage()
|
usage()
|
||||||
continue
|
continue
|
||||||
@@ -620,6 +672,12 @@ func RunServer(cmd *cobra.Command, _ []string) error {
|
|||||||
origins = strings.Split(o, ",")
|
origins = strings.Split(o, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||||
|
if err := server.PruneLayers(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return server.Serve(ln, origins)
|
return server.Serve(ln, origins)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -749,6 +807,20 @@ func NewCLI() *cobra.Command {
|
|||||||
|
|
||||||
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
|
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
|
||||||
|
|
||||||
|
showCmd := &cobra.Command{
|
||||||
|
Use: "show MODEL",
|
||||||
|
Short: "Show information for a model",
|
||||||
|
Args: cobra.MinimumNArgs(1),
|
||||||
|
PreRunE: checkServerHeartbeat,
|
||||||
|
RunE: ShowHandler,
|
||||||
|
}
|
||||||
|
|
||||||
|
showCmd.Flags().Bool("license", false, "Show license of a model")
|
||||||
|
showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
|
||||||
|
showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
|
||||||
|
showCmd.Flags().Bool("template", false, "Show template of a model")
|
||||||
|
showCmd.Flags().Bool("system", false, "Show system prompt of a model")
|
||||||
|
|
||||||
runCmd := &cobra.Command{
|
runCmd := &cobra.Command{
|
||||||
Use: "run MODEL [PROMPT]",
|
Use: "run MODEL [PROMPT]",
|
||||||
Short: "Run a model",
|
Short: "Run a model",
|
||||||
@@ -814,6 +886,7 @@ func NewCLI() *cobra.Command {
|
|||||||
rootCmd.AddCommand(
|
rootCmd.AddCommand(
|
||||||
serveCmd,
|
serveCmd,
|
||||||
createCmd,
|
createCmd,
|
||||||
|
showCmd,
|
||||||
runCmd,
|
runCmd,
|
||||||
pullCmd,
|
pullCmd,
|
||||||
pushCmd,
|
pushCmd,
|
||||||
|
|||||||
@@ -20,6 +20,10 @@ Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` a
|
|||||||
|
|
||||||
All durations are returned in nanoseconds.
|
All durations are returned in nanoseconds.
|
||||||
|
|
||||||
|
### Streams
|
||||||
|
|
||||||
|
Many API responses are streams of JSON objects showing the current status. For examples of working with streams in various languages, see [streaming.md](./streaming.md)
|
||||||
|
|
||||||
## Generate a completion
|
## Generate a completion
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -238,6 +242,10 @@ Generate embeddings from a model
|
|||||||
- `model`: name of model to generate embeddings from
|
- `model`: name of model to generate embeddings from
|
||||||
- `prompt`: text to generate embeddings for
|
- `prompt`: text to generate embeddings for
|
||||||
|
|
||||||
|
Advanced parameters:
|
||||||
|
|
||||||
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -6,6 +6,10 @@
|
|||||||
|
|
||||||
Install required tools:
|
Install required tools:
|
||||||
|
|
||||||
|
- cmake version 3.24 or higher
|
||||||
|
- go version 1.20 or higher
|
||||||
|
- gcc version 11.4.0 or higher
|
||||||
|
|
||||||
```
|
```
|
||||||
brew install go cmake gcc
|
brew install go cmake gcc
|
||||||
```
|
```
|
||||||
@@ -27,3 +31,9 @@ Now you can run `ollama`:
|
|||||||
```
|
```
|
||||||
./ollama
|
./ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Building on Linux with GPU support
|
||||||
|
|
||||||
|
- Install cmake and nvidia-cuda-toolkit
|
||||||
|
- run `go generate ./...`
|
||||||
|
- run `go build .`
|
||||||
|
|||||||
35
docs/streaming.md
Normal file
35
docs/streaming.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Streaming responses in the Ollama Client API
|
||||||
|
|
||||||
|
## JavaScript / TypeScript / Deno
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const pull = async () => {
|
||||||
|
const request = await fetch("http://localhost:11434/api/pull", {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify({ name: "llama2:7b-q5_0" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const reader = await request.body?.pipeThrough(new TextDecoderStream());
|
||||||
|
if (!reader) throw new Error("No reader");
|
||||||
|
for await (const chunk of reader) {
|
||||||
|
const out = JSON.parse(chunk);
|
||||||
|
if (out.status.startsWith("downloading")) {
|
||||||
|
console.log(`${out.status} - ${(out.completed / out.total) * 100}%`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pull();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Python
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
response = requests.post("http://localhost:11434/api/pull", json={"name": "llama2:7b-q5_0"}, stream=True)
|
||||||
|
for data in response.iter_lines():
|
||||||
|
out = json.loads(data)
|
||||||
|
if "completed" in out:
|
||||||
|
print(out["completed"] / out["total"] * 100)
|
||||||
|
```
|
||||||
@@ -10,15 +10,11 @@ package format
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"crypto"
|
"crypto"
|
||||||
"crypto/ecdsa"
|
|
||||||
"crypto/ed25519"
|
"crypto/ed25519"
|
||||||
"crypto/elliptic"
|
|
||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
"crypto/rsa"
|
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"encoding/pem"
|
"encoding/pem"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math/big"
|
|
||||||
|
|
||||||
"golang.org/x/crypto/ssh"
|
"golang.org/x/crypto/ssh"
|
||||||
)
|
)
|
||||||
@@ -41,25 +37,6 @@ type openSSHPrivateKey struct {
|
|||||||
Rest []byte `ssh:"rest"`
|
Rest []byte `ssh:"rest"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type openSSHRSAPrivateKey struct {
|
|
||||||
N *big.Int
|
|
||||||
E *big.Int
|
|
||||||
D *big.Int
|
|
||||||
Iqmp *big.Int
|
|
||||||
P *big.Int
|
|
||||||
Q *big.Int
|
|
||||||
Comment string
|
|
||||||
Pad []byte `ssh:"rest"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type openSSHECDSAPrivateKey struct {
|
|
||||||
Curve string
|
|
||||||
Pub []byte
|
|
||||||
D *big.Int
|
|
||||||
Comment string
|
|
||||||
Pad []byte `ssh:"rest"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type openSSHEd25519PrivateKey struct {
|
type openSSHEd25519PrivateKey struct {
|
||||||
Pub []byte
|
Pub []byte
|
||||||
Priv []byte
|
Priv []byte
|
||||||
@@ -85,64 +62,6 @@ func OpenSSHPrivateKey(key crypto.PrivateKey, comment string) (*pem.Block, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
switch k := key.(type) {
|
switch k := key.(type) {
|
||||||
case *rsa.PrivateKey:
|
|
||||||
e := new(big.Int).SetInt64(int64(k.E))
|
|
||||||
|
|
||||||
key := openSSHRSAPrivateKey{
|
|
||||||
N: k.N,
|
|
||||||
E: e,
|
|
||||||
D: k.D,
|
|
||||||
Iqmp: k.Precomputed.Qinv,
|
|
||||||
P: k.Primes[0],
|
|
||||||
Q: k.Primes[1],
|
|
||||||
Comment: comment,
|
|
||||||
}
|
|
||||||
|
|
||||||
pk1.Keytype = ssh.KeyAlgoRSA
|
|
||||||
pk1.Rest = ssh.Marshal(key)
|
|
||||||
|
|
||||||
w.PubKey = ssh.Marshal(struct {
|
|
||||||
KeyType string
|
|
||||||
E *big.Int
|
|
||||||
N *big.Int
|
|
||||||
}{
|
|
||||||
ssh.KeyAlgoRSA, e, k.N,
|
|
||||||
})
|
|
||||||
case *ecdsa.PrivateKey:
|
|
||||||
var curve, keytype string
|
|
||||||
switch name := k.Curve.Params().Name; name {
|
|
||||||
case "P-256":
|
|
||||||
curve = "nistp256"
|
|
||||||
keytype = ssh.KeyAlgoECDSA256
|
|
||||||
case "P-384":
|
|
||||||
curve = "nistp384"
|
|
||||||
keytype = ssh.KeyAlgoECDSA384
|
|
||||||
case "P-521":
|
|
||||||
curve = "nistp521"
|
|
||||||
keytype = ssh.KeyAlgoECDSA521
|
|
||||||
default:
|
|
||||||
return nil, fmt.Errorf("ssh: unknown curve %q", name)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub := elliptic.Marshal(k.Curve, k.X, k.Y)
|
|
||||||
|
|
||||||
key := openSSHECDSAPrivateKey{
|
|
||||||
Curve: curve,
|
|
||||||
Pub: pub,
|
|
||||||
D: k.D,
|
|
||||||
Comment: comment,
|
|
||||||
}
|
|
||||||
|
|
||||||
pk1.Keytype = keytype
|
|
||||||
pk1.Rest = ssh.Marshal(key)
|
|
||||||
|
|
||||||
w.PubKey = ssh.Marshal(struct {
|
|
||||||
KeyType string
|
|
||||||
Curve string
|
|
||||||
Pub []byte
|
|
||||||
}{
|
|
||||||
keytype, curve, pub,
|
|
||||||
})
|
|
||||||
case ed25519.PrivateKey:
|
case ed25519.PrivateKey:
|
||||||
pub, priv := k[32:], k
|
pub, priv := k[32:], k
|
||||||
key := openSSHEd25519PrivateKey{
|
key := openSSHEd25519PrivateKey{
|
||||||
|
|||||||
1
go.mod
1
go.mod
@@ -39,6 +39,7 @@ require (
|
|||||||
github.com/ugorji/go/codec v1.2.11 // indirect
|
github.com/ugorji/go/codec v1.2.11 // indirect
|
||||||
golang.org/x/arch v0.3.0 // indirect
|
golang.org/x/arch v0.3.0 // indirect
|
||||||
golang.org/x/crypto v0.10.0
|
golang.org/x/crypto v0.10.0
|
||||||
|
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
|
||||||
golang.org/x/net v0.10.0 // indirect
|
golang.org/x/net v0.10.0 // indirect
|
||||||
golang.org/x/sys v0.11.0 // indirect
|
golang.org/x/sys v0.11.0 // indirect
|
||||||
golang.org/x/term v0.10.0
|
golang.org/x/term v0.10.0
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -121,6 +121,8 @@ golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5y
|
|||||||
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
|
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
|
||||||
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
|
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
|
||||||
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
|
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
|
||||||
|
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
|
||||||
|
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
|
||||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
|
golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
|
||||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||||
|
|||||||
22
llm/falcon.go
Normal file
22
llm/falcon.go
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
const ModelFamilyFalcon = "falcon"
|
||||||
|
|
||||||
|
const (
|
||||||
|
falconModelType7B = 32
|
||||||
|
falconModelType40B = 60
|
||||||
|
falconModelType180B = 80
|
||||||
|
)
|
||||||
|
|
||||||
|
func falconModelType(numLayer uint32) string {
|
||||||
|
switch numLayer {
|
||||||
|
case 32:
|
||||||
|
return "7B"
|
||||||
|
case 60:
|
||||||
|
return "40B"
|
||||||
|
case 80:
|
||||||
|
return "180B"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
182
llm/ggml.go
182
llm/ggml.go
@@ -3,72 +3,97 @@ package llm
|
|||||||
import (
|
import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
|
||||||
"io"
|
"io"
|
||||||
|
"path"
|
||||||
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ModelFamily string
|
|
||||||
|
|
||||||
type ModelType uint32
|
|
||||||
|
|
||||||
const (
|
|
||||||
ModelType3B ModelType = 26
|
|
||||||
ModelType7B ModelType = 32
|
|
||||||
ModelType13B ModelType = 40
|
|
||||||
ModelType34B ModelType = 48
|
|
||||||
ModelType30B ModelType = 60
|
|
||||||
ModelType65B ModelType = 80
|
|
||||||
)
|
|
||||||
|
|
||||||
func (mt ModelType) String() string {
|
|
||||||
switch mt {
|
|
||||||
case ModelType3B:
|
|
||||||
return "3B"
|
|
||||||
case ModelType7B:
|
|
||||||
return "7B"
|
|
||||||
case ModelType13B:
|
|
||||||
return "13B"
|
|
||||||
case ModelType34B:
|
|
||||||
return "34B"
|
|
||||||
case ModelType30B:
|
|
||||||
return "30B"
|
|
||||||
case ModelType65B:
|
|
||||||
return "65B"
|
|
||||||
default:
|
|
||||||
return "Unknown"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type FileType interface {
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
magic uint32
|
magic uint32
|
||||||
container
|
container
|
||||||
model
|
model
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
fileTypeF32 uint32 = iota
|
||||||
|
fileTypeF16
|
||||||
|
fileTypeQ4_0
|
||||||
|
fileTypeQ4_1
|
||||||
|
fileTypeQ4_1_F16
|
||||||
|
fileTypeQ8_0 uint32 = iota + 2
|
||||||
|
fileTypeQ5_0
|
||||||
|
fileTypeQ5_1
|
||||||
|
fileTypeQ2_K
|
||||||
|
fileTypeQ3_K_S
|
||||||
|
fileTypeQ3_K_M
|
||||||
|
fileTypeQ3_K_L
|
||||||
|
fileTypeQ4_K_S
|
||||||
|
fileTypeQ4_K_M
|
||||||
|
fileTypeQ5_K_S
|
||||||
|
fileTypeQ5_K_M
|
||||||
|
fileTypeQ6_K
|
||||||
|
)
|
||||||
|
|
||||||
|
func fileType(fileType uint32) string {
|
||||||
|
switch fileType {
|
||||||
|
case fileTypeF32:
|
||||||
|
return "F32"
|
||||||
|
case fileTypeF16:
|
||||||
|
return "F16"
|
||||||
|
case fileTypeQ4_0:
|
||||||
|
return "Q4_0"
|
||||||
|
case fileTypeQ4_1:
|
||||||
|
return "Q4_1"
|
||||||
|
case fileTypeQ4_1_F16:
|
||||||
|
return "Q4_1_F16"
|
||||||
|
case fileTypeQ8_0:
|
||||||
|
return "Q8_0"
|
||||||
|
case fileTypeQ5_0:
|
||||||
|
return "Q5_0"
|
||||||
|
case fileTypeQ5_1:
|
||||||
|
return "Q5_1"
|
||||||
|
case fileTypeQ2_K:
|
||||||
|
return "Q2_K"
|
||||||
|
case fileTypeQ3_K_S:
|
||||||
|
return "Q3_K_S"
|
||||||
|
case fileTypeQ3_K_M:
|
||||||
|
return "Q3_K_M"
|
||||||
|
case fileTypeQ3_K_L:
|
||||||
|
return "Q3_K_L"
|
||||||
|
case fileTypeQ4_K_S:
|
||||||
|
return "Q4_K_S"
|
||||||
|
case fileTypeQ4_K_M:
|
||||||
|
return "Q4_K_M"
|
||||||
|
case fileTypeQ5_K_S:
|
||||||
|
return "Q5_K_S"
|
||||||
|
case fileTypeQ5_K_M:
|
||||||
|
return "Q5_K_M"
|
||||||
|
case fileTypeQ6_K:
|
||||||
|
return "Q6_K"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type model interface {
|
type model interface {
|
||||||
ModelFamily() ModelFamily
|
ModelFamily() string
|
||||||
ModelType() ModelType
|
ModelType() string
|
||||||
FileType() FileType
|
FileType() string
|
||||||
}
|
}
|
||||||
|
|
||||||
type container interface {
|
type container interface {
|
||||||
Name() string
|
Name() string
|
||||||
Decode(io.Reader) error
|
Decode(io.Reader) (model, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type containerGGML struct {
|
type containerGGML struct{}
|
||||||
}
|
|
||||||
|
|
||||||
func (c *containerGGML) Name() string {
|
func (c *containerGGML) Name() string {
|
||||||
return "ggml"
|
return "ggml"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *containerGGML) Decode(r io.Reader) error {
|
func (c *containerGGML) Decode(r io.Reader) (model, error) {
|
||||||
return nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type containerGGMF struct {
|
type containerGGMF struct {
|
||||||
@@ -79,18 +104,18 @@ func (c *containerGGMF) Name() string {
|
|||||||
return "ggmf"
|
return "ggmf"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *containerGGMF) Decode(r io.Reader) error {
|
func (c *containerGGMF) Decode(r io.Reader) (model, error) {
|
||||||
var version uint32
|
var version uint32
|
||||||
binary.Read(r, binary.LittleEndian, &version)
|
binary.Read(r, binary.LittleEndian, &version)
|
||||||
|
|
||||||
switch version {
|
switch version {
|
||||||
case 1:
|
case 1:
|
||||||
default:
|
default:
|
||||||
return errors.New("invalid version")
|
return nil, errors.New("invalid version")
|
||||||
}
|
}
|
||||||
|
|
||||||
c.version = version
|
c.version = version
|
||||||
return nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type containerGGJT struct {
|
type containerGGJT struct {
|
||||||
@@ -101,18 +126,22 @@ func (c *containerGGJT) Name() string {
|
|||||||
return "ggjt"
|
return "ggjt"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *containerGGJT) Decode(r io.Reader) error {
|
func (c *containerGGJT) Decode(r io.Reader) (model, error) {
|
||||||
var version uint32
|
var version uint32
|
||||||
binary.Read(r, binary.LittleEndian, &version)
|
binary.Read(r, binary.LittleEndian, &version)
|
||||||
|
|
||||||
switch version {
|
switch version {
|
||||||
case 1, 2, 3:
|
case 1, 2, 3:
|
||||||
default:
|
default:
|
||||||
return errors.New("invalid version")
|
return nil, errors.New("invalid version")
|
||||||
}
|
}
|
||||||
|
|
||||||
c.version = version
|
c.version = version
|
||||||
return nil
|
|
||||||
|
// different model types may have different layouts for hyperparameters
|
||||||
|
var llama llamaModel
|
||||||
|
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
|
||||||
|
return &llama, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type containerLORA struct {
|
type containerLORA struct {
|
||||||
@@ -123,32 +152,51 @@ func (c *containerLORA) Name() string {
|
|||||||
return "ggla"
|
return "ggla"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *containerLORA) Decode(r io.Reader) error {
|
func (c *containerLORA) Decode(r io.Reader) (model, error) {
|
||||||
var version uint32
|
var version uint32
|
||||||
binary.Read(r, binary.LittleEndian, &version)
|
binary.Read(r, binary.LittleEndian, &version)
|
||||||
|
|
||||||
switch version {
|
switch version {
|
||||||
case 1:
|
case 1:
|
||||||
default:
|
default:
|
||||||
return errors.New("invalid version")
|
return nil, errors.New("invalid version")
|
||||||
}
|
}
|
||||||
|
|
||||||
c.version = version
|
c.version = version
|
||||||
return nil
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
|
||||||
|
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
ggmlInit sync.Once
|
||||||
|
ggmlRunnerPath string
|
||||||
|
)
|
||||||
|
|
||||||
|
func ggmlRunner() ModelRunner {
|
||||||
|
ggmlInit.Do(func() {
|
||||||
|
ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
|
||||||
|
})
|
||||||
|
return ModelRunner{Path: ggmlRunnerPath}
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// / Magic constant for `ggml` files (unversioned).
|
// Magic constant for `ggml` files (unversioned).
|
||||||
FILE_MAGIC_GGML = 0x67676d6c
|
FILE_MAGIC_GGML = 0x67676d6c
|
||||||
// / Magic constant for `ggml` files (versioned, ggmf).
|
// Magic constant for `ggml` files (versioned, ggmf).
|
||||||
FILE_MAGIC_GGMF = 0x67676d66
|
FILE_MAGIC_GGMF = 0x67676d66
|
||||||
// / Magic constant for `ggml` files (versioned, ggjt).
|
// Magic constant for `ggml` files (versioned, ggjt).
|
||||||
FILE_MAGIC_GGJT = 0x67676a74
|
FILE_MAGIC_GGJT = 0x67676a74
|
||||||
// / Magic constant for `ggla` files (LoRA adapter).
|
// Magic constant for `ggla` files (LoRA adapter).
|
||||||
FILE_MAGIC_GGLA = 0x67676C61
|
FILE_MAGIC_GGLA = 0x67676C61
|
||||||
|
// Magic constant for `gguf` files (versioned, gguf)
|
||||||
|
FILE_MAGIC_GGUF = 0x46554747
|
||||||
)
|
)
|
||||||
|
|
||||||
func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
|
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||||
var ggml GGML
|
var ggml GGML
|
||||||
binary.Read(r, binary.LittleEndian, &ggml.magic)
|
binary.Read(r, binary.LittleEndian, &ggml.magic)
|
||||||
|
|
||||||
@@ -161,24 +209,18 @@ func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
|
|||||||
ggml.container = &containerGGJT{}
|
ggml.container = &containerGGJT{}
|
||||||
case FILE_MAGIC_GGLA:
|
case FILE_MAGIC_GGLA:
|
||||||
ggml.container = &containerLORA{}
|
ggml.container = &containerLORA{}
|
||||||
|
case FILE_MAGIC_GGUF:
|
||||||
|
ggml.container = &containerGGUF{}
|
||||||
default:
|
default:
|
||||||
return nil, errors.New("invalid file magic")
|
return nil, errors.New("invalid file magic")
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := ggml.Decode(r); err != nil {
|
model, err := ggml.Decode(r)
|
||||||
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// different model types may have different layouts for hyperparameters
|
ggml.model = model
|
||||||
switch hint {
|
|
||||||
case ModelFamilyLlama:
|
|
||||||
var llama llamaModel
|
|
||||||
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
|
|
||||||
ggml.model = &llama
|
|
||||||
// TODO: sanity check hyperparameters
|
|
||||||
default:
|
|
||||||
return nil, fmt.Errorf("unsupported model type: %s", hint)
|
|
||||||
}
|
|
||||||
|
|
||||||
// final model type
|
// final model type
|
||||||
return &ggml, nil
|
return &ggml, nil
|
||||||
|
|||||||
389
llm/gguf.go
Normal file
389
llm/gguf.go
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"path"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
type containerGGUF struct {
|
||||||
|
Version uint32
|
||||||
|
|
||||||
|
V1 struct {
|
||||||
|
NumTensor uint32
|
||||||
|
NumKV uint32
|
||||||
|
}
|
||||||
|
|
||||||
|
V2 struct {
|
||||||
|
NumTensor uint64
|
||||||
|
NumKV uint64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *containerGGUF) Name() string {
|
||||||
|
return "gguf"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *containerGGUF) Decode(r io.Reader) (model, error) {
|
||||||
|
binary.Read(r, binary.LittleEndian, &c.Version)
|
||||||
|
|
||||||
|
switch c.Version {
|
||||||
|
case 1:
|
||||||
|
binary.Read(r, binary.LittleEndian, &c.V1)
|
||||||
|
case 2:
|
||||||
|
binary.Read(r, binary.LittleEndian, &c.V2)
|
||||||
|
default:
|
||||||
|
return nil, errors.New("invalid version")
|
||||||
|
}
|
||||||
|
|
||||||
|
model := newGGUFModel(c)
|
||||||
|
if err := model.Decode(r); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return model, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
ggufTypeUint8 uint32 = iota
|
||||||
|
ggufTypeInt8
|
||||||
|
ggufTypeUint16
|
||||||
|
ggufTypeInt16
|
||||||
|
ggufTypeUint32
|
||||||
|
ggufTypeInt32
|
||||||
|
ggufTypeFloat32
|
||||||
|
ggufTypeBool
|
||||||
|
ggufTypeString
|
||||||
|
ggufTypeArray
|
||||||
|
ggufTypeUint64
|
||||||
|
ggufTypeInt64
|
||||||
|
ggufTypeFloat64
|
||||||
|
)
|
||||||
|
|
||||||
|
type kv map[string]any
|
||||||
|
|
||||||
|
type ggufModel struct {
|
||||||
|
*containerGGUF
|
||||||
|
kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func newGGUFModel(container *containerGGUF) *ggufModel {
|
||||||
|
return &ggufModel{
|
||||||
|
containerGGUF: container,
|
||||||
|
kv: make(kv),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) NumKV() uint64 {
|
||||||
|
if llm.Version == 1 {
|
||||||
|
return uint64(llm.V1.NumKV)
|
||||||
|
}
|
||||||
|
|
||||||
|
return llm.V2.NumKV
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) ModelFamily() string {
|
||||||
|
t, ok := llm.kv["general.architecture"].(string)
|
||||||
|
if ok {
|
||||||
|
return t
|
||||||
|
}
|
||||||
|
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) ModelType() string {
|
||||||
|
switch llm.ModelFamily() {
|
||||||
|
case "llama":
|
||||||
|
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
|
||||||
|
heads, headsOK := llm.kv["llama.head_count"].(uint32)
|
||||||
|
headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
|
||||||
|
if headsOK && headsKVsOK && heads/headKVs == 8 {
|
||||||
|
return "70B"
|
||||||
|
}
|
||||||
|
|
||||||
|
return llamaModelType(blocks)
|
||||||
|
}
|
||||||
|
case "falcon":
|
||||||
|
if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
|
||||||
|
return falconModelType(blocks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) FileType() string {
|
||||||
|
t, ok := llm.kv["general.file_type"].(uint32)
|
||||||
|
if ok {
|
||||||
|
return fileType(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) Decode(r io.Reader) error {
|
||||||
|
read := llm.readString
|
||||||
|
if llm.Version == 1 {
|
||||||
|
read = llm.readStringV1
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; uint64(i) < llm.NumKV(); i++ {
|
||||||
|
k, err := read(r)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
vtype := llm.readU32(r)
|
||||||
|
|
||||||
|
var v any
|
||||||
|
switch vtype {
|
||||||
|
case ggufTypeUint8:
|
||||||
|
v = llm.readU8(r)
|
||||||
|
case ggufTypeInt8:
|
||||||
|
v = llm.readI8(r)
|
||||||
|
case ggufTypeUint16:
|
||||||
|
v = llm.readU16(r)
|
||||||
|
case ggufTypeInt16:
|
||||||
|
v = llm.readI16(r)
|
||||||
|
case ggufTypeUint32:
|
||||||
|
v = llm.readU32(r)
|
||||||
|
case ggufTypeInt32:
|
||||||
|
v = llm.readI32(r)
|
||||||
|
case ggufTypeUint64:
|
||||||
|
v = llm.readU64(r)
|
||||||
|
case ggufTypeInt64:
|
||||||
|
v = llm.readI64(r)
|
||||||
|
case ggufTypeFloat32:
|
||||||
|
v = llm.readF32(r)
|
||||||
|
case ggufTypeFloat64:
|
||||||
|
v = llm.readF64(r)
|
||||||
|
case ggufTypeBool:
|
||||||
|
v = llm.readBool(r)
|
||||||
|
case ggufTypeString:
|
||||||
|
fn := llm.readString
|
||||||
|
if llm.Version == 1 {
|
||||||
|
fn = llm.readStringV1
|
||||||
|
}
|
||||||
|
|
||||||
|
s, err := fn(r)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
v = s
|
||||||
|
case ggufTypeArray:
|
||||||
|
fn := llm.readArray
|
||||||
|
if llm.Version == 1 {
|
||||||
|
fn = llm.readArrayV1
|
||||||
|
}
|
||||||
|
|
||||||
|
a, err := fn(r)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
v = a
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("invalid type: %d", vtype)
|
||||||
|
}
|
||||||
|
|
||||||
|
llm.kv[k] = v
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readU8(r io.Reader) uint8 {
|
||||||
|
var u8 uint8
|
||||||
|
binary.Read(r, binary.LittleEndian, &u8)
|
||||||
|
return u8
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readI8(r io.Reader) int8 {
|
||||||
|
var i8 int8
|
||||||
|
binary.Read(r, binary.LittleEndian, &i8)
|
||||||
|
return i8
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readU16(r io.Reader) uint16 {
|
||||||
|
var u16 uint16
|
||||||
|
binary.Read(r, binary.LittleEndian, &u16)
|
||||||
|
return u16
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readI16(r io.Reader) int16 {
|
||||||
|
var i16 int16
|
||||||
|
binary.Read(r, binary.LittleEndian, &i16)
|
||||||
|
return i16
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readU32(r io.Reader) uint32 {
|
||||||
|
var u32 uint32
|
||||||
|
binary.Read(r, binary.LittleEndian, &u32)
|
||||||
|
return u32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readI32(r io.Reader) int32 {
|
||||||
|
var i32 int32
|
||||||
|
binary.Read(r, binary.LittleEndian, &i32)
|
||||||
|
return i32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readU64(r io.Reader) uint64 {
|
||||||
|
var u64 uint64
|
||||||
|
binary.Read(r, binary.LittleEndian, &u64)
|
||||||
|
return u64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readI64(r io.Reader) int64 {
|
||||||
|
var i64 int64
|
||||||
|
binary.Read(r, binary.LittleEndian, &i64)
|
||||||
|
return i64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readF32(r io.Reader) float32 {
|
||||||
|
var f32 float32
|
||||||
|
binary.Read(r, binary.LittleEndian, &f32)
|
||||||
|
return f32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readF64(r io.Reader) float64 {
|
||||||
|
var f64 float64
|
||||||
|
binary.Read(r, binary.LittleEndian, &f64)
|
||||||
|
return f64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readBool(r io.Reader) bool {
|
||||||
|
var b bool
|
||||||
|
binary.Read(r, binary.LittleEndian, &b)
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ggufModel) readStringV1(r io.Reader) (string, error) {
|
||||||
|
var nameLength uint32
|
||||||
|
binary.Read(r, binary.LittleEndian, &nameLength)
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
// gguf v1 strings are null-terminated
|
||||||
|
b.Truncate(b.Len() - 1)
|
||||||
|
|
||||||
|
return b.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm ggufModel) readString(r io.Reader) (string, error) {
|
||||||
|
var nameLength uint64
|
||||||
|
binary.Read(r, binary.LittleEndian, &nameLength)
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
|
||||||
|
atype := llm.readU32(r)
|
||||||
|
n := llm.readU32(r)
|
||||||
|
|
||||||
|
for i := 0; uint32(i) < n; i++ {
|
||||||
|
switch atype {
|
||||||
|
case ggufTypeUint8:
|
||||||
|
arr = append(arr, llm.readU8(r))
|
||||||
|
case ggufTypeInt8:
|
||||||
|
arr = append(arr, llm.readU8(r))
|
||||||
|
case ggufTypeUint16:
|
||||||
|
arr = append(arr, llm.readU16(r))
|
||||||
|
case ggufTypeInt16:
|
||||||
|
arr = append(arr, llm.readI16(r))
|
||||||
|
case ggufTypeUint32:
|
||||||
|
arr = append(arr, llm.readU32(r))
|
||||||
|
case ggufTypeInt32:
|
||||||
|
arr = append(arr, llm.readI32(r))
|
||||||
|
case ggufTypeFloat32:
|
||||||
|
arr = append(arr, llm.readF32(r))
|
||||||
|
case ggufTypeBool:
|
||||||
|
arr = append(arr, llm.readBool(r))
|
||||||
|
case ggufTypeString:
|
||||||
|
s, err := llm.readStringV1(r)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
arr = append(arr, s)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("invalid array type: %d", atype)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
|
||||||
|
atype := llm.readU32(r)
|
||||||
|
n := llm.readU64(r)
|
||||||
|
|
||||||
|
for i := 0; uint64(i) < n; i++ {
|
||||||
|
switch atype {
|
||||||
|
case ggufTypeUint8:
|
||||||
|
arr = append(arr, llm.readU8(r))
|
||||||
|
case ggufTypeInt8:
|
||||||
|
arr = append(arr, llm.readU8(r))
|
||||||
|
case ggufTypeUint16:
|
||||||
|
arr = append(arr, llm.readU16(r))
|
||||||
|
case ggufTypeInt16:
|
||||||
|
arr = append(arr, llm.readI16(r))
|
||||||
|
case ggufTypeUint32:
|
||||||
|
arr = append(arr, llm.readU32(r))
|
||||||
|
case ggufTypeInt32:
|
||||||
|
arr = append(arr, llm.readI32(r))
|
||||||
|
case ggufTypeUint64:
|
||||||
|
arr = append(arr, llm.readU64(r))
|
||||||
|
case ggufTypeInt64:
|
||||||
|
arr = append(arr, llm.readI64(r))
|
||||||
|
case ggufTypeFloat32:
|
||||||
|
arr = append(arr, llm.readF32(r))
|
||||||
|
case ggufTypeFloat64:
|
||||||
|
arr = append(arr, llm.readF64(r))
|
||||||
|
case ggufTypeBool:
|
||||||
|
arr = append(arr, llm.readBool(r))
|
||||||
|
case ggufTypeString:
|
||||||
|
s, err := llm.readString(r)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
arr = append(arr, s)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("invalid array type: %d", atype)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
|
||||||
|
ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
ggufInit sync.Once
|
||||||
|
ggufRunnerPath string
|
||||||
|
)
|
||||||
|
|
||||||
|
func ggufRunner() ModelRunner {
|
||||||
|
ggufInit.Do(func() {
|
||||||
|
ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
|
||||||
|
})
|
||||||
|
|
||||||
|
return ModelRunner{Path: ggufRunnerPath}
|
||||||
|
}
|
||||||
@@ -1,8 +1,17 @@
|
|||||||
|
//go:build !darwin
|
||||||
|
// +build !darwin
|
||||||
|
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
|
|
||||||
//go:generate git submodule update --force ggml
|
//go:generate git submodule update --force ggml
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate -command git-apply git -C ggml apply
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
//go:build darwin
|
|
||||||
// +build darwin
|
|
||||||
|
|
||||||
package llm
|
|
||||||
|
|
||||||
//go:generate git submodule init
|
|
||||||
//go:generate git submodule update --force ggml
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
|
||||||
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
|
||||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
|
||||||
16
llm/llama.cpp/generate_darwin_amd64.go
Normal file
16
llm/llama.cpp/generate_darwin_amd64.go
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
//go:generate git submodule init
|
||||||
|
|
||||||
|
//go:generate git submodule update --force ggml
|
||||||
|
//go:generate -command git-apply git -C ggml apply
|
||||||
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
|
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
16
llm/llama.cpp/generate_darwin_arm64.go
Normal file
16
llm/llama.cpp/generate_darwin_arm64.go
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
//go:generate git submodule init
|
||||||
|
|
||||||
|
//go:generate git submodule update --force ggml
|
||||||
|
//go:generate -command git-apply git -C ggml apply
|
||||||
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
|
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
|
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||||
15
llm/llama.cpp/generate_linux.go
Normal file
15
llm/llama.cpp/generate_linux.go
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
//go:generate git submodule init
|
||||||
|
|
||||||
|
//go:generate git submodule update --force ggml
|
||||||
|
//go:generate -command git-apply git -C ggml apply
|
||||||
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
|
||||||
|
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
|
Date: Tue, 5 Sep 2023 16:05:08 -0400
|
||||||
|
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
|
||||||
|
|
||||||
|
---
|
||||||
|
ggml-metal.metal | 2 ++
|
||||||
|
1 file changed, 2 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||||
|
index 3f31252..ce3541f 100644
|
||||||
|
--- a/ggml-metal.metal
|
||||||
|
+++ b/ggml-metal.metal
|
||||||
|
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
|
//load data and store to threadgroup memory
|
||||||
|
half4x4 temp_a;
|
||||||
|
dequantize_func(x, il, temp_a);
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
#pragma unroll(16)
|
||||||
|
for (int i = 0; i < 16; i++) {
|
||||||
|
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||||
|
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||||
|
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
--
|
||||||
|
2.39.2 (Apple Git-143)
|
||||||
|
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
|
||||||
|
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
|
Date: Mon, 21 Aug 2023 06:59:29 -0400
|
||||||
|
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
|
||||||
|
kernel (#2686)
|
||||||
|
|
||||||
|
---
|
||||||
|
ggml-metal.metal | 3 ++-
|
||||||
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||||
|
index 3f31252..88d48f6 100644
|
||||||
|
--- a/ggml-metal.metal
|
||||||
|
+++ b/ggml-metal.metal
|
||||||
|
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
|
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||||
|
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_device);
|
||||||
|
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||||
|
}
|
||||||
|
|
||||||
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_device);
|
||||||
|
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||||
|
if (sgitg==0) {
|
||||||
|
for (int i = 0; i < n_rows; i++) {
|
||||||
|
--
|
||||||
|
2.41.0
|
||||||
|
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
|
Date: Tue, 22 Aug 2023 02:18:40 -0400
|
||||||
|
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
|
||||||
|
|
||||||
|
---
|
||||||
|
ggml-metal.metal | 5 +++--
|
||||||
|
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||||
|
index 88d48f6..ce3541f 100644
|
||||||
|
--- a/ggml-metal.metal
|
||||||
|
+++ b/ggml-metal.metal
|
||||||
|
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
|
//load data and store to threadgroup memory
|
||||||
|
half4x4 temp_a;
|
||||||
|
dequantize_func(x, il, temp_a);
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
#pragma unroll(16)
|
||||||
|
for (int i = 0; i < 16; i++) {
|
||||||
|
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||||
|
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||||
|
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
- threadgroup_barrier(mem_flags::mem_device);
|
||||||
|
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||||
|
}
|
||||||
|
|
||||||
|
- threadgroup_barrier(mem_flags::mem_device);
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||||
|
if (sgitg==0) {
|
||||||
|
for (int i = 0; i < n_rows; i++) {
|
||||||
|
--
|
||||||
|
2.41.0
|
||||||
|
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||||
|
Date: Tue, 22 Aug 2023 15:14:23 +0800
|
||||||
|
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
|
||||||
|
|
||||||
|
* ggml: support CUDA's half type for aarch64(#1455)
|
||||||
|
support CUDA's half type for aarch64 in ggml_fp16_t definition
|
||||||
|
|
||||||
|
* ggml: use __CUDACC__ to recognise nvcc compiler
|
||||||
|
---
|
||||||
|
ggml.h | 5 +++--
|
||||||
|
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ggml.h b/ggml.h
|
||||||
|
index 544ad2d..0ec7ec5 100644
|
||||||
|
--- a/ggml.h
|
||||||
|
+++ b/ggml.h
|
||||||
|
@@ -259,8 +259,9 @@
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#ifdef __ARM_NEON
|
||||||
|
- // we use the built-in 16-bit float type
|
||||||
|
+#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||||
|
+ typedef half ggml_fp16_t;
|
||||||
|
+#elif defined(__ARM_NEON)
|
||||||
|
typedef __fp16 ggml_fp16_t;
|
||||||
|
#else
|
||||||
|
typedef uint16_t ggml_fp16_t;
|
||||||
|
--
|
||||||
|
2.39.2 (Apple Git-143)
|
||||||
|
|
||||||
1
llm/llama.cpp/gguf
Submodule
1
llm/llama.cpp/gguf
Submodule
Submodule llm/llama.cpp/gguf added at 53885d7256
@@ -20,124 +20,114 @@ import (
|
|||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/jmorganca/ollama/api"
|
"github.com/jmorganca/ollama/api"
|
||||||
)
|
)
|
||||||
|
|
||||||
const ModelFamilyLlama ModelFamily = "llama"
|
//go:embed llama.cpp/*/build/*/bin/*
|
||||||
|
|
||||||
//go:embed llama.cpp/ggml/build/*/bin/*
|
|
||||||
var llamaCppEmbed embed.FS
|
var llamaCppEmbed embed.FS
|
||||||
|
|
||||||
var (
|
|
||||||
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
|
|
||||||
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
ggmlInit sync.Once
|
|
||||||
ggmlRunnerPath string
|
|
||||||
)
|
|
||||||
|
|
||||||
func osPath(llamaPath string) string {
|
func osPath(llamaPath string) string {
|
||||||
if runtime.GOOS == "windows" {
|
if runtime.GOOS == "windows" {
|
||||||
return path.Join(llamaPath, "Release")
|
return path.Join(llamaPath, "Release")
|
||||||
}
|
}
|
||||||
|
|
||||||
return llamaPath
|
return llamaPath
|
||||||
}
|
}
|
||||||
|
|
||||||
func initGGML() {
|
func chooseRunner(gpuPath, cpuPath string) string {
|
||||||
ggmlInit.Do(func() {
|
tmpDir, err := os.MkdirTemp("", "llama-*")
|
||||||
tmpDir, err := os.MkdirTemp("", "llama-*")
|
if err != nil {
|
||||||
if err != nil {
|
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
||||||
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
}
|
||||||
}
|
|
||||||
|
|
||||||
llamaPath := osPath(ggmlGPU)
|
llamaPath := osPath(gpuPath)
|
||||||
|
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||||
|
llamaPath = osPath(cpuPath)
|
||||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||||
llamaPath = osPath(ggmlCPU)
|
log.Fatalf("llama.cpp executable not found")
|
||||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
|
||||||
log.Fatalf("llama.cpp executable not found")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
files := []string{"server"}
|
files := []string{"server"}
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "windows":
|
case "windows":
|
||||||
files = []string{"server.exe"}
|
files = []string{"server.exe"}
|
||||||
case "darwin":
|
case "darwin":
|
||||||
|
if llamaPath == osPath(gpuPath) {
|
||||||
files = append(files, "ggml-metal.metal")
|
files = append(files, "ggml-metal.metal")
|
||||||
}
|
}
|
||||||
|
case "linux":
|
||||||
for _, f := range files {
|
// check if there is a GPU available
|
||||||
srcPath := path.Join(llamaPath, f)
|
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
|
||||||
destPath := filepath.Join(tmpDir, f)
|
// this error was logged on start-up, so we don't need to log it again
|
||||||
|
llamaPath = osPath(cpuPath)
|
||||||
srcFile, err := llamaCppEmbed.Open(srcPath)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("read llama.cpp %s: %v", f, err)
|
|
||||||
}
|
|
||||||
defer srcFile.Close()
|
|
||||||
|
|
||||||
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("write llama.cpp %s: %v", f, err)
|
|
||||||
}
|
|
||||||
defer destFile.Close()
|
|
||||||
|
|
||||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
|
||||||
log.Fatalf("copy llama.cpp %s: %v", f, err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ggmlRunnerPath = filepath.Join(tmpDir, "server")
|
for _, f := range files {
|
||||||
if runtime.GOOS == "windows" {
|
srcPath := path.Join(llamaPath, f)
|
||||||
ggmlRunnerPath = filepath.Join(tmpDir, "server.exe")
|
destPath := filepath.Join(tmpDir, f)
|
||||||
|
|
||||||
|
srcFile, err := llamaCppEmbed.Open(srcPath)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("read llama.cpp %s: %v", f, err)
|
||||||
}
|
}
|
||||||
})
|
defer srcFile.Close()
|
||||||
}
|
|
||||||
|
|
||||||
type ModelRunner struct {
|
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||||
Path string // path to the model runner executable
|
if err != nil {
|
||||||
}
|
log.Fatalf("write llama.cpp %s: %v", f, err)
|
||||||
|
}
|
||||||
|
defer destFile.Close()
|
||||||
|
|
||||||
func ggmlRunner() ModelRunner {
|
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||||
initGGML()
|
log.Fatalf("copy llama.cpp %s: %v", f, err)
|
||||||
return ModelRunner{Path: ggmlRunnerPath}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
runPath := filepath.Join(tmpDir, "server")
|
||||||
|
if runtime.GOOS == "windows" {
|
||||||
|
runPath = filepath.Join(tmpDir, "server.exe")
|
||||||
|
}
|
||||||
|
|
||||||
|
return runPath
|
||||||
}
|
}
|
||||||
|
|
||||||
type llamaModel struct {
|
type llamaModel struct {
|
||||||
hyperparameters llamaHyperparameters
|
hyperparameters llamaHyperparameters
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llamaModel) ModelFamily() ModelFamily {
|
func (llm *llamaModel) ModelFamily() string {
|
||||||
return ModelFamilyLlama
|
return "llama"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llamaModel) ModelType() ModelType {
|
func llamaModelType(numLayer uint32) string {
|
||||||
switch llm.hyperparameters.NumLayer {
|
switch numLayer {
|
||||||
case 26:
|
case 26:
|
||||||
return ModelType3B
|
return "3B"
|
||||||
case 32:
|
case 32:
|
||||||
return ModelType7B
|
return "7B"
|
||||||
case 40:
|
case 40:
|
||||||
return ModelType13B
|
return "13B"
|
||||||
case 48:
|
case 48:
|
||||||
return ModelType34B
|
return "34B"
|
||||||
case 60:
|
case 60:
|
||||||
return ModelType30B
|
return "30B"
|
||||||
case 80:
|
case 80:
|
||||||
return ModelType65B
|
return "65B"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: find a better default
|
|
||||||
return ModelType7B
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llamaModel) FileType() FileType {
|
func (llm *llamaModel) ModelType() string {
|
||||||
return llm.hyperparameters.FileType
|
return llamaModelType(llm.hyperparameters.NumLayer)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (llm *llamaModel) FileType() string {
|
||||||
|
return fileType(llm.hyperparameters.FileType)
|
||||||
}
|
}
|
||||||
|
|
||||||
type llamaHyperparameters struct {
|
type llamaHyperparameters struct {
|
||||||
@@ -154,70 +144,7 @@ type llamaHyperparameters struct {
|
|||||||
NumRot uint32
|
NumRot uint32
|
||||||
|
|
||||||
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
|
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
|
||||||
FileType llamaFileType
|
FileType uint32
|
||||||
}
|
|
||||||
|
|
||||||
type llamaFileType uint32
|
|
||||||
|
|
||||||
const (
|
|
||||||
llamaFileTypeF32 llamaFileType = iota
|
|
||||||
llamaFileTypeF16
|
|
||||||
llamaFileTypeQ4_0
|
|
||||||
llamaFileTypeQ4_1
|
|
||||||
llamaFileTypeQ4_1_F16
|
|
||||||
llamaFileTypeQ8_0 llamaFileType = iota + 2
|
|
||||||
llamaFileTypeQ5_0
|
|
||||||
llamaFileTypeQ5_1
|
|
||||||
llamaFileTypeQ2_K
|
|
||||||
llamaFileTypeQ3_K_S
|
|
||||||
llamaFileTypeQ3_K_M
|
|
||||||
llamaFileTypeQ3_K_L
|
|
||||||
llamaFileTypeQ4_K_S
|
|
||||||
llamaFileTypeQ4_K_M
|
|
||||||
llamaFileTypeQ5_K_S
|
|
||||||
llamaFileTypeQ5_K_M
|
|
||||||
llamaFileTypeQ6_K
|
|
||||||
)
|
|
||||||
|
|
||||||
func (ft llamaFileType) String() string {
|
|
||||||
switch ft {
|
|
||||||
case llamaFileTypeF32:
|
|
||||||
return "F32"
|
|
||||||
case llamaFileTypeF16:
|
|
||||||
return "F16"
|
|
||||||
case llamaFileTypeQ4_0:
|
|
||||||
return "Q4_0"
|
|
||||||
case llamaFileTypeQ4_1:
|
|
||||||
return "Q4_1"
|
|
||||||
case llamaFileTypeQ4_1_F16:
|
|
||||||
return "Q4_1_F16"
|
|
||||||
case llamaFileTypeQ8_0:
|
|
||||||
return "Q8_0"
|
|
||||||
case llamaFileTypeQ5_0:
|
|
||||||
return "Q5_0"
|
|
||||||
case llamaFileTypeQ5_1:
|
|
||||||
return "Q5_1"
|
|
||||||
case llamaFileTypeQ2_K:
|
|
||||||
return "Q2_K"
|
|
||||||
case llamaFileTypeQ3_K_S:
|
|
||||||
return "Q3_K_S"
|
|
||||||
case llamaFileTypeQ3_K_M:
|
|
||||||
return "Q3_K_M"
|
|
||||||
case llamaFileTypeQ3_K_L:
|
|
||||||
return "Q3_K_L"
|
|
||||||
case llamaFileTypeQ4_K_S:
|
|
||||||
return "Q4_K_S"
|
|
||||||
case llamaFileTypeQ4_K_M:
|
|
||||||
return "Q4_K_M"
|
|
||||||
case llamaFileTypeQ5_K_S:
|
|
||||||
return "Q5_K_S"
|
|
||||||
case llamaFileTypeQ5_K_M:
|
|
||||||
return "Q5_K_M"
|
|
||||||
case llamaFileTypeQ6_K:
|
|
||||||
return "Q6_K"
|
|
||||||
default:
|
|
||||||
return "Unknown"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type Running struct {
|
type Running struct {
|
||||||
@@ -226,11 +153,81 @@ type Running struct {
|
|||||||
Cancel context.CancelFunc
|
Cancel context.CancelFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ModelRunner struct {
|
||||||
|
Path string // path to the model runner executable
|
||||||
|
}
|
||||||
|
|
||||||
type llama struct {
|
type llama struct {
|
||||||
api.Options
|
api.Options
|
||||||
Running
|
Running
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var errNoGPU = errors.New("nvidia-smi command failed")
|
||||||
|
|
||||||
|
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
|
||||||
|
func CheckVRAM() (int, error) {
|
||||||
|
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
|
||||||
|
var stdout bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout
|
||||||
|
err := cmd.Run()
|
||||||
|
if err != nil {
|
||||||
|
return 0, errNoGPU
|
||||||
|
}
|
||||||
|
|
||||||
|
var total int
|
||||||
|
scanner := bufio.NewScanner(&stdout)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
vram, err := strconv.Atoi(line)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
total += vram
|
||||||
|
}
|
||||||
|
|
||||||
|
return total, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func NumGPU(opts api.Options) int {
|
||||||
|
if opts.NumGPU != -1 {
|
||||||
|
return opts.NumGPU
|
||||||
|
}
|
||||||
|
n := 1 // default to enable metal on macOS
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
|
vram, err := CheckVRAM()
|
||||||
|
if err != nil {
|
||||||
|
if err.Error() != "nvidia-smi command failed" {
|
||||||
|
log.Print(err.Error())
|
||||||
|
}
|
||||||
|
// nvidia driver not installed or no nvidia GPU found
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
|
||||||
|
switch {
|
||||||
|
case vram < 500:
|
||||||
|
log.Printf("WARNING: Low VRAM detected, disabling GPU")
|
||||||
|
n = 0
|
||||||
|
case vram < 1000:
|
||||||
|
n = 4
|
||||||
|
case vram < 2000:
|
||||||
|
n = 8
|
||||||
|
case vram < 4000:
|
||||||
|
n = 12
|
||||||
|
case vram < 8000:
|
||||||
|
n = 16
|
||||||
|
case vram < 12000:
|
||||||
|
n = 24
|
||||||
|
case vram < 16000:
|
||||||
|
n = 32
|
||||||
|
default:
|
||||||
|
n = 48
|
||||||
|
}
|
||||||
|
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -247,14 +244,17 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
|||||||
params := []string{
|
params := []string{
|
||||||
"--model", model,
|
"--model", model,
|
||||||
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
||||||
"--gqa", fmt.Sprintf("%d", opts.NumGQA),
|
|
||||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
|
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if opts.NumGQA > 0 {
|
||||||
|
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
|
||||||
|
}
|
||||||
|
|
||||||
if len(adapters) > 0 {
|
if len(adapters) > 0 {
|
||||||
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
|
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
|
||||||
params = append(params, "--lora", adapters[0])
|
params = append(params, "--lora", adapters[0])
|
||||||
@@ -286,17 +286,25 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
|||||||
runner.Path,
|
runner.Path,
|
||||||
append(params, "--port", strconv.Itoa(port))...,
|
append(params, "--port", strconv.Itoa(port))...,
|
||||||
)
|
)
|
||||||
var stderr bytes.Buffer
|
|
||||||
cmd.Stderr = &stderr
|
cmd.Stdout = os.Stderr
|
||||||
|
cmd.Stderr = os.Stderr
|
||||||
|
|
||||||
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
|
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
|
||||||
|
|
||||||
|
log.Print("starting llama.cpp server")
|
||||||
|
if err := llm.Cmd.Start(); err != nil {
|
||||||
|
log.Printf("error starting the external llama.cpp server: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
if err := waitForServer(llm); err != nil {
|
if err := waitForServer(llm); err != nil {
|
||||||
log.Printf("error starting llama.cpp server: %v", err)
|
log.Printf("error starting llama.cpp server: %v", err)
|
||||||
llm.Close()
|
llm.Close()
|
||||||
// try again
|
// try again
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// server started successfully
|
// server started successfully
|
||||||
return llm, nil
|
return llm, nil
|
||||||
}
|
}
|
||||||
@@ -305,59 +313,37 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
|||||||
}
|
}
|
||||||
|
|
||||||
func waitForServer(llm *llama) error {
|
func waitForServer(llm *llama) error {
|
||||||
log.Print("starting llama.cpp server")
|
|
||||||
var stderr bytes.Buffer
|
|
||||||
llm.Cmd.Stderr = &stderr
|
|
||||||
err := llm.Cmd.Start()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error starting the external llama.cpp server: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
exitChan := make(chan error, 1)
|
|
||||||
|
|
||||||
// the server is a long running process, watch for it exiting to keep track of something going wrong
|
|
||||||
go func() {
|
|
||||||
err := llm.Cmd.Wait()
|
|
||||||
log.Print(stderr.String())
|
|
||||||
exitChan <- err
|
|
||||||
}()
|
|
||||||
|
|
||||||
// wait for the server to start responding
|
// wait for the server to start responding
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
expiresAt := time.Now().Add(30 * time.Second)
|
expiresAt := time.Now().Add(45 * time.Second)
|
||||||
ticker := time.NewTicker(100 * time.Millisecond)
|
ticker := time.NewTicker(200 * time.Millisecond)
|
||||||
|
|
||||||
log.Print("waiting for llama.cpp server to start responding")
|
log.Print("waiting for llama.cpp server to start responding")
|
||||||
|
for range ticker.C {
|
||||||
|
if time.Now().After(expiresAt) {
|
||||||
|
return fmt.Errorf("llama.cpp server did not start within alloted time, retrying")
|
||||||
|
}
|
||||||
|
|
||||||
for {
|
if err := llm.Ping(context.Background()); err == nil {
|
||||||
select {
|
break
|
||||||
case <-ticker.C:
|
|
||||||
if time.Now().After(expiresAt) {
|
|
||||||
return fmt.Errorf("llama.cpp server did not start responding within 30 seconds, retrying")
|
|
||||||
}
|
|
||||||
if err := llm.Ping(context.Background()); err == nil {
|
|
||||||
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
case err := <-exitChan:
|
|
||||||
return fmt.Errorf("llama.cpp server exited unexpectedly: %w", err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llama) Close() {
|
func (llm *llama) Close() {
|
||||||
llm.Running.Cmd.Cancel()
|
llm.Cancel()
|
||||||
|
if err := llm.Cmd.Wait(); err != nil {
|
||||||
|
log.Printf("llama.cpp server exited with error: %v", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llama) SetOptions(opts api.Options) {
|
func (llm *llama) SetOptions(opts api.Options) {
|
||||||
llm.Options = opts
|
llm.Options = opts
|
||||||
}
|
}
|
||||||
|
|
||||||
type Prediction struct {
|
|
||||||
Content string `json:"content"`
|
|
||||||
Stop bool `json:"stop"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type GenerationSettings struct {
|
type GenerationSettings struct {
|
||||||
FrequencyPenalty float64 `json:"frequency_penalty"`
|
FrequencyPenalty float64 `json:"frequency_penalty"`
|
||||||
IgnoreEOS bool `json:"ignore_eos"`
|
IgnoreEOS bool `json:"ignore_eos"`
|
||||||
@@ -385,31 +371,19 @@ type GenerationSettings struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Timings struct {
|
type Timings struct {
|
||||||
PredictedMS float64 `json:"predicted_ms"`
|
PredictedN int `json:"predicted_n"`
|
||||||
PredictedN int `json:"predicted_n"`
|
PredictedMS float64 `json:"predicted_ms"`
|
||||||
PredictedPerSecond float64 `json:"predicted_per_second"`
|
PromptN int `json:"prompt_n"`
|
||||||
PredictedPerTokenMS float64 `json:"predicted_per_token_ms"`
|
PromptMS float64 `json:"prompt_ms"`
|
||||||
PromptMS float64 `json:"prompt_ms"`
|
|
||||||
PromptN int `json:"prompt_n"`
|
|
||||||
PromptPerSecond float64 `json:"prompt_per_second"`
|
|
||||||
PromptPerTokenMS float64 `json:"prompt_per_token_ms"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type PredictComplete struct {
|
type Prediction struct {
|
||||||
Content string `json:"content"`
|
Content string `json:"content"`
|
||||||
GenerationSettings GenerationSettings `json:"generation_settings"`
|
Model string `json:"model"`
|
||||||
Model string `json:"model"`
|
Prompt string `json:"prompt"`
|
||||||
Prompt string `json:"prompt"`
|
Stop bool `json:"stop"`
|
||||||
Stop bool `json:"stop"`
|
|
||||||
StoppedEOS bool `json:"stopped_eos"`
|
Timings `json:"timings"`
|
||||||
StoppedLimit bool `json:"stopped_limit"`
|
|
||||||
StoppedWord bool `json:"stopped_word"`
|
|
||||||
StoppingWord string `json:"stopping_word"`
|
|
||||||
Timings Timings `json:"timings"`
|
|
||||||
TokensCached int `json:"tokens_cached"`
|
|
||||||
TokensEvaluated int `json:"tokens_evaluated"`
|
|
||||||
TokensPredicted int `json:"tokens_predicted"`
|
|
||||||
Truncated bool `json:"truncated"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type PredictRequest struct {
|
type PredictRequest struct {
|
||||||
@@ -437,15 +411,19 @@ type PredictRequest struct {
|
|||||||
Stop []string `json:"stop,omitempty"`
|
Stop []string `json:"stop,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string, fn func(api.GenerateResponse)) error {
|
func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
|
||||||
// we need to find the trimmed prompt context before predicting so that we can return it to the client
|
prevConvo, err := llm.Decode(ctx, prevContext)
|
||||||
trimmedPrompt, err := llm.marshalPrompt(ctx, predictCtx, prompt)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("marshaling prompt: %v", err)
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var nextContext strings.Builder
|
||||||
|
nextContext.WriteString(prevConvo)
|
||||||
|
nextContext.WriteString(prompt)
|
||||||
|
|
||||||
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
|
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
|
||||||
predReq := PredictRequest{
|
predReq := PredictRequest{
|
||||||
Prompt: trimmedPrompt,
|
Prompt: nextContext.String(),
|
||||||
Stream: true,
|
Stream: true,
|
||||||
NPredict: llm.NumPredict,
|
NPredict: llm.NumPredict,
|
||||||
NKeep: llm.NumKeep,
|
NKeep: llm.NumKeep,
|
||||||
@@ -491,7 +469,6 @@ func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string,
|
|||||||
}
|
}
|
||||||
|
|
||||||
scanner := bufio.NewScanner(resp.Body)
|
scanner := bufio.NewScanner(resp.Body)
|
||||||
genCtx := trimmedPrompt // start with the trimmed prompt
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
@@ -506,34 +483,33 @@ func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string,
|
|||||||
// Read data from the server-side event stream
|
// Read data from the server-side event stream
|
||||||
if strings.HasPrefix(line, "data: ") {
|
if strings.HasPrefix(line, "data: ") {
|
||||||
evt := line[6:]
|
evt := line[6:]
|
||||||
var complete PredictComplete
|
var p Prediction
|
||||||
if err := json.Unmarshal([]byte(evt), &complete); err != nil {
|
if err := json.Unmarshal([]byte(evt), &p); err != nil {
|
||||||
return fmt.Errorf("error unmarshaling llm complete response: %v", err)
|
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if complete.Timings.PredictedMS > 0 {
|
if p.Content != "" {
|
||||||
genCtx += complete.Content
|
fn(api.GenerateResponse{Response: p.Content})
|
||||||
embd, err := llm.Encode(ctx, genCtx)
|
nextContext.WriteString(p.Content)
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.Stop {
|
||||||
|
embd, err := llm.Encode(ctx, nextContext.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("encoding context: %v", err)
|
return fmt.Errorf("encoding context: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn(api.GenerateResponse{
|
fn(api.GenerateResponse{
|
||||||
Done: true,
|
Done: true,
|
||||||
Context: embd,
|
Context: embd,
|
||||||
PromptEvalCount: int(complete.Timings.PromptN),
|
PromptEvalCount: p.PromptN,
|
||||||
PromptEvalDuration: parseDurationMs(float64(complete.Timings.PromptMS)),
|
PromptEvalDuration: parseDurationMs(p.PromptMS),
|
||||||
EvalCount: int(complete.Timings.PredictedN),
|
EvalCount: p.PredictedN,
|
||||||
EvalDuration: parseDurationMs(float64(complete.Timings.PredictedMS)),
|
EvalDuration: parseDurationMs(p.PredictedMS),
|
||||||
})
|
})
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var pred Prediction
|
|
||||||
if err := json.Unmarshal([]byte(evt), &pred); err != nil {
|
|
||||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
|
||||||
}
|
|
||||||
genCtx += pred.Content
|
|
||||||
fn(api.GenerateResponse{Response: pred.Content})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -545,34 +521,6 @@ func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string,
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *llama) marshalPrompt(ctx context.Context, pCtx []int, prompt string) (string, error) {
|
|
||||||
pEncode, err := llm.Encode(ctx, prompt)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("encoding prompt context: %w", err)
|
|
||||||
}
|
|
||||||
tokens := append(pCtx, pEncode...)
|
|
||||||
if llm.NumKeep < 0 {
|
|
||||||
llm.NumKeep = len(tokens)
|
|
||||||
}
|
|
||||||
|
|
||||||
// min(llm.NumCtx - 4, llm.NumKeep)
|
|
||||||
if llm.NumCtx-4 < llm.NumKeep {
|
|
||||||
llm.NumKeep = llm.NumCtx - 4
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(tokens) >= llm.NumCtx {
|
|
||||||
// truncate input
|
|
||||||
numLeft := (llm.NumCtx - llm.NumKeep) / 2
|
|
||||||
truncated := tokens[:llm.NumKeep]
|
|
||||||
erasedBlocks := (len(tokens) - llm.NumKeep - numLeft - 1) / numLeft
|
|
||||||
truncated = append(truncated, tokens[llm.NumKeep+erasedBlocks*numLeft:]...)
|
|
||||||
tokens = truncated
|
|
||||||
log.Printf("input truncated: num_ctx=%d num_keep=%d num_left=%d num_tokens=%d", llm.NumCtx, llm.NumKeep, numLeft, len(truncated))
|
|
||||||
}
|
|
||||||
|
|
||||||
return llm.Decode(ctx, tokens)
|
|
||||||
}
|
|
||||||
|
|
||||||
type TokenizeRequest struct {
|
type TokenizeRequest struct {
|
||||||
Content string `json:"content"`
|
Content string `json:"content"`
|
||||||
}
|
}
|
||||||
@@ -716,7 +664,7 @@ func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error
|
|||||||
|
|
||||||
// Ping checks that the server subprocess is still running and responding to requests
|
// Ping checks that the server subprocess is still running and responding to requests
|
||||||
func (llm *llama) Ping(ctx context.Context) error {
|
func (llm *llama) Ping(ctx context.Context) error {
|
||||||
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Running.Port))
|
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("ping resp: %w", err)
|
return fmt.Errorf("ping resp: %w", err)
|
||||||
}
|
}
|
||||||
44
llm/llm.go
44
llm/llm.go
@@ -32,15 +32,22 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ggml, err := DecodeGGML(f, ModelFamilyLlama)
|
ggml, err := DecodeGGML(f)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
switch ggml.FileType().String() {
|
switch ggml.FileType() {
|
||||||
case "F32", "Q5_0", "Q5_1", "Q8_0":
|
case "Q8_0":
|
||||||
|
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
|
||||||
|
// GGML Q8_0 do not support Metal API and will
|
||||||
|
// cause the runner to segmentation fault so disable GPU
|
||||||
|
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
||||||
|
opts.NumGPU = 0
|
||||||
|
}
|
||||||
|
case "F32", "Q5_0", "Q5_1":
|
||||||
if opts.NumGPU != 0 {
|
if opts.NumGPU != 0 {
|
||||||
// F32, F16, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
|
// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
|
||||||
// cause the runner to segmentation fault so disable GPU
|
// cause the runner to segmentation fault so disable GPU
|
||||||
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
@@ -49,34 +56,43 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
|||||||
|
|
||||||
totalResidentMemory := memory.TotalMemory()
|
totalResidentMemory := memory.TotalMemory()
|
||||||
switch ggml.ModelType() {
|
switch ggml.ModelType() {
|
||||||
case ModelType3B, ModelType7B:
|
case "3B", "7B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
|
||||||
} else if totalResidentMemory < 8*1024*1024 {
|
} else if totalResidentMemory < 8*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 8GB of memory")
|
return nil, fmt.Errorf("model requires at least 8GB of memory")
|
||||||
}
|
}
|
||||||
case ModelType13B:
|
case "13B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
|
||||||
} else if totalResidentMemory < 16*1024*1024 {
|
} else if totalResidentMemory < 16*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 16GB of memory")
|
return nil, fmt.Errorf("model requires at least 16GB of memory")
|
||||||
}
|
}
|
||||||
case ModelType30B, ModelType34B:
|
case "30B", "34B", "40B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
|
||||||
} else if totalResidentMemory < 32*1024*1024 {
|
} else if totalResidentMemory < 32*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 32GB of memory")
|
return nil, fmt.Errorf("model requires at least 32GB of memory")
|
||||||
}
|
}
|
||||||
case ModelType65B:
|
case "65B", "70B":
|
||||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
|
if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
|
||||||
return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
|
return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
|
||||||
} else if totalResidentMemory < 64*1024*1024 {
|
} else if totalResidentMemory < 64*1024*1024 {
|
||||||
return nil, fmt.Errorf("model requires at least 64GB of memory")
|
return nil, fmt.Errorf("model requires at least 64GB of memory")
|
||||||
}
|
}
|
||||||
|
case "180B":
|
||||||
|
if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
|
||||||
|
return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
|
||||||
|
} else if totalResidentMemory < 128*1024*1024 {
|
||||||
|
return nil, fmt.Errorf("model requires at least 128GB of memory")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
switch ggml.ModelFamily() {
|
switch ggml.Name() {
|
||||||
case ModelFamilyLlama:
|
case "gguf":
|
||||||
|
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
|
||||||
|
return newLlama(model, adapters, ggufRunner(), opts)
|
||||||
|
case "ggml", "ggmf", "ggjt", "ggla":
|
||||||
return newLlama(model, adapters, ggmlRunner(), opts)
|
return newLlama(model, adapters, ggmlRunner(), opts)
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
||||||
|
|||||||
@@ -6,8 +6,11 @@ GO_LDFLAGS="-X github.com/jmorganca/ollama/version.Version=$VERSION"
|
|||||||
GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
|
GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
|
||||||
|
|
||||||
# build universal binary
|
# build universal binary
|
||||||
CGO_ENABLED=1 GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
|
GOARCH=arm64 go generate ./...
|
||||||
CGO_ENABLED=1 GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
|
GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
|
||||||
|
rm -rf llm/llama.cpp/*/build/*/bin
|
||||||
|
GOARCH=amd64 go generate ./...
|
||||||
|
GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
|
||||||
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
||||||
rm dist/ollama-darwin-amd64 dist/ollama-darwin-arm64
|
rm dist/ollama-darwin-amd64 dist/ollama-darwin-arm64
|
||||||
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
|
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
|
|||||||
|
|
||||||
headers := make(http.Header)
|
headers := make(http.Header)
|
||||||
headers.Set("Authorization", sig)
|
headers.Set("Authorization", sig)
|
||||||
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, regOpts)
|
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("couldn't get token: %q", err)
|
log.Printf("couldn't get token: %q", err)
|
||||||
}
|
}
|
||||||
|
|||||||
336
server/images.go
336
server/images.go
@@ -22,6 +22,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"text/template"
|
"text/template"
|
||||||
|
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
|
||||||
"github.com/jmorganca/ollama/api"
|
"github.com/jmorganca/ollama/api"
|
||||||
"github.com/jmorganca/ollama/llm"
|
"github.com/jmorganca/ollama/llm"
|
||||||
"github.com/jmorganca/ollama/parser"
|
"github.com/jmorganca/ollama/parser"
|
||||||
@@ -39,15 +41,18 @@ type RegistryOptions struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Model struct {
|
type Model struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
ModelPath string
|
ShortName string
|
||||||
AdapterPaths []string
|
ModelPath string
|
||||||
Template string
|
OriginalModel string
|
||||||
System string
|
AdapterPaths []string
|
||||||
Digest string
|
Template string
|
||||||
ConfigDigest string
|
System string
|
||||||
Options map[string]interface{}
|
License []string
|
||||||
Embeddings []vector.Embedding
|
Digest string
|
||||||
|
ConfigDigest string
|
||||||
|
Options map[string]interface{}
|
||||||
|
Embeddings []vector.Embedding
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
|
func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
|
||||||
@@ -66,7 +71,6 @@ func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, e
|
|||||||
System string
|
System string
|
||||||
Prompt string
|
Prompt string
|
||||||
Embed string
|
Embed string
|
||||||
Args map[string]any
|
|
||||||
|
|
||||||
// deprecated: versions <= 0.0.7 used this to omit the system prompt
|
// deprecated: versions <= 0.0.7 used this to omit the system prompt
|
||||||
Context []int
|
Context []int
|
||||||
@@ -76,7 +80,6 @@ func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, e
|
|||||||
vars.System = m.System
|
vars.System = m.System
|
||||||
vars.Prompt = request.Prompt
|
vars.Prompt = request.Prompt
|
||||||
vars.Context = request.Context
|
vars.Context = request.Context
|
||||||
vars.Args = request.Args
|
|
||||||
vars.Embed = embedding
|
vars.Embed = embedding
|
||||||
|
|
||||||
if request.System != "" {
|
if request.System != "" {
|
||||||
@@ -111,10 +114,11 @@ type LayerReader struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type ConfigV2 struct {
|
type ConfigV2 struct {
|
||||||
ModelFamily llm.ModelFamily `json:"model_family"`
|
ModelFormat string `json:"model_format"`
|
||||||
ModelType string `json:"model_type"`
|
ModelFamily string `json:"model_family"`
|
||||||
FileType string `json:"file_type"`
|
ModelType string `json:"model_type"`
|
||||||
RootFS RootFS `json:"rootfs"`
|
FileType string `json:"file_type"`
|
||||||
|
RootFS RootFS `json:"rootfs"`
|
||||||
|
|
||||||
// required by spec
|
// required by spec
|
||||||
Architecture string `json:"architecture"`
|
Architecture string `json:"architecture"`
|
||||||
@@ -171,9 +175,11 @@ func GetModel(name string) (*Model, error) {
|
|||||||
|
|
||||||
model := &Model{
|
model := &Model{
|
||||||
Name: mp.GetFullTagname(),
|
Name: mp.GetFullTagname(),
|
||||||
|
ShortName: mp.GetShortTagname(),
|
||||||
Digest: digest,
|
Digest: digest,
|
||||||
ConfigDigest: manifest.Config.Digest,
|
ConfigDigest: manifest.Config.Digest,
|
||||||
Template: "{{ .Prompt }}",
|
Template: "{{ .Prompt }}",
|
||||||
|
License: []string{},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, layer := range manifest.Layers {
|
for _, layer := range manifest.Layers {
|
||||||
@@ -185,6 +191,7 @@ func GetModel(name string) (*Model, error) {
|
|||||||
switch layer.MediaType {
|
switch layer.MediaType {
|
||||||
case "application/vnd.ollama.image.model":
|
case "application/vnd.ollama.image.model":
|
||||||
model.ModelPath = filename
|
model.ModelPath = filename
|
||||||
|
model.OriginalModel = layer.From
|
||||||
case "application/vnd.ollama.image.embed":
|
case "application/vnd.ollama.image.embed":
|
||||||
file, err := os.Open(filename)
|
file, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -229,6 +236,12 @@ func GetModel(name string) (*Model, error) {
|
|||||||
if err = json.NewDecoder(params).Decode(&model.Options); err != nil {
|
if err = json.NewDecoder(params).Decode(&model.Options); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
case "application/vnd.ollama.image.license":
|
||||||
|
bts, err := os.ReadFile(filename)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
model.License = append(model.License, string(bts))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -256,6 +269,29 @@ func filenameWithPath(path, f string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
|
func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
|
||||||
|
mp := ParseModelPath(name)
|
||||||
|
|
||||||
|
var manifest *ManifestV2
|
||||||
|
var err error
|
||||||
|
var noprune string
|
||||||
|
|
||||||
|
// build deleteMap to prune unused layers
|
||||||
|
deleteMap := make(map[string]bool)
|
||||||
|
|
||||||
|
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||||
|
manifest, _, err = GetManifest(mp)
|
||||||
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if manifest != nil {
|
||||||
|
for _, l := range manifest.Layers {
|
||||||
|
deleteMap[l.Digest] = true
|
||||||
|
}
|
||||||
|
deleteMap[manifest.Config.Digest] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
mf, err := os.Open(path)
|
mf, err := os.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
|
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
|
||||||
@@ -276,6 +312,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
|
|
||||||
var layers []*LayerReader
|
var layers []*LayerReader
|
||||||
params := make(map[string][]string)
|
params := make(map[string][]string)
|
||||||
|
var sourceParams map[string]any
|
||||||
embed := EmbeddingParams{fn: fn}
|
embed := EmbeddingParams{fn: fn}
|
||||||
for _, c := range commands {
|
for _, c := range commands {
|
||||||
log.Printf("[%s] - %s\n", c.Name, c.Args)
|
log.Printf("[%s] - %s\n", c.Name, c.Args)
|
||||||
@@ -315,14 +352,15 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
ggml, err := llm.DecodeGGML(file, llm.ModelFamilyLlama)
|
ggml, err := llm.DecodeGGML(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
config.ModelFormat = ggml.Name()
|
||||||
config.ModelFamily = ggml.ModelFamily()
|
config.ModelFamily = ggml.ModelFamily()
|
||||||
config.ModelType = ggml.ModelType().String()
|
config.ModelType = ggml.ModelType()
|
||||||
config.FileType = ggml.FileType().String()
|
config.FileType = ggml.FileType()
|
||||||
|
|
||||||
// reset the file
|
// reset the file
|
||||||
file.Seek(0, io.SeekStart)
|
file.Seek(0, io.SeekStart)
|
||||||
@@ -356,9 +394,27 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
// copie the model metadata
|
// copie the model metadata
|
||||||
config.ModelFamily = source.ModelFamily
|
config.ModelFamily = source.ModelFamily
|
||||||
config.ModelType = source.ModelType
|
config.ModelType = source.ModelType
|
||||||
|
config.ModelFormat = source.ModelFormat
|
||||||
config.FileType = source.FileType
|
config.FileType = source.FileType
|
||||||
|
|
||||||
for _, l := range mf.Layers {
|
for _, l := range mf.Layers {
|
||||||
|
if l.MediaType == "application/vnd.ollama.image.params" {
|
||||||
|
sourceParamsBlobPath, err := GetBlobsPath(l.Digest)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
sourceParamsBlob, err := os.Open(sourceParamsBlobPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer sourceParamsBlob.Close()
|
||||||
|
|
||||||
|
if err := json.NewDecoder(sourceParamsBlob).Decode(&sourceParams); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
newLayer, err := GetLayerWithBufferFromLayer(l)
|
newLayer, err := GetLayerWithBufferFromLayer(l)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -429,12 +485,25 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
// Create a single layer for the parameters
|
// Create a single layer for the parameters
|
||||||
if len(params) > 0 {
|
if len(params) > 0 {
|
||||||
fn(api.ProgressResponse{Status: "creating parameter layer"})
|
fn(api.ProgressResponse{Status: "creating parameter layer"})
|
||||||
|
|
||||||
layers = removeLayerFromLayers(layers, "application/vnd.ollama.image.params")
|
layers = removeLayerFromLayers(layers, "application/vnd.ollama.image.params")
|
||||||
formattedParams, err := formatParams(params)
|
formattedParams, err := formatParams(params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("couldn't create params json: %v", err)
|
return fmt.Errorf("couldn't create params json: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for k, v := range sourceParams {
|
||||||
|
if _, ok := formattedParams[k]; !ok {
|
||||||
|
formattedParams[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.ModelType == "65B" {
|
||||||
|
if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
|
||||||
|
config.ModelType = "70B"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bts, err := json.Marshal(formattedParams)
|
bts, err := json.Marshal(formattedParams)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -466,6 +535,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
var manifestLayers []*Layer
|
var manifestLayers []*Layer
|
||||||
for _, l := range layers {
|
for _, l := range layers {
|
||||||
manifestLayers = append(manifestLayers, &l.Layer)
|
manifestLayers = append(manifestLayers, &l.Layer)
|
||||||
|
delete(deleteMap, l.Layer.Digest)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a layer for the config object
|
// Create a layer for the config object
|
||||||
@@ -475,6 +545,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
layers = append(layers, cfg)
|
layers = append(layers, cfg)
|
||||||
|
delete(deleteMap, cfg.Layer.Digest)
|
||||||
|
|
||||||
if err := SaveLayers(layers, fn, false); err != nil {
|
if err := SaveLayers(layers, fn, false); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -487,6 +558,14 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if noprune == "" {
|
||||||
|
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||||
|
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn(api.ProgressResponse{Status: "success"})
|
fn(api.ProgressResponse{Status: "success"})
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -632,14 +711,9 @@ func existingFileEmbeddings(digest string) (map[string][]float64, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader {
|
func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader {
|
||||||
j := 0
|
return slices.DeleteFunc(layers, func(layer *LayerReader) bool {
|
||||||
for _, l := range layers {
|
return layer.MediaType == mediaType
|
||||||
if l.MediaType != mediaType {
|
})
|
||||||
layers[j] = l
|
|
||||||
j++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return layers[:j]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveLayers(layers []*LayerReader, fn func(resp api.ProgressResponse), force bool) error {
|
func SaveLayers(layers []*LayerReader, fn func(resp api.ProgressResponse), force bool) error {
|
||||||
@@ -747,14 +821,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
|
|||||||
return nil, fmt.Errorf("invalid float value %s", vals)
|
return nil, fmt.Errorf("invalid float value %s", vals)
|
||||||
}
|
}
|
||||||
|
|
||||||
out[key] = floatVal
|
out[key] = float32(floatVal)
|
||||||
case reflect.Int:
|
case reflect.Int:
|
||||||
intVal, err := strconv.ParseInt(vals[0], 10, 0)
|
intVal, err := strconv.ParseInt(vals[0], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("invalid int value %s", vals)
|
return nil, fmt.Errorf("invalid int value %s", vals)
|
||||||
}
|
}
|
||||||
|
|
||||||
out[key] = intVal
|
out[key] = int(intVal)
|
||||||
case reflect.Bool:
|
case reflect.Bool:
|
||||||
boolVal, err := strconv.ParseBool(vals[0])
|
boolVal, err := strconv.ParseBool(vals[0])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -834,18 +908,7 @@ func CopyModel(src, dest string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func DeleteModel(name string) error {
|
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dryRun bool) error {
|
||||||
mp := ParseModelPath(name)
|
|
||||||
manifest, _, err := GetManifest(mp)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
deleteMap := make(map[string]bool)
|
|
||||||
for _, layer := range manifest.Layers {
|
|
||||||
deleteMap[layer.Digest] = true
|
|
||||||
}
|
|
||||||
deleteMap[manifest.Config.Digest] = true
|
|
||||||
|
|
||||||
fp, err := GetManifestPath()
|
fp, err := GetManifestPath()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -862,14 +925,13 @@ func DeleteModel(name string) error {
|
|||||||
fmp := ParseModelPath(tag)
|
fmp := ParseModelPath(tag)
|
||||||
|
|
||||||
// skip the manifest we're trying to delete
|
// skip the manifest we're trying to delete
|
||||||
if mp.GetFullTagname() == fmp.GetFullTagname() {
|
if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// save (i.e. delete from the deleteMap) any files used in other manifests
|
// save (i.e. delete from the deleteMap) any files used in other manifests
|
||||||
manifest, _, err := GetManifest(fmp)
|
manifest, _, err := GetManifest(fmp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("skipping file: %s", fp)
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -893,14 +955,72 @@ func DeleteModel(name string) error {
|
|||||||
log.Printf("couldn't get file path for '%s': %v", k, err)
|
log.Printf("couldn't get file path for '%s': %v", k, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err := os.Remove(fp); err != nil {
|
if !dryRun {
|
||||||
log.Printf("couldn't remove file '%s': %v", fp, err)
|
if err := os.Remove(fp); err != nil {
|
||||||
continue
|
log.Printf("couldn't remove file '%s': %v", fp, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Printf("wanted to remove: %s", fp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fp, err = mp.GetManifestPath(false)
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func PruneLayers() error {
|
||||||
|
deleteMap := make(map[string]bool)
|
||||||
|
p, err := GetBlobsPath("")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
blobs, err := os.ReadDir(p)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("couldn't read dir '%s': %v", p, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, blob := range blobs {
|
||||||
|
name := blob.Name()
|
||||||
|
if runtime.GOOS == "windows" {
|
||||||
|
name = strings.ReplaceAll(name, "-", ":")
|
||||||
|
}
|
||||||
|
deleteMap[name] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("total blobs: %d", len(deleteMap))
|
||||||
|
|
||||||
|
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("total unused blobs removed: %d", len(deleteMap))
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func DeleteModel(name string) error {
|
||||||
|
mp := ParseModelPath(name)
|
||||||
|
manifest, _, err := GetManifest(mp)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
deleteMap := make(map[string]bool)
|
||||||
|
for _, layer := range manifest.Layers {
|
||||||
|
deleteMap[layer.Digest] = true
|
||||||
|
}
|
||||||
|
deleteMap[manifest.Config.Digest] = true
|
||||||
|
|
||||||
|
err = deleteUnusedLayers(&mp, deleteMap, false)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
fp, err := mp.GetManifestPath(false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -913,6 +1033,83 @@ func DeleteModel(name string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ShowModelfile(model *Model) (string, error) {
|
||||||
|
type modelTemplate struct {
|
||||||
|
*Model
|
||||||
|
From string
|
||||||
|
Params string
|
||||||
|
}
|
||||||
|
|
||||||
|
var params []string
|
||||||
|
for k, v := range model.Options {
|
||||||
|
switch val := v.(type) {
|
||||||
|
case string:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, val))
|
||||||
|
case int:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.Itoa(val)))
|
||||||
|
case float64:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatFloat(val, 'f', 0, 64)))
|
||||||
|
case bool:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatBool(val)))
|
||||||
|
case []interface{}:
|
||||||
|
for _, nv := range val {
|
||||||
|
switch nval := nv.(type) {
|
||||||
|
case string:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, nval))
|
||||||
|
case int:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.Itoa(nval)))
|
||||||
|
case float64:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatFloat(nval, 'f', 0, 64)))
|
||||||
|
case bool:
|
||||||
|
params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatBool(nval)))
|
||||||
|
default:
|
||||||
|
log.Printf("unknown type: %s", reflect.TypeOf(nv).String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
log.Printf("unknown type: %s", reflect.TypeOf(v).String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mt := modelTemplate{
|
||||||
|
Model: model,
|
||||||
|
From: model.OriginalModel,
|
||||||
|
Params: strings.Join(params, "\n"),
|
||||||
|
}
|
||||||
|
|
||||||
|
if mt.From == "" {
|
||||||
|
mt.From = model.ModelPath
|
||||||
|
}
|
||||||
|
|
||||||
|
modelFile := `# Modelfile generated by "ollama show"
|
||||||
|
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||||
|
# FROM {{ .ShortName }}
|
||||||
|
|
||||||
|
FROM {{ .From }}
|
||||||
|
TEMPLATE """{{ .Template }}"""
|
||||||
|
SYSTEM """{{ .System }}"""
|
||||||
|
{{ .Params }}
|
||||||
|
`
|
||||||
|
for _, l := range mt.Model.AdapterPaths {
|
||||||
|
modelFile += fmt.Sprintf("ADAPTER %s\n", l)
|
||||||
|
}
|
||||||
|
|
||||||
|
tmpl, err := template.New("").Parse(modelFile)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("error parsing template: %q", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
|
||||||
|
if err = tmpl.Execute(&buf, mt); err != nil {
|
||||||
|
log.Printf("error executing template: %q", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||||
mp := ParseModelPath(name)
|
mp := ParseModelPath(name)
|
||||||
fn(api.ProgressResponse{Status: "retrieving manifest"})
|
fn(api.ProgressResponse{Status: "retrieving manifest"})
|
||||||
@@ -1002,13 +1199,34 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
|||||||
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||||
mp := ParseModelPath(name)
|
mp := ParseModelPath(name)
|
||||||
|
|
||||||
|
var manifest *ManifestV2
|
||||||
|
var err error
|
||||||
|
var noprune string
|
||||||
|
|
||||||
|
// build deleteMap to prune unused layers
|
||||||
|
deleteMap := make(map[string]bool)
|
||||||
|
|
||||||
|
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||||
|
manifest, _, err = GetManifest(mp)
|
||||||
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if manifest != nil {
|
||||||
|
for _, l := range manifest.Layers {
|
||||||
|
deleteMap[l.Digest] = true
|
||||||
|
}
|
||||||
|
deleteMap[manifest.Config.Digest] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if mp.ProtocolScheme == "http" && !regOpts.Insecure {
|
if mp.ProtocolScheme == "http" && !regOpts.Insecure {
|
||||||
return fmt.Errorf("insecure protocol http")
|
return fmt.Errorf("insecure protocol http")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn(api.ProgressResponse{Status: "pulling manifest"})
|
fn(api.ProgressResponse{Status: "pulling manifest"})
|
||||||
|
|
||||||
manifest, err := pullModelManifest(ctx, mp, regOpts)
|
manifest, err = pullModelManifest(ctx, mp, regOpts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("pull model manifest: %s", err)
|
return fmt.Errorf("pull model manifest: %s", err)
|
||||||
}
|
}
|
||||||
@@ -1028,7 +1246,9 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
|||||||
}); err != nil {
|
}); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
delete(deleteMap, layer.Digest)
|
||||||
}
|
}
|
||||||
|
delete(deleteMap, manifest.Config.Digest)
|
||||||
|
|
||||||
fn(api.ProgressResponse{Status: "verifying sha256 digest"})
|
fn(api.ProgressResponse{Status: "verifying sha256 digest"})
|
||||||
for _, layer := range layers {
|
for _, layer := range layers {
|
||||||
@@ -1066,6 +1286,14 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if noprune == "" {
|
||||||
|
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||||
|
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn(api.ProgressResponse{Status: "success"})
|
fn(api.ProgressResponse{Status: "success"})
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
@@ -1191,7 +1419,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
|||||||
}
|
}
|
||||||
|
|
||||||
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
|
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
|
||||||
if requestURL.Scheme != "http" && regOpts.Insecure {
|
if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
|
||||||
requestURL.Scheme = "http"
|
requestURL.Scheme = "http"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1204,10 +1432,12 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
|
|||||||
req.Header = headers
|
req.Header = headers
|
||||||
}
|
}
|
||||||
|
|
||||||
if regOpts.Token != "" {
|
if regOpts != nil {
|
||||||
req.Header.Set("Authorization", "Bearer "+regOpts.Token)
|
if regOpts.Token != "" {
|
||||||
} else if regOpts.Username != "" && regOpts.Password != "" {
|
req.Header.Set("Authorization", "Bearer "+regOpts.Token)
|
||||||
req.SetBasicAuth(regOpts.Username, regOpts.Password)
|
} else if regOpts.Username != "" && regOpts.Password != "" {
|
||||||
|
req.SetBasicAuth(regOpts.Username, regOpts.Password)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||||
|
|||||||
@@ -114,7 +114,12 @@ func GetManifestPath() (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return filepath.Join(home, ".ollama", "models", "manifests"), nil
|
path := filepath.Join(home, ".ollama", "models", "manifests")
|
||||||
|
if err := os.MkdirAll(path, 0o755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return path, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetBlobsPath(digest string) (string, error) {
|
func GetBlobsPath(digest string) (string, error) {
|
||||||
@@ -128,7 +133,12 @@ func GetBlobsPath(digest string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
path := filepath.Join(home, ".ollama", "models", "blobs", digest)
|
path := filepath.Join(home, ".ollama", "models", "blobs", digest)
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
dirPath := filepath.Dir(path)
|
||||||
|
if digest == "" {
|
||||||
|
dirPath = path
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.MkdirAll(dirPath, 0o755); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ import (
|
|||||||
"os/signal"
|
"os/signal"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -117,12 +119,13 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
tokensNoSystem, err := llmModel.Encode(ctx, promptNoSystem)
|
tokensNoSystem, err := llmModel.Encode(ctx, promptNoSystem)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
opts.NumKeep = len(tokensWithSystem) - len(tokensNoSystem) + 1
|
opts.NumKeep = len(tokensWithSystem) - len(tokensNoSystem)
|
||||||
|
|
||||||
llmModel.SetOptions(opts)
|
llmModel.SetOptions(opts)
|
||||||
}
|
}
|
||||||
@@ -361,6 +364,78 @@ func DeleteModelHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
c.JSON(http.StatusOK, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func ShowModelHandler(c *gin.Context) {
|
||||||
|
var req api.ShowRequest
|
||||||
|
if err := c.ShouldBindJSON(&req); err != nil {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := GetModelInfo(req.Name)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
|
||||||
|
} else {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetModelInfo(name string) (*api.ShowResponse, error) {
|
||||||
|
model, err := GetModel(name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
resp := &api.ShowResponse{
|
||||||
|
License: strings.Join(model.License, "\n"),
|
||||||
|
System: model.System,
|
||||||
|
Template: model.Template,
|
||||||
|
}
|
||||||
|
|
||||||
|
mf, err := ShowModelfile(model)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
resp.Modelfile = mf
|
||||||
|
|
||||||
|
var params []string
|
||||||
|
cs := 30
|
||||||
|
for k, v := range model.Options {
|
||||||
|
switch val := v.(type) {
|
||||||
|
case string:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, val))
|
||||||
|
case int:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(val)))
|
||||||
|
case float64:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(val, 'f', 0, 64)))
|
||||||
|
case bool:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(val)))
|
||||||
|
case []interface{}:
|
||||||
|
for _, nv := range val {
|
||||||
|
switch nval := nv.(type) {
|
||||||
|
case string:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, nval))
|
||||||
|
case int:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(nval)))
|
||||||
|
case float64:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(nval, 'f', 0, 64)))
|
||||||
|
case bool:
|
||||||
|
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(nval)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resp.Parameters = strings.Join(params, "\n")
|
||||||
|
|
||||||
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ListModelsHandler(c *gin.Context) {
|
func ListModelsHandler(c *gin.Context) {
|
||||||
@@ -456,6 +531,7 @@ func Serve(ln net.Listener, origins []string) error {
|
|||||||
r.POST("/api/copy", CopyModelHandler)
|
r.POST("/api/copy", CopyModelHandler)
|
||||||
r.GET("/api/tags", ListModelsHandler)
|
r.GET("/api/tags", ListModelsHandler)
|
||||||
r.DELETE("/api/delete", DeleteModelHandler)
|
r.DELETE("/api/delete", DeleteModelHandler)
|
||||||
|
r.POST("/api/show", ShowModelHandler)
|
||||||
|
|
||||||
log.Printf("Listening on %s", ln.Addr())
|
log.Printf("Listening on %s", ln.Addr())
|
||||||
s := &http.Server{
|
s := &http.Server{
|
||||||
@@ -473,6 +549,13 @@ func Serve(ln net.Listener, origins []string) error {
|
|||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
|
// check compatibility to log warnings
|
||||||
|
if _, err := llm.CheckVRAM(); err != nil {
|
||||||
|
log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return s.Serve(ln)
|
return s.Serve(ln)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -66,31 +66,39 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
|||||||
|
|
||||||
sectionReader := io.NewSectionReader(f, int64(offset), chunk)
|
sectionReader := io.NewSectionReader(f, int64(offset), chunk)
|
||||||
for try := 0; try < MaxRetries; try++ {
|
for try := 0; try < MaxRetries; try++ {
|
||||||
|
ch := make(chan error, 1)
|
||||||
|
|
||||||
r, w := io.Pipe()
|
r, w := io.Pipe()
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
go func() {
|
go func() {
|
||||||
defer w.Close()
|
defer w.Close()
|
||||||
|
|
||||||
for chunked := int64(0); chunked < chunk; {
|
for chunked := int64(0); chunked < chunk; {
|
||||||
n, err := io.CopyN(w, sectionReader, 1024*1024)
|
select {
|
||||||
if err != nil && !errors.Is(err, io.EOF) {
|
case err := <-ch:
|
||||||
|
log.Printf("chunk interrupted: %v", err)
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
n, err := io.CopyN(w, sectionReader, 1024*1024)
|
||||||
|
if err != nil && !errors.Is(err, io.EOF) {
|
||||||
|
fn(api.ProgressResponse{
|
||||||
|
Status: fmt.Sprintf("error reading chunk: %v", err),
|
||||||
|
Digest: layer.Digest,
|
||||||
|
Total: layer.Size,
|
||||||
|
Completed: int(offset),
|
||||||
|
})
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
chunked += n
|
||||||
fn(api.ProgressResponse{
|
fn(api.ProgressResponse{
|
||||||
Status: fmt.Sprintf("error reading chunk: %v", err),
|
Status: fmt.Sprintf("uploading %s", layer.Digest),
|
||||||
Digest: layer.Digest,
|
Digest: layer.Digest,
|
||||||
Total: layer.Size,
|
Total: layer.Size,
|
||||||
Completed: int(offset),
|
Completed: int(offset) + int(chunked),
|
||||||
})
|
})
|
||||||
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
chunked += n
|
|
||||||
fn(api.ProgressResponse{
|
|
||||||
Status: fmt.Sprintf("uploading %s", layer.Digest),
|
|
||||||
Digest: layer.Digest,
|
|
||||||
Total: layer.Size,
|
|
||||||
Completed: int(offset) + int(chunked),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -113,6 +121,8 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
|||||||
|
|
||||||
switch {
|
switch {
|
||||||
case resp.StatusCode == http.StatusUnauthorized:
|
case resp.StatusCode == http.StatusUnauthorized:
|
||||||
|
ch <- errors.New("unauthorized")
|
||||||
|
|
||||||
auth := resp.Header.Get("www-authenticate")
|
auth := resp.Header.Get("www-authenticate")
|
||||||
authRedir := ParseAuthRedirectString(auth)
|
authRedir := ParseAuthRedirectString(auth)
|
||||||
token, err := getAuthToken(ctx, authRedir, regOpts)
|
token, err := getAuthToken(ctx, authRedir, regOpts)
|
||||||
@@ -121,10 +131,7 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
|||||||
}
|
}
|
||||||
|
|
||||||
regOpts.Token = token
|
regOpts.Token = token
|
||||||
if _, err := sectionReader.Seek(0, io.SeekStart); err != nil {
|
sectionReader = io.NewSectionReader(f, int64(offset), chunk)
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
continue
|
continue
|
||||||
case resp.StatusCode >= http.StatusBadRequest:
|
case resp.StatusCode >= http.StatusBadRequest:
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
|||||||
Reference in New Issue
Block a user