Compare commits
36 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dcb6ba389a | ||
|
|
ed6abba75a | ||
|
|
b52a400cdf | ||
|
|
2ed26f0047 | ||
|
|
e64ef69e34 | ||
|
|
3d0a9b477b | ||
|
|
7226980fb6 | ||
|
|
a806b03f62 | ||
|
|
948323fa78 | ||
|
|
e243329e2e | ||
|
|
2a66a1164a | ||
|
|
62620914e9 | ||
|
|
442dec1c6f | ||
|
|
fd4792ec56 | ||
|
|
abaf7d3bda | ||
|
|
7762584fb1 | ||
|
|
317615fd5c | ||
|
|
acc31427dd | ||
|
|
a3ec1ec2a0 | ||
|
|
407a5cabf4 | ||
|
|
0859d50942 | ||
|
|
66bbf05918 | ||
|
|
edba935d67 | ||
|
|
2d49197b3b | ||
|
|
f5e2e150b8 | ||
|
|
268e362fa7 | ||
|
|
07a4c1e3fb | ||
|
|
20dae6b38f | ||
|
|
a18e6b3a40 | ||
|
|
5fb96255dc | ||
|
|
b43ddd84be | ||
|
|
993cb9fad6 | ||
|
|
a8dc0c9b5f | ||
|
|
1e97807808 | ||
|
|
840f87430a | ||
|
|
4d8b0414f7 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -3,8 +3,5 @@
|
||||
.env
|
||||
.venv
|
||||
*.spec
|
||||
build
|
||||
dist
|
||||
__pycache__
|
||||
ollama
|
||||
ggml-metal.metal
|
||||
|
||||
19
Makefile
19
Makefile
@@ -1,19 +0,0 @@
|
||||
default: ollama
|
||||
|
||||
.PHONY: llama
|
||||
llama:
|
||||
cmake -S llama -B llama/build -DLLAMA_METAL=on
|
||||
cmake --build llama/build
|
||||
|
||||
.PHONY: ollama
|
||||
ollama: llama
|
||||
go build .
|
||||
|
||||
.PHONY: app
|
||||
app: ollama
|
||||
npm install --prefix app
|
||||
npm run --prefix app make:sign
|
||||
|
||||
clean:
|
||||
go clean
|
||||
rm -rf llama/build
|
||||
12
README.md
12
README.md
@@ -42,12 +42,20 @@ ollama run vicuna "Why is the sky blue?"
|
||||
|
||||
### 🗺️ Instructions
|
||||
|
||||
Ask questions. Get answers.
|
||||
Get a helping hand.
|
||||
|
||||
```
|
||||
ollama run orca "Write an email to my boss."
|
||||
```
|
||||
|
||||
### 🔎 Ask questions about documents
|
||||
|
||||
Send the contents of a document and ask questions about it.
|
||||
|
||||
```
|
||||
ollama run nous-hermes "$(cat input.txt)", please summarize this story
|
||||
```
|
||||
|
||||
### 📖 Storytelling
|
||||
|
||||
Venture into the unknown.
|
||||
@@ -67,7 +75,7 @@ ollama run ~/Downloads/vicuna-7b-v1.3.ggmlv3.q4_1.bin
|
||||
## Building
|
||||
|
||||
```
|
||||
make
|
||||
go build .
|
||||
```
|
||||
|
||||
To run it start the server:
|
||||
|
||||
115
api/client.go
115
api/client.go
@@ -5,11 +5,25 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
)
|
||||
|
||||
type StatusError struct {
|
||||
StatusCode int
|
||||
Status string
|
||||
Message string
|
||||
}
|
||||
|
||||
func (e StatusError) Error() string {
|
||||
if e.Message != "" {
|
||||
return fmt.Sprintf("%s: %s", e.Status, e.Message)
|
||||
}
|
||||
|
||||
return e.Status
|
||||
}
|
||||
|
||||
type Client struct {
|
||||
base url.URL
|
||||
}
|
||||
@@ -25,35 +39,18 @@ func NewClient(hosts ...string) *Client {
|
||||
}
|
||||
}
|
||||
|
||||
type options struct {
|
||||
requestBody io.Reader
|
||||
responseFunc func(bts []byte) error
|
||||
}
|
||||
func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
|
||||
var buf *bytes.Buffer
|
||||
if data != nil {
|
||||
bts, err := json.Marshal(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
func OptionRequestBody(data any) func(*options) {
|
||||
bts, err := json.Marshal(data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
buf = bytes.NewBuffer(bts)
|
||||
}
|
||||
|
||||
return func(opts *options) {
|
||||
opts.requestBody = bytes.NewReader(bts)
|
||||
}
|
||||
}
|
||||
|
||||
func OptionResponseFunc(fn func([]byte) error) func(*options) {
|
||||
return func(opts *options) {
|
||||
opts.responseFunc = fn
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) stream(ctx context.Context, method, path string, fns ...func(*options)) error {
|
||||
var opts options
|
||||
for _, fn := range fns {
|
||||
fn(&opts)
|
||||
}
|
||||
|
||||
request, err := http.NewRequestWithContext(ctx, method, c.base.JoinPath(path).String(), opts.requestBody)
|
||||
request, err := http.NewRequestWithContext(ctx, method, c.base.JoinPath(path).String(), buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -67,13 +64,28 @@ func (c *Client) stream(ctx context.Context, method, path string, fns ...func(*o
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
if opts.responseFunc != nil {
|
||||
scanner := bufio.NewScanner(response.Body)
|
||||
for scanner.Scan() {
|
||||
if err := opts.responseFunc(scanner.Bytes()); err != nil {
|
||||
return err
|
||||
scanner := bufio.NewScanner(response.Body)
|
||||
for scanner.Scan() {
|
||||
var errorResponse struct {
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
bts := scanner.Bytes()
|
||||
if err := json.Unmarshal(bts, &errorResponse); err != nil {
|
||||
return fmt.Errorf("unmarshal: %w", err)
|
||||
}
|
||||
|
||||
if response.StatusCode >= 400 {
|
||||
return StatusError{
|
||||
StatusCode: response.StatusCode,
|
||||
Status: response.Status,
|
||||
Message: errorResponse.Error,
|
||||
}
|
||||
}
|
||||
|
||||
if err := fn(bts); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -82,36 +94,25 @@ func (c *Client) stream(ctx context.Context, method, path string, fns ...func(*o
|
||||
type GenerateResponseFunc func(GenerateResponse) error
|
||||
|
||||
func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn GenerateResponseFunc) error {
|
||||
return c.stream(ctx, http.MethodPost, "/api/generate",
|
||||
OptionRequestBody(req),
|
||||
OptionResponseFunc(func(bts []byte) error {
|
||||
var resp GenerateResponse
|
||||
if err := json.Unmarshal(bts, &resp); err != nil {
|
||||
return err
|
||||
}
|
||||
return c.stream(ctx, http.MethodPost, "/api/generate", req, func(bts []byte) error {
|
||||
var resp GenerateResponse
|
||||
if err := json.Unmarshal(bts, &resp); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return fn(resp)
|
||||
}),
|
||||
)
|
||||
return fn(resp)
|
||||
})
|
||||
}
|
||||
|
||||
type PullProgressFunc func(PullProgress) error
|
||||
|
||||
func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
|
||||
return c.stream(ctx, http.MethodPost, "/api/pull",
|
||||
OptionRequestBody(req),
|
||||
OptionResponseFunc(func(bts []byte) error {
|
||||
var resp PullProgress
|
||||
if err := json.Unmarshal(bts, &resp); err != nil {
|
||||
return err
|
||||
}
|
||||
return c.stream(ctx, http.MethodPost, "/api/pull", req, func(bts []byte) error {
|
||||
var resp PullProgress
|
||||
if err := json.Unmarshal(bts, &resp); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if resp.Error.Message != "" {
|
||||
// couldn't pull the model from the directory, proceed anyway
|
||||
return nil
|
||||
}
|
||||
|
||||
return fn(resp)
|
||||
}),
|
||||
)
|
||||
return fn(resp)
|
||||
})
|
||||
}
|
||||
|
||||
172
api/types.go
172
api/types.go
@@ -1,22 +1,6 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Error struct {
|
||||
Code int32 `json:"code"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
func (e Error) Error() string {
|
||||
if e.Message == "" {
|
||||
return fmt.Sprintf("%d %v", e.Code, strings.ToLower(http.StatusText(int(e.Code))))
|
||||
}
|
||||
return e.Message
|
||||
}
|
||||
import "runtime"
|
||||
|
||||
type PullRequest struct {
|
||||
Model string `json:"model"`
|
||||
@@ -26,100 +10,82 @@ type PullProgress struct {
|
||||
Total int64 `json:"total"`
|
||||
Completed int64 `json:"completed"`
|
||||
Percent float64 `json:"percent"`
|
||||
Error Error `json:"error"`
|
||||
}
|
||||
|
||||
type GenerateRequest struct {
|
||||
Model string `json:"model"`
|
||||
Prompt string `json:"prompt"`
|
||||
|
||||
ModelOptions `json:"model_opts,omitempty"`
|
||||
PredictOptions `json:"predict_opts,omitempty"`
|
||||
}
|
||||
|
||||
type ModelOptions struct {
|
||||
ContextSize int `json:"context_size,omitempty"`
|
||||
Seed int `json:"seed,omitempty"`
|
||||
NBatch int `json:"n_batch,omitempty"`
|
||||
F16Memory bool `json:"memory_f16,omitempty"`
|
||||
MLock bool `json:"mlock,omitempty"`
|
||||
MMap bool `json:"mmap,omitempty"`
|
||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||
LowVRAM bool `json:"low_vram,omitempty"`
|
||||
Embeddings bool `json:"embeddings,omitempty"`
|
||||
NUMA bool `json:"numa,omitempty"`
|
||||
NGPULayers int `json:"gpu_layers,omitempty"`
|
||||
MainGPU string `json:"main_gpu,omitempty"`
|
||||
TensorSplit string `json:"tensor_split,omitempty"`
|
||||
}
|
||||
|
||||
type PredictOptions struct {
|
||||
Seed int `json:"seed,omitempty"`
|
||||
Threads int `json:"threads,omitempty"`
|
||||
Tokens int `json:"tokens,omitempty"`
|
||||
TopK int `json:"top_k,omitempty"`
|
||||
Repeat int `json:"repeat,omitempty"`
|
||||
Batch int `json:"batch,omitempty"`
|
||||
NKeep int `json:"nkeep,omitempty"`
|
||||
TopP float64 `json:"top_p,omitempty"`
|
||||
Temperature float64 `json:"temp,omitempty"`
|
||||
Penalty float64 `json:"penalty,omitempty"`
|
||||
F16KV bool
|
||||
DebugMode bool
|
||||
StopPrompts []string
|
||||
IgnoreEOS bool `json:"ignore_eos,omitempty"`
|
||||
|
||||
TailFreeSamplingZ float64 `json:"tfs_z,omitempty"`
|
||||
TypicalP float64 `json:"typical_p,omitempty"`
|
||||
FrequencyPenalty float64 `json:"freq_penalty,omitempty"`
|
||||
PresencePenalty float64 `json:"pres_penalty,omitempty"`
|
||||
Mirostat int `json:"mirostat,omitempty"`
|
||||
MirostatETA float64 `json:"mirostat_lr,omitempty"`
|
||||
MirostatTAU float64 `json:"mirostat_ent,omitempty"`
|
||||
PenalizeNL bool `json:"penalize_nl,omitempty"`
|
||||
LogitBias string `json:"logit_bias,omitempty"`
|
||||
|
||||
PathPromptCache string
|
||||
MLock bool `json:"mlock,omitempty"`
|
||||
MMap bool `json:"mmap,omitempty"`
|
||||
PromptCacheAll bool
|
||||
PromptCacheRO bool
|
||||
MainGPU string
|
||||
TensorSplit string
|
||||
}
|
||||
|
||||
var DefaultModelOptions ModelOptions = ModelOptions{
|
||||
ContextSize: 128,
|
||||
Seed: 0,
|
||||
F16Memory: true,
|
||||
MLock: false,
|
||||
Embeddings: true,
|
||||
MMap: true,
|
||||
LowVRAM: false,
|
||||
}
|
||||
|
||||
var DefaultPredictOptions PredictOptions = PredictOptions{
|
||||
Seed: -1,
|
||||
Threads: -1,
|
||||
Tokens: 512,
|
||||
Penalty: 1.1,
|
||||
Repeat: 64,
|
||||
Batch: 512,
|
||||
NKeep: 64,
|
||||
TopK: 90,
|
||||
TopP: 0.86,
|
||||
TailFreeSamplingZ: 1.0,
|
||||
TypicalP: 1.0,
|
||||
Temperature: 0.8,
|
||||
FrequencyPenalty: 0.0,
|
||||
PresencePenalty: 0.0,
|
||||
Mirostat: 0,
|
||||
MirostatTAU: 5.0,
|
||||
MirostatETA: 0.1,
|
||||
MMap: true,
|
||||
StopPrompts: []string{"llama"},
|
||||
Options `json:"options"`
|
||||
}
|
||||
|
||||
type GenerateResponse struct {
|
||||
Response string `json:"response"`
|
||||
}
|
||||
|
||||
type Options struct {
|
||||
Seed int `json:"seed,omitempty"`
|
||||
|
||||
// Backend options
|
||||
UseNUMA bool `json:"numa,omitempty"`
|
||||
|
||||
// Model options
|
||||
NumCtx int `json:"num_ctx,omitempty"`
|
||||
NumBatch int `json:"num_batch,omitempty"`
|
||||
NumGPU int `json:"num_gpu,omitempty"`
|
||||
MainGPU int `json:"main_gpu,omitempty"`
|
||||
LowVRAM bool `json:"low_vram,omitempty"`
|
||||
F16KV bool `json:"f16_kv,omitempty"`
|
||||
LogitsAll bool `json:"logits_all,omitempty"`
|
||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||
UseMMap bool `json:"use_mmap,omitempty"`
|
||||
UseMLock bool `json:"use_mlock,omitempty"`
|
||||
EmbeddingOnly bool `json:"embedding_only,omitempty"`
|
||||
|
||||
// Predict options
|
||||
RepeatLastN int `json:"repeat_last_n,omitempty"`
|
||||
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
||||
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
|
||||
PresencePenalty float32 `json:"presence_penalty,omitempty"`
|
||||
Temperature float32 `json:"temperature,omitempty"`
|
||||
TopK int `json:"top_k,omitempty"`
|
||||
TopP float32 `json:"top_p,omitempty"`
|
||||
TFSZ float32 `json:"tfs_z,omitempty"`
|
||||
TypicalP float32 `json:"typical_p,omitempty"`
|
||||
Mirostat int `json:"mirostat,omitempty"`
|
||||
MirostatTau float32 `json:"mirostat_tau,omitempty"`
|
||||
MirostatEta float32 `json:"mirostat_eta,omitempty"`
|
||||
|
||||
NumThread int `json:"num_thread,omitempty"`
|
||||
}
|
||||
|
||||
func DefaultOptions() Options {
|
||||
return Options{
|
||||
Seed: -1,
|
||||
|
||||
UseNUMA: false,
|
||||
|
||||
NumCtx: 512,
|
||||
NumBatch: 512,
|
||||
NumGPU: 1,
|
||||
LowVRAM: false,
|
||||
F16KV: true,
|
||||
UseMMap: true,
|
||||
UseMLock: false,
|
||||
|
||||
RepeatLastN: 512,
|
||||
RepeatPenalty: 1.1,
|
||||
FrequencyPenalty: 0.0,
|
||||
PresencePenalty: 0.0,
|
||||
Temperature: 0.8,
|
||||
TopK: 40,
|
||||
TopP: 0.9,
|
||||
TFSZ: 1.0,
|
||||
TypicalP: 1.0,
|
||||
Mirostat: 0,
|
||||
MirostatTau: 5.0,
|
||||
MirostatEta: 0.1,
|
||||
|
||||
NumThread: runtime.NumCPU(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ const config: ForgeConfig = {
|
||||
'../ollama',
|
||||
path.join(__dirname, './assets/ollama_icon_16x16Template.png'),
|
||||
path.join(__dirname, './assets/ollama_icon_16x16Template@2x.png'),
|
||||
...(process.platform === 'darwin' ? ['../ggml-metal.metal'] : []),
|
||||
...(process.platform === 'darwin' ? ['../llama/ggml-metal.metal'] : []),
|
||||
],
|
||||
...(process.env.SIGN
|
||||
? {
|
||||
|
||||
243
app/package-lock.json
generated
243
app/package-lock.json
generated
@@ -15,7 +15,9 @@
|
||||
"electron-store": "^8.1.0",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0",
|
||||
"uuid": "^9.0.0"
|
||||
"uuid": "^9.0.0",
|
||||
"winston": "^3.10.0",
|
||||
"winston-daily-rotate-file": "^4.7.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/core": "^7.22.5",
|
||||
@@ -610,6 +612,14 @@
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@colors/colors": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
|
||||
"integrity": "sha512-ooWCrlZP11i8GImSjTHYHLkvFDP48nS4+204nGb1RiX/WXYHmJA2III9/e2DWVabCESdW7hBAEzHRqUn9OUVvQ==",
|
||||
"engines": {
|
||||
"node": ">=0.1.90"
|
||||
}
|
||||
},
|
||||
"node_modules/@cspotcode/source-map-support": {
|
||||
"version": "0.8.1",
|
||||
"resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz",
|
||||
@@ -1319,6 +1329,16 @@
|
||||
"postcss-selector-parser": "^6.0.10"
|
||||
}
|
||||
},
|
||||
"node_modules/@dabh/diagnostics": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@dabh/diagnostics/-/diagnostics-2.0.3.tgz",
|
||||
"integrity": "sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==",
|
||||
"dependencies": {
|
||||
"colorspace": "1.1.x",
|
||||
"enabled": "2.0.x",
|
||||
"kuler": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@discoveryjs/json-ext": {
|
||||
"version": "0.5.7",
|
||||
"resolved": "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz",
|
||||
@@ -2825,6 +2845,11 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/triple-beam": {
|
||||
"version": "1.3.2",
|
||||
"resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.2.tgz",
|
||||
"integrity": "sha512-txGIh+0eDFzKGC25zORnswy+br1Ha7hj5cMVwKIU7+s0U2AxxJru/jZSMU6OC9MJWP6+pc/hc6ZjyZShpsyY2g=="
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "9.0.2",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.2.tgz",
|
||||
@@ -3634,6 +3659,11 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/async": {
|
||||
"version": "3.2.4",
|
||||
"resolved": "https://registry.npmjs.org/async/-/async-3.2.4.tgz",
|
||||
"integrity": "sha512-iAB+JbDEGXhyIUavoDl9WP/Jj106Kz9DEn1DPgYw5ruDn0e3Wgi3sKFm55sASdGBNOQB8F59d9qQ7deqrHA8wQ=="
|
||||
},
|
||||
"node_modules/at-least-node": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/at-least-node/-/at-least-node-1.0.0.tgz",
|
||||
@@ -4394,6 +4424,15 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/color": {
|
||||
"version": "3.2.1",
|
||||
"resolved": "https://registry.npmjs.org/color/-/color-3.2.1.tgz",
|
||||
"integrity": "sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==",
|
||||
"dependencies": {
|
||||
"color-convert": "^1.9.3",
|
||||
"color-string": "^1.6.0"
|
||||
}
|
||||
},
|
||||
"node_modules/color-convert": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
|
||||
@@ -4409,8 +4448,16 @@
|
||||
"node_modules/color-name": {
|
||||
"version": "1.1.4",
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
|
||||
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
|
||||
"dev": true
|
||||
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA=="
|
||||
},
|
||||
"node_modules/color-string": {
|
||||
"version": "1.9.1",
|
||||
"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz",
|
||||
"integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==",
|
||||
"dependencies": {
|
||||
"color-name": "^1.0.0",
|
||||
"simple-swizzle": "^0.2.2"
|
||||
}
|
||||
},
|
||||
"node_modules/color-support": {
|
||||
"version": "1.1.3",
|
||||
@@ -4421,12 +4468,34 @@
|
||||
"color-support": "bin.js"
|
||||
}
|
||||
},
|
||||
"node_modules/color/node_modules/color-convert": {
|
||||
"version": "1.9.3",
|
||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
|
||||
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
|
||||
"dependencies": {
|
||||
"color-name": "1.1.3"
|
||||
}
|
||||
},
|
||||
"node_modules/color/node_modules/color-name": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
|
||||
"integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw=="
|
||||
},
|
||||
"node_modules/colorette": {
|
||||
"version": "2.0.20",
|
||||
"resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz",
|
||||
"integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/colorspace": {
|
||||
"version": "1.1.4",
|
||||
"resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.4.tgz",
|
||||
"integrity": "sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==",
|
||||
"dependencies": {
|
||||
"color": "^3.1.3",
|
||||
"text-hex": "1.0.x"
|
||||
}
|
||||
},
|
||||
"node_modules/commander": {
|
||||
"version": "5.1.0",
|
||||
"resolved": "https://registry.npmjs.org/commander/-/commander-5.1.0.tgz",
|
||||
@@ -5893,6 +5962,11 @@
|
||||
"node": ">= 4"
|
||||
}
|
||||
},
|
||||
"node_modules/enabled": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz",
|
||||
"integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ=="
|
||||
},
|
||||
"node_modules/encodeurl": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
|
||||
@@ -6654,6 +6728,11 @@
|
||||
"pend": "~1.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/fecha": {
|
||||
"version": "4.2.3",
|
||||
"resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz",
|
||||
"integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw=="
|
||||
},
|
||||
"node_modules/file-entry-cache": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz",
|
||||
@@ -6666,6 +6745,14 @@
|
||||
"node": "^10.12.0 || >=12.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/file-stream-rotator": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/file-stream-rotator/-/file-stream-rotator-0.6.1.tgz",
|
||||
"integrity": "sha512-u+dBid4PvZw17PmDeRcNOtCP9CCK/9lRN2w+r1xIS7yOL9JFrIBKTvrYsxT4P0pGtThYTn++QS5ChHaUov3+zQ==",
|
||||
"dependencies": {
|
||||
"moment": "^2.29.1"
|
||||
}
|
||||
},
|
||||
"node_modules/filename-reserved-regex": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/filename-reserved-regex/-/filename-reserved-regex-2.0.0.tgz",
|
||||
@@ -6834,6 +6921,11 @@
|
||||
"node": ">= 4.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/fn.name": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz",
|
||||
"integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw=="
|
||||
},
|
||||
"node_modules/follow-redirects": {
|
||||
"version": "1.15.2",
|
||||
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
|
||||
@@ -7928,8 +8020,7 @@
|
||||
"node_modules/inherits": {
|
||||
"version": "2.0.4",
|
||||
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
|
||||
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
|
||||
"dev": true
|
||||
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
|
||||
},
|
||||
"node_modules/ini": {
|
||||
"version": "1.3.8",
|
||||
@@ -8243,7 +8334,6 @@
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
|
||||
"integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
},
|
||||
@@ -8547,6 +8637,11 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/kuler": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz",
|
||||
"integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A=="
|
||||
},
|
||||
"node_modules/launch-editor": {
|
||||
"version": "2.6.0",
|
||||
"resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.6.0.tgz",
|
||||
@@ -8788,6 +8883,19 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/logform": {
|
||||
"version": "2.5.1",
|
||||
"resolved": "https://registry.npmjs.org/logform/-/logform-2.5.1.tgz",
|
||||
"integrity": "sha512-9FyqAm9o9NKKfiAKfZoYo9bGXXuwMkxQiQttkT4YjjVtQVIQtK6LmVtlxmCaFswo6N4AfEkHqZTV0taDtPotNg==",
|
||||
"dependencies": {
|
||||
"@colors/colors": "1.5.0",
|
||||
"@types/triple-beam": "^1.3.2",
|
||||
"fecha": "^4.2.0",
|
||||
"ms": "^2.1.1",
|
||||
"safe-stable-stringify": "^2.3.1",
|
||||
"triple-beam": "^1.3.0"
|
||||
}
|
||||
},
|
||||
"node_modules/loose-envify": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
|
||||
@@ -9218,6 +9326,14 @@
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/moment": {
|
||||
"version": "2.29.4",
|
||||
"resolved": "https://registry.npmjs.org/moment/-/moment-2.29.4.tgz",
|
||||
"integrity": "sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w==",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/ms": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
|
||||
@@ -9623,6 +9739,14 @@
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/one-time": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz",
|
||||
"integrity": "sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==",
|
||||
"dependencies": {
|
||||
"fn.name": "1.x.x"
|
||||
}
|
||||
},
|
||||
"node_modules/onetime": {
|
||||
"version": "5.1.2",
|
||||
"resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
|
||||
@@ -11366,7 +11490,6 @@
|
||||
"version": "3.6.2",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
|
||||
"integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"inherits": "^2.0.3",
|
||||
"string_decoder": "^1.1.1",
|
||||
@@ -11664,7 +11787,6 @@
|
||||
"version": "5.2.1",
|
||||
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
|
||||
"integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
@@ -11694,6 +11816,14 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/safe-stable-stringify": {
|
||||
"version": "2.4.3",
|
||||
"resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.4.3.tgz",
|
||||
"integrity": "sha512-e2bDA2WJT0wxseVd4lsDP4+3ONX6HpMXQa1ZhFQ7SU+GjvORCmShbCMltrtIDfkYhVHrOcPtj+KhmDBdPdZD1g==",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/safer-buffer": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
||||
@@ -12023,6 +12153,19 @@
|
||||
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/simple-swizzle": {
|
||||
"version": "0.2.2",
|
||||
"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
|
||||
"integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
|
||||
"dependencies": {
|
||||
"is-arrayish": "^0.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/simple-swizzle/node_modules/is-arrayish": {
|
||||
"version": "0.3.2",
|
||||
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
|
||||
"integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
|
||||
},
|
||||
"node_modules/slash": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
|
||||
@@ -12212,6 +12355,14 @@
|
||||
"node": "^14.17.0 || ^16.13.0 || >=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/stack-trace": {
|
||||
"version": "0.0.10",
|
||||
"resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
|
||||
"integrity": "sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/statuses": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
|
||||
@@ -12225,7 +12376,6 @@
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
|
||||
"integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"safe-buffer": "~5.2.0"
|
||||
}
|
||||
@@ -12689,6 +12839,11 @@
|
||||
"integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/text-hex": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
|
||||
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
|
||||
},
|
||||
"node_modules/text-table": {
|
||||
"version": "0.2.0",
|
||||
"resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
|
||||
@@ -12814,6 +12969,14 @@
|
||||
"node": ">=0.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/triple-beam": {
|
||||
"version": "1.4.1",
|
||||
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz",
|
||||
"integrity": "sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==",
|
||||
"engines": {
|
||||
"node": ">= 14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ts-interface-checker": {
|
||||
"version": "0.1.13",
|
||||
"resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz",
|
||||
@@ -13231,8 +13394,7 @@
|
||||
"node_modules/util-deprecate": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
|
||||
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
|
||||
"dev": true
|
||||
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
|
||||
},
|
||||
"node_modules/utila": {
|
||||
"version": "0.4.0",
|
||||
@@ -13764,6 +13926,65 @@
|
||||
"integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/winston": {
|
||||
"version": "3.10.0",
|
||||
"resolved": "https://registry.npmjs.org/winston/-/winston-3.10.0.tgz",
|
||||
"integrity": "sha512-nT6SIDaE9B7ZRO0u3UvdrimG0HkB7dSTAgInQnNR2SOPJ4bvq5q79+pXLftKmP52lJGW15+H5MCK0nM9D3KB/g==",
|
||||
"dependencies": {
|
||||
"@colors/colors": "1.5.0",
|
||||
"@dabh/diagnostics": "^2.0.2",
|
||||
"async": "^3.2.3",
|
||||
"is-stream": "^2.0.0",
|
||||
"logform": "^2.4.0",
|
||||
"one-time": "^1.0.0",
|
||||
"readable-stream": "^3.4.0",
|
||||
"safe-stable-stringify": "^2.3.1",
|
||||
"stack-trace": "0.0.x",
|
||||
"triple-beam": "^1.3.0",
|
||||
"winston-transport": "^4.5.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 12.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/winston-daily-rotate-file": {
|
||||
"version": "4.7.1",
|
||||
"resolved": "https://registry.npmjs.org/winston-daily-rotate-file/-/winston-daily-rotate-file-4.7.1.tgz",
|
||||
"integrity": "sha512-7LGPiYGBPNyGHLn9z33i96zx/bd71pjBn9tqQzO3I4Tayv94WPmBNwKC7CO1wPHdP9uvu+Md/1nr6VSH9h0iaA==",
|
||||
"dependencies": {
|
||||
"file-stream-rotator": "^0.6.1",
|
||||
"object-hash": "^2.0.1",
|
||||
"triple-beam": "^1.3.0",
|
||||
"winston-transport": "^4.4.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"winston": "^3"
|
||||
}
|
||||
},
|
||||
"node_modules/winston-daily-rotate-file/node_modules/object-hash": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/object-hash/-/object-hash-2.2.0.tgz",
|
||||
"integrity": "sha512-gScRMn0bS5fH+IuwyIFgnh9zBdo4DV+6GhygmWM9HyNJSgS0hScp1f5vjtm7oIIOiT9trXrShAkLFSc2IqKNgw==",
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
}
|
||||
},
|
||||
"node_modules/winston-transport": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.5.0.tgz",
|
||||
"integrity": "sha512-YpZzcUzBedhlTAfJg6vJDlyEai/IFMIVcaEZZyl3UXIl4gmqRpU7AE89AHLkbzLUsv0NVmw7ts+iztqKxxPW1Q==",
|
||||
"dependencies": {
|
||||
"logform": "^2.3.2",
|
||||
"readable-stream": "^3.6.0",
|
||||
"triple-beam": "^1.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 6.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/word-wrap": {
|
||||
"version": "1.2.3",
|
||||
"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
|
||||
|
||||
@@ -69,6 +69,8 @@
|
||||
"electron-store": "^8.1.0",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0",
|
||||
"uuid": "^9.0.0"
|
||||
"uuid": "^9.0.0",
|
||||
"winston": "^3.10.0",
|
||||
"winston-daily-rotate-file": "^4.7.1"
|
||||
}
|
||||
}
|
||||
|
||||
124
app/src/index.ts
124
app/src/index.ts
@@ -1,6 +1,8 @@
|
||||
import { spawn, exec } from 'child_process'
|
||||
import { app, autoUpdater, dialog, Tray, Menu } from 'electron'
|
||||
import Store from 'electron-store'
|
||||
import winston from 'winston'
|
||||
import 'winston-daily-rotate-file'
|
||||
import * as path from 'path'
|
||||
import * as fs from 'fs'
|
||||
|
||||
@@ -11,6 +13,18 @@ require('@electron/remote/main').initialize()
|
||||
const store = new Store()
|
||||
let tray: Tray | null = null
|
||||
|
||||
const logger = winston.createLogger({
|
||||
transports: [
|
||||
new winston.transports.Console(),
|
||||
new winston.transports.File({
|
||||
filename: path.join(app.getPath('home'), '.ollama', 'logs', 'server.log'),
|
||||
maxsize: 1024 * 1024 * 20,
|
||||
maxFiles: 5,
|
||||
}),
|
||||
],
|
||||
format: winston.format.printf(info => `${info.message}`),
|
||||
})
|
||||
|
||||
const SingleInstanceLock = app.requestSingleInstanceLock()
|
||||
if (!SingleInstanceLock) {
|
||||
app.quit()
|
||||
@@ -31,42 +45,35 @@ const createSystemtray = () => {
|
||||
tray.setToolTip('Ollama')
|
||||
}
|
||||
|
||||
// Handle creating/removing shortcuts on Windows when installing/uninstalling.
|
||||
if (require('electron-squirrel-startup')) {
|
||||
app.quit()
|
||||
}
|
||||
|
||||
const ollama = path.join(process.resourcesPath, 'ollama')
|
||||
|
||||
// if the app is packaged then run the server
|
||||
if (app.isPackaged) {
|
||||
// Start the executable
|
||||
console.log(`Starting server`)
|
||||
const proc = spawn(ollama, ['serve'])
|
||||
proc.stdout.on('data', data => {
|
||||
console.log(`server: ${data}`)
|
||||
})
|
||||
proc.stderr.on('data', data => {
|
||||
console.error(`server: ${data}`)
|
||||
})
|
||||
|
||||
process.on('exit', () => {
|
||||
proc.kill()
|
||||
})
|
||||
}
|
||||
|
||||
function server() {
|
||||
const binary = app.isPackaged
|
||||
? path.join(process.resourcesPath, 'ollama')
|
||||
: path.resolve(__dirname, '..', '..', 'ollama')
|
||||
: path.resolve(process.cwd(), '..', 'ollama')
|
||||
|
||||
console.log(`Starting server`)
|
||||
const proc = spawn(binary, ['serve'])
|
||||
|
||||
proc.stdout.on('data', data => {
|
||||
console.log(`server: ${data}`)
|
||||
logger.info(data.toString().trim())
|
||||
})
|
||||
|
||||
proc.stderr.on('data', data => {
|
||||
console.error(`server: ${data}`)
|
||||
logger.error(data.toString().trim())
|
||||
})
|
||||
|
||||
proc.on('exit', () => {
|
||||
logger.info('Restarting the server...')
|
||||
server()
|
||||
})
|
||||
|
||||
proc.on('disconnect', () => {
|
||||
logger.info('Server disconnected. Reconnecting...')
|
||||
server()
|
||||
})
|
||||
|
||||
process.on('exit', () => {
|
||||
@@ -95,11 +102,12 @@ function installCLI() {
|
||||
`
|
||||
exec(`osascript -e '${command}'`, (error: Error | null, stdout: string, stderr: string) => {
|
||||
if (error) {
|
||||
console.error(`exec error: ${error}`)
|
||||
logger.error(`cli: failed to install cli: ${error.message}`)
|
||||
return
|
||||
}
|
||||
console.log(`stdout: ${stdout}`)
|
||||
console.error(`stderr: ${stderr}`)
|
||||
|
||||
logger.info(stdout)
|
||||
logger.error(stderr)
|
||||
})
|
||||
}
|
||||
})
|
||||
@@ -118,44 +126,44 @@ app.on('ready', () => {
|
||||
app.setLoginItemSettings({ openAtLogin: app.getLoginItemSettings().openAtLogin })
|
||||
}
|
||||
|
||||
if (!app.isInApplicationsFolder()) {
|
||||
const chosen = dialog.showMessageBoxSync({
|
||||
type: 'question',
|
||||
buttons: ['Move to Applications', 'Do Not Move'],
|
||||
message: 'Ollama works best when run from the Applications directory.',
|
||||
defaultId: 0,
|
||||
cancelId: 1,
|
||||
})
|
||||
if (app.isPackaged) {
|
||||
if (!app.isInApplicationsFolder()) {
|
||||
const chosen = dialog.showMessageBoxSync({
|
||||
type: 'question',
|
||||
buttons: ['Move to Applications', 'Do Not Move'],
|
||||
message: 'Ollama works best when run from the Applications directory.',
|
||||
defaultId: 0,
|
||||
cancelId: 1,
|
||||
})
|
||||
|
||||
if (chosen === 0) {
|
||||
try {
|
||||
app.moveToApplicationsFolder({
|
||||
conflictHandler: conflictType => {
|
||||
if (conflictType === 'existsAndRunning') {
|
||||
dialog.showMessageBoxSync({
|
||||
type: 'info',
|
||||
message: 'Cannot move to Applications directory',
|
||||
detail:
|
||||
'Another version of Ollama is currently running from your Applications directory. Close it first and try again.',
|
||||
})
|
||||
}
|
||||
return true
|
||||
},
|
||||
})
|
||||
return
|
||||
} catch (e) {
|
||||
console.error('Failed to move to applications folder')
|
||||
console.error(e)
|
||||
if (chosen === 0) {
|
||||
try {
|
||||
app.moveToApplicationsFolder({
|
||||
conflictHandler: conflictType => {
|
||||
if (conflictType === 'existsAndRunning') {
|
||||
dialog.showMessageBoxSync({
|
||||
type: 'info',
|
||||
message: 'Cannot move to Applications directory',
|
||||
detail:
|
||||
'Another version of Ollama is currently running from your Applications directory. Close it first and try again.',
|
||||
})
|
||||
}
|
||||
return true
|
||||
},
|
||||
})
|
||||
return
|
||||
} catch (e) {
|
||||
logger.error(`[Move to Applications] Failed to move to applications folder - ${e.message}}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
installCLI()
|
||||
}
|
||||
}
|
||||
|
||||
createSystemtray()
|
||||
|
||||
if (app.isPackaged) {
|
||||
installCLI()
|
||||
}
|
||||
server()
|
||||
})
|
||||
|
||||
// Quit when all windows are closed, except on macOS. There, it's common
|
||||
@@ -183,8 +191,6 @@ async function heartbeat() {
|
||||
})
|
||||
}
|
||||
|
||||
heartbeat()
|
||||
|
||||
if (app.isPackaged) {
|
||||
heartbeat()
|
||||
autoUpdater.checkForUpdates()
|
||||
@@ -195,7 +201,7 @@ if (app.isPackaged) {
|
||||
}
|
||||
|
||||
autoUpdater.on('error', e => {
|
||||
console.error('update check failed', e)
|
||||
logger.error(`update check failed - ${e.message}`)
|
||||
})
|
||||
|
||||
autoUpdater.on('update-downloaded', (event, releaseNotes, releaseName) => {
|
||||
|
||||
@@ -4,8 +4,6 @@ import Store from 'electron-store'
|
||||
|
||||
const store = new Store()
|
||||
|
||||
console.log(process.env)
|
||||
|
||||
export const analytics = new Analytics({ writeKey: process.env.TELEMETRY_WRITE_KEY || '<empty>' })
|
||||
|
||||
export function id(): string {
|
||||
|
||||
43
cmd/cmd.go
43
cmd/cmd.go
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
@@ -34,7 +35,14 @@ func RunRun(cmd *cobra.Command, args []string) error {
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
if err := pull(args[0]); err != nil {
|
||||
return err
|
||||
var apiStatusError api.StatusError
|
||||
if !errors.As(err, &apiStatusError) {
|
||||
return err
|
||||
}
|
||||
|
||||
if apiStatusError.StatusCode != http.StatusBadGateway {
|
||||
return err
|
||||
}
|
||||
}
|
||||
case err != nil:
|
||||
return err
|
||||
@@ -50,11 +58,12 @@ func pull(model string) error {
|
||||
context.Background(),
|
||||
&api.PullRequest{Model: model},
|
||||
func(progress api.PullProgress) error {
|
||||
if bar == nil && progress.Percent == 100 {
|
||||
// already downloaded
|
||||
return nil
|
||||
}
|
||||
if bar == nil {
|
||||
if progress.Percent == 100 {
|
||||
// already downloaded
|
||||
return nil
|
||||
}
|
||||
|
||||
bar = progressbar.DefaultBytes(progress.Total)
|
||||
}
|
||||
|
||||
@@ -64,8 +73,10 @@ func pull(model string) error {
|
||||
}
|
||||
|
||||
func RunGenerate(_ *cobra.Command, args []string) error {
|
||||
// join all args into a single prompt
|
||||
prompt := strings.Join(args[1:], " ")
|
||||
if len(args) > 1 {
|
||||
return generateOneshot(args[0], args[1:]...)
|
||||
return generate(args[0], prompt)
|
||||
}
|
||||
|
||||
if term.IsTerminal(int(os.Stdin.Fd())) {
|
||||
@@ -98,28 +109,22 @@ func generate(model, prompt string) error {
|
||||
}
|
||||
}()
|
||||
|
||||
client.Generate(context.Background(), &api.GenerateRequest{Model: model, Prompt: prompt}, func(resp api.GenerateResponse) error {
|
||||
request := api.GenerateRequest{Model: model, Prompt: prompt}
|
||||
fn := func(resp api.GenerateResponse) error {
|
||||
if !spinner.IsFinished() {
|
||||
spinner.Finish()
|
||||
}
|
||||
|
||||
fmt.Print(resp.Response)
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func generateOneshot(model string, prompts ...string) error {
|
||||
for _, prompt := range prompts {
|
||||
fmt.Printf(">>> %s\n", prompt)
|
||||
if err := generate(model, prompt); err != nil {
|
||||
if err := client.Generate(context.Background(), &request, fn); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
15
examples/python/README.md
Normal file
15
examples/python/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# Python
|
||||
|
||||
This is a simple example of calling the Ollama api from a python app.
|
||||
|
||||
First, download a model:
|
||||
|
||||
```
|
||||
curl -L https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_1.bin -o orca.bin
|
||||
```
|
||||
|
||||
Then run it using the example script. You'll need to have Ollama running on your machine.
|
||||
|
||||
```
|
||||
python3 main.py orca.bin
|
||||
```
|
||||
32
examples/python/main.py
Normal file
32
examples/python/main.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import http.client
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python main.py <model file>")
|
||||
sys.exit(1)
|
||||
|
||||
conn = http.client.HTTPConnection('localhost', 11434)
|
||||
|
||||
headers = { 'Content-Type': 'application/json' }
|
||||
|
||||
# generate text from the model
|
||||
conn.request("POST", "/api/generate", json.dumps({
|
||||
'model': os.path.join(os.getcwd(), sys.argv[1]),
|
||||
'prompt': 'write me a short story',
|
||||
'stream': True
|
||||
}), headers)
|
||||
|
||||
response = conn.getresponse()
|
||||
|
||||
def parse_generate(data):
|
||||
for event in data.decode('utf-8').split("\n"):
|
||||
if not event:
|
||||
continue
|
||||
yield event
|
||||
|
||||
if response.status == 200:
|
||||
for chunk in response:
|
||||
for event in parse_generate(chunk):
|
||||
print(json.loads(event)['response'], end="", flush=True)
|
||||
1
ggml-metal.metal
Symbolic link
1
ggml-metal.metal
Symbolic link
@@ -0,0 +1 @@
|
||||
llama/ggml-metal.metal
|
||||
1
llama/.gitignore
vendored
1
llama/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
build
|
||||
@@ -1,23 +0,0 @@
|
||||
cmake_minimum_required(VERSION 3.12)
|
||||
project(binding)
|
||||
|
||||
include(FetchContent)
|
||||
|
||||
FetchContent_Declare(
|
||||
llama_cpp
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
|
||||
GIT_TAG 55dbb91
|
||||
)
|
||||
|
||||
FetchContent_MakeAvailable(llama_cpp)
|
||||
|
||||
add_library(binding ${CMAKE_CURRENT_SOURCE_DIR}/binding/binding.cpp ${llama_cpp_SOURCE_DIR}/examples/common.cpp)
|
||||
target_include_directories(binding PRIVATE ${llama_cpp_SOURCE_DIR}/examples)
|
||||
target_link_libraries(binding llama ggml_static)
|
||||
|
||||
if (LLAMA_METAL)
|
||||
configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_CURRENT_BINARY_DIR}/../../ggml-metal.metal COPYONLY)
|
||||
endif()
|
||||
|
||||
add_custom_target(copy_libllama ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:llama> ${CMAKE_CURRENT_BINARY_DIR})
|
||||
add_custom_target(copy_libggml_static ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:ggml_static> ${CMAKE_CURRENT_BINARY_DIR})
|
||||
@@ -1,691 +0,0 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include "binding.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <signal.h>
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \
|
||||
defined(_WIN32)
|
||||
void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings) {
|
||||
gpt_params *params_p = (gpt_params *)params_ptr;
|
||||
llama_context *ctx = (llama_context *)state_pr;
|
||||
gpt_params params = *params_p;
|
||||
|
||||
if (params.seed <= 0) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
llama_init_backend(params.numa);
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
// Add a space in front of the first character to match OG llama tokenizer
|
||||
// behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
|
||||
// tokenize the prompt
|
||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
// determine newline token
|
||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
|
||||
if (embd_inp.size() > 0) {
|
||||
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past,
|
||||
params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_embd = llama_n_embd(ctx);
|
||||
|
||||
const auto embeddings = llama_get_embeddings(ctx);
|
||||
|
||||
for (int i = 0; i < n_embd; i++) {
|
||||
res_embeddings[i] = embeddings[i];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens,
|
||||
int tokenSize, float *res_embeddings) {
|
||||
gpt_params *params_p = (gpt_params *)params_ptr;
|
||||
llama_context *ctx = (llama_context *)state_pr;
|
||||
gpt_params params = *params_p;
|
||||
|
||||
for (int i = 0; i < tokenSize; i++) {
|
||||
auto token_str = llama_token_to_str(ctx, tokens[i]);
|
||||
if (token_str == nullptr) {
|
||||
continue;
|
||||
}
|
||||
std::vector<std::string> my_vector;
|
||||
std::string str_token(token_str); // create a new std::string from the char*
|
||||
params_p->prompt += str_token;
|
||||
}
|
||||
|
||||
return get_embeddings(params_ptr, state_pr, res_embeddings);
|
||||
}
|
||||
|
||||
int eval(void *params_ptr, void *state_pr, char *text) {
|
||||
gpt_params *params_p = (gpt_params *)params_ptr;
|
||||
llama_context *ctx = (llama_context *)state_pr;
|
||||
|
||||
auto n_past = 0;
|
||||
auto last_n_tokens_data =
|
||||
std::vector<llama_token>(params_p->repeat_last_n, 0);
|
||||
|
||||
auto tokens = std::vector<llama_token>(params_p->n_ctx);
|
||||
auto n_prompt_tokens =
|
||||
llama_tokenize(ctx, text, tokens.data(), tokens.size(), true);
|
||||
|
||||
if (n_prompt_tokens < 1) {
|
||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// evaluate prompt
|
||||
return llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past,
|
||||
params_p->n_threads);
|
||||
}
|
||||
|
||||
int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug) {
|
||||
gpt_params *params_p = (gpt_params *)params_ptr;
|
||||
llama_context *ctx = (llama_context *)state_pr;
|
||||
|
||||
gpt_params params = *params_p;
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
if (params.seed <= 0) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
std::string path_session = params.path_prompt_cache;
|
||||
std::vector<llama_token> session_tokens;
|
||||
|
||||
if (!path_session.empty()) {
|
||||
if (debug) {
|
||||
fprintf(stderr, "%s: attempting to load saved session from '%s'\n",
|
||||
__func__, path_session.c_str());
|
||||
}
|
||||
// fopen to check for existing session
|
||||
FILE *fp = std::fopen(path_session.c_str(), "rb");
|
||||
if (fp != NULL) {
|
||||
std::fclose(fp);
|
||||
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(
|
||||
ctx, path_session.c_str(), session_tokens.data(),
|
||||
session_tokens.capacity(), &n_token_count_out)) {
|
||||
fprintf(stderr, "%s: error: failed to load session file '%s'\n",
|
||||
__func__, path_session.c_str());
|
||||
return 1;
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
if (debug) {
|
||||
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n",
|
||||
__func__, (int)session_tokens.size());
|
||||
}
|
||||
} else {
|
||||
if (debug) {
|
||||
fprintf(stderr, "%s: session file does not exist, will create\n",
|
||||
__func__);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
if (!params.prompt.empty() || session_tokens.empty()) {
|
||||
// Add a space in front of the first character to match OG llama tokenizer
|
||||
// behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
} else {
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
// debug message about similarity of saved session, if applicable
|
||||
size_t n_matching_session_tokens = 0;
|
||||
if (session_tokens.size()) {
|
||||
for (llama_token id : session_tokens) {
|
||||
if (n_matching_session_tokens >= embd_inp.size() ||
|
||||
id != embd_inp[n_matching_session_tokens]) {
|
||||
break;
|
||||
}
|
||||
n_matching_session_tokens++;
|
||||
}
|
||||
if (debug) {
|
||||
if (params.prompt.empty() &&
|
||||
n_matching_session_tokens == embd_inp.size()) {
|
||||
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
|
||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||
fprintf(stderr, "%s: session file has exact match for prompt!\n",
|
||||
__func__);
|
||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||
fprintf(stderr,
|
||||
"%s: warning: session file has low similarity to prompt (%zu / "
|
||||
"%zu tokens); will mostly be reevaluated\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
} else {
|
||||
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
// if we will use the cache for the full prompt without reaching the end of
|
||||
// the cache, force reevaluation of the last token token to recalculate the
|
||||
// cached logits
|
||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
|
||||
session_tokens.size() > embd_inp.size()) {
|
||||
session_tokens.resize(embd_inp.size() - 1);
|
||||
}
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
// determine newline token
|
||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
|
||||
// TODO: replace with ring-buffer
|
||||
std::vector<llama_token> last_n_tokens(n_ctx);
|
||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
|
||||
bool need_to_save_session =
|
||||
!path_session.empty() && n_matching_session_tokens < embd_inp.size();
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
int n_session_consumed = 0;
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
std::string res = "";
|
||||
|
||||
// do one empty run to warm up the model
|
||||
{
|
||||
const std::vector<llama_token> tmp = {
|
||||
llama_token_bos(),
|
||||
};
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
llama_reset_timings(ctx);
|
||||
}
|
||||
|
||||
while (n_remain != 0) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the
|
||||
// logits in batches
|
||||
if (n_past + (int)embd.size() > n_ctx) {
|
||||
const int n_left = n_past - params.n_keep;
|
||||
|
||||
// always keep the first token - BOS
|
||||
n_past = std::max(1, params.n_keep);
|
||||
|
||||
// insert n_left/2 tokens at the start of embd from last_n_tokens
|
||||
embd.insert(embd.begin(),
|
||||
last_n_tokens.begin() + n_ctx - n_left / 2 - embd.size(),
|
||||
last_n_tokens.end() - embd.size());
|
||||
|
||||
// stop saving session if we run out of context
|
||||
path_session.clear();
|
||||
|
||||
// printf("\n---\n");
|
||||
// printf("resetting: '");
|
||||
// for (int i = 0; i < (int) embd.size(); i++) {
|
||||
// printf("%s", llama_token_to_str(ctx, embd[i]));
|
||||
// }
|
||||
// printf("'\n");
|
||||
// printf("\n---\n");
|
||||
}
|
||||
|
||||
// try to reuse a matching prefix from the loaded session instead of
|
||||
// re-eval (via n_past)
|
||||
if (n_session_consumed < (int)session_tokens.size()) {
|
||||
size_t i = 0;
|
||||
for (; i < embd.size(); i++) {
|
||||
if (embd[i] != session_tokens[n_session_consumed]) {
|
||||
session_tokens.resize(n_session_consumed);
|
||||
break;
|
||||
}
|
||||
|
||||
n_past++;
|
||||
n_session_consumed++;
|
||||
|
||||
if (n_session_consumed >= (int)session_tokens.size()) {
|
||||
++i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i > 0) {
|
||||
embd.erase(embd.begin(), embd.begin() + i);
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not
|
||||
// always
|
||||
for (int i = 0; i < (int)embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int)embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
|
||||
if (embd.size() > 0 && !path_session.empty()) {
|
||||
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
|
||||
n_session_consumed = session_tokens.size();
|
||||
}
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int)embd_inp.size() <= n_consumed) {
|
||||
// out of user input, sample next token
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k =
|
||||
params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t repeat_last_n =
|
||||
params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const float alpha_presence = params.presence_penalty;
|
||||
const float alpha_frequency = params.frequency_penalty;
|
||||
const int mirostat = params.mirostat;
|
||||
const float mirostat_tau = params.mirostat_tau;
|
||||
const float mirostat_eta = params.mirostat_eta;
|
||||
const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
// optionally save the session on first sample (for faster prompt loading
|
||||
// next time)
|
||||
if (!path_session.empty() && need_to_save_session &&
|
||||
!params.prompt_cache_ro) {
|
||||
need_to_save_session = false;
|
||||
llama_save_session_file(ctx, path_session.c_str(),
|
||||
session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
llama_token id = 0;
|
||||
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end();
|
||||
it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(
|
||||
llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = {candidates.data(),
|
||||
candidates.size(), false};
|
||||
|
||||
// Apply penalties
|
||||
float nl_logit = logits[llama_token_nl()];
|
||||
auto last_n_repeat =
|
||||
std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
llama_sample_repetition_penalty(
|
||||
ctx, &candidates_p,
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, repeat_penalty);
|
||||
llama_sample_frequency_and_presence_penalties(
|
||||
ctx, &candidates_p,
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, alpha_frequency, alpha_presence);
|
||||
if (!penalize_nl) {
|
||||
logits[llama_token_nl()] = nl_logit;
|
||||
}
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau,
|
||||
mirostat_eta, mirostat_m,
|
||||
&mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(
|
||||
ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
// printf("`%d`", candidates_p.size);
|
||||
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(id);
|
||||
}
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
// call the token callback, no need to check if one is actually
|
||||
// registered, that will be handled on the Go side.
|
||||
auto token_str = llama_token_to_str(ctx, id);
|
||||
if (!tokenCallback(state_pr, (char *)token_str)) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to
|
||||
// processing
|
||||
while ((int)embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(embd_inp[n_consumed]);
|
||||
++n_consumed;
|
||||
if ((int)embd.size() >= params.n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto id : embd) {
|
||||
res += llama_token_to_str(ctx, id);
|
||||
}
|
||||
|
||||
// check for stop prompt
|
||||
if (params.antiprompt.size()) {
|
||||
std::string last_output;
|
||||
for (auto id : last_n_tokens) {
|
||||
last_output += llama_token_to_str(ctx, id);
|
||||
}
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
for (std::string &antiprompt : params.antiprompt) {
|
||||
// size_t extra_padding = params.interactive ? 0 : 2;
|
||||
size_t extra_padding = 2;
|
||||
size_t search_start_pos =
|
||||
last_output.length() >
|
||||
static_cast<size_t>(antiprompt.length() + extra_padding)
|
||||
? last_output.length() -
|
||||
static_cast<size_t>(antiprompt.length() + extra_padding)
|
||||
: 0;
|
||||
|
||||
if (last_output.find(antiprompt.c_str(), search_start_pos) !=
|
||||
std::string::npos) {
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!path_session.empty() && params.prompt_cache_all &&
|
||||
!params.prompt_cache_ro) {
|
||||
if (debug) {
|
||||
fprintf(stderr, "\n%s: saving final output to session file '%s'\n",
|
||||
__func__, path_session.c_str());
|
||||
}
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(),
|
||||
session_tokens.size());
|
||||
}
|
||||
|
||||
end:
|
||||
#if defined(_WIN32)
|
||||
signal(SIGINT, SIG_DFL);
|
||||
#endif
|
||||
|
||||
if (debug) {
|
||||
llama_print_timings(ctx);
|
||||
llama_reset_timings(ctx);
|
||||
}
|
||||
|
||||
strcpy(result, res.c_str());
|
||||
return 0;
|
||||
}
|
||||
|
||||
void llama_binding_free_model(void *state_ptr) {
|
||||
llama_context *ctx = (llama_context *)state_ptr;
|
||||
llama_free(ctx);
|
||||
}
|
||||
|
||||
void llama_free_params(void *params_ptr) {
|
||||
gpt_params *params = (gpt_params *)params_ptr;
|
||||
delete params;
|
||||
}
|
||||
|
||||
std::vector<std::string> create_vector(const char **strings, int count) {
|
||||
std::vector<std::string> *vec = new std::vector<std::string>;
|
||||
for (int i = 0; i < count; i++) {
|
||||
vec->push_back(std::string(strings[i]));
|
||||
}
|
||||
return *vec;
|
||||
}
|
||||
|
||||
void delete_vector(std::vector<std::string> *vec) { delete vec; }
|
||||
|
||||
int load_state(void *ctx, char *statefile, char *modes) {
|
||||
llama_context *state = (llama_context *)ctx;
|
||||
const llama_context *constState = static_cast<const llama_context *>(state);
|
||||
const size_t state_size = llama_get_state_size(state);
|
||||
uint8_t *state_mem = new uint8_t[state_size];
|
||||
|
||||
{
|
||||
FILE *fp_read = fopen(statefile, modes);
|
||||
if (state_size != llama_get_state_size(constState)) {
|
||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
||||
if (ret != state_size) {
|
||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_set_state_data(
|
||||
state, state_mem); // could also read directly from memory mapped file
|
||||
fclose(fp_read);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void save_state(void *ctx, char *dst, char *modes) {
|
||||
llama_context *state = (llama_context *)ctx;
|
||||
|
||||
const size_t state_size = llama_get_state_size(state);
|
||||
uint8_t *state_mem = new uint8_t[state_size];
|
||||
|
||||
// Save state (rng, logits, embedding and kv_cache) to file
|
||||
{
|
||||
FILE *fp_write = fopen(dst, modes);
|
||||
llama_copy_state_data(
|
||||
state, state_mem); // could also copy directly to memory mapped file
|
||||
fwrite(state_mem, 1, state_size, fp_write);
|
||||
fclose(fp_write);
|
||||
}
|
||||
}
|
||||
|
||||
void *llama_allocate_params(
|
||||
const char *prompt, int seed, int threads, int tokens, int top_k,
|
||||
float top_p, float temp, float repeat_penalty, int repeat_last_n,
|
||||
bool ignore_eos, bool memory_f16, int n_batch, int n_keep,
|
||||
const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
|
||||
float frequency_penalty, float presence_penalty, int mirostat,
|
||||
float mirostat_eta, float mirostat_tau, bool penalize_nl,
|
||||
const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
|
||||
const char *tensorsplit) {
|
||||
gpt_params *params = new gpt_params;
|
||||
params->seed = seed;
|
||||
params->n_threads = threads;
|
||||
params->n_predict = tokens;
|
||||
params->repeat_last_n = repeat_last_n;
|
||||
params->top_k = top_k;
|
||||
params->top_p = top_p;
|
||||
params->memory_f16 = memory_f16;
|
||||
params->temp = temp;
|
||||
params->use_mmap = mmap;
|
||||
params->use_mlock = mlock;
|
||||
params->repeat_penalty = repeat_penalty;
|
||||
params->n_batch = n_batch;
|
||||
params->n_keep = n_keep;
|
||||
if (maingpu[0] != '\0') {
|
||||
params->main_gpu = std::stoi(maingpu);
|
||||
}
|
||||
|
||||
if (tensorsplit[0] != '\0') {
|
||||
std::string arg_next = tensorsplit;
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params->tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
params->tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ignore_eos) {
|
||||
params->logit_bias[llama_token_eos()] = -INFINITY;
|
||||
}
|
||||
if (antiprompt_count > 0) {
|
||||
params->antiprompt = create_vector(antiprompt, antiprompt_count);
|
||||
}
|
||||
params->tfs_z = tfs_z;
|
||||
params->typical_p = typical_p;
|
||||
params->presence_penalty = presence_penalty;
|
||||
params->mirostat = mirostat;
|
||||
params->mirostat_eta = mirostat_eta;
|
||||
params->mirostat_tau = mirostat_tau;
|
||||
params->penalize_nl = penalize_nl;
|
||||
std::stringstream ss(logit_bias);
|
||||
llama_token key;
|
||||
char sign;
|
||||
std::string value_str;
|
||||
if (ss >> key && ss >> sign && std::getline(ss, value_str) &&
|
||||
(sign == '+' || sign == '-')) {
|
||||
params->logit_bias[key] =
|
||||
std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
||||
}
|
||||
params->frequency_penalty = frequency_penalty;
|
||||
params->prompt = prompt;
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16,
|
||||
bool mlock, bool embeddings, bool mmap, bool low_vram,
|
||||
bool vocab_only, int n_gpu_layers, int n_batch,
|
||||
const char *maingpu, const char *tensorsplit, bool numa) {
|
||||
// load the model
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = n_ctx;
|
||||
lparams.seed = n_seed;
|
||||
lparams.f16_kv = memory_f16;
|
||||
lparams.embedding = embeddings;
|
||||
lparams.use_mlock = mlock;
|
||||
lparams.n_gpu_layers = n_gpu_layers;
|
||||
lparams.use_mmap = mmap;
|
||||
lparams.low_vram = low_vram;
|
||||
lparams.vocab_only = vocab_only;
|
||||
|
||||
if (maingpu[0] != '\0') {
|
||||
lparams.main_gpu = std::stoi(maingpu);
|
||||
}
|
||||
|
||||
if (tensorsplit[0] != '\0') {
|
||||
std::string arg_next = tensorsplit;
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
lparams.tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
lparams.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lparams.n_batch = n_batch;
|
||||
|
||||
llama_init_backend(numa);
|
||||
void *res = nullptr;
|
||||
try {
|
||||
res = llama_init_from_file(fname, lparams);
|
||||
} catch (std::runtime_error &e) {
|
||||
fprintf(stderr, "failed %s", e.what());
|
||||
return res;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
#ifdef __cplusplus
|
||||
#include <string>
|
||||
#include <vector>
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
extern unsigned char tokenCallback(void *, char *);
|
||||
|
||||
int load_state(void *ctx, char *statefile, char *modes);
|
||||
|
||||
int eval(void *params_ptr, void *ctx, char *text);
|
||||
|
||||
void save_state(void *ctx, char *dst, char *modes);
|
||||
|
||||
void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16,
|
||||
bool mlock, bool embeddings, bool mmap, bool low_vram,
|
||||
bool vocab_only, int n_gpu, int n_batch, const char *maingpu,
|
||||
const char *tensorsplit, bool numa);
|
||||
|
||||
int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings);
|
||||
|
||||
int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens,
|
||||
int tokenSize, float *res_embeddings);
|
||||
|
||||
void *llama_allocate_params(
|
||||
const char *prompt, int seed, int threads, int tokens, int top_k,
|
||||
float top_p, float temp, float repeat_penalty, int repeat_last_n,
|
||||
bool ignore_eos, bool memory_f16, int n_batch, int n_keep,
|
||||
const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
|
||||
float frequency_penalty, float presence_penalty, int mirostat,
|
||||
float mirostat_eta, float mirostat_tau, bool penalize_nl,
|
||||
const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
|
||||
const char *tensorsplit);
|
||||
|
||||
void llama_free_params(void *params_ptr);
|
||||
|
||||
void llama_binding_free_model(void *state);
|
||||
|
||||
int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
std::vector<std::string> create_vector(const char **strings, int count);
|
||||
void delete_vector(std::vector<std::string> *vec);
|
||||
#endif
|
||||
3414
llama/ggml-cuda.cu
Normal file
3414
llama/ggml-cuda.cu
Normal file
File diff suppressed because it is too large
Load Diff
62
llama/ggml-cuda.h
Normal file
62
llama/ggml-cuda.h
Normal file
@@ -0,0 +1,62 @@
|
||||
/**
|
||||
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023 Georgi Gerganov
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
void ggml_init_cublas(void);
|
||||
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
// TODO: export these with GGML_API
|
||||
void * ggml_cuda_host_malloc(size_t size);
|
||||
void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
|
||||
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_set_main_device(int main_device);
|
||||
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||
void ggml_cuda_free_scratch(void);
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
97
llama/ggml-metal.h
Normal file
97
llama/ggml-metal.h
Normal file
@@ -0,0 +1,97 @@
|
||||
/**
|
||||
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023 Georgi Gerganov
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
// An interface allowing to compute ggml_cgraph with Metal
|
||||
//
|
||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
|
||||
//
|
||||
// How it works?
|
||||
//
|
||||
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
|
||||
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
|
||||
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
|
||||
//
|
||||
// You only need to make sure that all memory buffers that you used during the graph creation
|
||||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
||||
//
|
||||
// Synchronization between device and host memory (for example for input and output tensors)
|
||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
// max memory buffers that can be mapped to the device
|
||||
#define GGML_METAL_MAX_BUFFERS 16
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_cgraph;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_metal_context;
|
||||
|
||||
// number of command buffers to use
|
||||
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||
|
||||
// set the number of command buffers to use
|
||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
||||
|
||||
// creates a mapping between a host memory buffer and a device memory buffer
|
||||
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
||||
// - the mapping is used during computation to determine the arguments of the compute kernels
|
||||
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
||||
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
||||
// that it is guaranteed that the tensor will fit in at least one of the views
|
||||
//
|
||||
bool ggml_metal_add_buffer(
|
||||
struct ggml_metal_context * ctx,
|
||||
const char * name,
|
||||
void * data,
|
||||
size_t size,
|
||||
size_t max_size);
|
||||
|
||||
// set data from host memory into the device
|
||||
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||
|
||||
// get data from the device into host memory
|
||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||
|
||||
// same as ggml_graph_compute but uses Metal
|
||||
// creates gf->n_threads command buffers in parallel
|
||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
1014
llama/ggml-metal.m
Normal file
1014
llama/ggml-metal.m
Normal file
File diff suppressed because it is too large
Load Diff
1855
llama/ggml-metal.metal
Normal file
1855
llama/ggml-metal.metal
Normal file
File diff suppressed because it is too large
Load Diff
18380
llama/ggml.c
Normal file
18380
llama/ggml.c
Normal file
File diff suppressed because it is too large
Load Diff
1575
llama/ggml.h
Normal file
1575
llama/ggml.h
Normal file
File diff suppressed because it is too large
Load Diff
3926
llama/k_quants.c
Normal file
3926
llama/k_quants.c
Normal file
File diff suppressed because it is too large
Load Diff
183
llama/k_quants.h
Normal file
183
llama/k_quants.h
Normal file
@@ -0,0 +1,183 @@
|
||||
/**
|
||||
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023 Georgi Gerganov
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// Super-block size
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
#define K_SCALE_SIZE 4
|
||||
#else
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
// 2-bit quantization
|
||||
// weight is represented as x = a * q + b
|
||||
// 16 blocks of 16 elemenets each
|
||||
// Effectively 2.5625 bits per weight
|
||||
typedef struct {
|
||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[QK_K/4]; // quants
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
// 3-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elemenets each
|
||||
// Effectively 3.4375 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[2];
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[12]; // scales, quantized with 6 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 4-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 4.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_fp16_t d[2]; // super-block scales/mins
|
||||
uint8_t scales[2]; // 4-bit block scales/mins
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 5-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 5.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // super-block scale
|
||||
int8_t scales[QK_K/16]; // 8-bit block scales
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 6-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elemenets each
|
||||
// Effectively 6.5625 bits per weight
|
||||
typedef struct {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q6_K;
|
||||
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
|
||||
|
||||
// This is only used for intermediate quantization and dot products
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
int8_t qs[QK_K]; // quants
|
||||
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
||||
} block_q8_K;
|
||||
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
// Quantization with histogram collection
|
||||
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
530
llama/llama-util.h
Normal file
530
llama/llama-util.h
Normal file
@@ -0,0 +1,530 @@
|
||||
/**
|
||||
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023 Georgi Gerganov
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
#ifndef LLAMA_UTIL_H
|
||||
#define LLAMA_UTIL_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h> // for _fseeki64
|
||||
#endif
|
||||
|
||||
#define LLAMA_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
LLAMA_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
||||
}
|
||||
}
|
||||
|
||||
std::uint32_t read_u32() {
|
||||
std::uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string read_string(std::uint32_t len) {
|
||||
std::vector<char> chars(len);
|
||||
read_raw(chars.data(), len);
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(_WIN32)
|
||||
static std::string llama_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
if (!size) {
|
||||
return "FormatMessageA failed";
|
||||
}
|
||||
std::string ret(buf, size);
|
||||
LocalFree(buf);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct llama_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
|
||||
llama_mmap(const llama_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_PRIVATE;
|
||||
// prefetch/readahead impairs performance on NUMA systems
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
if (prefetch) { flags |= MAP_POPULATE; }
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch > 0) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
if (numa) {
|
||||
// advise the kernel not to use readahead
|
||||
// (because the next page might not belong on the same node)
|
||||
if (madvise(addr, file->size, MADV_RANDOM)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
munmap(addr, size);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
||||
(void) numa;
|
||||
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
||||
(void) prefetch;
|
||||
(void) numa;
|
||||
|
||||
throw std::runtime_error(std::string("mmap not supported"));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct llama_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
bool failed_already = false;
|
||||
|
||||
llama_mlock() {}
|
||||
llama_mlock(const llama_mlock &) = delete;
|
||||
|
||||
~llama_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * ptr) {
|
||||
LLAMA_ASSERT(addr == NULL && size == 0);
|
||||
addr = ptr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
LLAMA_ASSERT(addr);
|
||||
if (failed_already) {
|
||||
return;
|
||||
}
|
||||
size_t granularity = lock_granularity();
|
||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
||||
if (target_size > size) {
|
||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
||||
size = target_size;
|
||||
} else {
|
||||
failed_already = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) sysconf(_SC_PAGESIZE);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
if (!mlock(addr, size)) {
|
||||
return true;
|
||||
} else {
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
|
||||
// Check if the resource limit is fine after all
|
||||
struct rlimit lock_limit;
|
||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
||||
suggest = false;
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
||||
suggest = false;
|
||||
|
||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#undef MLOCK_SUGGESTION
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (munlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * ptr, size_t len) {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(ptr, len)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// It failed but this was only the first try; increase the working
|
||||
// set size and try again.
|
||||
SIZE_T min_ws_size, max_ws_size;
|
||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
// Per MSDN: "The maximum number of pages that a process can lock
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = len + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += increment;
|
||||
max_ws_size += increment;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void raw_unlock(void * ptr, size_t len) {
|
||||
if (!VirtualUnlock(ptr, len)) {
|
||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) 65536;
|
||||
}
|
||||
|
||||
bool raw_lock(const void * addr, size_t len) {
|
||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
void raw_unlock(const void * addr, size_t len) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||
struct llama_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
llama_buffer() = default;
|
||||
|
||||
void resize(size_t len) {
|
||||
#ifdef GGML_USE_METAL
|
||||
free(addr);
|
||||
int result = posix_memalign((void **) &addr, getpagesize(), len);
|
||||
if (result == 0) {
|
||||
memset(addr, 0, len);
|
||||
}
|
||||
else {
|
||||
addr = NULL;
|
||||
}
|
||||
#else
|
||||
delete[] addr;
|
||||
addr = new uint8_t[len];
|
||||
#endif
|
||||
size = len;
|
||||
}
|
||||
|
||||
~llama_buffer() {
|
||||
#ifdef GGML_USE_METAL
|
||||
free(addr);
|
||||
#else
|
||||
delete[] addr;
|
||||
#endif
|
||||
addr = NULL;
|
||||
}
|
||||
|
||||
// disable copy and move
|
||||
llama_buffer(const llama_buffer&) = delete;
|
||||
llama_buffer(llama_buffer&&) = delete;
|
||||
llama_buffer& operator=(const llama_buffer&) = delete;
|
||||
llama_buffer& operator=(llama_buffer&&) = delete;
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
struct llama_ctx_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
bool is_cuda;
|
||||
size_t size = 0;
|
||||
|
||||
llama_ctx_buffer() = default;
|
||||
|
||||
void resize(size_t size) {
|
||||
free();
|
||||
|
||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
||||
if (addr) {
|
||||
is_cuda = true;
|
||||
}
|
||||
else {
|
||||
// fall back to pageable memory
|
||||
addr = new uint8_t[size];
|
||||
is_cuda = false;
|
||||
}
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
void free() {
|
||||
if (addr) {
|
||||
if (is_cuda) {
|
||||
ggml_cuda_host_free(addr);
|
||||
}
|
||||
else {
|
||||
delete[] addr;
|
||||
}
|
||||
}
|
||||
addr = NULL;
|
||||
}
|
||||
|
||||
~llama_ctx_buffer() {
|
||||
free();
|
||||
}
|
||||
|
||||
// disable copy and move
|
||||
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
||||
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
||||
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
||||
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
||||
};
|
||||
#else
|
||||
typedef llama_buffer llama_ctx_buffer;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
3700
llama/llama.cpp
Normal file
3700
llama/llama.cpp
Normal file
File diff suppressed because it is too large
Load Diff
369
llama/llama.go
369
llama/llama.go
@@ -1,217 +1,234 @@
|
||||
// MIT License
|
||||
|
||||
// Copyright (c) 2023 go-skynet authors
|
||||
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
package llama
|
||||
|
||||
// #cgo LDFLAGS: -Lbuild -lbinding -lllama -lm -lggml_static -lstdc++
|
||||
// #cgo CXXFLAGS: -std=c++11
|
||||
// #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
// #include "binding/binding.h"
|
||||
// #include <stdlib.h>
|
||||
import "C"
|
||||
/*
|
||||
#cgo CPPFLAGS: -O3 -DNDEBUG=1
|
||||
#cgo CXXFLAGS: -std=c++11
|
||||
#cgo darwin CPPFLAGS: -DGGML_USE_METAL=1 -DGGML_METAL_NDEBUG=1
|
||||
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
#include <stdlib.h>
|
||||
#include "llama.h"
|
||||
|
||||
struct llama_sample_options
|
||||
{
|
||||
float repeat_penalty;
|
||||
float frequency_penalty;
|
||||
float presence_penalty;
|
||||
float temperature;
|
||||
int32_t top_k;
|
||||
float top_p;
|
||||
float tfs_z;
|
||||
float typical_p;
|
||||
int mirostat;
|
||||
float mirostat_tau;
|
||||
float mirostat_eta;
|
||||
};
|
||||
|
||||
llama_token llama_sample(
|
||||
struct llama_context *ctx,
|
||||
struct llama_token_data *candidates,
|
||||
size_t n_candidates,
|
||||
const llama_token *last_tokens,
|
||||
size_t n_last_tokens,
|
||||
struct llama_sample_options *opts)
|
||||
{
|
||||
llama_token_data_array candidates_p = {
|
||||
candidates,
|
||||
n_candidates,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sample_repetition_penalty(
|
||||
ctx, &candidates_p,
|
||||
last_tokens, n_last_tokens,
|
||||
opts->repeat_penalty);
|
||||
|
||||
llama_sample_frequency_and_presence_penalties(
|
||||
ctx, &candidates_p,
|
||||
last_tokens, n_last_tokens,
|
||||
opts->frequency_penalty, opts->presence_penalty);
|
||||
|
||||
if (opts->temperature <= 0) {
|
||||
return llama_sample_token_greedy(ctx, &candidates_p);
|
||||
}
|
||||
|
||||
if (opts->mirostat == 1) {
|
||||
int mirostat_m = 100;
|
||||
float mirostat_mu = 2.0f * opts->mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
|
||||
return llama_sample_token_mirostat(
|
||||
ctx, &candidates_p,
|
||||
opts->mirostat_tau, opts->mirostat_eta,
|
||||
mirostat_m, &mirostat_mu);
|
||||
} else if (opts->mirostat == 2) {
|
||||
float mirostat_mu = 2.0f * opts->mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
|
||||
return llama_sample_token_mirostat_v2(
|
||||
ctx, &candidates_p,
|
||||
opts->mirostat_tau, opts->mirostat_eta,
|
||||
&mirostat_mu);
|
||||
} else {
|
||||
llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1);
|
||||
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
|
||||
return llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"fmt"
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type LLama struct {
|
||||
ctx unsafe.Pointer
|
||||
embeddings bool
|
||||
contextSize int
|
||||
type llama struct {
|
||||
params *C.struct_llama_context_params
|
||||
model *C.struct_llama_model
|
||||
ctx *C.struct_llama_context
|
||||
|
||||
api.Options
|
||||
}
|
||||
|
||||
func New(model string, mo ModelOptions) (*LLama, error) {
|
||||
modelPath := C.CString(model)
|
||||
defer C.free(unsafe.Pointer(modelPath))
|
||||
|
||||
ctx := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA))
|
||||
if ctx == nil {
|
||||
return nil, fmt.Errorf("failed loading model")
|
||||
func New(model string, opts api.Options) (*llama, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ll := &LLama{ctx: ctx, contextSize: mo.ContextSize, embeddings: mo.Embeddings}
|
||||
llm := llama{Options: opts}
|
||||
|
||||
return ll, nil
|
||||
C.llama_backend_init(C.bool(llm.UseNUMA))
|
||||
|
||||
params := C.llama_context_default_params()
|
||||
params.seed = C.uint(llm.Seed)
|
||||
params.n_ctx = C.int(llm.NumCtx)
|
||||
params.n_batch = C.int(llm.NumBatch)
|
||||
params.n_gpu_layers = C.int(llm.NumGPU)
|
||||
params.main_gpu = C.int(llm.MainGPU)
|
||||
params.low_vram = C.bool(llm.LowVRAM)
|
||||
params.f16_kv = C.bool(llm.F16KV)
|
||||
params.logits_all = C.bool(llm.LogitsAll)
|
||||
params.vocab_only = C.bool(llm.VocabOnly)
|
||||
params.use_mmap = C.bool(llm.UseMMap)
|
||||
params.use_mlock = C.bool(llm.UseMLock)
|
||||
params.embedding = C.bool(llm.EmbeddingOnly)
|
||||
llm.params = ¶ms
|
||||
|
||||
cModel := C.CString(model)
|
||||
defer C.free(unsafe.Pointer(cModel))
|
||||
|
||||
llm.model = C.llama_load_model_from_file(cModel, params)
|
||||
llm.ctx = C.llama_new_context_with_model(llm.model, params)
|
||||
|
||||
// warm up the model
|
||||
bos := []C.llama_token{C.llama_token_bos()}
|
||||
C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread))
|
||||
C.llama_reset_timings(llm.ctx)
|
||||
|
||||
return &llm, nil
|
||||
}
|
||||
|
||||
func (l *LLama) Free() {
|
||||
C.llama_binding_free_model(l.ctx)
|
||||
func (llm *llama) Close() {
|
||||
defer C.llama_free_model(llm.model)
|
||||
defer C.llama_free(llm.ctx)
|
||||
|
||||
C.llama_print_timings(llm.ctx)
|
||||
}
|
||||
|
||||
func (l *LLama) Eval(text string, opts ...PredictOption) error {
|
||||
po := NewPredictOptions(opts...)
|
||||
|
||||
input := C.CString(text)
|
||||
if po.Tokens == 0 {
|
||||
po.Tokens = 99999999
|
||||
}
|
||||
defer C.free(unsafe.Pointer(input))
|
||||
|
||||
reverseCount := len(po.StopPrompts)
|
||||
reversePrompt := make([]*C.char, reverseCount)
|
||||
var pass **C.char
|
||||
for i, s := range po.StopPrompts {
|
||||
cs := C.CString(s)
|
||||
reversePrompt[i] = cs
|
||||
pass = &reversePrompt[0]
|
||||
defer C.free(unsafe.Pointer(cs))
|
||||
func (llm *llama) Predict(prompt string, fn func(string)) error {
|
||||
if tokens := llm.tokenize(prompt); tokens != nil {
|
||||
return llm.generate(tokens, fn)
|
||||
}
|
||||
|
||||
cLogitBias := C.CString(po.LogitBias)
|
||||
defer C.free(unsafe.Pointer(cLogitBias))
|
||||
return errors.New("llama: tokenize")
|
||||
}
|
||||
|
||||
cMainGPU := C.CString(po.MainGPU)
|
||||
defer C.free(unsafe.Pointer(cMainGPU))
|
||||
func (llm *llama) tokenize(prompt string) []C.llama_token {
|
||||
cPrompt := C.CString(prompt)
|
||||
defer C.free(unsafe.Pointer(cPrompt))
|
||||
|
||||
cTensorSplit := C.CString(po.TensorSplit)
|
||||
defer C.free(unsafe.Pointer(cTensorSplit))
|
||||
|
||||
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
|
||||
C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
|
||||
)
|
||||
defer C.llama_free_params(params)
|
||||
|
||||
ret := C.eval(params, l.ctx, input)
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("inference failed")
|
||||
tokens := make([]C.llama_token, llm.NumCtx)
|
||||
if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 {
|
||||
return tokens[:n]
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *LLama) Predict(text string, po PredictOptions) (string, error) {
|
||||
if po.TokenCallback != nil {
|
||||
setCallback(l.ctx, po.TokenCallback)
|
||||
func (llm *llama) detokenize(tokens ...C.llama_token) string {
|
||||
var sb strings.Builder
|
||||
for _, token := range tokens {
|
||||
sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token)))
|
||||
}
|
||||
|
||||
input := C.CString(text)
|
||||
if po.Tokens == 0 {
|
||||
po.Tokens = 99999999
|
||||
}
|
||||
defer C.free(unsafe.Pointer(input))
|
||||
|
||||
out := make([]byte, po.Tokens)
|
||||
|
||||
reverseCount := len(po.StopPrompts)
|
||||
reversePrompt := make([]*C.char, reverseCount)
|
||||
var pass **C.char
|
||||
for i, s := range po.StopPrompts {
|
||||
cs := C.CString(s)
|
||||
reversePrompt[i] = cs
|
||||
pass = &reversePrompt[0]
|
||||
defer C.free(unsafe.Pointer(cs))
|
||||
}
|
||||
|
||||
cLogitBias := C.CString(po.LogitBias)
|
||||
defer C.free(unsafe.Pointer(cLogitBias))
|
||||
|
||||
cMainGPU := C.CString(po.MainGPU)
|
||||
defer C.free(unsafe.Pointer(cMainGPU))
|
||||
|
||||
cTensorSplit := C.CString(po.TensorSplit)
|
||||
defer C.free(unsafe.Pointer(cTensorSplit))
|
||||
|
||||
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
|
||||
C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
|
||||
)
|
||||
defer C.llama_free_params(params)
|
||||
|
||||
ret := C.llama_predict(params, l.ctx, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
|
||||
if ret != 0 {
|
||||
return "", fmt.Errorf("inference failed")
|
||||
}
|
||||
res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
|
||||
|
||||
res = strings.TrimPrefix(res, " ")
|
||||
res = strings.TrimPrefix(res, text)
|
||||
res = strings.TrimPrefix(res, "\n")
|
||||
|
||||
for _, s := range po.StopPrompts {
|
||||
res = strings.TrimRight(res, s)
|
||||
}
|
||||
|
||||
if po.TokenCallback != nil {
|
||||
setCallback(l.ctx, nil)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// CGo only allows us to use static calls from C to Go, we can't just dynamically pass in func's.
|
||||
// This is the next best thing, we register the callbacks in this map and call tokenCallback from
|
||||
// the C code. We also attach a finalizer to LLama, so it will unregister the callback when the
|
||||
// garbage collection frees it.
|
||||
func (llm *llama) generate(tokens []C.llama_token, fn func(string)) error {
|
||||
var opts C.struct_llama_sample_options
|
||||
opts.repeat_penalty = C.float(llm.RepeatPenalty)
|
||||
opts.frequency_penalty = C.float(llm.FrequencyPenalty)
|
||||
opts.presence_penalty = C.float(llm.PresencePenalty)
|
||||
opts.temperature = C.float(llm.Temperature)
|
||||
opts.top_k = C.int(llm.TopK)
|
||||
opts.top_p = C.float(llm.TopP)
|
||||
opts.tfs_z = C.float(llm.TFSZ)
|
||||
opts.typical_p = C.float(llm.TypicalP)
|
||||
opts.mirostat = C.int(llm.Mirostat)
|
||||
opts.mirostat_tau = C.float(llm.MirostatTau)
|
||||
opts.mirostat_eta = C.float(llm.MirostatEta)
|
||||
|
||||
// SetTokenCallback registers a callback for the individual tokens created when running Predict. It
|
||||
// will be called once for each token. The callback shall return true as long as the model should
|
||||
// continue predicting the next token. When the callback returns false the predictor will return.
|
||||
// The tokens are just converted into Go strings, they are not trimmed or otherwise changed. Also
|
||||
// the tokens may not be valid UTF-8.
|
||||
// Pass in nil to remove a callback.
|
||||
//
|
||||
// It is save to call this method while a prediction is running.
|
||||
func (l *LLama) SetTokenCallback(callback func(token string) bool) {
|
||||
setCallback(l.ctx, callback)
|
||||
}
|
||||
pastTokens := deque[C.llama_token]{capacity: llm.RepeatLastN}
|
||||
|
||||
var (
|
||||
m sync.Mutex
|
||||
callbacks = map[uintptr]func(string) bool{}
|
||||
)
|
||||
for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) {
|
||||
if retval := C.llama_eval(llm.ctx, unsafe.SliceData(tokens), C.int(len(tokens)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 {
|
||||
return errors.New("llama: eval")
|
||||
}
|
||||
|
||||
//export tokenCallback
|
||||
func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool {
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
token, err := llm.sample(pastTokens, &opts)
|
||||
switch {
|
||||
case err != nil:
|
||||
return err
|
||||
case errors.Is(err, io.EOF):
|
||||
return nil
|
||||
}
|
||||
|
||||
if callback, ok := callbacks[uintptr(statePtr)]; ok {
|
||||
return callback(C.GoString(token))
|
||||
fn(llm.detokenize(token))
|
||||
|
||||
tokens = []C.llama_token{token}
|
||||
|
||||
pastTokens.PushLeft(token)
|
||||
}
|
||||
|
||||
return true
|
||||
return nil
|
||||
}
|
||||
|
||||
// setCallback can be used to register a token callback for LLama. Pass in a nil callback to
|
||||
// remove the callback.
|
||||
func setCallback(statePtr unsafe.Pointer, callback func(string) bool) {
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
func (llm *llama) sample(pastTokens deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) {
|
||||
numVocab := int(C.llama_n_vocab(llm.ctx))
|
||||
logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab)
|
||||
|
||||
if callback == nil {
|
||||
delete(callbacks, uintptr(statePtr))
|
||||
} else {
|
||||
callbacks[uintptr(statePtr)] = callback
|
||||
candidates := make([]C.struct_llama_token_data, 0, numVocab)
|
||||
for i := 0; i < numVocab; i++ {
|
||||
candidates = append(candidates, C.llama_token_data{
|
||||
id: C.int(i),
|
||||
logit: logits[i],
|
||||
p: 0,
|
||||
})
|
||||
}
|
||||
|
||||
token := C.llama_sample(
|
||||
llm.ctx,
|
||||
unsafe.SliceData(candidates), C.ulong(len(candidates)),
|
||||
unsafe.SliceData(pastTokens.Data()), C.ulong(pastTokens.Len()),
|
||||
opts)
|
||||
if token != C.llama_token_eos() {
|
||||
return token, nil
|
||||
}
|
||||
|
||||
return 0, io.EOF
|
||||
}
|
||||
|
||||
410
llama/llama.h
Normal file
410
llama/llama.h
Normal file
@@ -0,0 +1,410 @@
|
||||
/**
|
||||
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023 Georgi Gerganov
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef LLAMA_H
|
||||
#define LLAMA_H
|
||||
|
||||
#include "ggml.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define LLAMA_API __declspec(dllexport)
|
||||
# else
|
||||
# define LLAMA_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define LLAMA_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||
#elif defined(_MSC_VER)
|
||||
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
||||
#else
|
||||
# define DEPRECATED(func, hint) func
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_FILE_VERSION 3
|
||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// C interface
|
||||
//
|
||||
// TODO: show sample usage
|
||||
//
|
||||
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
|
||||
typedef int llama_token;
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
float logit; // log-odds of the token
|
||||
float p; // probability of the token
|
||||
} llama_token_data;
|
||||
|
||||
typedef struct llama_token_data_array {
|
||||
llama_token_data * data;
|
||||
size_t size;
|
||||
bool sorted;
|
||||
} llama_token_data_array;
|
||||
|
||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
int32_t n_ctx; // text context
|
||||
int32_t n_batch; // prompt processing batch size
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
// context pointer passed to the progress callback
|
||||
void * progress_callback_user_data;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||
bool f16_kv; // use fp16 for KV cache
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
};
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
} llama_model_quantize_params;
|
||||
|
||||
// performance timing information
|
||||
struct llama_timings {
|
||||
double t_start_ms;
|
||||
double t_end_ms;
|
||||
double t_load_ms;
|
||||
double t_sample_ms;
|
||||
double t_p_eval_ms;
|
||||
double t_eval_ms;
|
||||
|
||||
int32_t n_sample;
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Initialize the llama + ggml backend
|
||||
// If numa is true, use NUMA optimizations
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(bool numa);
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free();
|
||||
|
||||
LLAMA_API int64_t llama_time_us();
|
||||
|
||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params);
|
||||
|
||||
LLAMA_API void llama_free_model(struct llama_model * model);
|
||||
|
||||
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params),
|
||||
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
const llama_model_quantize_params * params);
|
||||
|
||||
// Apply a LoRA adapter to a loaded model
|
||||
// path_base_model is the path to a higher quality model to use as a base for
|
||||
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
// will be applied on top of the previous one
|
||||
// Returns 0 on success
|
||||
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads),
|
||||
"please use llama_model_apply_lora_from_file instead");
|
||||
|
||||
LLAMA_API int llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||
|
||||
// Sets the current rng seed.
|
||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
||||
|
||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||
// and kv_cache) - will often be smaller after compacting tokens
|
||||
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
||||
|
||||
// Copies the state to the specified destination address.
|
||||
// Destination needs to have allocated enough memory.
|
||||
// Returns the number of bytes copied
|
||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
||||
|
||||
// Save/load session file
|
||||
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
||||
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Same as llama_eval, but use float matrix input directly.
|
||||
LLAMA_API int llama_eval_embd(
|
||||
struct llama_context * ctx,
|
||||
const float * embd,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Export a static computation graph for context of 511 and batch size of 1
|
||||
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
||||
// parameters here to keep things simple
|
||||
// IMPORTANT: do not use for anything else other than debugging and testing!
|
||||
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
||||
|
||||
// Convert the provided text into tokens.
|
||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||
// Returns the number of tokens on success, no more than n_max_tokens
|
||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
||||
// TODO: not sure if correct
|
||||
LLAMA_API int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
// Get the vocabulary as output parameters.
|
||||
// Returns number of results.
|
||||
LLAMA_API int llama_get_vocab(
|
||||
const struct llama_context * ctx,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
// Rows: n_tokens
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||
|
||||
// Sampling functions
|
||||
|
||||
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
||||
|
||||
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
||||
|
||||
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
||||
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
||||
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
||||
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
||||
LLAMA_API void llama_sample_classifier_free_guidance(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
struct llama_context * guidance_ctx,
|
||||
float scale,
|
||||
float smooth_factor);
|
||||
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
||||
|
||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
|
||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
||||
|
||||
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
||||
|
||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
||||
|
||||
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
||||
|
||||
/// @details Selects the token with the highest probability.
|
||||
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
|
||||
/// @details Randomly selects a token from the candidates based on their probabilities.
|
||||
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
|
||||
// Performance information
|
||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||
#ifdef LLAMA_API_INTERNAL
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
struct ggml_tensor;
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif
|
||||
|
||||
#endif // LLAMA_H
|
||||
@@ -1,9 +0,0 @@
|
||||
//go:build cublas
|
||||
// +build cublas
|
||||
|
||||
package llama
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
|
||||
*/
|
||||
import "C"
|
||||
@@ -1,2 +0,0 @@
|
||||
//go:build metal
|
||||
package llama
|
||||
@@ -1,9 +0,0 @@
|
||||
//go:build openblas
|
||||
// +build openblas
|
||||
|
||||
package llama
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lopenblas
|
||||
*/
|
||||
import "C"
|
||||
375
llama/options.go
375
llama/options.go
@@ -1,375 +0,0 @@
|
||||
// MIT License
|
||||
|
||||
// Copyright (c) 2023 go-skynet authors
|
||||
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
package llama
|
||||
|
||||
type ModelOptions struct {
|
||||
ContextSize int
|
||||
Seed int
|
||||
NBatch int
|
||||
F16Memory bool
|
||||
MLock bool
|
||||
MMap bool
|
||||
VocabOnly bool
|
||||
LowVRAM bool
|
||||
Embeddings bool
|
||||
NUMA bool
|
||||
NGPULayers int
|
||||
MainGPU string
|
||||
TensorSplit string
|
||||
}
|
||||
|
||||
type PredictOptions struct {
|
||||
Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int
|
||||
TopP, Temperature, Penalty float64
|
||||
F16KV bool
|
||||
DebugMode bool
|
||||
StopPrompts []string
|
||||
IgnoreEOS bool
|
||||
|
||||
TailFreeSamplingZ float64
|
||||
TypicalP float64
|
||||
FrequencyPenalty float64
|
||||
PresencePenalty float64
|
||||
Mirostat int
|
||||
MirostatETA float64
|
||||
MirostatTAU float64
|
||||
PenalizeNL bool
|
||||
LogitBias string
|
||||
TokenCallback func(string) bool
|
||||
|
||||
MLock, MMap bool
|
||||
MainGPU string
|
||||
TensorSplit string
|
||||
}
|
||||
|
||||
type PredictOption func(p *PredictOptions)
|
||||
|
||||
type ModelOption func(p *ModelOptions)
|
||||
|
||||
var DefaultModelOptions ModelOptions = ModelOptions{
|
||||
ContextSize: 512,
|
||||
Seed: 0,
|
||||
F16Memory: false,
|
||||
MLock: false,
|
||||
Embeddings: false,
|
||||
MMap: true,
|
||||
LowVRAM: false,
|
||||
}
|
||||
|
||||
var DefaultOptions PredictOptions = PredictOptions{
|
||||
Seed: -1,
|
||||
Threads: 4,
|
||||
Tokens: 128,
|
||||
Penalty: 1.1,
|
||||
Repeat: 64,
|
||||
Batch: 512,
|
||||
NKeep: 64,
|
||||
TopK: 40,
|
||||
TopP: 0.95,
|
||||
TailFreeSamplingZ: 1.0,
|
||||
TypicalP: 1.0,
|
||||
Temperature: 0.8,
|
||||
FrequencyPenalty: 0.0,
|
||||
PresencePenalty: 0.0,
|
||||
Mirostat: 0,
|
||||
MirostatTAU: 5.0,
|
||||
MirostatETA: 0.1,
|
||||
MMap: true,
|
||||
}
|
||||
|
||||
// SetContext sets the context size.
|
||||
func SetContext(c int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.ContextSize = c
|
||||
}
|
||||
}
|
||||
|
||||
func SetModelSeed(c int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.Seed = c
|
||||
}
|
||||
}
|
||||
|
||||
// SetContext sets the context size.
|
||||
func SetMMap(b bool) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.MMap = b
|
||||
}
|
||||
}
|
||||
|
||||
// SetNBatch sets the n_Batch
|
||||
func SetNBatch(n_batch int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.NBatch = n_batch
|
||||
}
|
||||
}
|
||||
|
||||
// Set sets the tensor split for the GPU
|
||||
func SetTensorSplit(maingpu string) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.TensorSplit = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
// SetMainGPU sets the main_gpu
|
||||
func SetMainGPU(maingpu string) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.MainGPU = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
// SetPredictionTensorSplit sets the tensor split for the GPU
|
||||
func SetPredictionTensorSplit(maingpu string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TensorSplit = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
// SetPredictionMainGPU sets the main_gpu
|
||||
func SetPredictionMainGPU(maingpu string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MainGPU = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
var VocabOnly ModelOption = func(p *ModelOptions) {
|
||||
p.VocabOnly = true
|
||||
}
|
||||
|
||||
var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
|
||||
p.LowVRAM = true
|
||||
}
|
||||
|
||||
var EnableNUMA ModelOption = func(p *ModelOptions) {
|
||||
p.NUMA = true
|
||||
}
|
||||
|
||||
var EnableEmbeddings ModelOption = func(p *ModelOptions) {
|
||||
p.Embeddings = true
|
||||
}
|
||||
|
||||
var EnableF16Memory ModelOption = func(p *ModelOptions) {
|
||||
p.F16Memory = true
|
||||
}
|
||||
|
||||
var EnableF16KV PredictOption = func(p *PredictOptions) {
|
||||
p.F16KV = true
|
||||
}
|
||||
|
||||
var Debug PredictOption = func(p *PredictOptions) {
|
||||
p.DebugMode = true
|
||||
}
|
||||
|
||||
var EnableMLock ModelOption = func(p *ModelOptions) {
|
||||
p.MLock = true
|
||||
}
|
||||
|
||||
// Create a new PredictOptions object with the given options.
|
||||
func NewModelOptions(opts ...ModelOption) ModelOptions {
|
||||
p := DefaultModelOptions
|
||||
for _, opt := range opts {
|
||||
opt(&p)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
var IgnoreEOS PredictOption = func(p *PredictOptions) {
|
||||
p.IgnoreEOS = true
|
||||
}
|
||||
|
||||
// SetMlock sets the memory lock.
|
||||
func SetMlock(b bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MLock = b
|
||||
}
|
||||
}
|
||||
|
||||
// SetMemoryMap sets memory mapping.
|
||||
func SetMemoryMap(b bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MMap = b
|
||||
}
|
||||
}
|
||||
|
||||
// SetGPULayers sets the number of GPU layers to use to offload computation
|
||||
func SetGPULayers(n int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.NGPULayers = n
|
||||
}
|
||||
}
|
||||
|
||||
// SetTokenCallback sets the prompts that will stop predictions.
|
||||
func SetTokenCallback(fn func(string) bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TokenCallback = fn
|
||||
}
|
||||
}
|
||||
|
||||
// SetStopWords sets the prompts that will stop predictions.
|
||||
func SetStopWords(stop ...string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.StopPrompts = stop
|
||||
}
|
||||
}
|
||||
|
||||
// SetSeed sets the random seed for sampling text generation.
|
||||
func SetSeed(seed int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Seed = seed
|
||||
}
|
||||
}
|
||||
|
||||
// SetThreads sets the number of threads to use for text generation.
|
||||
func SetThreads(threads int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Threads = threads
|
||||
}
|
||||
}
|
||||
|
||||
// SetTokens sets the number of tokens to generate.
|
||||
func SetTokens(tokens int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Tokens = tokens
|
||||
}
|
||||
}
|
||||
|
||||
// SetTopK sets the value for top-K sampling.
|
||||
func SetTopK(topk int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TopK = topk
|
||||
}
|
||||
}
|
||||
|
||||
// SetTopP sets the value for nucleus sampling.
|
||||
func SetTopP(topp float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TopP = topp
|
||||
}
|
||||
}
|
||||
|
||||
// SetTemperature sets the temperature value for text generation.
|
||||
func SetTemperature(temp float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Temperature = temp
|
||||
}
|
||||
}
|
||||
|
||||
// SetPenalty sets the repetition penalty for text generation.
|
||||
func SetPenalty(penalty float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Penalty = penalty
|
||||
}
|
||||
}
|
||||
|
||||
// SetRepeat sets the number of times to repeat text generation.
|
||||
func SetRepeat(repeat int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Repeat = repeat
|
||||
}
|
||||
}
|
||||
|
||||
// SetBatch sets the batch size.
|
||||
func SetBatch(size int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Batch = size
|
||||
}
|
||||
}
|
||||
|
||||
// SetKeep sets the number of tokens from initial prompt to keep.
|
||||
func SetNKeep(n int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.NKeep = n
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new PredictOptions object with the given options.
|
||||
func NewPredictOptions(opts ...PredictOption) PredictOptions {
|
||||
p := DefaultOptions
|
||||
for _, opt := range opts {
|
||||
opt(&p)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// SetTailFreeSamplingZ sets the tail free sampling, parameter z.
|
||||
func SetTailFreeSamplingZ(tfz float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TailFreeSamplingZ = tfz
|
||||
}
|
||||
}
|
||||
|
||||
// SetTypicalP sets the typicality parameter, p_typical.
|
||||
func SetTypicalP(tp float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TypicalP = tp
|
||||
}
|
||||
}
|
||||
|
||||
// SetFrequencyPenalty sets the frequency penalty parameter, freq_penalty.
|
||||
func SetFrequencyPenalty(fp float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.FrequencyPenalty = fp
|
||||
}
|
||||
}
|
||||
|
||||
// SetPresencePenalty sets the presence penalty parameter, presence_penalty.
|
||||
func SetPresencePenalty(pp float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.PresencePenalty = pp
|
||||
}
|
||||
}
|
||||
|
||||
// SetMirostat sets the mirostat parameter.
|
||||
func SetMirostat(m int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Mirostat = m
|
||||
}
|
||||
}
|
||||
|
||||
// SetMirostatETA sets the mirostat ETA parameter.
|
||||
func SetMirostatETA(me float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MirostatETA = me
|
||||
}
|
||||
}
|
||||
|
||||
// SetMirostatTAU sets the mirostat TAU parameter.
|
||||
func SetMirostatTAU(mt float64) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MirostatTAU = mt
|
||||
}
|
||||
}
|
||||
|
||||
// SetPenalizeNL sets whether to penalize newlines or not.
|
||||
func SetPenalizeNL(pnl bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.PenalizeNL = pnl
|
||||
}
|
||||
}
|
||||
|
||||
// SetLogitBias sets the logit bias parameter.
|
||||
func SetLogitBias(lb string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.LogitBias = lb
|
||||
}
|
||||
}
|
||||
104
llama/utils.go
Normal file
104
llama/utils.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package llama
|
||||
|
||||
type node[T any] struct {
|
||||
t T
|
||||
next *node[T]
|
||||
prev *node[T]
|
||||
}
|
||||
|
||||
type deque[T any] struct {
|
||||
head *node[T]
|
||||
tail *node[T]
|
||||
size int
|
||||
capacity int
|
||||
}
|
||||
|
||||
func (d *deque[T]) Empty() bool {
|
||||
return d.size == 0
|
||||
}
|
||||
|
||||
func (d *deque[T]) Len() int {
|
||||
return d.size
|
||||
}
|
||||
|
||||
func (d *deque[T]) Cap() int {
|
||||
return d.capacity
|
||||
}
|
||||
|
||||
func (d *deque[T]) Push(t T) {
|
||||
if d.capacity > 0 && d.size >= d.capacity {
|
||||
d.PopLeft()
|
||||
}
|
||||
|
||||
n := node[T]{t: t}
|
||||
if d.head != nil {
|
||||
n.next = d.head
|
||||
d.head.prev = &n
|
||||
d.head = &n
|
||||
} else {
|
||||
d.head = &n
|
||||
d.tail = &n
|
||||
}
|
||||
|
||||
d.size++
|
||||
}
|
||||
|
||||
func (d *deque[T]) PushLeft(t T) {
|
||||
if d.capacity > 0 && d.size >= d.capacity {
|
||||
d.Pop()
|
||||
}
|
||||
|
||||
n := node[T]{t: t}
|
||||
if d.tail != nil {
|
||||
n.prev = d.tail
|
||||
d.tail.next = &n
|
||||
d.tail = &n
|
||||
} else {
|
||||
d.head = &n
|
||||
d.tail = &n
|
||||
}
|
||||
|
||||
d.size++
|
||||
}
|
||||
|
||||
func (d *deque[T]) Pop() *T {
|
||||
if d.Empty() {
|
||||
return nil
|
||||
}
|
||||
|
||||
head := d.head
|
||||
d.head = head.next
|
||||
if d.head != nil {
|
||||
d.head.prev = nil
|
||||
} else {
|
||||
d.tail = nil
|
||||
}
|
||||
|
||||
d.size--
|
||||
return &head.t
|
||||
}
|
||||
|
||||
func (d *deque[T]) PopLeft() *T {
|
||||
if d.Empty() {
|
||||
return nil
|
||||
}
|
||||
|
||||
tail := d.tail
|
||||
d.tail = tail.prev
|
||||
if d.tail != nil {
|
||||
d.tail.next = nil
|
||||
} else {
|
||||
d.head = nil
|
||||
}
|
||||
|
||||
d.size--
|
||||
return &tail.t
|
||||
}
|
||||
|
||||
func (d *deque[T]) Data() (data []T) {
|
||||
for n := d.head; n != nil; n = n.next {
|
||||
data = append(data, n.t)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
@@ -10,7 +10,7 @@ fi
|
||||
OS=$(go env GOOS)
|
||||
ARCH=$(go env GOARCH)
|
||||
|
||||
make app
|
||||
go build .
|
||||
|
||||
# Create a new tag if it doesn't exist.
|
||||
if ! git rev-parse v$VERSION >/dev/null 2>&1; then
|
||||
|
||||
101
server/models.go
101
server/models.go
@@ -2,14 +2,13 @@ package server
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"strconv"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
const directoryURL = "https://ollama.ai/api/models"
|
||||
@@ -36,12 +35,12 @@ func (m *Model) FullName() string {
|
||||
return path.Join(home, ".ollama", "models", m.Name+".bin")
|
||||
}
|
||||
|
||||
func pull(model string, progressCh chan<- api.PullProgress) error {
|
||||
remote, err := getRemote(model)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to pull model: %w", err)
|
||||
}
|
||||
return saveModel(remote, progressCh)
|
||||
func (m *Model) TempFile() string {
|
||||
fullName := m.FullName()
|
||||
return path.Join(
|
||||
path.Dir(fullName),
|
||||
fmt.Sprintf(".%s.part", path.Base(fullName)),
|
||||
)
|
||||
}
|
||||
|
||||
func getRemote(model string) (*Model, error) {
|
||||
@@ -68,7 +67,7 @@ func getRemote(model string) (*Model, error) {
|
||||
return nil, fmt.Errorf("model not found in directory: %s", model)
|
||||
}
|
||||
|
||||
func saveModel(model *Model, progressCh chan<- api.PullProgress) error {
|
||||
func saveModel(model *Model, fn func(total, completed int64)) error {
|
||||
// this models cache directory is created by the server on startup
|
||||
|
||||
client := &http.Client{}
|
||||
@@ -76,41 +75,45 @@ func saveModel(model *Model, progressCh chan<- api.PullProgress) error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to download model: %w", err)
|
||||
}
|
||||
// check for resume
|
||||
alreadyDownloaded := int64(0)
|
||||
fileInfo, err := os.Stat(model.FullName())
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to check resume model file: %w", err)
|
||||
}
|
||||
// file doesn't exist, create it now
|
||||
} else {
|
||||
alreadyDownloaded = fileInfo.Size()
|
||||
req.Header.Add("Range", fmt.Sprintf("bytes=%d-", alreadyDownloaded))
|
||||
|
||||
// check if completed file exists
|
||||
fi, err := os.Stat(model.FullName())
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
// noop, file doesn't exist so create it
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat: %w", err)
|
||||
default:
|
||||
fn(fi.Size(), fi.Size())
|
||||
return nil
|
||||
}
|
||||
|
||||
var size int64
|
||||
|
||||
// completed file doesn't exist, check partial file
|
||||
fi, err = os.Stat(model.TempFile())
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
// noop, file doesn't exist so create it
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat: %w", err)
|
||||
default:
|
||||
size = fi.Size()
|
||||
}
|
||||
|
||||
req.Header.Add("Range", fmt.Sprintf("bytes=%d-", size))
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to download model: %w", err)
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusRequestedRangeNotSatisfiable {
|
||||
// already downloaded
|
||||
progressCh <- api.PullProgress{
|
||||
Total: alreadyDownloaded,
|
||||
Completed: alreadyDownloaded,
|
||||
Percent: 100,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusPartialContent {
|
||||
if resp.StatusCode >= 400 {
|
||||
return fmt.Errorf("failed to download model: %s", resp.Status)
|
||||
}
|
||||
|
||||
out, err := os.OpenFile(model.FullName(), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
|
||||
out, err := os.OpenFile(model.TempFile(), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
@@ -118,37 +121,23 @@ func saveModel(model *Model, progressCh chan<- api.PullProgress) error {
|
||||
|
||||
totalSize, _ := strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)
|
||||
|
||||
buf := make([]byte, 1024)
|
||||
totalBytes := alreadyDownloaded
|
||||
totalSize += alreadyDownloaded
|
||||
totalBytes := size
|
||||
totalSize += size
|
||||
|
||||
for {
|
||||
n, err := resp.Body.Read(buf)
|
||||
if err != nil && err != io.EOF {
|
||||
n, err := io.CopyN(out, resp.Body, 8192)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
break
|
||||
}
|
||||
if _, err := out.Write(buf[:n]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
totalBytes += int64(n)
|
||||
|
||||
// send progress updates
|
||||
progressCh <- api.PullProgress{
|
||||
Total: totalSize,
|
||||
Completed: totalBytes,
|
||||
Percent: float64(totalBytes) / float64(totalSize) * 100,
|
||||
}
|
||||
totalBytes += n
|
||||
fn(totalSize, totalBytes)
|
||||
}
|
||||
|
||||
progressCh <- api.PullProgress{
|
||||
Total: totalSize,
|
||||
Completed: totalSize,
|
||||
Percent: 100,
|
||||
}
|
||||
|
||||
return nil
|
||||
fn(totalSize, totalSize)
|
||||
return os.Rename(model.TempFile(), model.FullName())
|
||||
}
|
||||
|
||||
213
server/routes.go
213
server/routes.go
@@ -4,7 +4,6 @@ import (
|
||||
"embed"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"math"
|
||||
@@ -12,7 +11,6 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"runtime"
|
||||
"strings"
|
||||
"text/template"
|
||||
|
||||
@@ -37,11 +35,12 @@ func cacheDir() string {
|
||||
}
|
||||
|
||||
func generate(c *gin.Context) {
|
||||
var req api.GenerateRequest
|
||||
req.ModelOptions = api.DefaultModelOptions
|
||||
req.PredictOptions = api.DefaultPredictOptions
|
||||
req := api.GenerateRequest{
|
||||
Options: api.DefaultOptions(),
|
||||
}
|
||||
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -50,21 +49,14 @@ func generate(c *gin.Context) {
|
||||
}
|
||||
if _, err := os.Stat(req.Model); err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
req.Model = path.Join(cacheDir(), "models", req.Model+".bin")
|
||||
}
|
||||
|
||||
modelOpts := getModelOpts(req)
|
||||
modelOpts.NGPULayers = 1 // hard-code this for now
|
||||
|
||||
model, err := llama.New(req.Model, modelOpts)
|
||||
if err != nil {
|
||||
fmt.Println("Loading the model failed:", err.Error())
|
||||
return
|
||||
}
|
||||
defer model.Free()
|
||||
ch := make(chan any)
|
||||
go stream(c, ch)
|
||||
|
||||
templateNames := make([]string, 0, len(templates.Templates()))
|
||||
for _, template := range templates.Templates() {
|
||||
@@ -75,51 +67,59 @@ func generate(c *gin.Context) {
|
||||
if template := templates.Lookup(match); template != nil {
|
||||
var sb strings.Builder
|
||||
if err := template.Execute(&sb, req); err != nil {
|
||||
fmt.Println("Prompt template failed:", err.Error())
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
req.Prompt = sb.String()
|
||||
}
|
||||
|
||||
ch := make(chan string)
|
||||
model.SetTokenCallback(func(token string) bool {
|
||||
ch <- token
|
||||
return true
|
||||
})
|
||||
llm, err := llama.New(req.Model, req.Options)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
defer llm.Close()
|
||||
|
||||
predictOpts := getPredictOpts(req)
|
||||
fn := func(s string) {
|
||||
ch <- api.GenerateResponse{Response: s}
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer close(ch)
|
||||
_, err := model.Predict(req.Prompt, predictOpts)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if err := llm.Predict(req.Prompt, fn); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func pull(c *gin.Context) {
|
||||
var req api.PullRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
remote, err := getRemote(req.Model)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
ch := make(chan any)
|
||||
go stream(c, ch)
|
||||
|
||||
fn := func(total, completed int64) {
|
||||
ch <- api.PullProgress{
|
||||
Total: total,
|
||||
Completed: completed,
|
||||
Percent: float64(total) / float64(completed) * 100,
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
c.Stream(func(w io.Writer) bool {
|
||||
token, ok := <-ch
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
resp := api.GenerateResponse{
|
||||
Response: token,
|
||||
}
|
||||
|
||||
bts, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
bts = append(bts, '\n')
|
||||
if _, err := w.Write(bts); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
if err := saveModel(remote, fn); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func Serve(ln net.Listener) error {
|
||||
@@ -129,53 +129,7 @@ func Serve(ln net.Listener) error {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
|
||||
r.POST("api/pull", func(c *gin.Context) {
|
||||
var req api.PullRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
progressCh := make(chan api.PullProgress)
|
||||
go func() {
|
||||
defer close(progressCh)
|
||||
if err := pull(req.Model, progressCh); err != nil {
|
||||
var opError *net.OpError
|
||||
if errors.As(err, &opError) {
|
||||
result := api.PullProgress{
|
||||
Error: api.Error{
|
||||
Code: http.StatusBadGateway,
|
||||
Message: "failed to get models from directory",
|
||||
},
|
||||
}
|
||||
c.JSON(http.StatusBadGateway, result)
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
||||
c.Stream(func(w io.Writer) bool {
|
||||
progress, ok := <-progressCh
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
bts, err := json.Marshal(progress)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
bts = append(bts, '\n')
|
||||
if _, err := w.Write(bts); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
})
|
||||
|
||||
r.POST("api/pull", pull)
|
||||
r.POST("/api/generate", generate)
|
||||
|
||||
log.Printf("Listening on %s", ln.Addr())
|
||||
@@ -198,52 +152,23 @@ func matchRankOne(source string, targets []string) (bestMatch string, bestRank i
|
||||
return
|
||||
}
|
||||
|
||||
func getModelOpts(req api.GenerateRequest) llama.ModelOptions {
|
||||
var opts llama.ModelOptions
|
||||
opts.ContextSize = req.ModelOptions.ContextSize
|
||||
opts.Seed = req.ModelOptions.Seed
|
||||
opts.F16Memory = req.ModelOptions.F16Memory
|
||||
opts.MLock = req.ModelOptions.MLock
|
||||
opts.Embeddings = req.ModelOptions.Embeddings
|
||||
opts.MMap = req.ModelOptions.MMap
|
||||
opts.LowVRAM = req.ModelOptions.LowVRAM
|
||||
func stream(c *gin.Context, ch chan any) {
|
||||
c.Stream(func(w io.Writer) bool {
|
||||
val, ok := <-ch
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
opts.NBatch = req.ModelOptions.NBatch
|
||||
opts.VocabOnly = req.ModelOptions.VocabOnly
|
||||
opts.NUMA = req.ModelOptions.NUMA
|
||||
opts.NGPULayers = req.ModelOptions.NGPULayers
|
||||
opts.MainGPU = req.ModelOptions.MainGPU
|
||||
opts.TensorSplit = req.ModelOptions.TensorSplit
|
||||
bts, err := json.Marshal(val)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return opts
|
||||
}
|
||||
|
||||
func getPredictOpts(req api.GenerateRequest) llama.PredictOptions {
|
||||
var opts llama.PredictOptions
|
||||
|
||||
if req.PredictOptions.Threads == -1 {
|
||||
opts.Threads = runtime.NumCPU()
|
||||
} else {
|
||||
opts.Threads = req.PredictOptions.Threads
|
||||
}
|
||||
|
||||
opts.Seed = req.PredictOptions.Seed
|
||||
opts.Tokens = req.PredictOptions.Tokens
|
||||
opts.Penalty = req.PredictOptions.Penalty
|
||||
opts.Repeat = req.PredictOptions.Repeat
|
||||
opts.Batch = req.PredictOptions.Batch
|
||||
opts.NKeep = req.PredictOptions.NKeep
|
||||
opts.TopK = req.PredictOptions.TopK
|
||||
opts.TopP = req.PredictOptions.TopP
|
||||
opts.TailFreeSamplingZ = req.PredictOptions.TailFreeSamplingZ
|
||||
opts.TypicalP = req.PredictOptions.TypicalP
|
||||
opts.Temperature = req.PredictOptions.Temperature
|
||||
opts.FrequencyPenalty = req.PredictOptions.FrequencyPenalty
|
||||
opts.PresencePenalty = req.PredictOptions.PresencePenalty
|
||||
opts.Mirostat = req.PredictOptions.Mirostat
|
||||
opts.MirostatTAU = req.PredictOptions.MirostatTAU
|
||||
opts.MirostatETA = req.PredictOptions.MirostatETA
|
||||
opts.MMap = req.PredictOptions.MMap
|
||||
|
||||
return opts
|
||||
bts = append(bts, '\n')
|
||||
if _, err := w.Write(bts); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,44 +1,42 @@
|
||||
import { NextResponse } from 'next/server'
|
||||
import semver from 'semver'
|
||||
import { Octokit } from '@octokit/rest'
|
||||
import { RequestError } from '@octokit/types'
|
||||
|
||||
const octokit = new Octokit()
|
||||
|
||||
export async function GET(req: Request) {
|
||||
const { searchParams } = new URL(req.url)
|
||||
|
||||
const os = searchParams.get('os') || ''
|
||||
const version = searchParams.get('version') || ''
|
||||
const os = searchParams.get('os') || 'darwin'
|
||||
const version = searchParams.get('version') || '0.0.0'
|
||||
|
||||
if (!version) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
try {
|
||||
const { data } = await octokit.repos.getLatestRelease({
|
||||
owner: 'jmorganca',
|
||||
repo: 'ollama',
|
||||
})
|
||||
const res = await fetch('https://api.github.com/repos/jmorganca/ollama/releases', { next: { revalidate: 60 } })
|
||||
const data = await res.json()
|
||||
|
||||
// todo: get the correct asset for the current arch/os
|
||||
const asset = data.assets.find(a => a.name.toLowerCase().includes(os))
|
||||
|
||||
if (!asset) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
if (semver.lt(version, data.tag_name)) {
|
||||
return NextResponse.json({ version: data.tag_name, url: asset.browser_download_url })
|
||||
}
|
||||
|
||||
return new Response('up to date', { status: 204 })
|
||||
} catch (error) {
|
||||
const e = error as RequestError
|
||||
if (e.status === 404) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
return new Response('internal server error', { status: 500 })
|
||||
if (data.length === 0) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
const latest = data[0]
|
||||
const assets = latest.assets || []
|
||||
|
||||
if (assets.length === 0) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
// todo: get the correct asset for the current arch/os
|
||||
const asset = assets.find((a: any) => a.name.toLowerCase().includes(os) && a.name.toLowerCase().includes('.zip'))
|
||||
|
||||
if (!asset) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
console.log(asset)
|
||||
|
||||
if (semver.lt(version, latest.tag_name)) {
|
||||
return NextResponse.json({ version: data.tag_name, url: asset.browser_download_url })
|
||||
}
|
||||
|
||||
return new Response(null, { status: 204 })
|
||||
}
|
||||
|
||||
@@ -1,16 +1,28 @@
|
||||
import { Octokit } from '@octokit/rest'
|
||||
import { redirect } from 'next/navigation'
|
||||
|
||||
const octokit = new Octokit()
|
||||
|
||||
export default async function Download() {
|
||||
const { data } = await octokit.repos.getLatestRelease({
|
||||
owner: 'jmorganca',
|
||||
repo: 'ollama',
|
||||
})
|
||||
const res = await fetch('https://api.github.com/repos/jmorganca/ollama/releases', { next: { revalidate: 60 } })
|
||||
const data = await res.json()
|
||||
|
||||
if (data.length === 0) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
const latest = data[0]
|
||||
const assets = latest.assets || []
|
||||
|
||||
if (assets.length === 0) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
// todo: get the correct asset for the current arch/os
|
||||
const asset = data.assets.find(a => a.name.toLowerCase().includes('darwin') && a.name.toLowerCase().includes('.zip'))
|
||||
const asset = assets.find(
|
||||
(a: any) => a.name.toLowerCase().includes('darwin') && a.name.toLowerCase().includes('.zip')
|
||||
)
|
||||
|
||||
if (!asset) {
|
||||
return new Response('not found', { status: 404 })
|
||||
}
|
||||
|
||||
if (asset) {
|
||||
redirect(asset.browser_download_url)
|
||||
|
||||
88
web/package-lock.json
generated
88
web/package-lock.json
generated
@@ -17,7 +17,7 @@
|
||||
"encoding": "^0.1.13",
|
||||
"eslint": "8.44.0",
|
||||
"eslint-config-next": "13.4.7",
|
||||
"next": "13.4.7",
|
||||
"next": "13.4.9",
|
||||
"postcss": "8.4.24",
|
||||
"react": "18.2.0",
|
||||
"react-dom": "18.2.0",
|
||||
@@ -191,9 +191,9 @@
|
||||
"integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw=="
|
||||
},
|
||||
"node_modules/@next/env": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/env/-/env-13.4.7.tgz",
|
||||
"integrity": "sha512-ZlbiFulnwiFsW9UV1ku1OvX/oyIPLtMk9p/nnvDSwI0s7vSoZdRtxXNsaO+ZXrLv/pMbXVGq4lL8TbY9iuGmVw=="
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/env/-/env-13.4.9.tgz",
|
||||
"integrity": "sha512-vuDRK05BOKfmoBYLNi2cujG2jrYbEod/ubSSyqgmEx9n/W3eZaJQdRNhTfumO+qmq/QTzLurW487n/PM/fHOkw=="
|
||||
},
|
||||
"node_modules/@next/eslint-plugin-next": {
|
||||
"version": "13.4.7",
|
||||
@@ -204,9 +204,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-darwin-arm64": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.7.tgz",
|
||||
"integrity": "sha512-VZTxPv1b59KGiv/pZHTO5Gbsdeoxcj2rU2cqJu03btMhHpn3vwzEK0gUSVC/XW96aeGO67X+cMahhwHzef24/w==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.9.tgz",
|
||||
"integrity": "sha512-TVzGHpZoVBk3iDsTOQA/R6MGmFp0+17SWXMEWd6zG30AfuELmSSMe2SdPqxwXU0gbpWkJL1KgfLzy5ReN0crqQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -219,9 +219,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-darwin-x64": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.7.tgz",
|
||||
"integrity": "sha512-gO2bw+2Ymmga+QYujjvDz9955xvYGrWofmxTq7m70b9pDPvl7aDFABJOZ2a8SRCuSNB5mXU8eTOmVVwyp/nAew==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.9.tgz",
|
||||
"integrity": "sha512-aSfF1fhv28N2e7vrDZ6zOQ+IIthocfaxuMWGReB5GDriF0caTqtHttAvzOMgJgXQtQx6XhyaJMozLTSEXeNN+A==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -234,9 +234,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-arm64-gnu": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.7.tgz",
|
||||
"integrity": "sha512-6cqp3vf1eHxjIDhEOc7Mh/s8z1cwc/l5B6ZNkOofmZVyu1zsbEM5Hmx64s12Rd9AYgGoiCz4OJ4M/oRnkE16/Q==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.9.tgz",
|
||||
"integrity": "sha512-JhKoX5ECzYoTVyIy/7KykeO4Z2lVKq7HGQqvAH+Ip9UFn1MOJkOnkPRB7v4nmzqAoY+Je05Aj5wNABR1N18DMg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -249,9 +249,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-arm64-musl": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.7.tgz",
|
||||
"integrity": "sha512-T1kD2FWOEy5WPidOn1si0rYmWORNch4a/NR52Ghyp4q7KyxOCuiOfZzyhVC5tsLIBDH3+cNdB5DkD9afpNDaOw==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.9.tgz",
|
||||
"integrity": "sha512-OOn6zZBIVkm/4j5gkPdGn4yqQt+gmXaLaSjRSO434WplV8vo2YaBNbSHaTM9wJpZTHVDYyjzuIYVEzy9/5RVZw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -264,9 +264,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-x64-gnu": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.7.tgz",
|
||||
"integrity": "sha512-zaEC+iEiAHNdhl6fuwl0H0shnTzQoAoJiDYBUze8QTntE/GNPfTYpYboxF5LRYIjBwETUatvE0T64W6SKDipvg==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.9.tgz",
|
||||
"integrity": "sha512-iA+fJXFPpW0SwGmx/pivVU+2t4zQHNOOAr5T378PfxPHY6JtjV6/0s1vlAJUdIHeVpX98CLp9k5VuKgxiRHUpg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -279,9 +279,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-x64-musl": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.7.tgz",
|
||||
"integrity": "sha512-X6r12F8d8SKAtYJqLZBBMIwEqcTRvUdVm+xIq+l6pJqlgT2tNsLLf2i5Cl88xSsIytBICGsCNNHd+siD2fbWBA==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.9.tgz",
|
||||
"integrity": "sha512-rlNf2WUtMM+GAQrZ9gMNdSapkVi3koSW3a+dmBVp42lfugWVvnyzca/xJlN48/7AGx8qu62WyO0ya1ikgOxh6A==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -294,9 +294,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-arm64-msvc": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.7.tgz",
|
||||
"integrity": "sha512-NPnmnV+vEIxnu6SUvjnuaWRglZzw4ox5n/MQTxeUhb5iwVWFedolPFebMNwgrWu4AELwvTdGtWjqof53AiWHcw==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.9.tgz",
|
||||
"integrity": "sha512-5T9ybSugXP77nw03vlgKZxD99AFTHaX8eT1ayKYYnGO9nmYhJjRPxcjU5FyYI+TdkQgEpIcH7p/guPLPR0EbKA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -309,9 +309,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-ia32-msvc": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.7.tgz",
|
||||
"integrity": "sha512-6Hxijm6/a8XqLQpOOf/XuwWRhcuc/g4rBB2oxjgCMuV9Xlr2bLs5+lXyh8w9YbAUMYR3iC9mgOlXbHa79elmXw==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.9.tgz",
|
||||
"integrity": "sha512-ojZTCt1lP2ucgpoiFgrFj07uq4CZsq4crVXpLGgQfoFq00jPKRPgesuGPaz8lg1yLfvafkU3Jd1i8snKwYR3LA==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
@@ -324,9 +324,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-x64-msvc": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.4.7.tgz",
|
||||
"integrity": "sha512-sW9Yt36Db1nXJL+mTr2Wo0y+VkPWeYhygvcHj1FF0srVtV+VoDjxleKtny21QHaG05zdeZnw2fCtf2+dEqgwqA==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.4.9.tgz",
|
||||
"integrity": "sha512-QbT03FXRNdpuL+e9pLnu+XajZdm/TtIXVYY4lA9t+9l0fLZbHXDYEKitAqxrOj37o3Vx5ufxiRAniaIebYDCgw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -2983,11 +2983,11 @@
|
||||
"integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw=="
|
||||
},
|
||||
"node_modules/next": {
|
||||
"version": "13.4.7",
|
||||
"resolved": "https://registry.npmjs.org/next/-/next-13.4.7.tgz",
|
||||
"integrity": "sha512-M8z3k9VmG51SRT6v5uDKdJXcAqLzP3C+vaKfLIAM0Mhx1um1G7MDnO63+m52qPdZfrTFzMZNzfsgvm3ghuVHIQ==",
|
||||
"version": "13.4.9",
|
||||
"resolved": "https://registry.npmjs.org/next/-/next-13.4.9.tgz",
|
||||
"integrity": "sha512-vtefFm/BWIi/eWOqf1GsmKG3cjKw1k3LjuefKRcL3iiLl3zWzFdPG3as6xtxrGO6gwTzzaO1ktL4oiHt/uvTjA==",
|
||||
"dependencies": {
|
||||
"@next/env": "13.4.7",
|
||||
"@next/env": "13.4.9",
|
||||
"@swc/helpers": "0.5.1",
|
||||
"busboy": "1.6.0",
|
||||
"caniuse-lite": "^1.0.30001406",
|
||||
@@ -3003,15 +3003,15 @@
|
||||
"node": ">=16.8.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@next/swc-darwin-arm64": "13.4.7",
|
||||
"@next/swc-darwin-x64": "13.4.7",
|
||||
"@next/swc-linux-arm64-gnu": "13.4.7",
|
||||
"@next/swc-linux-arm64-musl": "13.4.7",
|
||||
"@next/swc-linux-x64-gnu": "13.4.7",
|
||||
"@next/swc-linux-x64-musl": "13.4.7",
|
||||
"@next/swc-win32-arm64-msvc": "13.4.7",
|
||||
"@next/swc-win32-ia32-msvc": "13.4.7",
|
||||
"@next/swc-win32-x64-msvc": "13.4.7"
|
||||
"@next/swc-darwin-arm64": "13.4.9",
|
||||
"@next/swc-darwin-x64": "13.4.9",
|
||||
"@next/swc-linux-arm64-gnu": "13.4.9",
|
||||
"@next/swc-linux-arm64-musl": "13.4.9",
|
||||
"@next/swc-linux-x64-gnu": "13.4.9",
|
||||
"@next/swc-linux-x64-musl": "13.4.9",
|
||||
"@next/swc-win32-arm64-msvc": "13.4.9",
|
||||
"@next/swc-win32-ia32-msvc": "13.4.9",
|
||||
"@next/swc-win32-x64-msvc": "13.4.9"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@opentelemetry/api": "^1.1.0",
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
"encoding": "^0.1.13",
|
||||
"eslint": "8.44.0",
|
||||
"eslint-config-next": "13.4.7",
|
||||
"next": "13.4.7",
|
||||
"next": "13.4.9",
|
||||
"postcss": "8.4.24",
|
||||
"react": "18.2.0",
|
||||
"react-dom": "18.2.0",
|
||||
|
||||
5
web/vercel.json
Normal file
5
web/vercel.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"github": {
|
||||
"silent": true
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user