app: trim server lines before logging

app: bundle real ggml-metal.metal instead of symlink
use go build on publish
2023-07-11 16:43:19 -07:00 · 2023-07-11 16:36:39 -07:00 · 2023-07-11 16:17:45 -07:00 · 2023-07-11 16:16:38 -07:00 · 2023-07-11 15:58:56 -07:00 · 2023-07-11 15:52:22 -07:00
44 changed files with 36166 additions and 1895 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,8 +3,5 @@
 .env
 .venv
 *.spec
-build
 dist
-__pycache__
 ollama
-ggml-metal.metal
--- a/19
+++ b/19
@@ -1,19 +0,0 @@
-default: ollama
-
-.PHONY: llama
-llama:
-	cmake -S llama -B llama/build -DLLAMA_METAL=on
-	cmake --build llama/build
-
-.PHONY: ollama
-ollama: llama
-	go build .
-
-.PHONY: app
-app: ollama
-	npm install --prefix app
-	npm run --prefix app make:sign
-
-clean:
-	go clean
-	rm -rf llama/build
--- a/README.md
+++ b/README.md
@@ -42,12 +42,20 @@ ollama run vicuna "Why is the sky blue?"

 ### 🗺️ Instructions

-Ask questions. Get answers.
+Get a helping hand.

 ```
 ollama run orca "Write an email to my boss."
 ```

+### 🔎 Ask questions about documents
+
+Send the contents of a document and ask questions about it.
+
+```
+ollama run nous-hermes "$(cat input.txt)", please summarize this story
+```
+
 ### 📖 Storytelling

 Venture into the unknown.
@@ -67,7 +75,7 @@ ollama run ~/Downloads/vicuna-7b-v1.3.ggmlv3.q4_1.bin
 ## Building

 ```
-make
+go build .
 ```

 To run it start the server:
--- a/api/client.go
+++ b/api/client.go
@@ -5,11 +5,25 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
-	"io"
+	"fmt"
 	"net/http"
 	"net/url"
 )

+type StatusError struct {
+	StatusCode int
+	Status     string
+	Message    string
+}
+
+func (e StatusError) Error() string {
+	if e.Message != "" {
+		return fmt.Sprintf("%s: %s", e.Status, e.Message)
+	}
+
+	return e.Status
+}
+
 type Client struct {
 	base url.URL
 }
@@ -25,35 +39,18 @@ func NewClient(hosts ...string) *Client {
 	}
 }

-type options struct {
-	requestBody  io.Reader
-	responseFunc func(bts []byte) error
-}
+func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
+	var buf *bytes.Buffer
+	if data != nil {
+		bts, err := json.Marshal(data)
+		if err != nil {
+			return err
+		}

-func OptionRequestBody(data any) func(*options) {
-	bts, err := json.Marshal(data)
-	if err != nil {
-		panic(err)
+		buf = bytes.NewBuffer(bts)
 	}

-	return func(opts *options) {
-		opts.requestBody = bytes.NewReader(bts)
-	}
-}
-
-func OptionResponseFunc(fn func([]byte) error) func(*options) {
-	return func(opts *options) {
-		opts.responseFunc = fn
-	}
-}
-
-func (c *Client) stream(ctx context.Context, method, path string, fns ...func(*options)) error {
-	var opts options
-	for _, fn := range fns {
-		fn(&opts)
-	}
-
-	request, err := http.NewRequestWithContext(ctx, method, c.base.JoinPath(path).String(), opts.requestBody)
+	request, err := http.NewRequestWithContext(ctx, method, c.base.JoinPath(path).String(), buf)
 	if err != nil {
 		return err
 	}
@@ -67,13 +64,28 @@ func (c *Client) stream(ctx context.Context, method, path string, fns ...func(*o
 	}
 	defer response.Body.Close()

-	if opts.responseFunc != nil {
-		scanner := bufio.NewScanner(response.Body)
-		for scanner.Scan() {
-			if err := opts.responseFunc(scanner.Bytes()); err != nil {
-				return err
+	scanner := bufio.NewScanner(response.Body)
+	for scanner.Scan() {
+		var errorResponse struct {
+			Error string `json:"error,omitempty"`
+		}
+
+		bts := scanner.Bytes()
+		if err := json.Unmarshal(bts, &errorResponse); err != nil {
+			return fmt.Errorf("unmarshal: %w", err)
+		}
+
+		if response.StatusCode >= 400 {
+			return StatusError{
+				StatusCode: response.StatusCode,
+				Status:     response.Status,
+				Message:    errorResponse.Error,
 			}
 		}
+
+		if err := fn(bts); err != nil {
+			return err
+		}
 	}

 	return nil
@@ -82,36 +94,25 @@ func (c *Client) stream(ctx context.Context, method, path string, fns ...func(*o
 type GenerateResponseFunc func(GenerateResponse) error

 func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn GenerateResponseFunc) error {
-	return c.stream(ctx, http.MethodPost, "/api/generate",
-		OptionRequestBody(req),
-		OptionResponseFunc(func(bts []byte) error {
-			var resp GenerateResponse
-			if err := json.Unmarshal(bts, &resp); err != nil {
-				return err
-			}
+	return c.stream(ctx, http.MethodPost, "/api/generate", req, func(bts []byte) error {
+		var resp GenerateResponse
+		if err := json.Unmarshal(bts, &resp); err != nil {
+			return err
+		}

-			return fn(resp)
-		}),
-	)
+		return fn(resp)
+	})
 }

 type PullProgressFunc func(PullProgress) error

 func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
-	return c.stream(ctx, http.MethodPost, "/api/pull",
-		OptionRequestBody(req),
-		OptionResponseFunc(func(bts []byte) error {
-			var resp PullProgress
-			if err := json.Unmarshal(bts, &resp); err != nil {
-				return err
-			}
+	return c.stream(ctx, http.MethodPost, "/api/pull", req, func(bts []byte) error {
+		var resp PullProgress
+		if err := json.Unmarshal(bts, &resp); err != nil {
+			return err
+		}

-			if resp.Error.Message != "" {
-				// couldn't pull the model from the directory, proceed anyway
-				return nil
-			}
-
-			return fn(resp)
-		}),
-	)
+		return fn(resp)
+	})
 }
--- a/api/types.go
+++ b/api/types.go
@@ -1,22 +1,6 @@
 package api

-import (
-	"fmt"
-	"net/http"
-	"strings"
-)
-
-type Error struct {
-	Code    int32  `json:"code"`
-	Message string `json:"message"`
-}
-
-func (e Error) Error() string {
-	if e.Message == "" {
-		return fmt.Sprintf("%d %v", e.Code, strings.ToLower(http.StatusText(int(e.Code))))
-	}
-	return e.Message
-}
+import "runtime"

 type PullRequest struct {
 	Model string `json:"model"`
@@ -26,100 +10,82 @@ type PullProgress struct {
 	Total     int64   `json:"total"`
 	Completed int64   `json:"completed"`
 	Percent   float64 `json:"percent"`
-	Error     Error   `json:"error"`
 }

 type GenerateRequest struct {
 	Model  string `json:"model"`
 	Prompt string `json:"prompt"`

-	ModelOptions   `json:"model_opts,omitempty"`
-	PredictOptions `json:"predict_opts,omitempty"`
-}
-
-type ModelOptions struct {
-	ContextSize int    `json:"context_size,omitempty"`
-	Seed        int    `json:"seed,omitempty"`
-	NBatch      int    `json:"n_batch,omitempty"`
-	F16Memory   bool   `json:"memory_f16,omitempty"`
-	MLock       bool   `json:"mlock,omitempty"`
-	MMap        bool   `json:"mmap,omitempty"`
-	VocabOnly   bool   `json:"vocab_only,omitempty"`
-	LowVRAM     bool   `json:"low_vram,omitempty"`
-	Embeddings  bool   `json:"embeddings,omitempty"`
-	NUMA        bool   `json:"numa,omitempty"`
-	NGPULayers  int    `json:"gpu_layers,omitempty"`
-	MainGPU     string `json:"main_gpu,omitempty"`
-	TensorSplit string `json:"tensor_split,omitempty"`
-}
-
-type PredictOptions struct {
-	Seed        int     `json:"seed,omitempty"`
-	Threads     int     `json:"threads,omitempty"`
-	Tokens      int     `json:"tokens,omitempty"`
-	TopK        int     `json:"top_k,omitempty"`
-	Repeat      int     `json:"repeat,omitempty"`
-	Batch       int     `json:"batch,omitempty"`
-	NKeep       int     `json:"nkeep,omitempty"`
-	TopP        float64 `json:"top_p,omitempty"`
-	Temperature float64 `json:"temp,omitempty"`
-	Penalty     float64 `json:"penalty,omitempty"`
-	F16KV       bool
-	DebugMode   bool
-	StopPrompts []string
-	IgnoreEOS   bool `json:"ignore_eos,omitempty"`
-
-	TailFreeSamplingZ float64 `json:"tfs_z,omitempty"`
-	TypicalP          float64 `json:"typical_p,omitempty"`
-	FrequencyPenalty  float64 `json:"freq_penalty,omitempty"`
-	PresencePenalty   float64 `json:"pres_penalty,omitempty"`
-	Mirostat          int     `json:"mirostat,omitempty"`
-	MirostatETA       float64 `json:"mirostat_lr,omitempty"`
-	MirostatTAU       float64 `json:"mirostat_ent,omitempty"`
-	PenalizeNL        bool    `json:"penalize_nl,omitempty"`
-	LogitBias         string  `json:"logit_bias,omitempty"`
-
-	PathPromptCache string
-	MLock           bool `json:"mlock,omitempty"`
-	MMap            bool `json:"mmap,omitempty"`
-	PromptCacheAll  bool
-	PromptCacheRO   bool
-	MainGPU         string
-	TensorSplit     string
-}
-
-var DefaultModelOptions ModelOptions = ModelOptions{
-	ContextSize: 128,
-	Seed:        0,
-	F16Memory:   true,
-	MLock:       false,
-	Embeddings:  true,
-	MMap:        true,
-	LowVRAM:     false,
-}
-
-var DefaultPredictOptions PredictOptions = PredictOptions{
-	Seed:              -1,
-	Threads:           -1,
-	Tokens:            512,
-	Penalty:           1.1,
-	Repeat:            64,
-	Batch:             512,
-	NKeep:             64,
-	TopK:              90,
-	TopP:              0.86,
-	TailFreeSamplingZ: 1.0,
-	TypicalP:          1.0,
-	Temperature:       0.8,
-	FrequencyPenalty:  0.0,
-	PresencePenalty:   0.0,
-	Mirostat:          0,
-	MirostatTAU:       5.0,
-	MirostatETA:       0.1,
-	MMap:              true,
-	StopPrompts:       []string{"llama"},
+	Options `json:"options"`
 }

 type GenerateResponse struct {
 	Response string `json:"response"`
 }
+
+type Options struct {
+	Seed int `json:"seed,omitempty"`
+
+	// Backend options
+	UseNUMA bool `json:"numa,omitempty"`
+
+	// Model options
+	NumCtx        int  `json:"num_ctx,omitempty"`
+	NumBatch      int  `json:"num_batch,omitempty"`
+	NumGPU        int  `json:"num_gpu,omitempty"`
+	MainGPU       int  `json:"main_gpu,omitempty"`
+	LowVRAM       bool `json:"low_vram,omitempty"`
+	F16KV         bool `json:"f16_kv,omitempty"`
+	LogitsAll     bool `json:"logits_all,omitempty"`
+	VocabOnly     bool `json:"vocab_only,omitempty"`
+	UseMMap       bool `json:"use_mmap,omitempty"`
+	UseMLock      bool `json:"use_mlock,omitempty"`
+	EmbeddingOnly bool `json:"embedding_only,omitempty"`
+
+	// Predict options
+	RepeatLastN      int     `json:"repeat_last_n,omitempty"`
+	RepeatPenalty    float32 `json:"repeat_penalty,omitempty"`
+	FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
+	PresencePenalty  float32 `json:"presence_penalty,omitempty"`
+	Temperature      float32 `json:"temperature,omitempty"`
+	TopK             int     `json:"top_k,omitempty"`
+	TopP             float32 `json:"top_p,omitempty"`
+	TFSZ             float32 `json:"tfs_z,omitempty"`
+	TypicalP         float32 `json:"typical_p,omitempty"`
+	Mirostat         int     `json:"mirostat,omitempty"`
+	MirostatTau      float32 `json:"mirostat_tau,omitempty"`
+	MirostatEta      float32 `json:"mirostat_eta,omitempty"`
+
+	NumThread int `json:"num_thread,omitempty"`
+}
+
+func DefaultOptions() Options {
+	return Options{
+		Seed: -1,
+
+		UseNUMA: false,
+
+		NumCtx:   512,
+		NumBatch: 512,
+		NumGPU:   1,
+		LowVRAM:  false,
+		F16KV:    true,
+		UseMMap:  true,
+		UseMLock: false,
+
+		RepeatLastN:      512,
+		RepeatPenalty:    1.1,
+		FrequencyPenalty: 0.0,
+		PresencePenalty:  0.0,
+		Temperature:      0.8,
+		TopK:             40,
+		TopP:             0.9,
+		TFSZ:             1.0,
+		TypicalP:         1.0,
+		Mirostat:         0,
+		MirostatTau:      5.0,
+		MirostatEta:      0.1,
+
+		NumThread: runtime.NumCPU(),
+	}
+}
--- a/app/forge.config.ts
+++ b/app/forge.config.ts
@@ -21,7 +21,7 @@ const config: ForgeConfig = {
      '../ollama',
      path.join(__dirname, './assets/ollama_icon_16x16Template.png'),
      path.join(__dirname, './assets/ollama_icon_16x16Template@2x.png'),
-      ...(process.platform === 'darwin' ? ['../ggml-metal.metal'] : []),
+      ...(process.platform === 'darwin' ? ['../llama/ggml-metal.metal'] : []),
    ],
    ...(process.env.SIGN
      ? {
--- a/app/package-lock.json
+++ b/app/package-lock.json
@@ -15,7 +15,9 @@
        "electron-store": "^8.1.0",
        "react": "^18.2.0",
        "react-dom": "^18.2.0",
-        "uuid": "^9.0.0"
+        "uuid": "^9.0.0",
+        "winston": "^3.10.0",
+        "winston-daily-rotate-file": "^4.7.1"
      },
      "devDependencies": {
        "@babel/core": "^7.22.5",
@@ -610,6 +612,14 @@
        "node": ">=6.9.0"
      }
    },
+    "node_modules/@colors/colors": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
+      "integrity": "sha512-ooWCrlZP11i8GImSjTHYHLkvFDP48nS4+204nGb1RiX/WXYHmJA2III9/e2DWVabCESdW7hBAEzHRqUn9OUVvQ==",
+      "engines": {
+        "node": ">=0.1.90"
+      }
+    },
    "node_modules/@cspotcode/source-map-support": {
      "version": "0.8.1",
      "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz",
@@ -1319,6 +1329,16 @@
        "postcss-selector-parser": "^6.0.10"
      }
    },
+    "node_modules/@dabh/diagnostics": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/@dabh/diagnostics/-/diagnostics-2.0.3.tgz",
+      "integrity": "sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==",
+      "dependencies": {
+        "colorspace": "1.1.x",
+        "enabled": "2.0.x",
+        "kuler": "^2.0.0"
+      }
+    },
    "node_modules/@discoveryjs/json-ext": {
      "version": "0.5.7",
      "resolved": "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz",
@@ -2825,6 +2845,11 @@
        "@types/node": "*"
      }
    },
+    "node_modules/@types/triple-beam": {
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.2.tgz",
+      "integrity": "sha512-txGIh+0eDFzKGC25zORnswy+br1Ha7hj5cMVwKIU7+s0U2AxxJru/jZSMU6OC9MJWP6+pc/hc6ZjyZShpsyY2g=="
+    },
    "node_modules/@types/uuid": {
      "version": "9.0.2",
      "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.2.tgz",
@@ -3634,6 +3659,11 @@
        "node": ">=8"
      }
    },
+    "node_modules/async": {
+      "version": "3.2.4",
+      "resolved": "https://registry.npmjs.org/async/-/async-3.2.4.tgz",
+      "integrity": "sha512-iAB+JbDEGXhyIUavoDl9WP/Jj106Kz9DEn1DPgYw5ruDn0e3Wgi3sKFm55sASdGBNOQB8F59d9qQ7deqrHA8wQ=="
+    },
    "node_modules/at-least-node": {
      "version": "1.0.0",
      "resolved": "https://registry.npmjs.org/at-least-node/-/at-least-node-1.0.0.tgz",
@@ -4394,6 +4424,15 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
+    "node_modules/color": {
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/color/-/color-3.2.1.tgz",
+      "integrity": "sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==",
+      "dependencies": {
+        "color-convert": "^1.9.3",
+        "color-string": "^1.6.0"
+      }
+    },
    "node_modules/color-convert": {
      "version": "2.0.1",
      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
@@ -4409,8 +4448,16 @@
    "node_modules/color-name": {
      "version": "1.1.4",
      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
-      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
-      "dev": true
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA=="
+    },
+    "node_modules/color-string": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz",
+      "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==",
+      "dependencies": {
+        "color-name": "^1.0.0",
+        "simple-swizzle": "^0.2.2"
+      }
    },
    "node_modules/color-support": {
      "version": "1.1.3",
@@ -4421,12 +4468,34 @@
        "color-support": "bin.js"
      }
    },
+    "node_modules/color/node_modules/color-convert": {
+      "version": "1.9.3",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
+      "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
+      "dependencies": {
+        "color-name": "1.1.3"
+      }
+    },
+    "node_modules/color/node_modules/color-name": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
+      "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw=="
+    },
    "node_modules/colorette": {
      "version": "2.0.20",
      "resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz",
      "integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==",
      "dev": true
    },
+    "node_modules/colorspace": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.4.tgz",
+      "integrity": "sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==",
+      "dependencies": {
+        "color": "^3.1.3",
+        "text-hex": "1.0.x"
+      }
+    },
    "node_modules/commander": {
      "version": "5.1.0",
      "resolved": "https://registry.npmjs.org/commander/-/commander-5.1.0.tgz",
@@ -5893,6 +5962,11 @@
        "node": ">= 4"
      }
    },
+    "node_modules/enabled": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz",
+      "integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ=="
+    },
    "node_modules/encodeurl": {
      "version": "1.0.2",
      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
@@ -6654,6 +6728,11 @@
        "pend": "~1.2.0"
      }
    },
+    "node_modules/fecha": {
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz",
+      "integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw=="
+    },
    "node_modules/file-entry-cache": {
      "version": "6.0.1",
      "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz",
@@ -6666,6 +6745,14 @@
        "node": "^10.12.0 || >=12.0.0"
      }
    },
+    "node_modules/file-stream-rotator": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/file-stream-rotator/-/file-stream-rotator-0.6.1.tgz",
+      "integrity": "sha512-u+dBid4PvZw17PmDeRcNOtCP9CCK/9lRN2w+r1xIS7yOL9JFrIBKTvrYsxT4P0pGtThYTn++QS5ChHaUov3+zQ==",
+      "dependencies": {
+        "moment": "^2.29.1"
+      }
+    },
    "node_modules/filename-reserved-regex": {
      "version": "2.0.0",
      "resolved": "https://registry.npmjs.org/filename-reserved-regex/-/filename-reserved-regex-2.0.0.tgz",
@@ -6834,6 +6921,11 @@
        "node": ">= 4.0.0"
      }
    },
+    "node_modules/fn.name": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz",
+      "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw=="
+    },
    "node_modules/follow-redirects": {
      "version": "1.15.2",
      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
@@ -7928,8 +8020,7 @@
    "node_modules/inherits": {
      "version": "2.0.4",
      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
-      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
-      "dev": true
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
    },
    "node_modules/ini": {
      "version": "1.3.8",
@@ -8243,7 +8334,6 @@
      "version": "2.0.1",
      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
      "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
-      "dev": true,
      "engines": {
        "node": ">=8"
      },
@@ -8547,6 +8637,11 @@
        "node": ">=0.10.0"
      }
    },
+    "node_modules/kuler": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz",
+      "integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A=="
+    },
    "node_modules/launch-editor": {
      "version": "2.6.0",
      "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.6.0.tgz",
@@ -8788,6 +8883,19 @@
        "node": ">=8"
      }
    },
+    "node_modules/logform": {
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/logform/-/logform-2.5.1.tgz",
+      "integrity": "sha512-9FyqAm9o9NKKfiAKfZoYo9bGXXuwMkxQiQttkT4YjjVtQVIQtK6LmVtlxmCaFswo6N4AfEkHqZTV0taDtPotNg==",
+      "dependencies": {
+        "@colors/colors": "1.5.0",
+        "@types/triple-beam": "^1.3.2",
+        "fecha": "^4.2.0",
+        "ms": "^2.1.1",
+        "safe-stable-stringify": "^2.3.1",
+        "triple-beam": "^1.3.0"
+      }
+    },
    "node_modules/loose-envify": {
      "version": "1.4.0",
      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
@@ -9218,6 +9326,14 @@
        "node": ">=10"
      }
    },
+    "node_modules/moment": {
+      "version": "2.29.4",
+      "resolved": "https://registry.npmjs.org/moment/-/moment-2.29.4.tgz",
+      "integrity": "sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w==",
+      "engines": {
+        "node": "*"
+      }
+    },
    "node_modules/ms": {
      "version": "2.1.2",
      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
@@ -9623,6 +9739,14 @@
        "wrappy": "1"
      }
    },
+    "node_modules/one-time": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz",
+      "integrity": "sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==",
+      "dependencies": {
+        "fn.name": "1.x.x"
+      }
+    },
    "node_modules/onetime": {
      "version": "5.1.2",
      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
@@ -11366,7 +11490,6 @@
      "version": "3.6.2",
      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
      "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
-      "dev": true,
      "dependencies": {
        "inherits": "^2.0.3",
        "string_decoder": "^1.1.1",
@@ -11664,7 +11787,6 @@
      "version": "5.2.1",
      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
-      "dev": true,
      "funding": [
        {
          "type": "github",
@@ -11694,6 +11816,14 @@
        "url": "https://github.com/sponsors/ljharb"
      }
    },
+    "node_modules/safe-stable-stringify": {
+      "version": "2.4.3",
+      "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.4.3.tgz",
+      "integrity": "sha512-e2bDA2WJT0wxseVd4lsDP4+3ONX6HpMXQa1ZhFQ7SU+GjvORCmShbCMltrtIDfkYhVHrOcPtj+KhmDBdPdZD1g==",
+      "engines": {
+        "node": ">=10"
+      }
+    },
    "node_modules/safer-buffer": {
      "version": "2.1.2",
      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
@@ -12023,6 +12153,19 @@
      "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
      "dev": true
    },
+    "node_modules/simple-swizzle": {
+      "version": "0.2.2",
+      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
+      "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
+      "dependencies": {
+        "is-arrayish": "^0.3.1"
+      }
+    },
+    "node_modules/simple-swizzle/node_modules/is-arrayish": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
+      "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
+    },
    "node_modules/slash": {
      "version": "3.0.0",
      "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
@@ -12212,6 +12355,14 @@
        "node": "^14.17.0 || ^16.13.0 || >=18.0.0"
      }
    },
+    "node_modules/stack-trace": {
+      "version": "0.0.10",
+      "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
+      "integrity": "sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==",
+      "engines": {
+        "node": "*"
+      }
+    },
    "node_modules/statuses": {
      "version": "2.0.1",
      "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
@@ -12225,7 +12376,6 @@
      "version": "1.3.0",
      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
      "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
-      "dev": true,
      "dependencies": {
        "safe-buffer": "~5.2.0"
      }
@@ -12689,6 +12839,11 @@
      "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
      "dev": true
    },
+    "node_modules/text-hex": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
+      "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
+    },
    "node_modules/text-table": {
      "version": "0.2.0",
      "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
@@ -12814,6 +12969,14 @@
        "node": ">=0.8.0"
      }
    },
+    "node_modules/triple-beam": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz",
+      "integrity": "sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==",
+      "engines": {
+        "node": ">= 14.0.0"
+      }
+    },
    "node_modules/ts-interface-checker": {
      "version": "0.1.13",
      "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz",
@@ -13231,8 +13394,7 @@
    "node_modules/util-deprecate": {
      "version": "1.0.2",
      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
-      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
-      "dev": true
+      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
    },
    "node_modules/utila": {
      "version": "0.4.0",
@@ -13764,6 +13926,65 @@
      "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==",
      "dev": true
    },
+    "node_modules/winston": {
+      "version": "3.10.0",
+      "resolved": "https://registry.npmjs.org/winston/-/winston-3.10.0.tgz",
+      "integrity": "sha512-nT6SIDaE9B7ZRO0u3UvdrimG0HkB7dSTAgInQnNR2SOPJ4bvq5q79+pXLftKmP52lJGW15+H5MCK0nM9D3KB/g==",
+      "dependencies": {
+        "@colors/colors": "1.5.0",
+        "@dabh/diagnostics": "^2.0.2",
+        "async": "^3.2.3",
+        "is-stream": "^2.0.0",
+        "logform": "^2.4.0",
+        "one-time": "^1.0.0",
+        "readable-stream": "^3.4.0",
+        "safe-stable-stringify": "^2.3.1",
+        "stack-trace": "0.0.x",
+        "triple-beam": "^1.3.0",
+        "winston-transport": "^4.5.0"
+      },
+      "engines": {
+        "node": ">= 12.0.0"
+      }
+    },
+    "node_modules/winston-daily-rotate-file": {
+      "version": "4.7.1",
+      "resolved": "https://registry.npmjs.org/winston-daily-rotate-file/-/winston-daily-rotate-file-4.7.1.tgz",
+      "integrity": "sha512-7LGPiYGBPNyGHLn9z33i96zx/bd71pjBn9tqQzO3I4Tayv94WPmBNwKC7CO1wPHdP9uvu+Md/1nr6VSH9h0iaA==",
+      "dependencies": {
+        "file-stream-rotator": "^0.6.1",
+        "object-hash": "^2.0.1",
+        "triple-beam": "^1.3.0",
+        "winston-transport": "^4.4.0"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "peerDependencies": {
+        "winston": "^3"
+      }
+    },
+    "node_modules/winston-daily-rotate-file/node_modules/object-hash": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-2.2.0.tgz",
+      "integrity": "sha512-gScRMn0bS5fH+IuwyIFgnh9zBdo4DV+6GhygmWM9HyNJSgS0hScp1f5vjtm7oIIOiT9trXrShAkLFSc2IqKNgw==",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/winston-transport": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.5.0.tgz",
+      "integrity": "sha512-YpZzcUzBedhlTAfJg6vJDlyEai/IFMIVcaEZZyl3UXIl4gmqRpU7AE89AHLkbzLUsv0NVmw7ts+iztqKxxPW1Q==",
+      "dependencies": {
+        "logform": "^2.3.2",
+        "readable-stream": "^3.6.0",
+        "triple-beam": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 6.4.0"
+      }
+    },
    "node_modules/word-wrap": {
      "version": "1.2.3",
      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
--- a/app/package.json
+++ b/app/package.json
@@ -69,6 +69,8 @@
    "electron-store": "^8.1.0",
    "react": "^18.2.0",
    "react-dom": "^18.2.0",
-    "uuid": "^9.0.0"
+    "uuid": "^9.0.0",
+    "winston": "^3.10.0",
+    "winston-daily-rotate-file": "^4.7.1"
  }
 }
--- a/app/src/index.ts
+++ b/app/src/index.ts
@@ -1,6 +1,8 @@
 import { spawn, exec } from 'child_process'
 import { app, autoUpdater, dialog, Tray, Menu } from 'electron'
 import Store from 'electron-store'
+import winston from 'winston'
+import 'winston-daily-rotate-file'
 import * as path from 'path'
 import * as fs from 'fs'

@@ -11,6 +13,18 @@ require('@electron/remote/main').initialize()
 const store = new Store()
 let tray: Tray | null = null

+const logger = winston.createLogger({
+  transports: [
+    new winston.transports.Console(),
+    new winston.transports.File({
+      filename: path.join(app.getPath('home'), '.ollama', 'logs', 'server.log'),
+      maxsize: 1024 * 1024 * 20,
+      maxFiles: 5,
+    }),
+  ],
+  format: winston.format.printf(info => `${info.message}`),
+})
+
 const SingleInstanceLock = app.requestSingleInstanceLock()
 if (!SingleInstanceLock) {
  app.quit()
@@ -31,42 +45,35 @@ const createSystemtray = () => {
  tray.setToolTip('Ollama')
 }

-// Handle creating/removing shortcuts on Windows when installing/uninstalling.
 if (require('electron-squirrel-startup')) {
  app.quit()
 }

 const ollama = path.join(process.resourcesPath, 'ollama')

-// if the app is packaged then run the server
-if (app.isPackaged) {
-  // Start the executable
-  console.log(`Starting server`)
-  const proc = spawn(ollama, ['serve'])
-  proc.stdout.on('data', data => {
-    console.log(`server: ${data}`)
-  })
-  proc.stderr.on('data', data => {
-    console.error(`server: ${data}`)
-  })
-
-  process.on('exit', () => {
-    proc.kill()
-  })
-}
-
 function server() {
  const binary = app.isPackaged
    ? path.join(process.resourcesPath, 'ollama')
-    : path.resolve(__dirname, '..', '..', 'ollama')
+    : path.resolve(process.cwd(), '..', 'ollama')

-  console.log(`Starting server`)
  const proc = spawn(binary, ['serve'])
+
  proc.stdout.on('data', data => {
-    console.log(`server: ${data}`)
+    logger.info(data.toString().trim())
  })
+
  proc.stderr.on('data', data => {
-    console.error(`server: ${data}`)
+    logger.error(data.toString().trim())
+  })
+
+  proc.on('exit', () => {
+    logger.info('Restarting the server...')
+    server()
+  })
+
+  proc.on('disconnect', () => {
+    logger.info('Server disconnected. Reconnecting...')
+    server()
  })

  process.on('exit', () => {
@@ -95,11 +102,12 @@ function installCLI() {
    `
        exec(`osascript -e '${command}'`, (error: Error | null, stdout: string, stderr: string) => {
          if (error) {
-            console.error(`exec error: ${error}`)
+            logger.error(`cli: failed to install cli: ${error.message}`)
            return
          }
-          console.log(`stdout: ${stdout}`)
-          console.error(`stderr: ${stderr}`)
+
+          logger.info(stdout)
+          logger.error(stderr)
        })
      }
    })
@@ -118,44 +126,44 @@ app.on('ready', () => {
      app.setLoginItemSettings({ openAtLogin: app.getLoginItemSettings().openAtLogin })
    }

-    if (!app.isInApplicationsFolder()) {
-      const chosen = dialog.showMessageBoxSync({
-        type: 'question',
-        buttons: ['Move to Applications', 'Do Not Move'],
-        message: 'Ollama works best when run from the Applications directory.',
-        defaultId: 0,
-        cancelId: 1,
-      })
+    if (app.isPackaged) {
+      if (!app.isInApplicationsFolder()) {
+        const chosen = dialog.showMessageBoxSync({
+          type: 'question',
+          buttons: ['Move to Applications', 'Do Not Move'],
+          message: 'Ollama works best when run from the Applications directory.',
+          defaultId: 0,
+          cancelId: 1,
+        })

-      if (chosen === 0) {
-        try {
-          app.moveToApplicationsFolder({
-            conflictHandler: conflictType => {
-              if (conflictType === 'existsAndRunning') {
-                dialog.showMessageBoxSync({
-                  type: 'info',
-                  message: 'Cannot move to Applications directory',
-                  detail:
-                    'Another version of Ollama is currently running from your Applications directory. Close it first and try again.',
-                })
-              }
-              return true
-            },
-          })
-          return
-        } catch (e) {
-          console.error('Failed to move to applications folder')
-          console.error(e)
+        if (chosen === 0) {
+          try {
+            app.moveToApplicationsFolder({
+              conflictHandler: conflictType => {
+                if (conflictType === 'existsAndRunning') {
+                  dialog.showMessageBoxSync({
+                    type: 'info',
+                    message: 'Cannot move to Applications directory',
+                    detail:
+                      'Another version of Ollama is currently running from your Applications directory. Close it first and try again.',
+                  })
+                }
+                return true
+              },
+            })
+            return
+          } catch (e) {
+            logger.error(`[Move to Applications] Failed to move to applications folder - ${e.message}}`)
+          }
        }
      }
+
+      installCLI()
    }
  }

  createSystemtray()
-
-  if (app.isPackaged) {
-    installCLI()
-  }
+  server()
 })

 // Quit when all windows are closed, except on macOS. There, it's common
@@ -183,8 +191,6 @@ async function heartbeat() {
  })
 }

-heartbeat()
-
 if (app.isPackaged) {
  heartbeat()
  autoUpdater.checkForUpdates()
@@ -195,7 +201,7 @@ if (app.isPackaged) {
 }

 autoUpdater.on('error', e => {
-  console.error('update check failed', e)
+  logger.error(`update check failed - ${e.message}`)
 })

 autoUpdater.on('update-downloaded', (event, releaseNotes, releaseName) => {
--- a/app/src/telemetry.ts
+++ b/app/src/telemetry.ts
@@ -4,8 +4,6 @@ import Store from 'electron-store'

 const store = new Store()

-console.log(process.env)
-
 export const analytics = new Analytics({ writeKey: process.env.TELEMETRY_WRITE_KEY || '<empty>' })

 export function id(): string {
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"log"
 	"net"
+	"net/http"
 	"os"
 	"path"
 	"strings"
@@ -34,7 +35,14 @@ func RunRun(cmd *cobra.Command, args []string) error {
 	switch {
 	case errors.Is(err, os.ErrNotExist):
 		if err := pull(args[0]); err != nil {
-			return err
+			var apiStatusError api.StatusError
+			if !errors.As(err, &apiStatusError) {
+				return err
+			}
+
+			if apiStatusError.StatusCode != http.StatusBadGateway {
+				return err
+			}
 		}
 	case err != nil:
 		return err
@@ -50,11 +58,12 @@ func pull(model string) error {
 		context.Background(),
 		&api.PullRequest{Model: model},
 		func(progress api.PullProgress) error {
-			if bar == nil && progress.Percent == 100 {
-				// already downloaded
-				return nil
-			}
 			if bar == nil {
+				if progress.Percent == 100 {
+					// already downloaded
+					return nil
+				}
+
 				bar = progressbar.DefaultBytes(progress.Total)
 			}

@@ -64,8 +73,10 @@ func pull(model string) error {
 }

 func RunGenerate(_ *cobra.Command, args []string) error {
+	// join all args into a single prompt
+	prompt := strings.Join(args[1:], " ")
 	if len(args) > 1 {
-		return generateOneshot(args[0], args[1:]...)
+		return generate(args[0], prompt)
 	}

 	if term.IsTerminal(int(os.Stdin.Fd())) {
@@ -98,28 +109,22 @@ func generate(model, prompt string) error {
 			}
 		}()

-		client.Generate(context.Background(), &api.GenerateRequest{Model: model, Prompt: prompt}, func(resp api.GenerateResponse) error {
+		request := api.GenerateRequest{Model: model, Prompt: prompt}
+		fn := func(resp api.GenerateResponse) error {
 			if !spinner.IsFinished() {
 				spinner.Finish()
 			}

 			fmt.Print(resp.Response)
 			return nil
-		})
+		}

-		fmt.Println()
-		fmt.Println()
-	}
-
-	return nil
-}
-
-func generateOneshot(model string, prompts ...string) error {
-	for _, prompt := range prompts {
-		fmt.Printf(">>> %s\n", prompt)
-		if err := generate(model, prompt); err != nil {
+		if err := client.Generate(context.Background(), &request, fn); err != nil {
 			return err
 		}
+
+		fmt.Println()
+		fmt.Println()
 	}

 	return nil
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -0,0 +1,15 @@
+# Python
+
+This is a simple example of calling the Ollama api from a python app.
+
+First, download a model:
+
+```
+curl -L https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_1.bin -o orca.bin
+```
+
+Then run it using the example script. You'll need to have Ollama running on your machine.
+
+```
+python3 main.py orca.bin
+```
--- a/examples/python/main.py
+++ b/examples/python/main.py
@@ -0,0 +1,32 @@
+import http.client
+import json
+import os
+import sys
+
+if len(sys.argv) < 2:
+    print("Usage: python main.py <model file>")
+    sys.exit(1)
+
+conn = http.client.HTTPConnection('localhost', 11434)
+
+headers = { 'Content-Type': 'application/json' }
+
+# generate text from the model
+conn.request("POST", "/api/generate", json.dumps({
+    'model': os.path.join(os.getcwd(), sys.argv[1]),
+    'prompt': 'write me a short story',
+    'stream': True
+}), headers)
+
+response = conn.getresponse()
+
+def parse_generate(data):
+    for event in data.decode('utf-8').split("\n"):
+        if not event:
+            continue
+        yield event
+
+if response.status == 200:
+    for chunk in response:
+        for event in parse_generate(chunk):
+            print(json.loads(event)['response'], end="", flush=True)
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -0,0 +1 @@
+llama/ggml-metal.metal
--- a/llama/.gitignore
+++ b/llama/.gitignore
@@ -1 +0,0 @@
-build
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -1,23 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-project(binding)
-
-include(FetchContent)
-
-FetchContent_Declare(
-    llama_cpp
-    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-    GIT_TAG        55dbb91
-)
-
-FetchContent_MakeAvailable(llama_cpp)
-
-add_library(binding ${CMAKE_CURRENT_SOURCE_DIR}/binding/binding.cpp ${llama_cpp_SOURCE_DIR}/examples/common.cpp)
-target_include_directories(binding PRIVATE ${llama_cpp_SOURCE_DIR}/examples)
-target_link_libraries(binding llama ggml_static)
-
-if (LLAMA_METAL)
-    configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_CURRENT_BINARY_DIR}/../../ggml-metal.metal COPYONLY)
-endif()
-
-add_custom_target(copy_libllama ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:llama> ${CMAKE_CURRENT_BINARY_DIR})
-add_custom_target(copy_libggml_static ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:ggml_static> ${CMAKE_CURRENT_BINARY_DIR})
--- a/llama/binding/binding.cpp
+++ b/llama/binding/binding.cpp
@@ -1,691 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include "binding.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <vector>
-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <signal.h>
-#include <windows.h>
-#endif
-
-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \
-    defined(_WIN32)
-void sigint_handler(int signo) {
-  if (signo == SIGINT) {
-    _exit(130);
-  }
-}
-#endif
-
-int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings) {
-  gpt_params *params_p = (gpt_params *)params_ptr;
-  llama_context *ctx = (llama_context *)state_pr;
-  gpt_params params = *params_p;
-
-  if (params.seed <= 0) {
-    params.seed = time(NULL);
-  }
-
-  std::mt19937 rng(params.seed);
-
-  llama_init_backend(params.numa);
-
-  int n_past = 0;
-
-  // Add a space in front of the first character to match OG llama tokenizer
-  // behavior
-  params.prompt.insert(0, 1, ' ');
-
-  // tokenize the prompt
-  auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
-
-  // determine newline token
-  auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
-  if (embd_inp.size() > 0) {
-    if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past,
-                   params.n_threads)) {
-      fprintf(stderr, "%s : failed to eval\n", __func__);
-      return 1;
-    }
-  }
-
-  const int n_embd = llama_n_embd(ctx);
-
-  const auto embeddings = llama_get_embeddings(ctx);
-
-  for (int i = 0; i < n_embd; i++) {
-    res_embeddings[i] = embeddings[i];
-  }
-
-  return 0;
-}
-
-int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens,
-                         int tokenSize, float *res_embeddings) {
-  gpt_params *params_p = (gpt_params *)params_ptr;
-  llama_context *ctx = (llama_context *)state_pr;
-  gpt_params params = *params_p;
-
-  for (int i = 0; i < tokenSize; i++) {
-    auto token_str = llama_token_to_str(ctx, tokens[i]);
-    if (token_str == nullptr) {
-      continue;
-    }
-    std::vector<std::string> my_vector;
-    std::string str_token(token_str); // create a new std::string from the char*
-    params_p->prompt += str_token;
-  }
-
-  return get_embeddings(params_ptr, state_pr, res_embeddings);
-}
-
-int eval(void *params_ptr, void *state_pr, char *text) {
-  gpt_params *params_p = (gpt_params *)params_ptr;
-  llama_context *ctx = (llama_context *)state_pr;
-
-  auto n_past = 0;
-  auto last_n_tokens_data =
-      std::vector<llama_token>(params_p->repeat_last_n, 0);
-
-  auto tokens = std::vector<llama_token>(params_p->n_ctx);
-  auto n_prompt_tokens =
-      llama_tokenize(ctx, text, tokens.data(), tokens.size(), true);
-
-  if (n_prompt_tokens < 1) {
-    fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
-    return 1;
-  }
-
-  // evaluate prompt
-  return llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past,
-                    params_p->n_threads);
-}
-
-int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug) {
-  gpt_params *params_p = (gpt_params *)params_ptr;
-  llama_context *ctx = (llama_context *)state_pr;
-
-  gpt_params params = *params_p;
-
-  const int n_ctx = llama_n_ctx(ctx);
-
-  if (params.seed <= 0) {
-    params.seed = time(NULL);
-  }
-
-  std::mt19937 rng(params.seed);
-
-  std::string path_session = params.path_prompt_cache;
-  std::vector<llama_token> session_tokens;
-
-  if (!path_session.empty()) {
-    if (debug) {
-      fprintf(stderr, "%s: attempting to load saved session from '%s'\n",
-              __func__, path_session.c_str());
-    }
-    // fopen to check for existing session
-    FILE *fp = std::fopen(path_session.c_str(), "rb");
-    if (fp != NULL) {
-      std::fclose(fp);
-
-      session_tokens.resize(n_ctx);
-      size_t n_token_count_out = 0;
-      if (!llama_load_session_file(
-              ctx, path_session.c_str(), session_tokens.data(),
-              session_tokens.capacity(), &n_token_count_out)) {
-        fprintf(stderr, "%s: error: failed to load session file '%s'\n",
-                __func__, path_session.c_str());
-        return 1;
-      }
-      session_tokens.resize(n_token_count_out);
-      llama_set_rng_seed(ctx, params.seed);
-      if (debug) {
-        fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n",
-                __func__, (int)session_tokens.size());
-      }
-    } else {
-      if (debug) {
-        fprintf(stderr, "%s: session file does not exist, will create\n",
-                __func__);
-      }
-    }
-  }
-
-  std::vector<llama_token> embd_inp;
-  if (!params.prompt.empty() || session_tokens.empty()) {
-    // Add a space in front of the first character to match OG llama tokenizer
-    // behavior
-    params.prompt.insert(0, 1, ' ');
-
-    embd_inp = ::llama_tokenize(ctx, params.prompt, true);
-  } else {
-    embd_inp = session_tokens;
-  }
-
-  // debug message about similarity of saved session, if applicable
-  size_t n_matching_session_tokens = 0;
-  if (session_tokens.size()) {
-    for (llama_token id : session_tokens) {
-      if (n_matching_session_tokens >= embd_inp.size() ||
-          id != embd_inp[n_matching_session_tokens]) {
-        break;
-      }
-      n_matching_session_tokens++;
-    }
-    if (debug) {
-      if (params.prompt.empty() &&
-          n_matching_session_tokens == embd_inp.size()) {
-        fprintf(stderr, "%s: using full prompt from session file\n", __func__);
-      } else if (n_matching_session_tokens >= embd_inp.size()) {
-        fprintf(stderr, "%s: session file has exact match for prompt!\n",
-                __func__);
-      } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-        fprintf(stderr,
-                "%s: warning: session file has low similarity to prompt (%zu / "
-                "%zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
-      } else {
-        fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
-      }
-    }
-  }
-  // if we will use the cache for the full prompt without reaching the end of
-  // the cache, force reevaluation of the last token token to recalculate the
-  // cached logits
-  if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
-      session_tokens.size() > embd_inp.size()) {
-    session_tokens.resize(embd_inp.size() - 1);
-  }
-  // number of tokens to keep when resetting context
-  if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) {
-    params.n_keep = (int)embd_inp.size();
-  }
-
-  // determine newline token
-  auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
-  // TODO: replace with ring-buffer
-  std::vector<llama_token> last_n_tokens(n_ctx);
-  std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-  bool need_to_save_session =
-      !path_session.empty() && n_matching_session_tokens < embd_inp.size();
-  int n_past = 0;
-  int n_remain = params.n_predict;
-  int n_consumed = 0;
-  int n_session_consumed = 0;
-
-  std::vector<llama_token> embd;
-  std::string res = "";
-
-  // do one empty run to warm up the model
-  {
-    const std::vector<llama_token> tmp = {
-        llama_token_bos(),
-    };
-    llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-    llama_reset_timings(ctx);
-  }
-
-  while (n_remain != 0) {
-    // predict
-    if (embd.size() > 0) {
-      // infinite text generation via context swapping
-      // if we run out of context:
-      // - take the n_keep first tokens from the original prompt (via n_past)
-      // - take half of the last (n_ctx - n_keep) tokens and recompute the
-      // logits in batches
-      if (n_past + (int)embd.size() > n_ctx) {
-        const int n_left = n_past - params.n_keep;
-
-        // always keep the first token - BOS
-        n_past = std::max(1, params.n_keep);
-
-        // insert n_left/2 tokens at the start of embd from last_n_tokens
-        embd.insert(embd.begin(),
-                    last_n_tokens.begin() + n_ctx - n_left / 2 - embd.size(),
-                    last_n_tokens.end() - embd.size());
-
-        // stop saving session if we run out of context
-        path_session.clear();
-
-        // printf("\n---\n");
-        // printf("resetting: '");
-        // for (int i = 0; i < (int) embd.size(); i++) {
-        //     printf("%s", llama_token_to_str(ctx, embd[i]));
-        // }
-        // printf("'\n");
-        // printf("\n---\n");
-      }
-
-      // try to reuse a matching prefix from the loaded session instead of
-      // re-eval (via n_past)
-      if (n_session_consumed < (int)session_tokens.size()) {
-        size_t i = 0;
-        for (; i < embd.size(); i++) {
-          if (embd[i] != session_tokens[n_session_consumed]) {
-            session_tokens.resize(n_session_consumed);
-            break;
-          }
-
-          n_past++;
-          n_session_consumed++;
-
-          if (n_session_consumed >= (int)session_tokens.size()) {
-            ++i;
-            break;
-          }
-        }
-        if (i > 0) {
-          embd.erase(embd.begin(), embd.begin() + i);
-        }
-      }
-
-      // evaluate tokens in batches
-      // embd is typically prepared beforehand to fit within a batch, but not
-      // always
-      for (int i = 0; i < (int)embd.size(); i += params.n_batch) {
-        int n_eval = (int)embd.size() - i;
-        if (n_eval > params.n_batch) {
-          n_eval = params.n_batch;
-        }
-        if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-          fprintf(stderr, "%s : failed to eval\n", __func__);
-          return 1;
-        }
-        n_past += n_eval;
-      }
-
-      if (embd.size() > 0 && !path_session.empty()) {
-        session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
-        n_session_consumed = session_tokens.size();
-      }
-    }
-
-    embd.clear();
-
-    if ((int)embd_inp.size() <= n_consumed) {
-      // out of user input, sample next token
-      const float temp = params.temp;
-      const int32_t top_k =
-          params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
-      const float top_p = params.top_p;
-      const float tfs_z = params.tfs_z;
-      const float typical_p = params.typical_p;
-      const int32_t repeat_last_n =
-          params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-      const float repeat_penalty = params.repeat_penalty;
-      const float alpha_presence = params.presence_penalty;
-      const float alpha_frequency = params.frequency_penalty;
-      const int mirostat = params.mirostat;
-      const float mirostat_tau = params.mirostat_tau;
-      const float mirostat_eta = params.mirostat_eta;
-      const bool penalize_nl = params.penalize_nl;
-
-      // optionally save the session on first sample (for faster prompt loading
-      // next time)
-      if (!path_session.empty() && need_to_save_session &&
-          !params.prompt_cache_ro) {
-        need_to_save_session = false;
-        llama_save_session_file(ctx, path_session.c_str(),
-                                session_tokens.data(), session_tokens.size());
-      }
-
-      llama_token id = 0;
-
-      {
-        auto logits = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(ctx);
-
-        // Apply params.logit_bias map
-        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end();
-             it++) {
-          logits[it->first] += it->second;
-        }
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-          candidates.emplace_back(
-              llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-
-        llama_token_data_array candidates_p = {candidates.data(),
-                                               candidates.size(), false};
-
-        // Apply penalties
-        float nl_logit = logits[llama_token_nl()];
-        auto last_n_repeat =
-            std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-        llama_sample_repetition_penalty(
-            ctx, &candidates_p,
-            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-            last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(
-            ctx, &candidates_p,
-            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-            last_n_repeat, alpha_frequency, alpha_presence);
-        if (!penalize_nl) {
-          logits[llama_token_nl()] = nl_logit;
-        }
-
-        if (temp <= 0) {
-          // Greedy sampling
-          id = llama_sample_token_greedy(ctx, &candidates_p);
-        } else {
-          if (mirostat == 1) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            const int mirostat_m = 100;
-            llama_sample_temperature(ctx, &candidates_p, temp);
-            id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau,
-                                             mirostat_eta, mirostat_m,
-                                             &mirostat_mu);
-          } else if (mirostat == 2) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            llama_sample_temperature(ctx, &candidates_p, temp);
-            id = llama_sample_token_mirostat_v2(
-                ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-          } else {
-            // Temperature sampling
-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-            llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temperature(ctx, &candidates_p, temp);
-            id = llama_sample_token(ctx, &candidates_p);
-          }
-        }
-        // printf("`%d`", candidates_p.size);
-
-        last_n_tokens.erase(last_n_tokens.begin());
-        last_n_tokens.push_back(id);
-      }
-
-      // add it to the context
-      embd.push_back(id);
-
-      // decrement remaining sampling budget
-      --n_remain;
-
-      // call the token callback, no need to check if one is actually
-      // registered, that will be handled on the Go side.
-      auto token_str = llama_token_to_str(ctx, id);
-      if (!tokenCallback(state_pr, (char *)token_str)) {
-        break;
-      }
-    } else {
-      // some user input remains from prompt or interaction, forward it to
-      // processing
-      while ((int)embd_inp.size() > n_consumed) {
-        embd.push_back(embd_inp[n_consumed]);
-        last_n_tokens.erase(last_n_tokens.begin());
-        last_n_tokens.push_back(embd_inp[n_consumed]);
-        ++n_consumed;
-        if ((int)embd.size() >= params.n_batch) {
-          break;
-        }
-      }
-    }
-
-    for (auto id : embd) {
-      res += llama_token_to_str(ctx, id);
-    }
-
-    // check for stop prompt
-    if (params.antiprompt.size()) {
-      std::string last_output;
-      for (auto id : last_n_tokens) {
-        last_output += llama_token_to_str(ctx, id);
-      }
-      // Check if each of the reverse prompts appears at the end of the output.
-      for (std::string &antiprompt : params.antiprompt) {
-        // size_t extra_padding = params.interactive ? 0 : 2;
-        size_t extra_padding = 2;
-        size_t search_start_pos =
-            last_output.length() >
-                    static_cast<size_t>(antiprompt.length() + extra_padding)
-                ? last_output.length() -
-                      static_cast<size_t>(antiprompt.length() + extra_padding)
-                : 0;
-
-        if (last_output.find(antiprompt.c_str(), search_start_pos) !=
-            std::string::npos) {
-          goto end;
-        }
-      }
-    }
-
-    // end of text token
-    if (!embd.empty() && embd.back() == llama_token_eos()) {
-      break;
-    }
-  }
-
-  if (!path_session.empty() && params.prompt_cache_all &&
-      !params.prompt_cache_ro) {
-    if (debug) {
-      fprintf(stderr, "\n%s: saving final output to session file '%s'\n",
-              __func__, path_session.c_str());
-    }
-    llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(),
-                            session_tokens.size());
-  }
-
-end:
-#if defined(_WIN32)
-  signal(SIGINT, SIG_DFL);
-#endif
-
-  if (debug) {
-    llama_print_timings(ctx);
-    llama_reset_timings(ctx);
-  }
-
-  strcpy(result, res.c_str());
-  return 0;
-}
-
-void llama_binding_free_model(void *state_ptr) {
-  llama_context *ctx = (llama_context *)state_ptr;
-  llama_free(ctx);
-}
-
-void llama_free_params(void *params_ptr) {
-  gpt_params *params = (gpt_params *)params_ptr;
-  delete params;
-}
-
-std::vector<std::string> create_vector(const char **strings, int count) {
-  std::vector<std::string> *vec = new std::vector<std::string>;
-  for (int i = 0; i < count; i++) {
-    vec->push_back(std::string(strings[i]));
-  }
-  return *vec;
-}
-
-void delete_vector(std::vector<std::string> *vec) { delete vec; }
-
-int load_state(void *ctx, char *statefile, char *modes) {
-  llama_context *state = (llama_context *)ctx;
-  const llama_context *constState = static_cast<const llama_context *>(state);
-  const size_t state_size = llama_get_state_size(state);
-  uint8_t *state_mem = new uint8_t[state_size];
-
-  {
-    FILE *fp_read = fopen(statefile, modes);
-    if (state_size != llama_get_state_size(constState)) {
-      fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
-      return 1;
-    }
-
-    const size_t ret = fread(state_mem, 1, state_size, fp_read);
-    if (ret != state_size) {
-      fprintf(stderr, "\n%s : failed to read state\n", __func__);
-      return 1;
-    }
-
-    llama_set_state_data(
-        state, state_mem); // could also read directly from memory mapped file
-    fclose(fp_read);
-  }
-
-  return 0;
-}
-
-void save_state(void *ctx, char *dst, char *modes) {
-  llama_context *state = (llama_context *)ctx;
-
-  const size_t state_size = llama_get_state_size(state);
-  uint8_t *state_mem = new uint8_t[state_size];
-
-  // Save state (rng, logits, embedding and kv_cache) to file
-  {
-    FILE *fp_write = fopen(dst, modes);
-    llama_copy_state_data(
-        state, state_mem); // could also copy directly to memory mapped file
-    fwrite(state_mem, 1, state_size, fp_write);
-    fclose(fp_write);
-  }
-}
-
-void *llama_allocate_params(
-    const char *prompt, int seed, int threads, int tokens, int top_k,
-    float top_p, float temp, float repeat_penalty, int repeat_last_n,
-    bool ignore_eos, bool memory_f16, int n_batch, int n_keep,
-    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
-    float frequency_penalty, float presence_penalty, int mirostat,
-    float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
-    const char *tensorsplit) {
-  gpt_params *params = new gpt_params;
-  params->seed = seed;
-  params->n_threads = threads;
-  params->n_predict = tokens;
-  params->repeat_last_n = repeat_last_n;
-  params->top_k = top_k;
-  params->top_p = top_p;
-  params->memory_f16 = memory_f16;
-  params->temp = temp;
-  params->use_mmap = mmap;
-  params->use_mlock = mlock;
-  params->repeat_penalty = repeat_penalty;
-  params->n_batch = n_batch;
-  params->n_keep = n_keep;
-  if (maingpu[0] != '\0') {
-    params->main_gpu = std::stoi(maingpu);
-  }
-
-  if (tensorsplit[0] != '\0') {
-    std::string arg_next = tensorsplit;
-    // split string by , and /
-    const std::regex regex{R"([,/]+)"};
-    std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-    std::vector<std::string> split_arg{it, {}};
-    GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
-
-    for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
-      if (i < split_arg.size()) {
-        params->tensor_split[i] = std::stof(split_arg[i]);
-      } else {
-        params->tensor_split[i] = 0.0f;
-      }
-    }
-  }
-
-  if (ignore_eos) {
-    params->logit_bias[llama_token_eos()] = -INFINITY;
-  }
-  if (antiprompt_count > 0) {
-    params->antiprompt = create_vector(antiprompt, antiprompt_count);
-  }
-  params->tfs_z = tfs_z;
-  params->typical_p = typical_p;
-  params->presence_penalty = presence_penalty;
-  params->mirostat = mirostat;
-  params->mirostat_eta = mirostat_eta;
-  params->mirostat_tau = mirostat_tau;
-  params->penalize_nl = penalize_nl;
-  std::stringstream ss(logit_bias);
-  llama_token key;
-  char sign;
-  std::string value_str;
-  if (ss >> key && ss >> sign && std::getline(ss, value_str) &&
-      (sign == '+' || sign == '-')) {
-    params->logit_bias[key] =
-        std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-  }
-  params->frequency_penalty = frequency_penalty;
-  params->prompt = prompt;
-
-  return params;
-}
-
-void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16,
-                 bool mlock, bool embeddings, bool mmap, bool low_vram,
-                 bool vocab_only, int n_gpu_layers, int n_batch,
-                 const char *maingpu, const char *tensorsplit, bool numa) {
-  // load the model
-  auto lparams = llama_context_default_params();
-
-  lparams.n_ctx = n_ctx;
-  lparams.seed = n_seed;
-  lparams.f16_kv = memory_f16;
-  lparams.embedding = embeddings;
-  lparams.use_mlock = mlock;
-  lparams.n_gpu_layers = n_gpu_layers;
-  lparams.use_mmap = mmap;
-  lparams.low_vram = low_vram;
-  lparams.vocab_only = vocab_only;
-
-  if (maingpu[0] != '\0') {
-    lparams.main_gpu = std::stoi(maingpu);
-  }
-
-  if (tensorsplit[0] != '\0') {
-    std::string arg_next = tensorsplit;
-    // split string by , and /
-    const std::regex regex{R"([,/]+)"};
-    std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-    std::vector<std::string> split_arg{it, {}};
-    GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
-
-    for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
-      if (i < split_arg.size()) {
-        lparams.tensor_split[i] = std::stof(split_arg[i]);
-      } else {
-        lparams.tensor_split[i] = 0.0f;
-      }
-    }
-  }
-
-  lparams.n_batch = n_batch;
-
-  llama_init_backend(numa);
-  void *res = nullptr;
-  try {
-    res = llama_init_from_file(fname, lparams);
-  } catch (std::runtime_error &e) {
-    fprintf(stderr, "failed %s", e.what());
-    return res;
-  }
-
-  return res;
-}
--- a/llama/binding/binding.h
+++ b/llama/binding/binding.h
@@ -1,48 +0,0 @@
-#ifdef __cplusplus
-#include <string>
-#include <vector>
-extern "C" {
-#endif
-
-#include <stdbool.h>
-
-extern unsigned char tokenCallback(void *, char *);
-
-int load_state(void *ctx, char *statefile, char *modes);
-
-int eval(void *params_ptr, void *ctx, char *text);
-
-void save_state(void *ctx, char *dst, char *modes);
-
-void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16,
-                 bool mlock, bool embeddings, bool mmap, bool low_vram,
-                 bool vocab_only, int n_gpu, int n_batch, const char *maingpu,
-                 const char *tensorsplit, bool numa);
-
-int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings);
-
-int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens,
-                         int tokenSize, float *res_embeddings);
-
-void *llama_allocate_params(
-    const char *prompt, int seed, int threads, int tokens, int top_k,
-    float top_p, float temp, float repeat_penalty, int repeat_last_n,
-    bool ignore_eos, bool memory_f16, int n_batch, int n_keep,
-    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
-    float frequency_penalty, float presence_penalty, int mirostat,
-    float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
-    const char *tensorsplit);
-
-void llama_free_params(void *params_ptr);
-
-void llama_binding_free_model(void *state);
-
-int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug);
-
-#ifdef __cplusplus
-}
-
-std::vector<std::string> create_vector(const char **strings, int count);
-void delete_vector(std::vector<std::string> *vec);
-#endif
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -0,0 +1,62 @@
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_CUDA_MAX_DEVICES       16
+
+void   ggml_init_cublas(void);
+void   ggml_cuda_set_tensor_split(const float * tensor_split);
+
+void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+
+// TODO: export these with GGML_API
+void * ggml_cuda_host_malloc(size_t size);
+void   ggml_cuda_host_free(void * ptr);
+
+void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+
+void   ggml_cuda_free_data(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+void   ggml_cuda_set_main_device(int main_device);
+void   ggml_cuda_set_scratch_size(size_t scratch_size);
+void   ggml_cuda_free_scratch(void);
+bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
@@ -0,0 +1,97 @@
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 16
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_metal_context;
+
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+// - max_size specifies the maximum size of a tensor and is used to create shared views such
+//   that it is guaranteed that the tensor will fit in at least one of the views
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size);
+
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// same as ggml_graph_compute but uses Metal
+// creates gf->n_threads command buffers in parallel
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/llama/ggml-metal.m
+++ b/llama/ggml-metal.m
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
--- a/llama/ggml.c
+++ b/llama/ggml.c
--- a/llama/ggml.h
+++ b/llama/ggml.h
--- a/llama/k_quants.c
+++ b/llama/k_quants.c
--- a/llama/k_quants.h
+++ b/llama/k_quants.h
@@ -0,0 +1,183 @@
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+
+#include <stdint.h>
+#include <assert.h>
+#include <stddef.h>
+
+// Super-block size
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+//
+// Super-block quantization structures
+//
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elemenets each
+// Effectively 2.5625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    ggml_fp16_t d;           // super-block scale for quantized scales
+    ggml_fp16_t dmin;        // super-block scale for quantized mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elemenets each
+// Effectively 3.4375 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[2];
+    ggml_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
+#else
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[12];        // scales, quantized with 6 bits
+    ggml_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
+#endif
+
+// 4-bit quantization
+// 16 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    ggml_fp16_t d[2];          // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct {
+    ggml_fp16_t d;             // super-block scale for quantized scales
+    ggml_fp16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+// 5-bit quantization
+// 16 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    ggml_fp16_t d;               // super-block scale
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct {
+    ggml_fp16_t d;               // super-block scale for quantized scales
+    ggml_fp16_t dmin;            // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elemenets each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_fp16_t d;           // super-block scale
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
+
+// This is only used for intermediate quantization and dot products
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
+static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
+
+
+// Quantization
+void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
+void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
+void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
+void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
+void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+
+void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+
+// Dequantization
+void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
+void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
+void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
+void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+
+// Dot product
+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+// Quantization with histogram collection
+size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+
--- a/llama/llama-util.h
+++ b/llama/llama-util.h
@@ -0,0 +1,530 @@
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Internal header to be included only by llama.cpp.
+// Contains wrappers around OS interfaces.
+
+#ifndef LLAMA_UTIL_H
+#define LLAMA_UTIL_H
+
+#include <cstdio>
+#include <cstdint>
+#include <cerrno>
+#include <cstring>
+#include <cstdarg>
+#include <cstdlib>
+#include <climits>
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <io.h>
+    #include <stdio.h> // for _fseeki64
+#endif
+
+#define LLAMA_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    LLAMA_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        LLAMA_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, len, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, len, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+#if defined(_WIN32)
+static std::string llama_format_win_err(DWORD err) {
+    LPSTR buf;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+    if (!size) {
+        return "FormatMessageA failed";
+    }
+    std::string ret(buf, size);
+    LocalFree(buf);
+    return ret;
+}
+#endif
+
+struct llama_mmap {
+    void * addr;
+    size_t size;
+
+    llama_mmap(const llama_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_PRIVATE;
+        // prefetch/readahead impairs performance on NUMA systems
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+        addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
+        }
+
+        if (prefetch > 0) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+        if (numa) {
+            // advise the kernel not to use readahead
+            // (because the next page might not belong on the same node)
+            if (madvise(addr, file->size, MADV_RANDOM)) {
+                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~llama_mmap() {
+        munmap(addr, size);
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
+        (void) numa;
+
+        size = file->size;
+
+        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
+
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+        DWORD error = GetLastError();
+
+        if (hMapping == NULL) {
+            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
+        }
+
+        addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
+        error = GetLastError();
+        CloseHandle(hMapping);
+
+        if (addr == NULL) {
+            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
+        }
+
+        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            WIN32_MEMORY_RANGE_ENTRY range;
+            range.VirtualAddress = addr;
+            range.NumberOfBytes = (SIZE_T)size;
+            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+            }
+        }
+        #else
+        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
+        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
+    }
+
+    ~llama_mmap() {
+        if (!UnmapViewOfFile(addr)) {
+            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
+        (void) prefetch;
+        (void) numa;
+
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
+#endif
+};
+
+// Represents some region of memory being locked using mlock or VirtualLock;
+// will automatically unlock on destruction.
+struct llama_mlock {
+    void * addr = NULL;
+    size_t size = 0;
+    bool failed_already = false;
+
+    llama_mlock() {}
+    llama_mlock(const llama_mlock &) = delete;
+
+    ~llama_mlock() {
+        if (size) {
+            raw_unlock(addr, size);
+        }
+    }
+
+    void init(void * ptr) {
+        LLAMA_ASSERT(addr == NULL && size == 0);
+        addr = ptr;
+    }
+
+    void grow_to(size_t target_size) {
+        LLAMA_ASSERT(addr);
+        if (failed_already) {
+            return;
+        }
+        size_t granularity = lock_granularity();
+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
+        if (target_size > size) {
+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+                size = target_size;
+            } else {
+                failed_already = true;
+            }
+        }
+    }
+
+#ifdef _POSIX_MEMLOCK_RANGE
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        return (size_t) sysconf(_SC_PAGESIZE);
+    }
+
+    #ifdef __APPLE__
+        #define MLOCK_SUGGESTION \
+            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+    #else
+        #define MLOCK_SUGGESTION \
+            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+    #endif
+
+    bool raw_lock(const void * addr, size_t size) {
+        if (!mlock(addr, size)) {
+            return true;
+        } else {
+            char* errmsg = std::strerror(errno);
+            bool suggest = (errno == ENOMEM);
+
+            // Check if the resource limit is fine after all
+            struct rlimit lock_limit;
+            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
+                suggest = false;
+            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
+                suggest = false;
+
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+            return false;
+        }
+    }
+
+    #undef MLOCK_SUGGESTION
+
+    void raw_unlock(void * addr, size_t size) {
+        if (munlock(addr, size)) {
+            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+        }
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return (size_t) si.dwPageSize;
+    }
+
+    bool raw_lock(void * ptr, size_t len) {
+        for (int tries = 1; ; tries++) {
+            if (VirtualLock(ptr, len)) {
+                return true;
+            }
+            if (tries == 2) {
+                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                    len, size, llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+
+            // It failed but this was only the first try; increase the working
+            // set size and try again.
+            SIZE_T min_ws_size, max_ws_size;
+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+            // Per MSDN: "The maximum number of pages that a process can lock
+            // is equal to the number of pages in its minimum working set minus
+            // a small overhead."
+            // Hopefully a megabyte is enough overhead:
+            size_t increment = len + 1048576;
+            // The minimum must be <= the maximum, so we need to increase both:
+            min_ws_size += increment;
+            max_ws_size += increment;
+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+        }
+    }
+
+    void raw_unlock(void * ptr, size_t len) {
+        if (!VirtualUnlock(ptr, len)) {
+            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    size_t lock_granularity() {
+        return (size_t) 65536;
+    }
+
+    bool raw_lock(const void * addr, size_t len) {
+        fprintf(stderr, "warning: mlock not supported on this system\n");
+        return false;
+    }
+
+    void raw_unlock(const void * addr, size_t len) {}
+#endif
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct llama_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    llama_buffer() = default;
+
+    void resize(size_t len) {
+#ifdef GGML_USE_METAL
+        free(addr);
+        int result = posix_memalign((void **) &addr, getpagesize(), len);
+        if (result == 0) {
+            memset(addr, 0, len);
+        }
+        else {
+            addr = NULL;
+        }
+#else
+        delete[] addr;
+        addr = new uint8_t[len];
+#endif
+        size = len;
+    }
+
+    ~llama_buffer() {
+#ifdef GGML_USE_METAL
+        free(addr);
+#else
+        delete[] addr;
+#endif
+        addr = NULL;
+    }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
+};
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+struct llama_ctx_buffer {
+    uint8_t * addr = NULL;
+    bool is_cuda;
+    size_t size = 0;
+
+    llama_ctx_buffer() = default;
+
+    void resize(size_t size) {
+        free();
+
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        if (addr) {
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
+        }
+        this->size = size;
+    }
+
+    void free() {
+        if (addr) {
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
+        }
+        addr = NULL;
+    }
+
+    ~llama_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
+};
+#else
+typedef llama_buffer llama_ctx_buffer;
+#endif
+
+#endif
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -1,217 +1,234 @@
-// MIT License
-
-// Copyright (c) 2023 go-skynet authors
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
 package llama

-// #cgo LDFLAGS: -Lbuild -lbinding -lllama -lm -lggml_static -lstdc++
-// #cgo CXXFLAGS: -std=c++11
-// #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
-// #include "binding/binding.h"
-// #include <stdlib.h>
-import "C"
+/*
+#cgo CPPFLAGS: -O3 -DNDEBUG=1
+#cgo CXXFLAGS: -std=c++11
+#cgo darwin CPPFLAGS: -DGGML_USE_METAL=1 -DGGML_METAL_NDEBUG=1
+#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+#include <stdlib.h>
+#include "llama.h"

+struct llama_sample_options
+{
+	float repeat_penalty;
+	float frequency_penalty;
+	float presence_penalty;
+	float temperature;
+	int32_t top_k;
+	float top_p;
+	float tfs_z;
+	float typical_p;
+	int mirostat;
+	float mirostat_tau;
+	float mirostat_eta;
+};
+
+llama_token llama_sample(
+		struct llama_context *ctx,
+		struct llama_token_data *candidates,
+		size_t n_candidates,
+		const llama_token *last_tokens,
+		size_t n_last_tokens,
+		struct llama_sample_options *opts)
+{
+	llama_token_data_array candidates_p = {
+		candidates,
+		n_candidates,
+		false,
+	};
+
+	llama_sample_repetition_penalty(
+		ctx, &candidates_p,
+		last_tokens, n_last_tokens,
+		opts->repeat_penalty);
+
+	llama_sample_frequency_and_presence_penalties(
+		ctx, &candidates_p,
+		last_tokens, n_last_tokens,
+		opts->frequency_penalty, opts->presence_penalty);
+
+	if (opts->temperature <= 0) {
+		return llama_sample_token_greedy(ctx, &candidates_p);
+	}
+
+	if (opts->mirostat == 1) {
+		int mirostat_m = 100;
+		float mirostat_mu = 2.0f * opts->mirostat_tau;
+		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
+		return llama_sample_token_mirostat(
+			ctx, &candidates_p,
+			opts->mirostat_tau, opts->mirostat_eta,
+			mirostat_m, &mirostat_mu);
+	} else if (opts->mirostat == 2) {
+		float mirostat_mu = 2.0f * opts->mirostat_tau;
+		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
+		return llama_sample_token_mirostat_v2(
+			ctx, &candidates_p,
+			opts->mirostat_tau, opts->mirostat_eta,
+			&mirostat_mu);
+	} else {
+		llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1);
+		llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1);
+		llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1);
+		llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1);
+		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
+		return llama_sample_token(ctx, &candidates_p);
+	}
+}
+*/
+import "C"
 import (
-	"fmt"
+	"errors"
+	"io"
+	"os"
 	"strings"
-	"sync"
 	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
 )

-type LLama struct {
-	ctx         unsafe.Pointer
-	embeddings  bool
-	contextSize int
+type llama struct {
+	params *C.struct_llama_context_params
+	model  *C.struct_llama_model
+	ctx    *C.struct_llama_context
+
+	api.Options
 }

-func New(model string, mo ModelOptions) (*LLama, error) {
-	modelPath := C.CString(model)
-	defer C.free(unsafe.Pointer(modelPath))
-
-	ctx := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA))
-	if ctx == nil {
-		return nil, fmt.Errorf("failed loading model")
+func New(model string, opts api.Options) (*llama, error) {
+	if _, err := os.Stat(model); err != nil {
+		return nil, err
 	}

-	ll := &LLama{ctx: ctx, contextSize: mo.ContextSize, embeddings: mo.Embeddings}
+	llm := llama{Options: opts}

-	return ll, nil
+	C.llama_backend_init(C.bool(llm.UseNUMA))
+
+	params := C.llama_context_default_params()
+	params.seed = C.uint(llm.Seed)
+	params.n_ctx = C.int(llm.NumCtx)
+	params.n_batch = C.int(llm.NumBatch)
+	params.n_gpu_layers = C.int(llm.NumGPU)
+	params.main_gpu = C.int(llm.MainGPU)
+	params.low_vram = C.bool(llm.LowVRAM)
+	params.f16_kv = C.bool(llm.F16KV)
+	params.logits_all = C.bool(llm.LogitsAll)
+	params.vocab_only = C.bool(llm.VocabOnly)
+	params.use_mmap = C.bool(llm.UseMMap)
+	params.use_mlock = C.bool(llm.UseMLock)
+	params.embedding = C.bool(llm.EmbeddingOnly)
+	llm.params = &params
+
+	cModel := C.CString(model)
+	defer C.free(unsafe.Pointer(cModel))
+
+	llm.model = C.llama_load_model_from_file(cModel, params)
+	llm.ctx = C.llama_new_context_with_model(llm.model, params)
+
+	// warm up the model
+	bos := []C.llama_token{C.llama_token_bos()}
+	C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread))
+	C.llama_reset_timings(llm.ctx)
+
+	return &llm, nil
 }

-func (l *LLama) Free() {
-	C.llama_binding_free_model(l.ctx)
+func (llm *llama) Close() {
+	defer C.llama_free_model(llm.model)
+	defer C.llama_free(llm.ctx)
+
+	C.llama_print_timings(llm.ctx)
 }

-func (l *LLama) Eval(text string, opts ...PredictOption) error {
-	po := NewPredictOptions(opts...)
-
-	input := C.CString(text)
-	if po.Tokens == 0 {
-		po.Tokens = 99999999
-	}
-	defer C.free(unsafe.Pointer(input))
-
-	reverseCount := len(po.StopPrompts)
-	reversePrompt := make([]*C.char, reverseCount)
-	var pass **C.char
-	for i, s := range po.StopPrompts {
-		cs := C.CString(s)
-		reversePrompt[i] = cs
-		pass = &reversePrompt[0]
-		defer C.free(unsafe.Pointer(cs))
+func (llm *llama) Predict(prompt string, fn func(string)) error {
+	if tokens := llm.tokenize(prompt); tokens != nil {
+		return llm.generate(tokens, fn)
 	}

-	cLogitBias := C.CString(po.LogitBias)
-	defer C.free(unsafe.Pointer(cLogitBias))
+	return errors.New("llama: tokenize")
+}

-	cMainGPU := C.CString(po.MainGPU)
-	defer C.free(unsafe.Pointer(cMainGPU))
+func (llm *llama) tokenize(prompt string) []C.llama_token {
+	cPrompt := C.CString(prompt)
+	defer C.free(unsafe.Pointer(cPrompt))

-	cTensorSplit := C.CString(po.TensorSplit)
-	defer C.free(unsafe.Pointer(cTensorSplit))
-
-	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
-		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
-		C.bool(po.IgnoreEOS), C.bool(po.F16KV),
-		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
-		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
-		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
-		C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
-	)
-	defer C.llama_free_params(params)
-
-	ret := C.eval(params, l.ctx, input)
-	if ret != 0 {
-		return fmt.Errorf("inference failed")
+	tokens := make([]C.llama_token, llm.NumCtx)
+	if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 {
+		return tokens[:n]
 	}

 	return nil
 }

-func (l *LLama) Predict(text string, po PredictOptions) (string, error) {
-	if po.TokenCallback != nil {
-		setCallback(l.ctx, po.TokenCallback)
+func (llm *llama) detokenize(tokens ...C.llama_token) string {
+	var sb strings.Builder
+	for _, token := range tokens {
+		sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token)))
 	}

-	input := C.CString(text)
-	if po.Tokens == 0 {
-		po.Tokens = 99999999
-	}
-	defer C.free(unsafe.Pointer(input))
-
-	out := make([]byte, po.Tokens)
-
-	reverseCount := len(po.StopPrompts)
-	reversePrompt := make([]*C.char, reverseCount)
-	var pass **C.char
-	for i, s := range po.StopPrompts {
-		cs := C.CString(s)
-		reversePrompt[i] = cs
-		pass = &reversePrompt[0]
-		defer C.free(unsafe.Pointer(cs))
-	}
-
-	cLogitBias := C.CString(po.LogitBias)
-	defer C.free(unsafe.Pointer(cLogitBias))
-
-	cMainGPU := C.CString(po.MainGPU)
-	defer C.free(unsafe.Pointer(cMainGPU))
-
-	cTensorSplit := C.CString(po.TensorSplit)
-	defer C.free(unsafe.Pointer(cTensorSplit))
-
-	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
-		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
-		C.bool(po.IgnoreEOS), C.bool(po.F16KV),
-		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
-		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
-		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
-		C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
-	)
-	defer C.llama_free_params(params)
-
-	ret := C.llama_predict(params, l.ctx, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
-	if ret != 0 {
-		return "", fmt.Errorf("inference failed")
-	}
-	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
-
-	res = strings.TrimPrefix(res, " ")
-	res = strings.TrimPrefix(res, text)
-	res = strings.TrimPrefix(res, "\n")
-
-	for _, s := range po.StopPrompts {
-		res = strings.TrimRight(res, s)
-	}
-
-	if po.TokenCallback != nil {
-		setCallback(l.ctx, nil)
-	}
-
-	return res, nil
+	return sb.String()
 }

-// CGo only allows us to use static calls from C to Go, we can't just dynamically pass in func's.
-// This is the next best thing, we register the callbacks in this map and call tokenCallback from
-// the C code. We also attach a finalizer to LLama, so it will unregister the callback when the
-// garbage collection frees it.
+func (llm *llama) generate(tokens []C.llama_token, fn func(string)) error {
+	var opts C.struct_llama_sample_options
+	opts.repeat_penalty = C.float(llm.RepeatPenalty)
+	opts.frequency_penalty = C.float(llm.FrequencyPenalty)
+	opts.presence_penalty = C.float(llm.PresencePenalty)
+	opts.temperature = C.float(llm.Temperature)
+	opts.top_k = C.int(llm.TopK)
+	opts.top_p = C.float(llm.TopP)
+	opts.tfs_z = C.float(llm.TFSZ)
+	opts.typical_p = C.float(llm.TypicalP)
+	opts.mirostat = C.int(llm.Mirostat)
+	opts.mirostat_tau = C.float(llm.MirostatTau)
+	opts.mirostat_eta = C.float(llm.MirostatEta)

-// SetTokenCallback registers a callback for the individual tokens created when running Predict. It
-// will be called once for each token. The callback shall return true as long as the model should
-// continue predicting the next token. When the callback returns false the predictor will return.
-// The tokens are just converted into Go strings, they are not trimmed or otherwise changed. Also
-// the tokens may not be valid UTF-8.
-// Pass in nil to remove a callback.
-//
-// It is save to call this method while a prediction is running.
-func (l *LLama) SetTokenCallback(callback func(token string) bool) {
-	setCallback(l.ctx, callback)
-}
+	pastTokens := deque[C.llama_token]{capacity: llm.RepeatLastN}

-var (
-	m         sync.Mutex
-	callbacks = map[uintptr]func(string) bool{}
-)
+	for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) {
+		if retval := C.llama_eval(llm.ctx, unsafe.SliceData(tokens), C.int(len(tokens)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 {
+			return errors.New("llama: eval")
+		}

-//export tokenCallback
-func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool {
-	m.Lock()
-	defer m.Unlock()
+		token, err := llm.sample(pastTokens, &opts)
+		switch {
+		case err != nil:
+			return err
+		case errors.Is(err, io.EOF):
+			return nil
+		}

-	if callback, ok := callbacks[uintptr(statePtr)]; ok {
-		return callback(C.GoString(token))
+		fn(llm.detokenize(token))
+
+		tokens = []C.llama_token{token}
+
+		pastTokens.PushLeft(token)
 	}

-	return true
+	return nil
 }

-// setCallback can be used to register a token callback for LLama. Pass in a nil callback to
-// remove the callback.
-func setCallback(statePtr unsafe.Pointer, callback func(string) bool) {
-	m.Lock()
-	defer m.Unlock()
+func (llm *llama) sample(pastTokens deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) {
+	numVocab := int(C.llama_n_vocab(llm.ctx))
+	logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab)

-	if callback == nil {
-		delete(callbacks, uintptr(statePtr))
-	} else {
-		callbacks[uintptr(statePtr)] = callback
+	candidates := make([]C.struct_llama_token_data, 0, numVocab)
+	for i := 0; i < numVocab; i++ {
+		candidates = append(candidates, C.llama_token_data{
+			id:    C.int(i),
+			logit: logits[i],
+			p:     0,
+		})
 	}
+
+	token := C.llama_sample(
+		llm.ctx,
+		unsafe.SliceData(candidates), C.ulong(len(candidates)),
+		unsafe.SliceData(pastTokens.Data()), C.ulong(pastTokens.Len()),
+		opts)
+	if token != C.llama_token_eos() {
+		return token, nil
+	}
+
+	return 0, io.EOF
 }
--- a/llama/llama.h
+++ b/llama/llama.h
@@ -0,0 +1,410 @@
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LLAMA_H
+#define LLAMA_H
+
+#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#else
+#define LLAMA_MAX_DEVICES 1
+#endif // GGML_USE_CUBLAS
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define LLAMA_API __declspec(dllexport)
+#        else
+#            define LLAMA_API __declspec(dllimport)
+#        endif
+#    else
+#        define LLAMA_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define LLAMA_API
+#endif
+
+#ifdef __GNUC__
+#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define DEPRECATED(func, hint) func
+#endif
+
+#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+
+#define LLAMA_FILE_VERSION           3
+#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
+#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
+#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
+#define LLAMA_SESSION_VERSION        1
+
+#define LLAMA_DEFAULT_SEED           0xFFFFFFFF
+
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    //
+    // C interface
+    //
+    // TODO: show sample usage
+    //
+
+    struct llama_model;
+    struct llama_context;
+
+    typedef int llama_token;
+
+    typedef struct llama_token_data {
+        llama_token id; // token id
+        float logit;    // log-odds of the token
+        float p;        // probability of the token
+    } llama_token_data;
+
+    typedef struct llama_token_data_array {
+        llama_token_data * data;
+        size_t size;
+        bool sorted;
+    } llama_token_data_array;
+
+    typedef void (*llama_progress_callback)(float progress, void *ctx);
+
+   struct llama_context_params {
+        uint32_t seed;                         // RNG seed, -1 for random
+        int32_t  n_ctx;                        // text context
+        int32_t  n_batch;                      // prompt processing batch size
+        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
+        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
+        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        // called with a progress value between 0 and 1, pass NULL to disable
+        llama_progress_callback progress_callback;
+        // context pointer passed to the progress callback
+        void * progress_callback_user_data;
+
+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool low_vram;   // if true, reduce VRAM usage at the cost of performance
+        bool f16_kv;     // use fp16 for KV cache
+        bool logits_all; // the llama_eval() call computes all logits, not just the last one
+        bool vocab_only; // only load the vocabulary, no weights
+        bool use_mmap;   // use mmap if possible
+        bool use_mlock;  // force system to keep model in RAM
+        bool embedding;  // embedding mode only
+    };
+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
+    };
+
+    // model quantization parameters
+    typedef struct llama_model_quantize_params {
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+    } llama_model_quantize_params;
+
+    // performance timing information
+    struct llama_timings {
+        double t_start_ms;
+        double t_end_ms;
+        double t_load_ms;
+        double t_sample_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+
+        int32_t n_sample;
+        int32_t n_p_eval;
+        int32_t n_eval;
+    };
+
+    LLAMA_API struct llama_context_params llama_context_default_params();
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
+
+    LLAMA_API bool llama_mmap_supported();
+    LLAMA_API bool llama_mlock_supported();
+
+    // TODO: not great API - very likely to change
+    // Initialize the llama + ggml backend
+    // If numa is true, use NUMA optimizations
+    // Call once at the start of the program
+    LLAMA_API void llama_backend_init(bool numa);
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free();
+
+    LLAMA_API int64_t llama_time_us();
+
+    LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+            struct llama_context_params   params);
+
+    LLAMA_API void llama_free_model(struct llama_model * model);
+
+    LLAMA_API struct llama_context * llama_new_context_with_model(
+                     struct llama_model * model,
+            struct llama_context_params   params);
+
+    // Various functions for loading a ggml llama model.
+    // Allocate (almost) all memory needed for the model.
+    // Return NULL on failure
+    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
+                             const char * path_model,
+            struct llama_context_params   params),
+            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
+
+    // Frees all allocated memory
+    LLAMA_API void llama_free(struct llama_context * ctx);
+
+    // Returns 0 on success
+    LLAMA_API int llama_model_quantize(
+            const char * fname_inp,
+            const char * fname_out,
+            const llama_model_quantize_params * params);
+
+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
+            struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads),
+            "please use llama_model_apply_lora_from_file instead");
+
+    LLAMA_API int llama_model_apply_lora_from_file(
+            const struct llama_model * model,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads);
+
+    // Returns the number of tokens in the KV cache
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+
+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
+
+    // Returns the maximum size in bytes of the state (rng, logits, embedding
+    // and kv_cache) - will often be smaller after compacting tokens
+    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
+
+    // Save/load session file
+    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+
+    // Run the llama inference to obtain the logits and probabilities for the next token.
+    // tokens + n_tokens is the provided batch of new tokens to process
+    // n_past is the number of tokens to use from previous eval calls
+    // Returns 0 on success
+    LLAMA_API int llama_eval(
+            struct llama_context * ctx,
+               const llama_token * tokens,
+                             int   n_tokens,
+                             int   n_past,
+                             int   n_threads);
+
+    // Same as llama_eval, but use float matrix input directly.
+    LLAMA_API int llama_eval_embd(
+            struct llama_context * ctx,
+                     const float * embd,
+                             int   n_tokens,
+                             int   n_past,
+                             int   n_threads);
+
+    // Export a static computation graph for context of 511 and batch size of 1
+    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
+    //       parameters here to keep things simple
+    // IMPORTANT: do not use for anything else other than debugging and testing!
+    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
+
+    // Convert the provided text into tokens.
+    // The tokens pointer must be large enough to hold the resulting tokens.
+    // Returns the number of tokens on success, no more than n_max_tokens
+    // Returns a negative number on failure - the number of tokens that would have been returned
+    // TODO: not sure if correct
+    LLAMA_API int llama_tokenize(
+            struct llama_context * ctx,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
+
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+
+    // Get the vocabulary as output parameters.
+    // Returns number of results.
+    LLAMA_API int llama_get_vocab(
+            const struct llama_context * ctx,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
+
+    // Token logits obtained from the last call to llama_eval()
+    // The logits for the last token are stored in the last row
+    // Can be mutated in order to change the probabilities of the next token
+    // Rows: n_tokens
+    // Cols: n_vocab
+    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
+
+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+
+    // Token Id -> String. Uses the vocabulary in the provided context
+    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+
+    // Special tokens
+    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl();   // next-line
+
+    // Sampling functions
+
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
+
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+
+    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+    LLAMA_API void llama_sample_classifier_free_guidance(
+              struct llama_context * ctx,
+            llama_token_data_array * candidates,
+              struct llama_context * guidance_ctx,
+                             float   scale,
+                             float   smooth_factor);
+
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
+
+    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+
+    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
+
+    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+
+    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+
+    /// @details Selects the token with the highest probability.
+    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Randomly selects a token from the candidates based on their probabilities.
+    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    // Performance information
+    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
+    LLAMA_API void llama_print_timings(struct llama_context * ctx);
+    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
+
+    // Print system information
+    LLAMA_API const char * llama_print_system_info(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
+#ifdef LLAMA_API_INTERNAL
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif
+
+#endif // LLAMA_H
--- a/llama/llama_cublas.go
+++ b/llama/llama_cublas.go
@@ -1,9 +0,0 @@
-//go:build cublas
-// +build cublas
-
-package llama
-
-/*
-#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
-*/
-import "C"
--- a/llama/llama_metal.go
+++ b/llama/llama_metal.go
@@ -1,2 +0,0 @@
-//go:build metal
-package llama
--- a/llama/llama_openblas.go
+++ b/llama/llama_openblas.go
@@ -1,9 +0,0 @@
-//go:build openblas
-// +build openblas
-
-package llama
-
-/*
-#cgo LDFLAGS: -lopenblas
-*/
-import "C"
--- a/llama/options.go
+++ b/llama/options.go
@@ -1,375 +0,0 @@
-// MIT License
-
-// Copyright (c) 2023 go-skynet authors
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-package llama
-
-type ModelOptions struct {
-	ContextSize int
-	Seed        int
-	NBatch      int
-	F16Memory   bool
-	MLock       bool
-	MMap        bool
-	VocabOnly   bool
-	LowVRAM     bool
-	Embeddings  bool
-	NUMA        bool
-	NGPULayers  int
-	MainGPU     string
-	TensorSplit string
-}
-
-type PredictOptions struct {
-	Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int
-	TopP, Temperature, Penalty                        float64
-	F16KV                                             bool
-	DebugMode                                         bool
-	StopPrompts                                       []string
-	IgnoreEOS                                         bool
-
-	TailFreeSamplingZ float64
-	TypicalP          float64
-	FrequencyPenalty  float64
-	PresencePenalty   float64
-	Mirostat          int
-	MirostatETA       float64
-	MirostatTAU       float64
-	PenalizeNL        bool
-	LogitBias         string
-	TokenCallback     func(string) bool
-
-	MLock, MMap bool
-	MainGPU     string
-	TensorSplit string
-}
-
-type PredictOption func(p *PredictOptions)
-
-type ModelOption func(p *ModelOptions)
-
-var DefaultModelOptions ModelOptions = ModelOptions{
-	ContextSize: 512,
-	Seed:        0,
-	F16Memory:   false,
-	MLock:       false,
-	Embeddings:  false,
-	MMap:        true,
-	LowVRAM:     false,
-}
-
-var DefaultOptions PredictOptions = PredictOptions{
-	Seed:              -1,
-	Threads:           4,
-	Tokens:            128,
-	Penalty:           1.1,
-	Repeat:            64,
-	Batch:             512,
-	NKeep:             64,
-	TopK:              40,
-	TopP:              0.95,
-	TailFreeSamplingZ: 1.0,
-	TypicalP:          1.0,
-	Temperature:       0.8,
-	FrequencyPenalty:  0.0,
-	PresencePenalty:   0.0,
-	Mirostat:          0,
-	MirostatTAU:       5.0,
-	MirostatETA:       0.1,
-	MMap:              true,
-}
-
-// SetContext sets the context size.
-func SetContext(c int) ModelOption {
-	return func(p *ModelOptions) {
-		p.ContextSize = c
-	}
-}
-
-func SetModelSeed(c int) ModelOption {
-	return func(p *ModelOptions) {
-		p.Seed = c
-	}
-}
-
-// SetContext sets the context size.
-func SetMMap(b bool) ModelOption {
-	return func(p *ModelOptions) {
-		p.MMap = b
-	}
-}
-
-// SetNBatch sets the  n_Batch
-func SetNBatch(n_batch int) ModelOption {
-	return func(p *ModelOptions) {
-		p.NBatch = n_batch
-	}
-}
-
-// Set sets the tensor split for the GPU
-func SetTensorSplit(maingpu string) ModelOption {
-	return func(p *ModelOptions) {
-		p.TensorSplit = maingpu
-	}
-}
-
-// SetMainGPU sets the main_gpu
-func SetMainGPU(maingpu string) ModelOption {
-	return func(p *ModelOptions) {
-		p.MainGPU = maingpu
-	}
-}
-
-// SetPredictionTensorSplit sets the tensor split for the GPU
-func SetPredictionTensorSplit(maingpu string) PredictOption {
-	return func(p *PredictOptions) {
-		p.TensorSplit = maingpu
-	}
-}
-
-// SetPredictionMainGPU sets the main_gpu
-func SetPredictionMainGPU(maingpu string) PredictOption {
-	return func(p *PredictOptions) {
-		p.MainGPU = maingpu
-	}
-}
-
-var VocabOnly ModelOption = func(p *ModelOptions) {
-	p.VocabOnly = true
-}
-
-var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
-	p.LowVRAM = true
-}
-
-var EnableNUMA ModelOption = func(p *ModelOptions) {
-	p.NUMA = true
-}
-
-var EnableEmbeddings ModelOption = func(p *ModelOptions) {
-	p.Embeddings = true
-}
-
-var EnableF16Memory ModelOption = func(p *ModelOptions) {
-	p.F16Memory = true
-}
-
-var EnableF16KV PredictOption = func(p *PredictOptions) {
-	p.F16KV = true
-}
-
-var Debug PredictOption = func(p *PredictOptions) {
-	p.DebugMode = true
-}
-
-var EnableMLock ModelOption = func(p *ModelOptions) {
-	p.MLock = true
-}
-
-// Create a new PredictOptions object with the given options.
-func NewModelOptions(opts ...ModelOption) ModelOptions {
-	p := DefaultModelOptions
-	for _, opt := range opts {
-		opt(&p)
-	}
-	return p
-}
-
-var IgnoreEOS PredictOption = func(p *PredictOptions) {
-	p.IgnoreEOS = true
-}
-
-// SetMlock sets the memory lock.
-func SetMlock(b bool) PredictOption {
-	return func(p *PredictOptions) {
-		p.MLock = b
-	}
-}
-
-// SetMemoryMap sets memory mapping.
-func SetMemoryMap(b bool) PredictOption {
-	return func(p *PredictOptions) {
-		p.MMap = b
-	}
-}
-
-// SetGPULayers sets the number of GPU layers to use to offload computation
-func SetGPULayers(n int) ModelOption {
-	return func(p *ModelOptions) {
-		p.NGPULayers = n
-	}
-}
-
-// SetTokenCallback sets the prompts that will stop predictions.
-func SetTokenCallback(fn func(string) bool) PredictOption {
-	return func(p *PredictOptions) {
-		p.TokenCallback = fn
-	}
-}
-
-// SetStopWords sets the prompts that will stop predictions.
-func SetStopWords(stop ...string) PredictOption {
-	return func(p *PredictOptions) {
-		p.StopPrompts = stop
-	}
-}
-
-// SetSeed sets the random seed for sampling text generation.
-func SetSeed(seed int) PredictOption {
-	return func(p *PredictOptions) {
-		p.Seed = seed
-	}
-}
-
-// SetThreads sets the number of threads to use for text generation.
-func SetThreads(threads int) PredictOption {
-	return func(p *PredictOptions) {
-		p.Threads = threads
-	}
-}
-
-// SetTokens sets the number of tokens to generate.
-func SetTokens(tokens int) PredictOption {
-	return func(p *PredictOptions) {
-		p.Tokens = tokens
-	}
-}
-
-// SetTopK sets the value for top-K sampling.
-func SetTopK(topk int) PredictOption {
-	return func(p *PredictOptions) {
-		p.TopK = topk
-	}
-}
-
-// SetTopP sets the value for nucleus sampling.
-func SetTopP(topp float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.TopP = topp
-	}
-}
-
-// SetTemperature sets the temperature value for text generation.
-func SetTemperature(temp float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.Temperature = temp
-	}
-}
-
-// SetPenalty sets the repetition penalty for text generation.
-func SetPenalty(penalty float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.Penalty = penalty
-	}
-}
-
-// SetRepeat sets the number of times to repeat text generation.
-func SetRepeat(repeat int) PredictOption {
-	return func(p *PredictOptions) {
-		p.Repeat = repeat
-	}
-}
-
-// SetBatch sets the batch size.
-func SetBatch(size int) PredictOption {
-	return func(p *PredictOptions) {
-		p.Batch = size
-	}
-}
-
-// SetKeep sets the number of tokens from initial prompt to keep.
-func SetNKeep(n int) PredictOption {
-	return func(p *PredictOptions) {
-		p.NKeep = n
-	}
-}
-
-// Create a new PredictOptions object with the given options.
-func NewPredictOptions(opts ...PredictOption) PredictOptions {
-	p := DefaultOptions
-	for _, opt := range opts {
-		opt(&p)
-	}
-	return p
-}
-
-// SetTailFreeSamplingZ sets the tail free sampling, parameter z.
-func SetTailFreeSamplingZ(tfz float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.TailFreeSamplingZ = tfz
-	}
-}
-
-// SetTypicalP sets the typicality parameter, p_typical.
-func SetTypicalP(tp float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.TypicalP = tp
-	}
-}
-
-// SetFrequencyPenalty sets the frequency penalty parameter, freq_penalty.
-func SetFrequencyPenalty(fp float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.FrequencyPenalty = fp
-	}
-}
-
-// SetPresencePenalty sets the presence penalty parameter, presence_penalty.
-func SetPresencePenalty(pp float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.PresencePenalty = pp
-	}
-}
-
-// SetMirostat sets the mirostat parameter.
-func SetMirostat(m int) PredictOption {
-	return func(p *PredictOptions) {
-		p.Mirostat = m
-	}
-}
-
-// SetMirostatETA sets the mirostat ETA parameter.
-func SetMirostatETA(me float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.MirostatETA = me
-	}
-}
-
-// SetMirostatTAU sets the mirostat TAU parameter.
-func SetMirostatTAU(mt float64) PredictOption {
-	return func(p *PredictOptions) {
-		p.MirostatTAU = mt
-	}
-}
-
-// SetPenalizeNL sets whether to penalize newlines or not.
-func SetPenalizeNL(pnl bool) PredictOption {
-	return func(p *PredictOptions) {
-		p.PenalizeNL = pnl
-	}
-}
-
-// SetLogitBias sets the logit bias parameter.
-func SetLogitBias(lb string) PredictOption {
-	return func(p *PredictOptions) {
-		p.LogitBias = lb
-	}
-}
--- a/llama/utils.go
+++ b/llama/utils.go
@@ -0,0 +1,104 @@
+package llama
+
+type node[T any] struct {
+	t    T
+	next *node[T]
+	prev *node[T]
+}
+
+type deque[T any] struct {
+	head     *node[T]
+	tail     *node[T]
+	size     int
+	capacity int
+}
+
+func (d *deque[T]) Empty() bool {
+	return d.size == 0
+}
+
+func (d *deque[T]) Len() int {
+	return d.size
+}
+
+func (d *deque[T]) Cap() int {
+	return d.capacity
+}
+
+func (d *deque[T]) Push(t T) {
+	if d.capacity > 0 && d.size >= d.capacity {
+		d.PopLeft()
+	}
+
+	n := node[T]{t: t}
+	if d.head != nil {
+		n.next = d.head
+		d.head.prev = &n
+		d.head = &n
+	} else {
+		d.head = &n
+		d.tail = &n
+	}
+
+	d.size++
+}
+
+func (d *deque[T]) PushLeft(t T) {
+	if d.capacity > 0 && d.size >= d.capacity {
+		d.Pop()
+	}
+
+	n := node[T]{t: t}
+	if d.tail != nil {
+		n.prev = d.tail
+		d.tail.next = &n
+		d.tail = &n
+	} else {
+		d.head = &n
+		d.tail = &n
+	}
+
+	d.size++
+}
+
+func (d *deque[T]) Pop() *T {
+	if d.Empty() {
+		return nil
+	}
+
+	head := d.head
+	d.head = head.next
+	if d.head != nil {
+		d.head.prev = nil
+	} else {
+		d.tail = nil
+	}
+
+	d.size--
+	return &head.t
+}
+
+func (d *deque[T]) PopLeft() *T {
+	if d.Empty() {
+		return nil
+	}
+
+	tail := d.tail
+	d.tail = tail.prev
+	if d.tail != nil {
+		d.tail.next = nil
+	} else {
+		d.head = nil
+	}
+
+	d.size--
+	return &tail.t
+}
+
+func (d *deque[T]) Data() (data []T) {
+	for n := d.head; n != nil; n = n.next {
+		data = append(data, n.t)
+	}
+
+	return data
+}
--- a/scripts/publish.sh
+++ b/scripts/publish.sh
@@ -10,7 +10,7 @@ fi
 OS=$(go env GOOS)
 ARCH=$(go env GOARCH)

-make app
+go build .

 # Create a new tag if it doesn't exist.
 if ! git rev-parse v$VERSION >/dev/null 2>&1; then
--- a/server/models.go
+++ b/server/models.go
@@ -2,14 +2,13 @@ package server

 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"path"
 	"strconv"
-
-	"github.com/jmorganca/ollama/api"
 )

 const directoryURL = "https://ollama.ai/api/models"
@@ -36,12 +35,12 @@ func (m *Model) FullName() string {
 	return path.Join(home, ".ollama", "models", m.Name+".bin")
 }

-func pull(model string, progressCh chan<- api.PullProgress) error {
-	remote, err := getRemote(model)
-	if err != nil {
-		return fmt.Errorf("failed to pull model: %w", err)
-	}
-	return saveModel(remote, progressCh)
+func (m *Model) TempFile() string {
+	fullName := m.FullName()
+	return path.Join(
+		path.Dir(fullName),
+		fmt.Sprintf(".%s.part", path.Base(fullName)),
+	)
 }

 func getRemote(model string) (*Model, error) {
@@ -68,7 +67,7 @@ func getRemote(model string) (*Model, error) {
 	return nil, fmt.Errorf("model not found in directory: %s", model)
 }

-func saveModel(model *Model, progressCh chan<- api.PullProgress) error {
+func saveModel(model *Model, fn func(total, completed int64)) error {
 	// this models cache directory is created by the server on startup

 	client := &http.Client{}
@@ -76,41 +75,45 @@ func saveModel(model *Model, progressCh chan<- api.PullProgress) error {
 	if err != nil {
 		return fmt.Errorf("failed to download model: %w", err)
 	}
-	// check for resume
-	alreadyDownloaded := int64(0)
-	fileInfo, err := os.Stat(model.FullName())
-	if err != nil {
-		if !os.IsNotExist(err) {
-			return fmt.Errorf("failed to check resume model file: %w", err)
-		}
-		// file doesn't exist, create it now
-	} else {
-		alreadyDownloaded = fileInfo.Size()
-		req.Header.Add("Range", fmt.Sprintf("bytes=%d-", alreadyDownloaded))
+
+	// check if completed file exists
+	fi, err := os.Stat(model.FullName())
+	switch {
+	case errors.Is(err, os.ErrNotExist):
+		// noop, file doesn't exist so create it
+	case err != nil:
+		return fmt.Errorf("stat: %w", err)
+	default:
+		fn(fi.Size(), fi.Size())
+		return nil
 	}

+	var size int64
+
+	// completed file doesn't exist, check partial file
+	fi, err = os.Stat(model.TempFile())
+	switch {
+	case errors.Is(err, os.ErrNotExist):
+		// noop, file doesn't exist so create it
+	case err != nil:
+		return fmt.Errorf("stat: %w", err)
+	default:
+		size = fi.Size()
+	}
+
+	req.Header.Add("Range", fmt.Sprintf("bytes=%d-", size))
+
 	resp, err := client.Do(req)
 	if err != nil {
 		return fmt.Errorf("failed to download model: %w", err)
 	}
-
 	defer resp.Body.Close()

-	if resp.StatusCode == http.StatusRequestedRangeNotSatisfiable {
-		// already downloaded
-		progressCh <- api.PullProgress{
-			Total:     alreadyDownloaded,
-			Completed: alreadyDownloaded,
-			Percent:   100,
-		}
-		return nil
-	}
-
-	if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusPartialContent {
+	if resp.StatusCode >= 400 {
 		return fmt.Errorf("failed to download model: %s", resp.Status)
 	}

-	out, err := os.OpenFile(model.FullName(), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
+	out, err := os.OpenFile(model.TempFile(), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
 	if err != nil {
 		panic(err)
 	}
@@ -118,37 +121,23 @@ func saveModel(model *Model, progressCh chan<- api.PullProgress) error {

 	totalSize, _ := strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)

-	buf := make([]byte, 1024)
-	totalBytes := alreadyDownloaded
-	totalSize += alreadyDownloaded
+	totalBytes := size
+	totalSize += size

 	for {
-		n, err := resp.Body.Read(buf)
-		if err != nil && err != io.EOF {
+		n, err := io.CopyN(out, resp.Body, 8192)
+		if err != nil && !errors.Is(err, io.EOF) {
 			return err
 		}
+
 		if n == 0 {
 			break
 		}
-		if _, err := out.Write(buf[:n]); err != nil {
-			return err
-		}

-		totalBytes += int64(n)
-
-		// send progress updates
-		progressCh <- api.PullProgress{
-			Total:     totalSize,
-			Completed: totalBytes,
-			Percent:   float64(totalBytes) / float64(totalSize) * 100,
-		}
+		totalBytes += n
+		fn(totalSize, totalBytes)
 	}

-	progressCh <- api.PullProgress{
-		Total:     totalSize,
-		Completed: totalSize,
-		Percent:   100,
-	}
-
-	return nil
+	fn(totalSize, totalSize)
+	return os.Rename(model.TempFile(), model.FullName())
 }
--- a/server/routes.go
+++ b/server/routes.go
@@ -4,7 +4,6 @@ import (
 	"embed"
 	"encoding/json"
 	"errors"
-	"fmt"
 	"io"
 	"log"
 	"math"
@@ -12,7 +11,6 @@ import (
 	"net/http"
 	"os"
 	"path"
-	"runtime"
 	"strings"
 	"text/template"

@@ -37,11 +35,12 @@ func cacheDir() string {
 }

 func generate(c *gin.Context) {
-	var req api.GenerateRequest
-	req.ModelOptions = api.DefaultModelOptions
-	req.PredictOptions = api.DefaultPredictOptions
+	req := api.GenerateRequest{
+		Options: api.DefaultOptions(),
+	}
+
 	if err := c.ShouldBindJSON(&req); err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

@@ -50,21 +49,14 @@ func generate(c *gin.Context) {
 	}
 	if _, err := os.Stat(req.Model); err != nil {
 		if !errors.Is(err, os.ErrNotExist) {
-			c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
+			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			return
 		}
 		req.Model = path.Join(cacheDir(), "models", req.Model+".bin")
 	}

-	modelOpts := getModelOpts(req)
-	modelOpts.NGPULayers = 1 // hard-code this for now
-
-	model, err := llama.New(req.Model, modelOpts)
-	if err != nil {
-		fmt.Println("Loading the model failed:", err.Error())
-		return
-	}
-	defer model.Free()
+	ch := make(chan any)
+	go stream(c, ch)

 	templateNames := make([]string, 0, len(templates.Templates()))
 	for _, template := range templates.Templates() {
@@ -75,51 +67,59 @@ func generate(c *gin.Context) {
 	if template := templates.Lookup(match); template != nil {
 		var sb strings.Builder
 		if err := template.Execute(&sb, req); err != nil {
-			fmt.Println("Prompt template failed:", err.Error())
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}

 		req.Prompt = sb.String()
 	}

-	ch := make(chan string)
-	model.SetTokenCallback(func(token string) bool {
-		ch <- token
-		return true
-	})
+	llm, err := llama.New(req.Model, req.Options)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+	defer llm.Close()

-	predictOpts := getPredictOpts(req)
+	fn := func(s string) {
+		ch <- api.GenerateResponse{Response: s}
+	}

-	go func() {
-		defer close(ch)
-		_, err := model.Predict(req.Prompt, predictOpts)
-		if err != nil {
-			panic(err)
+	if err := llm.Predict(req.Prompt, fn); err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+}
+
+func pull(c *gin.Context) {
+	var req api.PullRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	remote, err := getRemote(req.Model)
+	if err != nil {
+		c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
+		return
+	}
+
+	ch := make(chan any)
+	go stream(c, ch)
+
+	fn := func(total, completed int64) {
+		ch <- api.PullProgress{
+			Total:     total,
+			Completed: completed,
+			Percent:   float64(total) / float64(completed) * 100,
 		}
-	}()
+	}

-	c.Stream(func(w io.Writer) bool {
-		token, ok := <-ch
-		if !ok {
-			return false
-		}
-
-		resp := api.GenerateResponse{
-			Response: token,
-		}
-
-		bts, err := json.Marshal(resp)
-		if err != nil {
-			return false
-		}
-
-		bts = append(bts, '\n')
-		if _, err := w.Write(bts); err != nil {
-			return false
-		}
-
-		return true
-	})
+	if err := saveModel(remote, fn); err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
 }

 func Serve(ln net.Listener) error {
@@ -129,53 +129,7 @@ func Serve(ln net.Listener) error {
 		c.String(http.StatusOK, "Ollama is running")
 	})

-	r.POST("api/pull", func(c *gin.Context) {
-		var req api.PullRequest
-		if err := c.ShouldBindJSON(&req); err != nil {
-			c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
-			return
-		}
-
-		progressCh := make(chan api.PullProgress)
-		go func() {
-			defer close(progressCh)
-			if err := pull(req.Model, progressCh); err != nil {
-				var opError *net.OpError
-				if errors.As(err, &opError) {
-					result := api.PullProgress{
-						Error: api.Error{
-							Code:    http.StatusBadGateway,
-							Message: "failed to get models from directory",
-						},
-					}
-					c.JSON(http.StatusBadGateway, result)
-					return
-				}
-				c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
-				return
-			}
-		}()
-
-		c.Stream(func(w io.Writer) bool {
-			progress, ok := <-progressCh
-			if !ok {
-				return false
-			}
-
-			bts, err := json.Marshal(progress)
-			if err != nil {
-				return false
-			}
-
-			bts = append(bts, '\n')
-			if _, err := w.Write(bts); err != nil {
-				return false
-			}
-
-			return true
-		})
-	})
-
+	r.POST("api/pull", pull)
 	r.POST("/api/generate", generate)

 	log.Printf("Listening on %s", ln.Addr())
@@ -198,52 +152,23 @@ func matchRankOne(source string, targets []string) (bestMatch string, bestRank i
 	return
 }

-func getModelOpts(req api.GenerateRequest) llama.ModelOptions {
-	var opts llama.ModelOptions
-	opts.ContextSize = req.ModelOptions.ContextSize
-	opts.Seed = req.ModelOptions.Seed
-	opts.F16Memory = req.ModelOptions.F16Memory
-	opts.MLock = req.ModelOptions.MLock
-	opts.Embeddings = req.ModelOptions.Embeddings
-	opts.MMap = req.ModelOptions.MMap
-	opts.LowVRAM = req.ModelOptions.LowVRAM
+func stream(c *gin.Context, ch chan any) {
+	c.Stream(func(w io.Writer) bool {
+		val, ok := <-ch
+		if !ok {
+			return false
+		}

-	opts.NBatch = req.ModelOptions.NBatch
-	opts.VocabOnly = req.ModelOptions.VocabOnly
-	opts.NUMA = req.ModelOptions.NUMA
-	opts.NGPULayers = req.ModelOptions.NGPULayers
-	opts.MainGPU = req.ModelOptions.MainGPU
-	opts.TensorSplit = req.ModelOptions.TensorSplit
+		bts, err := json.Marshal(val)
+		if err != nil {
+			return false
+		}

-	return opts
-}
-
-func getPredictOpts(req api.GenerateRequest) llama.PredictOptions {
-	var opts llama.PredictOptions
-
-	if req.PredictOptions.Threads == -1 {
-		opts.Threads = runtime.NumCPU()
-	} else {
-		opts.Threads = req.PredictOptions.Threads
-	}
-
-	opts.Seed = req.PredictOptions.Seed
-	opts.Tokens = req.PredictOptions.Tokens
-	opts.Penalty = req.PredictOptions.Penalty
-	opts.Repeat = req.PredictOptions.Repeat
-	opts.Batch = req.PredictOptions.Batch
-	opts.NKeep = req.PredictOptions.NKeep
-	opts.TopK = req.PredictOptions.TopK
-	opts.TopP = req.PredictOptions.TopP
-	opts.TailFreeSamplingZ = req.PredictOptions.TailFreeSamplingZ
-	opts.TypicalP = req.PredictOptions.TypicalP
-	opts.Temperature = req.PredictOptions.Temperature
-	opts.FrequencyPenalty = req.PredictOptions.FrequencyPenalty
-	opts.PresencePenalty = req.PredictOptions.PresencePenalty
-	opts.Mirostat = req.PredictOptions.Mirostat
-	opts.MirostatTAU = req.PredictOptions.MirostatTAU
-	opts.MirostatETA = req.PredictOptions.MirostatETA
-	opts.MMap = req.PredictOptions.MMap
-
-	return opts
+		bts = append(bts, '\n')
+		if _, err := w.Write(bts); err != nil {
+			return false
+		}
+
+		return true
+	})
 }
--- a/web/app/api/update/route.ts
+++ b/web/app/api/update/route.ts
@@ -1,44 +1,42 @@
 import { NextResponse } from 'next/server'
 import semver from 'semver'
-import { Octokit } from '@octokit/rest'
-import { RequestError } from '@octokit/types'
-
-const octokit = new Octokit()

 export async function GET(req: Request) {
  const { searchParams } = new URL(req.url)

-  const os = searchParams.get('os') || ''
-  const version = searchParams.get('version') || ''
+  const os = searchParams.get('os') || 'darwin'
+  const version = searchParams.get('version') || '0.0.0'

  if (!version) {
    return new Response('not found', { status: 404 })
  }

-  try {
-    const { data } = await octokit.repos.getLatestRelease({
-      owner: 'jmorganca',
-      repo: 'ollama',
-    })
+  const res = await fetch('https://api.github.com/repos/jmorganca/ollama/releases', { next: { revalidate: 60 } })
+  const data = await res.json()

-    // todo: get the correct asset for the current arch/os
-    const asset = data.assets.find(a => a.name.toLowerCase().includes(os))
-
-    if (!asset) {
-      return new Response('not found', { status: 404 })
-    }
-
-    if (semver.lt(version, data.tag_name)) {
-      return NextResponse.json({ version: data.tag_name, url: asset.browser_download_url })
-    }
-
-    return new Response('up to date', { status: 204 })
-  } catch (error) {
-    const e = error as RequestError
-    if (e.status === 404) {
-      return new Response('not found', { status: 404 })
-    }
-
-    return new Response('internal server error', { status: 500 })
+  if (data.length === 0) {
+    return new Response('not found', { status: 404 })
  }
+
+  const latest = data[0]
+  const assets = latest.assets || []
+
+  if (assets.length === 0) {
+    return new Response('not found', { status: 404 })
+  }
+
+  // todo: get the correct asset for the current arch/os
+  const asset = assets.find((a: any) => a.name.toLowerCase().includes(os) && a.name.toLowerCase().includes('.zip'))
+
+  if (!asset) {
+    return new Response('not found', { status: 404 })
+  }
+
+  console.log(asset)
+
+  if (semver.lt(version, latest.tag_name)) {
+    return NextResponse.json({ version: data.tag_name, url: asset.browser_download_url })
+  }
+
+  return new Response(null, { status: 204 })
 }
--- a/web/app/download/page.tsx
+++ b/web/app/download/page.tsx
@@ -1,16 +1,28 @@
-import { Octokit } from '@octokit/rest'
 import { redirect } from 'next/navigation'

-const octokit = new Octokit()
-
 export default async function Download() {
-  const { data } = await octokit.repos.getLatestRelease({
-    owner: 'jmorganca',
-    repo: 'ollama',
-  })
+  const res = await fetch('https://api.github.com/repos/jmorganca/ollama/releases', { next: { revalidate: 60 } })
+  const data = await res.json()
+
+  if (data.length === 0) {
+    return new Response('not found', { status: 404 })
+  }
+
+  const latest = data[0]
+  const assets = latest.assets || []
+
+  if (assets.length === 0) {
+    return new Response('not found', { status: 404 })
+  }

  // todo: get the correct asset for the current arch/os
-  const asset = data.assets.find(a => a.name.toLowerCase().includes('darwin') && a.name.toLowerCase().includes('.zip'))
+  const asset = assets.find(
+    (a: any) => a.name.toLowerCase().includes('darwin') && a.name.toLowerCase().includes('.zip')
+  )
+
+  if (!asset) {
+    return new Response('not found', { status: 404 })
+  }

  if (asset) {
    redirect(asset.browser_download_url)
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -17,7 +17,7 @@
        "encoding": "^0.1.13",
        "eslint": "8.44.0",
        "eslint-config-next": "13.4.7",
-        "next": "13.4.7",
+        "next": "13.4.9",
        "postcss": "8.4.24",
        "react": "18.2.0",
        "react-dom": "18.2.0",
@@ -191,9 +191,9 @@
      "integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw=="
    },
    "node_modules/@next/env": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-13.4.7.tgz",
-      "integrity": "sha512-ZlbiFulnwiFsW9UV1ku1OvX/oyIPLtMk9p/nnvDSwI0s7vSoZdRtxXNsaO+ZXrLv/pMbXVGq4lL8TbY9iuGmVw=="
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/env/-/env-13.4.9.tgz",
+      "integrity": "sha512-vuDRK05BOKfmoBYLNi2cujG2jrYbEod/ubSSyqgmEx9n/W3eZaJQdRNhTfumO+qmq/QTzLurW487n/PM/fHOkw=="
    },
    "node_modules/@next/eslint-plugin-next": {
      "version": "13.4.7",
@@ -204,9 +204,9 @@
      }
    },
    "node_modules/@next/swc-darwin-arm64": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.7.tgz",
-      "integrity": "sha512-VZTxPv1b59KGiv/pZHTO5Gbsdeoxcj2rU2cqJu03btMhHpn3vwzEK0gUSVC/XW96aeGO67X+cMahhwHzef24/w==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.9.tgz",
+      "integrity": "sha512-TVzGHpZoVBk3iDsTOQA/R6MGmFp0+17SWXMEWd6zG30AfuELmSSMe2SdPqxwXU0gbpWkJL1KgfLzy5ReN0crqQ==",
      "cpu": [
        "arm64"
      ],
@@ -219,9 +219,9 @@
      }
    },
    "node_modules/@next/swc-darwin-x64": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.7.tgz",
-      "integrity": "sha512-gO2bw+2Ymmga+QYujjvDz9955xvYGrWofmxTq7m70b9pDPvl7aDFABJOZ2a8SRCuSNB5mXU8eTOmVVwyp/nAew==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.9.tgz",
+      "integrity": "sha512-aSfF1fhv28N2e7vrDZ6zOQ+IIthocfaxuMWGReB5GDriF0caTqtHttAvzOMgJgXQtQx6XhyaJMozLTSEXeNN+A==",
      "cpu": [
        "x64"
      ],
@@ -234,9 +234,9 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.7.tgz",
-      "integrity": "sha512-6cqp3vf1eHxjIDhEOc7Mh/s8z1cwc/l5B6ZNkOofmZVyu1zsbEM5Hmx64s12Rd9AYgGoiCz4OJ4M/oRnkE16/Q==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.9.tgz",
+      "integrity": "sha512-JhKoX5ECzYoTVyIy/7KykeO4Z2lVKq7HGQqvAH+Ip9UFn1MOJkOnkPRB7v4nmzqAoY+Je05Aj5wNABR1N18DMg==",
      "cpu": [
        "arm64"
      ],
@@ -249,9 +249,9 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.7.tgz",
-      "integrity": "sha512-T1kD2FWOEy5WPidOn1si0rYmWORNch4a/NR52Ghyp4q7KyxOCuiOfZzyhVC5tsLIBDH3+cNdB5DkD9afpNDaOw==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.9.tgz",
+      "integrity": "sha512-OOn6zZBIVkm/4j5gkPdGn4yqQt+gmXaLaSjRSO434WplV8vo2YaBNbSHaTM9wJpZTHVDYyjzuIYVEzy9/5RVZw==",
      "cpu": [
        "arm64"
      ],
@@ -264,9 +264,9 @@
      }
    },
    "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.7.tgz",
-      "integrity": "sha512-zaEC+iEiAHNdhl6fuwl0H0shnTzQoAoJiDYBUze8QTntE/GNPfTYpYboxF5LRYIjBwETUatvE0T64W6SKDipvg==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.9.tgz",
+      "integrity": "sha512-iA+fJXFPpW0SwGmx/pivVU+2t4zQHNOOAr5T378PfxPHY6JtjV6/0s1vlAJUdIHeVpX98CLp9k5VuKgxiRHUpg==",
      "cpu": [
        "x64"
      ],
@@ -279,9 +279,9 @@
      }
    },
    "node_modules/@next/swc-linux-x64-musl": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.7.tgz",
-      "integrity": "sha512-X6r12F8d8SKAtYJqLZBBMIwEqcTRvUdVm+xIq+l6pJqlgT2tNsLLf2i5Cl88xSsIytBICGsCNNHd+siD2fbWBA==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.9.tgz",
+      "integrity": "sha512-rlNf2WUtMM+GAQrZ9gMNdSapkVi3koSW3a+dmBVp42lfugWVvnyzca/xJlN48/7AGx8qu62WyO0ya1ikgOxh6A==",
      "cpu": [
        "x64"
      ],
@@ -294,9 +294,9 @@
      }
    },
    "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.7.tgz",
-      "integrity": "sha512-NPnmnV+vEIxnu6SUvjnuaWRglZzw4ox5n/MQTxeUhb5iwVWFedolPFebMNwgrWu4AELwvTdGtWjqof53AiWHcw==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.9.tgz",
+      "integrity": "sha512-5T9ybSugXP77nw03vlgKZxD99AFTHaX8eT1ayKYYnGO9nmYhJjRPxcjU5FyYI+TdkQgEpIcH7p/guPLPR0EbKA==",
      "cpu": [
        "arm64"
      ],
@@ -309,9 +309,9 @@
      }
    },
    "node_modules/@next/swc-win32-ia32-msvc": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.7.tgz",
-      "integrity": "sha512-6Hxijm6/a8XqLQpOOf/XuwWRhcuc/g4rBB2oxjgCMuV9Xlr2bLs5+lXyh8w9YbAUMYR3iC9mgOlXbHa79elmXw==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.9.tgz",
+      "integrity": "sha512-ojZTCt1lP2ucgpoiFgrFj07uq4CZsq4crVXpLGgQfoFq00jPKRPgesuGPaz8lg1yLfvafkU3Jd1i8snKwYR3LA==",
      "cpu": [
        "ia32"
      ],
@@ -324,9 +324,9 @@
      }
    },
    "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.4.7.tgz",
-      "integrity": "sha512-sW9Yt36Db1nXJL+mTr2Wo0y+VkPWeYhygvcHj1FF0srVtV+VoDjxleKtny21QHaG05zdeZnw2fCtf2+dEqgwqA==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.4.9.tgz",
+      "integrity": "sha512-QbT03FXRNdpuL+e9pLnu+XajZdm/TtIXVYY4lA9t+9l0fLZbHXDYEKitAqxrOj37o3Vx5ufxiRAniaIebYDCgw==",
      "cpu": [
        "x64"
      ],
@@ -2983,11 +2983,11 @@
      "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw=="
    },
    "node_modules/next": {
-      "version": "13.4.7",
-      "resolved": "https://registry.npmjs.org/next/-/next-13.4.7.tgz",
-      "integrity": "sha512-M8z3k9VmG51SRT6v5uDKdJXcAqLzP3C+vaKfLIAM0Mhx1um1G7MDnO63+m52qPdZfrTFzMZNzfsgvm3ghuVHIQ==",
+      "version": "13.4.9",
+      "resolved": "https://registry.npmjs.org/next/-/next-13.4.9.tgz",
+      "integrity": "sha512-vtefFm/BWIi/eWOqf1GsmKG3cjKw1k3LjuefKRcL3iiLl3zWzFdPG3as6xtxrGO6gwTzzaO1ktL4oiHt/uvTjA==",
      "dependencies": {
-        "@next/env": "13.4.7",
+        "@next/env": "13.4.9",
        "@swc/helpers": "0.5.1",
        "busboy": "1.6.0",
        "caniuse-lite": "^1.0.30001406",
@@ -3003,15 +3003,15 @@
        "node": ">=16.8.0"
      },
      "optionalDependencies": {
-        "@next/swc-darwin-arm64": "13.4.7",
-        "@next/swc-darwin-x64": "13.4.7",
-        "@next/swc-linux-arm64-gnu": "13.4.7",
-        "@next/swc-linux-arm64-musl": "13.4.7",
-        "@next/swc-linux-x64-gnu": "13.4.7",
-        "@next/swc-linux-x64-musl": "13.4.7",
-        "@next/swc-win32-arm64-msvc": "13.4.7",
-        "@next/swc-win32-ia32-msvc": "13.4.7",
-        "@next/swc-win32-x64-msvc": "13.4.7"
+        "@next/swc-darwin-arm64": "13.4.9",
+        "@next/swc-darwin-x64": "13.4.9",
+        "@next/swc-linux-arm64-gnu": "13.4.9",
+        "@next/swc-linux-arm64-musl": "13.4.9",
+        "@next/swc-linux-x64-gnu": "13.4.9",
+        "@next/swc-linux-x64-musl": "13.4.9",
+        "@next/swc-win32-arm64-msvc": "13.4.9",
+        "@next/swc-win32-ia32-msvc": "13.4.9",
+        "@next/swc-win32-x64-msvc": "13.4.9"
      },
      "peerDependencies": {
        "@opentelemetry/api": "^1.1.0",
--- a/web/package.json
+++ b/web/package.json
@@ -17,7 +17,7 @@
    "encoding": "^0.1.13",
    "eslint": "8.44.0",
    "eslint-config-next": "13.4.7",
-    "next": "13.4.7",
+    "next": "13.4.9",
    "postcss": "8.4.24",
    "react": "18.2.0",
    "react-dom": "18.2.0",
--- a/web/vercel.json
+++ b/web/vercel.json
@@ -0,0 +1,5 @@
+{
+    "github": {
+        "silent": true
+    }
+}
Author	SHA1	Message	Date
Jeffrey Morgan	dcb6ba389a	app: trim server lines before logging	2023-07-11 16:43:19 -07:00
Jeffrey Morgan	ed6abba75a	app: bundle real `ggml-metal.metal` instead of symlink	2023-07-11 16:36:39 -07:00
Jeffrey Morgan	b52a400cdf	use `go build` on publish	2023-07-11 16:17:45 -07:00
Jeffrey Morgan	2ed26f0047	tweak logging	2023-07-11 16:16:38 -07:00
Jeffrey Morgan	e64ef69e34	look for ggml-metal in the same directory as the binary	2023-07-11 15:58:56 -07:00
Jeffrey Morgan	3d0a9b477b	log to console as well as file	2023-07-11 15:52:22 -07:00
Michael Yang	7226980fb6	Merge pull request #70 from jmorganca/offline-fixes offline fixes	2023-07-11 15:50:19 -07:00
Michael Yang	a806b03f62	no errgroup	2023-07-11 14:58:10 -07:00
Michael Yang	948323fa78	rename partial file	2023-07-11 13:50:26 -07:00
Michael Yang	e243329e2e	check api status	2023-07-11 13:42:05 -07:00
Michael Yang	2a66a1164a	common stream producer	2023-07-11 13:42:05 -07:00
Michael Yang	62620914e9	Merge pull request #65 from jmorganca/bindings call llama.cpp directly from go	2023-07-11 12:01:03 -07:00
Michael Yang	442dec1c6f	vendor llama.cpp	2023-07-11 11:59:18 -07:00
Michael Yang	fd4792ec56	call llama.cpp directly from go	2023-07-11 11:59:18 -07:00
hoyyeva	abaf7d3bda	Merge pull request #67 from jmorganca/log writing logs to `./ollama/logs`	2023-07-11 14:45:21 -04:00
Eva Ho	7762584fb1	address comments	2023-07-11 14:38:38 -04:00
Jeffrey Morgan	317615fd5c	web: remove unused code	2023-07-11 11:05:45 -07:00
Eva Ho	acc31427dd	add logs to ~/.ollama/logs folder	2023-07-11 13:33:32 -04:00
Jeffrey Morgan	a3ec1ec2a0	consistent error handling for pull and generate	2023-07-10 21:34:15 -07:00
Eva Ho	407a5cabf4	when app is running, server restarts when it exits or disconnects	2023-07-10 17:14:25 -04:00
Michael Yang	0859d50942	Merge pull request #58 from jmorganca/generate-errors return error in generate response	2023-07-10 14:03:47 -07:00
Jeffrey Morgan	66bbf05918	start server in both dev and when packaged	2023-07-10 13:46:31 -07:00
Michael Yang	edba935d67	return error in generate response	2023-07-10 13:30:10 -07:00
Bruce MacDonald	2d49197b3b	increase default model size to 512	2023-07-10 21:24:41 +02:00
Bruce MacDonald	f5e2e150b8	allow overriding default generate options	2023-07-10 20:58:02 +02:00
Jeffrey Morgan	268e362fa7	fix binding build	2023-07-10 11:33:43 -07:00
Bruce MacDonald	07a4c1e3fb	take all args as one prompt	2023-07-10 06:05:09 -04:00
Jeffrey Morgan	20dae6b38f	add `vercel.json` to silence PR comments	2023-07-09 20:11:37 -07:00
Jeffrey Morgan	a18e6b3a40	llama: remove unnecessary std::vector	2023-07-09 10:51:45 -04:00
Jeffrey Morgan	5fb96255dc	llama: remove unused helper functions	2023-07-09 10:25:07 -04:00
Jeffrey Morgan	b43ddd84be	update `README.md` instructions section	2023-07-08 19:19:31 -04:00
Jeffrey Morgan	993cb9fad6	examples: add basic python example	2023-07-08 17:40:05 -04:00
Jeffrey Morgan	a8dc0c9b5f	web: use proper caching for autoupdate endpoint	2023-07-08 16:48:02 -04:00
Jeffrey Morgan	1e97807808	web: revalidate download every minute	2023-07-08 13:45:47 -04:00
Jeffrey Morgan	840f87430a	remove double `heartbeat`	2023-07-08 13:30:27 -04:00
Bruce MacDonald	4d8b0414f7	take all args as one prompt - parse all run arguments into one prompt - do not echo prompt back on one-shot - example of summarizing a document	2023-07-07 16:14:58 -04:00