handle race condition while setting raw mode in windows (#2509 )

Revert "Revert "bump submodule to 6c00a06 (#2479 )"" (#2485 )
This reverts commit 6920964b87.
2024-02-14 21:28:35 -08:00 · 2024-02-13 18:18:41 -08:00 · 2024-02-13 17:48:41 -08:00 · 2024-02-13 17:23:05 -08:00 · 2024-02-13 17:12:42 -08:00 · 2024-02-13 15:40:32 -08:00
50 changed files with 1726 additions and 959 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -34,7 +34,7 @@ jobs:
      matrix:
        cuda-version:
          - '11.8.0'
-    runs-on: ubuntu-latest
+    runs-on: linux
    container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
    steps:
      - run: |
@@ -64,7 +64,7 @@ jobs:
        rocm-version:
          - '5.7.1'
          - '6.0'
-    runs-on: ubuntu-latest
+    runs-on: linux
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
    steps:
      - run: |
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Get up and running with large language models locally.

 ### macOS

-[Download](https://ollama.ai/download/Ollama-darwin.zip)
+[Download](https://ollama.com/download/Ollama-darwin.zip)

 ### Windows

@@ -19,7 +19,7 @@ Coming soon! For now, you can install Ollama on Windows via WSL2.
 ### Linux & WSL2

 ```
-curl https://ollama.ai/install.sh | sh
+curl -fsSL https://ollama.com/install.sh | sh
 ```

 [Manual install instructions](https://github.com/jmorganca/ollama/blob/main/docs/linux.md)
@@ -35,7 +35,7 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 ## Quickstart

-To run and chat with [Llama 2](https://ollama.ai/library/llama2):
+To run and chat with [Llama 2](https://ollama.com/library/llama2):

 ```
 ollama run llama2
@@ -43,7 +43,7 @@ ollama run llama2

 ## Model library

-Ollama supports a list of open-source models available on [ollama.ai/library](https://ollama.ai/library 'ollama model library')
+Ollama supports a list of open-source models available on [ollama.com/library](https://ollama.com/library 'ollama model library')

 Here are some example open-source models that can be downloaded:

@@ -200,18 +200,21 @@ brew install cmake go
 ```

 Then generate dependencies:
+
 ```
 go generate ./...
 ```
+
 Then build the binary:
+
 ```
 go build .
 ```

 More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)

-
 ### Running local builds
+
 Next, start the server:

 ```
@@ -253,6 +256,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ## Community Integrations

 ### Web & Desktop
+
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
@@ -265,7 +269,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
 - [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
-
+- [MindMac](https://mindmac.app)

 ### Terminal

@@ -278,6 +282,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
 - [cmdh](https://github.com/pgibler/cmdh)
+- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).

 ### Database

@@ -304,7 +309,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainDart](https://github.com/davidmigloz/langchain_dart)
 - [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
 - [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
-
+- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
+- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)

 ### Mobile

@@ -326,3 +332,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
+- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
+- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
--- a/api/types.go
+++ b/api/types.go
@@ -183,12 +183,11 @@ type CopyRequest struct {
 }

 type PullRequest struct {
-	Model         string `json:"model"`
-	Insecure      bool   `json:"insecure,omitempty"`
-	Username      string `json:"username"`
-	Password      string `json:"password"`
-	Stream        *bool  `json:"stream,omitempty"`
-	CurrentDigest string `json:"current_digest,omitempty"`
+	Model    string `json:"model"`
+	Insecure bool   `json:"insecure,omitempty"`
+	Username string `json:"username"`
+	Password string `json:"password"`
+	Stream   *bool  `json:"stream,omitempty"`

 	// Name is deprecated, see Model
 	Name string `json:"name"`
@@ -242,7 +241,6 @@ type GenerateResponse struct {

 type ModelDetails struct {
 	ParentModel       string   `json:"parent_model"`
-	Digest            string   `json:"digest"`
 	Format            string   `json:"format"`
 	Family            string   `json:"family"`
 	Families          []string `json:"families"`
@@ -417,8 +415,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 	switch t := v.(type) {
 	case float64:
 		if t < 0 {
-			t = math.MaxFloat64
-			d.Duration = time.Duration(t)
+			d.Duration = time.Duration(math.MaxInt64)
 		} else {
 			d.Duration = time.Duration(t * float64(time.Second))
 		}
@@ -428,8 +425,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 			return err
 		}
 		if d.Duration < 0 {
-			mf := math.MaxFloat64
-			d.Duration = time.Duration(mf)
+			d.Duration = time.Duration(math.MaxInt64)
 		}
 	}

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -25,6 +25,7 @@ import (
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
+	"golang.org/x/exp/slices"
 	"golang.org/x/term"

 	"github.com/jmorganca/ollama/api"
@@ -146,19 +147,68 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}

 	name := args[0]
+
 	// check if the model exists on the server
-	_, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
+	show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 	var statusError api.StatusError
 	switch {
 	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
 		if err := PullHandler(cmd, []string{name}); err != nil {
 			return err
 		}
+
+		show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
+		if err != nil {
+			return err
+		}
 	case err != nil:
 		return err
 	}

-	return RunGenerate(cmd, args)
+	interactive := true
+
+	opts := runOptions{
+		Model:       args[0],
+		WordWrap:    os.Getenv("TERM") == "xterm-256color",
+		Options:     map[string]interface{}{},
+		MultiModal:  slices.Contains(show.Details.Families, "clip"),
+		ParentModel: show.Details.ParentModel,
+	}
+
+	format, err := cmd.Flags().GetString("format")
+	if err != nil {
+		return err
+	}
+	opts.Format = format
+
+	prompts := args[1:]
+	// prepend stdin to the prompt if provided
+	if !term.IsTerminal(int(os.Stdin.Fd())) {
+		in, err := io.ReadAll(os.Stdin)
+		if err != nil {
+			return err
+		}
+
+		prompts = append([]string{string(in)}, prompts...)
+		opts.WordWrap = false
+		interactive = false
+	}
+	opts.Prompt = strings.Join(prompts, " ")
+	if len(prompts) > 0 {
+		interactive = false
+	}
+
+	nowrap, err := cmd.Flags().GetBool("nowordwrap")
+	if err != nil {
+		return err
+	}
+	opts.WordWrap = !nowrap
+
+	if !interactive {
+		return generate(cmd, opts)
+	}
+
+	return generateInteractive(cmd, opts)
 }

 func PushHandler(cmd *cobra.Command, args []string) error {
@@ -357,42 +407,6 @@ func CopyHandler(cmd *cobra.Command, args []string) error {
 }

 func PullHandler(cmd *cobra.Command, args []string) error {
-	upgradeAll, err := cmd.Flags().GetBool("upgrade-all")
-	if err != nil {
-		return err
-	}
-
-	if !upgradeAll {
-		if len(args) == 0 {
-			return fmt.Errorf("no model specified to pull")
-		}
-		return pull(cmd, args[0], "")
-	}
-
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	models, err := client.List(cmd.Context())
-	if err != nil {
-		return err
-	}
-
-	for _, m := range (*models).Models {
-		err = pull(cmd, m.Name, "sha256:"+m.Digest)
-		if err != nil {
-			if strings.Contains(err.Error(), "file does not exist") {
-				fmt.Printf("model '%s' is no longer available\n", m.Name)
-				continue
-			}
-			return err
-		}
-	}
-	return nil
-}
-
-func pull(cmd *cobra.Command, name string, currentDigest string) error {
 	insecure, err := cmd.Flags().GetBool("insecure")
 	if err != nil {
 		return err
@@ -404,7 +418,7 @@ func pull(cmd *cobra.Command, name string, currentDigest string) error {
 	}

 	p := progress.NewProgress(os.Stderr)
-	defer p.StopWithoutClear()
+	defer p.Stop()

 	bars := make(map[string]*progress.Bar)

@@ -438,7 +452,7 @@ func pull(cmd *cobra.Command, name string, currentDigest string) error {
 		return nil
 	}

-	request := api.PullRequest{Name: name, Insecure: insecure, CurrentDigest: currentDigest}
+	request := api.PullRequest{Name: args[0], Insecure: insecure}
 	if err := client.Pull(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -446,51 +460,6 @@ func pull(cmd *cobra.Command, name string, currentDigest string) error {
 	return nil
 }

-func RunGenerate(cmd *cobra.Command, args []string) error {
-	interactive := true
-
-	opts := runOptions{
-		Model:    args[0],
-		WordWrap: os.Getenv("TERM") == "xterm-256color",
-		Options:  map[string]interface{}{},
-	}
-
-	format, err := cmd.Flags().GetString("format")
-	if err != nil {
-		return err
-	}
-	opts.Format = format
-
-	prompts := args[1:]
-	// prepend stdin to the prompt if provided
-	if !term.IsTerminal(int(os.Stdin.Fd())) {
-		in, err := io.ReadAll(os.Stdin)
-		if err != nil {
-			return err
-		}
-
-		prompts = append([]string{string(in)}, prompts...)
-		opts.WordWrap = false
-		interactive = false
-	}
-	opts.Prompt = strings.Join(prompts, " ")
-	if len(prompts) > 0 {
-		interactive = false
-	}
-
-	nowrap, err := cmd.Flags().GetBool("nowordwrap")
-	if err != nil {
-		return err
-	}
-	opts.WordWrap = !nowrap
-
-	if !interactive {
-		return generate(cmd, opts)
-	}
-
-	return generateInteractive(cmd, opts)
-}
-
 type generateContextKey string

 type runOptions struct {
@@ -666,10 +635,18 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		return nil
 	}

+	if opts.MultiModal {
+		opts.Prompt, opts.Images, err = extractFileData(opts.Prompt)
+		if err != nil {
+			return err
+		}
+	}
+
 	request := api.GenerateRequest{
 		Model:    opts.Model,
 		Prompt:   opts.Prompt,
 		Context:  generateContext,
+		Images:   opts.Images,
 		Format:   opts.Format,
 		System:   opts.System,
 		Template: opts.Template,
@@ -920,13 +897,12 @@ func NewCLI() *cobra.Command {
 	pullCmd := &cobra.Command{
 		Use:     "pull MODEL",
 		Short:   "Pull a model from a registry",
-		Args:    cobra.RangeArgs(0, 1),
+		Args:    cobra.ExactArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE:    PullHandler,
 	}

 	pullCmd.Flags().Bool("insecure", false, "Use an insecure registry")
-	pullCmd.Flags().Bool("upgrade-all", false, "Upgrade all models if they're out of date")

 	pushCmd := &cobra.Command{
 		Use:     "push MODEL",
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -6,6 +6,7 @@ import (
 	"io"
 	"net/http"
 	"os"
+	"path/filepath"
 	"regexp"
 	"sort"
 	"strings"
@@ -98,6 +99,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
+
+		if opts.MultiModal {
+			fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
+		}
+
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -207,6 +213,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			switch multiline {
 			case MultilineSystem:
 				opts.System = sb.String()
+				opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
 				fmt.Println("Set system message.")
 				sb.Reset()
 			case MultilineTemplate:
@@ -226,7 +233,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				fmt.Fprintln(&sb)
 				multiline = MultilinePrompt
 				scanner.Prompt.UseAlt = true
-				break
 			}
 		case scanner.Pasting:
 			fmt.Fprintln(&sb, line)
@@ -349,10 +355,13 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 					if args[1] == "system" {
 						opts.System = sb.String()
+						opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
 						fmt.Println("Set system message.")
+						sb.Reset()
 					} else if args[1] == "template" {
 						opts.Template = sb.String()
 						fmt.Println("Set prompt template.")
+						sb.Reset()
 					}

 					sb.Reset()
@@ -487,29 +496,18 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				if err != nil {
 					return err
 				}
-				newMessage.Content = msg

-				// reset the context if we find another image
+				// clear all previous images for better responses
 				if len(images) > 0 {
-					newMessage.Images = append(newMessage.Images, images...)
-					// reset the context for the new image
-					opts.Messages = []api.Message{}
-				} else {
-					if len(opts.Messages) > 1 {
-						newMessage.Images = append(newMessage.Images, opts.Messages[len(opts.Messages)-2].Images...)
+					for i := range opts.Messages {
+						opts.Messages[i].Images = nil
 					}
 				}
-				if len(newMessage.Images) == 0 {
-					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
-					fmt.Println()
-					sb.Reset()
-					continue
-				}
+
+				newMessage.Content = msg
+				newMessage.Images = images
 			}

-			if opts.System != "" {
-				opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
-			}
 			opts.Messages = append(opts.Messages, newMessage)

 			assistant, err := chat(cmd, opts)
@@ -603,10 +601,10 @@ func extractFileData(input string) (string, []api.ImageData, error) {
 			if os.IsNotExist(err) {
 				continue
 			}
-			fmt.Printf("Couldn't process image: %q\n", err)
+			fmt.Fprintf(os.Stderr, "Couldn't process image: %q\n", err)
 			return "", imgs, err
 		}
-		fmt.Printf("Added image '%s'\n", nfp)
+		fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
@@ -627,7 +625,7 @@ func getImageData(filePath string) ([]byte, error) {
 	}

 	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
--- a/docs/README.md
+++ b/docs/README.md
@@ -10,7 +10,7 @@ Create new models or modify models already in the library using the Modelfile. L

 Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.

-Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
+Installing on Linux in most cases is easy using the script on [ollama.com/download](ollama.com/download). To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.

 Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.

--- a/docs/api.md
+++ b/docs/api.md
@@ -49,7 +49,8 @@ Advanced parameters (optional):
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.
+- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

 #### JSON mode

@@ -379,6 +380,7 @@ Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

 ### Examples

@@ -542,7 +544,7 @@ curl http://localhost:11434/api/chat -d '{
      "role": "user",
      "content": "what is in this image?",
      "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
-    },
+    }
  ]
 }'
 ```
@@ -958,6 +960,7 @@ Generate embeddings from a model
 Advanced parameters:

 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

 ### Examples

--- a/docs/development.md
+++ b/docs/development.md
@@ -50,7 +50,8 @@ development and runtime packages.
 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler.
+libraries, and `CUDACXX` to the location of the nvcc compiler.  You can customize
+set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")

 Then generate dependencies:

--- a/docs/import.md
+++ b/docs/import.md
@@ -15,7 +15,7 @@ FROM ./mistral-7b-v0.1.Q4_0.gguf
 (Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:

 ```
-FROM ./q4_0.bin
+FROM ./mistral-7b-v0.1.Q4_0.gguf
 TEMPLATE "[INST] {{ .Prompt }} [/INST]"
 ```

@@ -37,55 +37,69 @@ ollama run example "What is your favourite condiment?"

 ## Importing (PyTorch & Safetensors)

-### Supported models
+> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress.

-Ollama supports a set of model architectures, with support for more coming soon:
+### Setup

- Llama & Mistral
- Falcon & RW
- BigCode
+First, clone the `ollama/ollama` repo:

-To view a model's architecture, check the `config.json` file in its HuggingFace repo. You should see an entry under `architectures` (e.g. `LlamaForCausalLM`).
+```
+git clone git@github.com:ollama/ollama.git ollama
+cd ollama
+```

-### Step 1: Clone the HuggingFace repository (optional)
+and then fetch its `llama.cpp` submodule:
+
+```shell
+git submodule init
+git submodule update llm/llama.cpp
+```
+
+Next, install the Python dependencies:
+
+```
+python3 -m venv llm/llama.cpp/.venv
+source llm/llama.cpp/.venv/bin/activate
+pip install -r llm/llama.cpp/requirements.txt
+```
+
+Then build the `quantize` tool:
+
+```
+make -C llm/llama.cpp quantize
+```
+
+### Clone the HuggingFace repository (optional)

 If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model.

+Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository:
+
 ```
 git lfs install
-git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
-cd Mistral-7B-Instruct-v0.1
+git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model
 ```

-### Step 2: Convert and quantize to a `.bin` file (optional, for PyTorch and Safetensors)
+### Convert the model

-If the model is in PyTorch or Safetensors format, a [Docker image](https://hub.docker.com/r/ollama/quantize) with the tooling required to convert and quantize models is available.
-
-First, Install [Docker](https://www.docker.com/get-started/).
-
-Next, to convert and quantize your model, run:
+> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py`

 ```
-docker run --rm -v .:/model ollama/quantize -q q4_0 /model
+python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin
 ```

-This will output two files into the directory:
+### Quantize the model

- `f16.bin`: the model converted to GGUF
- `q4_0.bin` the model quantized to a 4-bit quantization (Ollama will use this file to create the Ollama model)
+```
+llm/llama.cpp/quantize converted.bin quantized.bin q4_0
+```

 ### Step 3: Write a `Modelfile`

 Next, create a `Modelfile` for your model:

 ```
-FROM ./q4_0.bin
-```
-
-(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
-
-```
-FROM ./q4_0.bin
+FROM quantized.bin
 TEMPLATE "[INST] {{ .Prompt }} [/INST]"
 ```

@@ -109,9 +123,9 @@ ollama run example "What is your favourite condiment?"

 Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps:

-1. Create [an account](https://ollama.ai/signup)
+1. Create [an account](https://ollama.com/signup)
 2. Run `cat ~/.ollama/id_ed25519.pub` to view your Ollama public key. Copy this to the clipboard.
-3. Add your public key to your [Ollama account](https://ollama.ai/settings/keys)
+3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)

 Next, copy your model to your username's namespace:

@@ -125,7 +139,7 @@ Then push the model:
 ollama push <your username>/example
 ```

-After publishing, your model will be available at `https://ollama.ai/<your username>/example`.
+After publishing, your model will be available at `https://ollama.com/<your username>/example`.

 ## Quantization reference

@@ -149,47 +163,3 @@ The quantization options are as follow (from highest highest to lowest levels of
 - `q6_K`
 - `q8_0`
 - `f16`
-
-## Manually converting & quantizing models
-
-### Prerequisites
-
-Start by cloning the `llama.cpp` repo to your machine in another directory:
-
-```
-git clone https://github.com/ggerganov/llama.cpp.git
-cd llama.cpp
-```
-
-Next, install the Python dependencies:
-
-```
-pip install -r requirements.txt
-```
-
-Finally, build the `quantize` tool:
-
-```
-make quantize
-```
-
-### Convert the model
-
-Run the correct conversion script for your model architecture:
-
-```shell
-# LlamaForCausalLM or MistralForCausalLM
-python convert.py <path to model directory>
-
-# FalconForCausalLM
-python convert-falcon-hf-to-gguf.py <path to model directory>
-
-# GPTBigCodeForCausalLM
-python convert-starcoder-hf-to-gguf.py <path to model directory>
-```
-
-### Quantize the model
-
-```
-quantize <path to model dir>/ggml-model-f32.bin <path to model dir>/q4_0.bin q4_0
-```
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -3,9 +3,11 @@
 ## Install

 Install Ollama running this one-liner:
+
 >
+
 ```bash
-curl https://ollama.ai/install.sh | sh
+curl -fsSL https://ollama.com/install.sh | sh
 ```

 ## Manual install
@@ -15,7 +17,7 @@ curl https://ollama.ai/install.sh | sh
 Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:

 ```bash
-sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
+sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
 sudo chmod +x /usr/bin/ollama
 ```

@@ -75,13 +77,13 @@ sudo systemctl start ollama
 Update ollama by running the install script again:

 ```bash
-curl https://ollama.ai/install.sh | sh
+curl -fsSL https://ollama.com/install.sh | sh
 ```

 Or by downloading the ollama binary:

 ```bash
-sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
+sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
 sudo chmod +x /usr/bin/ollama
 ```

@@ -110,6 +112,7 @@ sudo rm $(which ollama)
 ```

 Remove the downloaded models and Ollama service user and group:
+
 ```bash
 sudo rm -r /usr/share/ollama
 sudo userdel ollama
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -67,13 +67,13 @@ To use this:

 More examples are available in the [examples directory](../examples).

-### `Modelfile`s in [ollama.ai/library][1]
+### `Modelfile`s in [ollama.com/library][1]

-There are two ways to view `Modelfile`s underlying the models in [ollama.ai/library][1]:
+There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:

 - Option 1: view a details page from a model's tags page:
-  1.  Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
-  2.  Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
+  1.  Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
+  2.  Click on a tag (e.g. https://ollama.com/library/llama2:13b)
  3.  Scroll down to "Layers"
      - Note: if the [`FROM` instruction](#from-required) is not present,
        it means the model was created from a local file
@@ -86,7 +86,7 @@ There are two ways to view `Modelfile`s underlying the models in [ollama.ai/libr
  # FROM llama2:13b

  FROM /root/.ollama/models/blobs/sha256:123abc
-  TEMPLATE """[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>
+  TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>

  {{ end }}{{ .Prompt }} [/INST] """
  SYSTEM """"""
@@ -154,31 +154,23 @@ PARAMETER <parameter> <parametervalue>

 ### TEMPLATE

-`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.
+`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message, a user's message and the response from the model. Note: syntax may be model specific. Templates use Go [template syntax](https://pkg.go.dev/text/template).

 #### Template Variables

-| Variable          | Description                                                                                                   |
-| ----------------- | ------------------------------------------------------------------------------------------------------------- |
-| `{{ .System }}`   | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
-| `{{ .Prompt }}`   | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
-| `{{ .Response }}` | The response from the LLM, if not specified response is appended to the end of the template.                  |
-| `{{ .First }}`    | A boolean value used to render specific template information for the first generation of a session.           |
+| Variable          | Description                                                                                   |
+| ----------------- | --------------------------------------------------------------------------------------------- |
+| `{{ .System }}`   | The system message used to specify custom behavior.                                           |
+| `{{ .Prompt }}`   | The user prompt message.                                                                      |
+| `{{ .Response }}` | The response from the model. When generating a response, text after this variable is omitted. |

-```modelfile
-TEMPLATE """
-{{- if .First }}
-### System:
-{{ .System }}
-{{- end }}
-
-### User:
-{{ .Prompt }}
-
-### Response:
+```
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
 """
-
-SYSTEM """<system message>"""
 ```

 ### SYSTEM
@@ -225,4 +217,4 @@ MESSAGE assistant yes
 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
 - Instructions can be in any order. In the examples, the `FROM` instruction is first to keep it easily readable.

-[1]: https://ollama.ai/library
+[1]: https://ollama.com/library
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -0,0 +1,141 @@
+# OpenAI compatibility
+
+> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/jmorganca/ollama/blob/main/docs/api.md).
+
+Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
+
+## Usage
+
+### OpenAI Python library
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url='http://localhost:11434/v1/',
+
+    # required but ignored
+    api_key='ollama',
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            'role': 'user',
+            'content': 'Say this is a test',
+        }
+    ],
+    model='llama2',
+)
+```
+
+### OpenAI JavaScript library
+
+```javascript
+import OpenAI from 'openai'
+
+const openai = new OpenAI({
+  baseURL: 'http://localhost:11434/v1/',
+
+  // required but ignored
+  apiKey: 'ollama',
+})
+
+const chatCompletion = await openai.chat.completions.create({
+  messages: [{ role: 'user', content: 'Say this is a test' }],
+  model: 'llama2',
+})
+```
+
+### `curl`
+
+```
+curl http://localhost:11434/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "llama2",
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant."
+            },
+            {
+                "role": "user",
+                "content": "Hello!"
+            }
+        ]
+    }'
+```
+
+## Endpoints
+
+### `/v1/chat/completions`
+
+#### Supported features
+
+- [x] Chat completions
+- [x] Streaming
+- [x] JSON mode
+- [x] Reproducible outputs
+- [ ] Vision
+- [ ] Function calling
+- [ ] Logprobs
+
+#### Supported request fields
+
+- [x] `model`
+- [x] `messages`
+  - [x] Text `content`
+  - [ ] Array of `content` parts
+- [x] `frequency_penalty`
+- [x] `presence_penalty`
+- [x] `response_format`
+- [x] `seed`
+- [x] `stop`
+- [x] `stream`
+- [x] `temperature`
+- [x] `top_p`
+- [x] `max_tokens`
+- [ ] `logit_bias`
+- [ ] `tools`
+- [ ] `tool_choice`
+- [ ] `user`
+- [ ] `n`
+
+#### Notes
+
+- Setting `seed` will always set `temperature` to `0`
+- `finish_reason` will always be `stop`
+- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
+
+## Models
+
+Before using a model, pull it locally `ollama pull`:
+
+```shell
+ollama pull llama2
+```
+
+### Default model names
+
+For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
+
+```
+ollama cp llama2 gpt-3.5-turbo
+```
+
+Afterwards, this new model name can be specified the `model` field:
+
+```shell
+curl http://localhost:11434/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "gpt-3.5-turbo",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hello!"
+            }
+        ]
+    }'
+```
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -12,6 +12,13 @@ On Linux systems with systemd, the logs can be found with this command:
 journalctl -u ollama
 ```

+When you run Ollama in a container, the logs go to stdout/stderr in the container:
+
+```shell
+docker logs <container-name>
+```
+(Use `docker ps` to find the container name)
+
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.

 Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
--- a/docs/tutorials/nvidia-jetson.md
+++ b/docs/tutorials/nvidia-jetson.md
@@ -17,7 +17,7 @@ Prerequisites:

 Here are the steps:

- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.ai/install.sh | sh`
+- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
 - Stop the Ollama service: `sudo systemctl stop ollama`
 - Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson 
 'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
--- a/examples/jupyter-notebook/ollama.ipynb
+++ b/examples/jupyter-notebook/ollama.ipynb
@@ -8,7 +8,7 @@
   "outputs": [],
   "source": [
    "# Download and run the Ollama Linux install script\n",
-    "!curl https://ollama.ai/install.sh | sh\n",
+    "!curl -fsSL https://ollama.com/install.sh | sh\n",
    "!command -v systemctl >/dev/null && sudo systemctl stop ollama"
   ]
  },
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -2,28 +2,28 @@

 ## Prerequisites

- Ollama: https://ollama.ai/download
+- Ollama: https://ollama.com/download
 - Kubernetes cluster. This example will use Google Kubernetes Engine.

 ## Steps

 1. Create the Ollama namespace, daemon set, and service

-    ```bash
-    kubectl apply -f cpu.yaml
-    ```
+   ```bash
+   kubectl apply -f cpu.yaml
+   ```

 1. Port forward the Ollama service to connect and use it locally

-    ```bash
-    kubectl -n ollama port-forward service/ollama 11434:80
-    ```
+   ```bash
+   kubectl -n ollama port-forward service/ollama 11434:80
+   ```

 1. Pull and run a model, for example `orca-mini:3b`

-    ```bash
-    ollama run orca-mini:3b
-    ```
+   ```bash
+   ollama run orca-mini:3b
+   ```

 ## (Optional) Hardware Acceleration

--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -1,6 +1,6 @@
 # LangChain Web Summarization

-This example summarizes the website, [https://ollama.ai/blog/run-llama2-uncensored-locally](https://ollama.ai/blog/run-llama2-uncensored-locally)
+This example summarizes the website, [https://ollama.com/blog/run-llama2-uncensored-locally](https://ollama.com/blog/run-llama2-uncensored-locally)

 ## Running the Example

--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -2,7 +2,7 @@ from langchain.llms import Ollama
 from langchain.document_loaders import WebBaseLoader
 from langchain.chains.summarize import load_summarize_chain

-loader = WebBaseLoader("https://ollama.ai/blog/run-llama2-uncensored-locally")
+loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()

 llm = Ollama(model="llama2")
--- a/examples/python-loganalysis/readme.md
+++ b/examples/python-loganalysis/readme.md
@@ -40,13 +40,13 @@ You are a log file analyzer. You will receive a set of lines from a log file for
 """
 ```

-This model is available at https://ollama.ai/mattw/loganalyzer. You can customize it and add to your own namespace using the command `ollama create <namespace/modelname> -f <path-to-modelfile>` then `ollama push <namespace/modelname>`.
+This model is available at https://ollama.com/mattw/loganalyzer. You can customize it and add to your own namespace using the command `ollama create <namespace/modelname> -f <path-to-modelfile>` then `ollama push <namespace/modelname>`.

 Then loganalysis.py scans all the lines in the given log file and searches for the word 'error'. When the word is found, the 10 lines before and after are set as the prompt for a call to the Generate API.

 ```python
 data = {
-  "prompt": "\n".join(error_logs), 
+  "prompt": "\n".join(error_logs),
  "model": "mattw/loganalyzer"
 }
 ```
--- a/examples/typescript-mentors/README.md
+++ b/examples/typescript-mentors/README.md
@@ -29,9 +29,9 @@ You can also add your own character to be chosen at random when you ask a questi
   ```bash
   ollama pull stablebeluga2:70b-q4_K_M
   ```
-  
+
 2. Create a new character:
-  
+
   ```bash
   npm run charactergen "Lorne Greene"
   ```
@@ -41,15 +41,15 @@ You can also add your own character to be chosen at random when you ask a questi
 3. Now you can create a model with this command:

   ```bash
-   ollama create <YourNamespace>/lornegreene -f lornegreene/Modelfile
+   ollama create <username>/lornegreene -f lornegreene/Modelfile
   ```

-   `YourNamespace` is whatever name you set up when you signed up at [https://ollama.ai/signup](https://ollama.ai/signup).
+   `username` is whatever name you set up when you signed up at [https://ollama.com/signup](https://ollama.com/signup).

-4. To add this to your mentors, you will have to update the code as follows. On line 8 of `mentors.ts`, add an object to the array, replacing `<YourNamespace>` with the namespace you used above.
+4. To add this to your mentors, you will have to update the code as follows. On line 8 of `mentors.ts`, add an object to the array, replacing `<username>` with the username you used above.

   ```bash
-   {ns: "<YourNamespace>", char: "Lorne Greene"}
+   {ns: "<username>", char: "Lorne Greene"}
   ```

 ## Review the Code
--- a/gpu/amd.go
+++ b/gpu/amd.go
@@ -0,0 +1,91 @@
+package gpu
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// TODO - windows vs. non-windows vs darwin
+
+// Discovery logic for AMD/ROCm GPUs
+
+const (
+	DriverVersionFile     = "/sys/module/amdgpu/version"
+	GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
+	// TODO probably break these down per GPU to make the logic simpler
+	GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
+	GPUUsedMemoryFileGlob  = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
+)
+
+func AMDDetected() bool {
+	_, err := AMDDriverVersion()
+	return err == nil
+}
+
+func AMDDriverVersion() (string, error) {
+	_, err := os.Stat(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	fp, err := os.Open(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	defer fp.Close()
+	verString, err := io.ReadAll(fp)
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimSpace(string(verString)), nil
+}
+
+func AMDGFXVersions() []Version {
+	res := []Version{}
+	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	for _, match := range matches {
+		fp, err := os.Open(match)
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
+			continue
+		}
+		defer fp.Close()
+
+		scanner := bufio.NewScanner(fp)
+		// optionally, resize scanner's capacity for lines over 64K, see next example
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if strings.HasPrefix(line, "gfx_target_version") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 || len(ver[1]) < 5 {
+					slog.Debug("malformed " + line)
+					continue
+				}
+				l := len(ver[1])
+				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
+				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
+				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
+				if err1 != nil || err2 != nil || err3 != nil {
+					slog.Debug("malformed int " + line)
+					continue
+				}
+
+				res = append(res, Version{
+					Major: uint(major),
+					Minor: uint(minor),
+					Patch: uint(patch),
+				})
+			}
+		}
+	}
+	return res
+}
+
+func (v Version) ToGFXString() string {
+	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
+}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -30,8 +30,8 @@ type handles struct {
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil

-// With our current CUDA compile flags, 5.2 and older will not work properly
-const CudaComputeMajorMin = 6
+// With our current CUDA compile flags, older than 5.0 will not work properly
+var CudaComputeMin = [2]C.int{5, 0}

 // Possible locations for the nvidia-ml library
 var CudaLinuxGlobs = []string{
@@ -122,70 +122,90 @@ func GetGPUInfo() GpuInfo {
 		initGPUHandles()
 	}

-	// All our GPU builds have AVX enabled, so fallback to CPU if we don't detect at least AVX
+	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
 	cpuVariant := GetCPUVariant()
-	if cpuVariant == "" {
+	if cpuVariant == "" && runtime.GOARCH == "amd64" {
 		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
 	}

 	var memInfo C.mem_info_t
 	resp := GpuInfo{}
-	if gpuHandles.cuda != nil && cpuVariant != "" {
+	if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
 		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		if memInfo.err != nil {
 			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
 			C.free(unsafe.Pointer(memInfo.err))
-		} else {
+		} else if memInfo.count > 0 {
 			// Verify minimum compute capability
 			var cc C.cuda_compute_capability_t
 			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
 			if cc.err != nil {
 				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
-			} else if cc.major >= CudaComputeMajorMin {
+			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 			} else {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else if gpuHandles.rocm != nil && cpuVariant != "" {
-		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
-		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
-			C.free(unsafe.Pointer(memInfo.err))
-		} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
-			// Only one GPU detected and it appears to be an integrated GPU - skip it
-			slog.Info("ROCm unsupported integrated GPU detected")
-		} else {
-			if memInfo.igpu_index >= 0 {
-				// We have multiple GPUs reported, and one of them is an integrated GPU
-				// so we have to set the env var to bypass it
-				// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
-				val := os.Getenv("ROCR_VISIBLE_DEVICES")
-				if val == "" {
-					devices := []string{}
-					for i := 0; i < int(memInfo.count); i++ {
-						if i == int(memInfo.igpu_index) {
-							continue
+	} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		ver, err := AMDDriverVersion()
+		if err == nil {
+			slog.Info("AMD Driver: " + ver)
+		}
+		gfx := AMDGFXVersions()
+		tooOld := false
+		for _, v := range gfx {
+			if v.Major < 9 {
+				slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
+				tooOld = true
+				break
+			}
+
+			// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
+			// e.g. gfx1034 works if we map it to gfx1030 at runtime
+
+		}
+		if !tooOld {
+			// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
+			C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+			if memInfo.err != nil {
+				slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
+				C.free(unsafe.Pointer(memInfo.err))
+			} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
+				// Only one GPU detected and it appears to be an integrated GPU - skip it
+				slog.Info("ROCm unsupported integrated GPU detected")
+			} else if memInfo.count > 0 {
+				if memInfo.igpu_index >= 0 {
+					// We have multiple GPUs reported, and one of them is an integrated GPU
+					// so we have to set the env var to bypass it
+					// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
+					val := os.Getenv("ROCR_VISIBLE_DEVICES")
+					if val == "" {
+						devices := []string{}
+						for i := 0; i < int(memInfo.count); i++ {
+							if i == int(memInfo.igpu_index) {
+								continue
+							}
+							devices = append(devices, strconv.Itoa(i))
 						}
-						devices = append(devices, strconv.Itoa(i))
+						val = strings.Join(devices, ",")
+						os.Setenv("ROCR_VISIBLE_DEVICES", val)
 					}
-					val = strings.Join(devices, ",")
-					os.Setenv("ROCR_VISIBLE_DEVICES", val)
+					slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
 				}
-				slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
+				resp.Library = "rocm"
+				var version C.rocm_version_resp_t
+				C.rocm_get_version(*gpuHandles.rocm, &version)
+				verString := C.GoString(version.str)
+				if version.status == 0 {
+					resp.Variant = "v" + verString
+				} else {
+					slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
+				}
+				C.free(unsafe.Pointer(version.str))
 			}
-			resp.Library = "rocm"
-			var version C.rocm_version_resp_t
-			C.rocm_get_version(*gpuHandles.rocm, &version)
-			verString := C.GoString(version.str)
-			if version.status == 0 {
-				resp.Variant = "v" + verString
-			} else {
-				slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
-			}
-			C.free(unsafe.Pointer(version.str))
 		}
 	}
 	if resp.Library == "" {
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -178,7 +178,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
-    resp->str = strdup("nvml handle not initialized");
+    resp->str = strdup("rocm handle not initialized");
    resp->status = 1;
    return;
  }
@@ -195,4 +195,4 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  resp->str = strdup(buf);
 }

-#endif  // __APPLE__
+#endif  // __APPLE__
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -16,3 +16,9 @@ type GpuInfo struct {

 	// TODO add other useful attributes about the card here for discovery information
 }
+
+type Version struct {
+	Major uint
+	Minor uint
+	Patch uint
+}
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -4,7 +4,7 @@ package llm
 #cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
-#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
+#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
 #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
 #cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
@@ -161,13 +161,10 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-	var imageData []ImageData
+
 	if len(predict.Images) > 0 {
-		for cnt, i := range predict.Images {
-			imageData = append(imageData, ImageData{Data: i, ID: cnt})
-		}
+		slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
 	}
-	slog.Info(fmt.Sprintf("loaded %d images", len(imageData)))

 	request := map[string]any{
 		"prompt":            predict.Prompt,
@@ -189,7 +186,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		"penalize_nl":       predict.Options.PenalizeNewline,
 		"seed":              predict.Options.Seed,
 		"stop":              predict.Options.Stop,
-		"image_data":        imageData,
+		"image_data":        predict.Images,
 		"cache_prompt":      true,
 	}

@@ -261,7 +258,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 					})
 				}

-				if p.Stop {
+				if p.Stop || bool(result.stop) {
 					fn(PredictResult{
 						Done:               true,
 						PromptEvalCount:    p.Timings.PromptN,
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -1,4 +1,5 @@
 #include "ext_server.h"
+#include <atomic>

 // Necessary evil since the server types are not defined in a header
 #include "server.cpp"
@@ -26,13 +27,29 @@

 // Expose the llama server as a callable extern "C" API
 llama_server_context *llama = NULL;
-std::atomic<bool> ext_server_running(false);
 std::thread ext_server_thread;
+bool shutting_down = false;
+std::atomic_int recv_counter;

+// RAII wrapper for tracking in-flight recv calls
+class atomicRecv {
+  public:
+    atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
+      ++this->atomic;
+    }
+    ~atomicRecv() {
+      --this->atomic;
+    }
+  private:
+    std::atomic<int> &atomic;
+};
+ 
 void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
+  recv_counter = 0;
  assert(err != NULL && sparams != NULL);
  log_set_target(stderr);
  if (!sparams->verbose_logging) {
+    server_verbose = true;
    log_disable();
  }

@@ -122,18 +139,23 @@ void llama_server_start() {
  assert(llama != NULL);
  // TODO mutex to protect thread creation
  ext_server_thread = std::thread([&]() {
-    ext_server_running = true;
    try {
      LOG_TEE("llama server main loop starting\n");
      ggml_time_init();
-      while (ext_server_running.load()) {
-        if (!llama->update_slots()) {
-          LOG_TEE(
-              "unexpected error in llama server update_slots - exiting main "
-              "loop\n");
-          break;
-        }
-      }
+      llama->queue_tasks.on_new_task(std::bind(
+        &llama_server_context::process_single_task, llama, std::placeholders::_1));
+      llama->queue_tasks.on_finish_multitask(std::bind(
+          &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
+      llama->queue_tasks.on_all_tasks_finished(std::bind(
+          &llama_server_context::run_on_all_tasks_finished, llama));
+      llama->queue_results.on_multitask_update(std::bind(
+          &llama_server_queue::update_multitask,
+          &llama->queue_tasks,
+          std::placeholders::_1,
+          std::placeholders::_2,
+          std::placeholders::_3
+        ));
+      llama->queue_tasks.start_loop();
    } catch (std::exception &e) {
      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
    } catch (...) {
@@ -146,17 +168,22 @@ void llama_server_start() {

 void llama_server_stop() {
  assert(llama != NULL);
-  // TODO - too verbose, remove once things are solid
-  LOG_TEE("requesting llama server shutdown\n");
-  ext_server_running = false;
+  // Shutdown any in-flight requests and block incoming requests.
+  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
+  shutting_down = true;

-  // unblocks the update_slots() loop so it can clean up and exit
-  llama->request_cancel(0);
+  while (recv_counter.load() > 0) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }

+  // This may take a while for any pending tasks to drain
+  // TODO - consider a timeout to cancel tasks if it's taking too long
+  llama->queue_tasks.terminate();
  ext_server_thread.join();
  delete llama;
  llama = NULL;
  LOG_TEE("llama server shutdown complete\n");
+  shutting_down = false;
 }

 void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
@@ -164,8 +191,13 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
  resp->id = -1;
  resp->msg[0] = '\0';
  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
    json data = json::parse(json_req);
-    resp->id = llama->request_completion(data, false, false, -1);
+    resp->id = llama->queue_tasks.get_new_id();
+    llama->queue_results.add_waiting_task_id(resp->id);
+    llama->request_completion(resp->id, data, false, false, -1);
  } catch (std::exception &e) {
    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
  } catch (...) {
@@ -183,16 +215,28 @@ void llama_server_completion_next_result(const int task_id,
  resp->json_resp = NULL;
  std::string result_json;
  try {
-    task_result result = llama->next_result(task_id);
+    atomicRecv ar(recv_counter);
+    task_result result = llama->queue_results.recv(task_id);
    result_json =
        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
    resp->id = result.id;
    resp->stop = result.stop;
    resp->error = result.error;
    if (result.error) {
+      LOG_TEE("next result cancel on error\n");
      llama->request_cancel(task_id);
+      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
+      llama->queue_results.remove_waiting_task_id(task_id);
    } else if (result.stop) {
+      LOG_TEE("next result cancel on stop\n");
      llama->request_cancel(task_id);
+      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
+      llama->queue_results.remove_waiting_task_id(task_id);
+    } else if (shutting_down) {
+      LOG_TEE("aborting completion due to shutdown %d\n", task_id);
+      llama->request_cancel(task_id);
+      llama->queue_results.remove_waiting_task_id(task_id);
+      resp->stop = true;
    }
  } catch (std::exception &e) {
    resp->error = true;
@@ -223,6 +267,7 @@ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
  err->msg[0] = '\0';
  try {
    llama->request_cancel(task_id);
+    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -240,6 +285,9 @@ void llama_server_tokenize(const char *json_req, char **json_resp,
  err->id = 0;
  err->msg[0] = '\0';
  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
    const json body = json::parse(json_req);
    std::vector<llama_token> tokens;
    if (body.count("content") != 0) {
@@ -273,6 +321,9 @@ void llama_server_detokenize(const char *json_req, char **json_resp,
  err->id = 0;
  err->msg[0] = '\0';
  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
    const json body = json::parse(json_req);
    std::string content;
    if (body.count("tokens") != 0) {
@@ -300,6 +351,9 @@ void llama_server_embedding(const char *json_req, char **json_resp,
  err->id = 0;
  err->msg[0] = '\0';
  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
    const json body = json::parse(json_req);
    json prompt;
    if (body.count("content") != 0) {
@@ -307,13 +361,16 @@ void llama_server_embedding(const char *json_req, char **json_resp,
    } else {
      prompt = "";
    }
-    const int task_id = llama->request_completion(
-        {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
-    task_result result = llama->next_result(task_id);
+    const int task_id = llama->queue_tasks.get_new_id();
+    llama->queue_results.add_waiting_task_id(task_id);
+    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
+    atomicRecv ar(recv_counter);
+    task_result result = llama->queue_results.recv(task_id);
    std::string result_json = result.result_json.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
+    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -39,6 +39,9 @@ init_vars() {
    *)
        ;;
    esac
+    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then 
+        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
+    fi
 }

 git_module_setup() {
@@ -62,15 +65,17 @@ apply_patches() {
        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
    fi

-    # apply temporary patches until fix is upstream
-    for patch in ../patches/*.diff; do
-        for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-            (cd ${LLAMACPP_DIR}; git checkout ${file})
+    if [ -n "$(ls -A ../patches/*.diff)" ]; then
+        # apply temporary patches until fix is upstream
+        for patch in ../patches/*.diff; do
+            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
+                (cd ${LLAMACPP_DIR}; git checkout ${file})
+            done
        done
-    done
-    for patch in ../patches/*.diff; do
-        (cd ${LLAMACPP_DIR} && git apply ${patch})
-    done
+        for patch in ../patches/*.diff; do
+            (cd ${LLAMACPP_DIR} && git apply ${patch})
+        done
+    fi

    # Avoid duplicate main symbols when we link into the cgo binary
    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
@@ -109,4 +114,12 @@ compress_libs() {
 # Keep the local tree clean after we're done with the build
 cleanup() {
    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
+
+    if [ -n "$(ls -A ../patches/*.diff)" ]; then
+        for patch in ../patches/*.diff; do
+            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
+                (cd ${LLAMACPP_DIR}; git checkout ${file})
+            done
+        done
+    fi
 }
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -21,7 +21,6 @@ amdGPUs() {
        return
    fi
    GPU_LIST=(
-        "gfx803"
        "gfx900"
        "gfx906:xnack-"
        "gfx908:xnack-"
@@ -128,6 +127,11 @@ if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
 fi

+# Allow override in case libcudart is in the wrong place
+if [ -z "${CUDART_LIB_DIR}" ]; then
+    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
+fi
+
 if [ -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
@@ -135,7 +139,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
-    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
@@ -151,6 +155,8 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
+        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
+            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
        else
            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
        fi
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -25,6 +25,11 @@ function init_vars {
    }
    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
+    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
+        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
+    } else {
+        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
+    }
 }

 function git_module_setup {
@@ -151,7 +156,7 @@ if ($null -ne $script:CUDA_LIB_DIR) {
    }
    init_vars
    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
+    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
    build
    install
    cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -62,7 +62,7 @@ const maxRetries = 3
 type PredictOpts struct {
 	Prompt  string
 	Format  string
-	Images  []api.ImageData
+	Images  []ImageData
 	Options api.Options
 }

--- a/llm/llm.go
+++ b/llm/llm.go
@@ -120,7 +120,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)

 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, model, adapters, projectors, opts)
+	return newLlmServer(info, workDir, model, adapters, projectors, opts)
 }

 // Give any native cgo implementations an opportunity to initialize
@@ -128,7 +128,7 @@ func Init(workdir string) error {
 	return nativeInit(workdir)
 }

-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)

 	// Check to see if the user has requested a specific library instead of auto-detecting
@@ -143,6 +143,16 @@ func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []stri
 		}
 	}

+	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
+	_, err := os.Stat(dynLibs[0])
+	if err != nil {
+		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
+		err = nativeInit(workDir)
+		if err != nil {
+			return nil, err
+		}
+	}
+
 	err2 := fmt.Errorf("unable to locate suitable llm library")
 	for _, dynLib := range dynLibs {
 		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,30 +1,21 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 0462fbd2..4fa7b57f 100644
+index d86d7e04..2694e92e 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1857,12 +1857,6 @@ struct llama_server_context
-                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
-                     }
+@@ -901,13 +901,15 @@ struct llama_server_context
+                 slot.sent_count += result.text_to_send.size();
+                 // add the token to slot queue and cache
+             }
+-            slot.add_token_string(result);
+
+             if (slot.params.stream)
+             {
+                 send_partial_response(slot, result);
+             }
+         }
 
-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
-
-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
-
-                    slot.cache_tokens = prompt_tokens;
-
-                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
-                     {
-                         // we have to evaluate at least 1 token to generate logits.
-@@ -1870,6 +1864,12 @@ struct llama_server_context
-                         slot.n_past--;
-                     }
- 
-+                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+        slot.add_token_string(result);
 +
-+                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
-+
-+                    slot.cache_tokens = prompt_tokens;
-+
-                     LOG_VERBOSE("prompt ingested", {
-                                                     {"n_past", slot.n_past},
-                                                     {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
+         if (incomplete)
+         {
+             slot.has_next_token = true;
--- a/llm/patches/02-shutdown.diff
+++ b/llm/patches/02-shutdown.diff
@@ -0,0 +1,85 @@
+diff --git a/examples/server/server.cpp b/examples/server/server.cpp
+index 11dd82c3..311495a8 100644
+--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+@@ -28,6 +28,7 @@
+ #include <chrono>
+ #include <condition_variable>
+ #include <atomic>
+#include <signal.h>
+ 
+ using json = nlohmann::json;
+ 
+@@ -2394,6 +2395,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
+     }
+ }
+ 
+std::function<void(int)> shutdown_handler;
+inline void signal_handler(int signal) { shutdown_handler(signal); }
+
+ int main(int argc, char **argv)
+ {
+ #if SERVER_VERBOSE != 1
+@@ -3014,8 +3018,14 @@ int main(int argc, char **argv)
+         std::placeholders::_2,
+         std::placeholders::_3
+     ));
+-    llama.queue_tasks.start_loop();
+ 
+    shutdown_handler = [&](int) {
+        llama.queue_tasks.terminate();
+    };
+    signal(SIGTERM, signal_handler);
+    signal(SIGINT, signal_handler);
+    llama.queue_tasks.start_loop();
+    svr.stop();
+     t.join();
+ 
+     llama_backend_free();
+diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
+index 70cce072..9124869a 100644
+--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
+@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector<json> messages)
+ struct llama_server_queue {
+     int id = 0;
+     std::mutex mutex_tasks;
+    bool running;
+     // queues
+     std::vector<task_server> queue_tasks;
+     std::vector<task_server> queue_tasks_deferred;
+@@ -248,9 +249,18 @@ struct llama_server_queue {
+         queue_tasks_deferred.clear();
+     }
+ 
+-    // Start the main loop. This call is blocking
+-    [[noreturn]]
+    // end the start_loop routine
+    void terminate() {
+        {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            running = false;
+        }
+        condition_tasks.notify_all();
+    }
+
+    // Start the main loop.
+     void start_loop() {
+        running = true;
+         while (true) {
+             // new task arrived
+             LOG_VERBOSE("have new task", {});
+@@ -294,8 +304,12 @@ struct llama_server_queue {
+             {
+                 std::unique_lock<std::mutex> lock(mutex_tasks);
+                 if (queue_tasks.empty()) {
+                    if (!running) {
+                        LOG_VERBOSE("ending start_loop", {});
+                        return;
+                    }
+                     condition_tasks.wait(lock, [&]{
+-                        return !queue_tasks.empty();
+                        return (!queue_tasks.empty() || !running);
+                     });
+                 }
+             }
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
 	if len(dynLibs) == 0 {
 		dynLibs = []string{availableDynLibs["cpu"]}
 	}
+	slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
 	return dynLibs
 }

--- a/openai/openai.go
+++ b/openai/openai.go
@@ -0,0 +1,322 @@
+// openai package provides middleware for partial compatibility with the OpenAI REST API
+package openai
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"math/rand"
+	"net/http"
+	"time"
+
+	"github.com/gin-gonic/gin"
+	"github.com/jmorganca/ollama/api"
+)
+
+type Error struct {
+	Message string      `json:"message"`
+	Type    string      `json:"type"`
+	Param   interface{} `json:"param"`
+	Code    *string     `json:"code"`
+}
+
+type ErrorResponse struct {
+	Error Error `json:"error"`
+}
+
+type Message struct {
+	Role    string `json:"role"`
+	Content string `json:"content"`
+}
+
+type Choice struct {
+	Index        int     `json:"index"`
+	Message      Message `json:"message"`
+	FinishReason *string `json:"finish_reason"`
+}
+
+type ChunkChoice struct {
+	Index        int     `json:"index"`
+	Delta        Message `json:"delta"`
+	FinishReason *string `json:"finish_reason"`
+}
+
+type Usage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
+
+type ResponseFormat struct {
+	Type string `json:"type"`
+}
+
+type ChatCompletionRequest struct {
+	Model            string          `json:"model"`
+	Messages         []Message       `json:"messages"`
+	Stream           bool            `json:"stream"`
+	MaxTokens        *int            `json:"max_tokens"`
+	Seed             *int            `json:"seed"`
+	Stop             any             `json:"stop"`
+	Temperature      *float64        `json:"temperature"`
+	FrequencyPenalty *float64        `json:"frequency_penalty"`
+	PresencePenalty  *float64        `json:"presence_penalty_penalty"`
+	TopP             *float64        `json:"top_p"`
+	ResponseFormat   *ResponseFormat `json:"response_format"`
+}
+
+type ChatCompletion struct {
+	Id                string   `json:"id"`
+	Object            string   `json:"object"`
+	Created           int64    `json:"created"`
+	Model             string   `json:"model"`
+	SystemFingerprint string   `json:"system_fingerprint"`
+	Choices           []Choice `json:"choices"`
+	Usage             Usage    `json:"usage,omitempty"`
+}
+
+type ChatCompletionChunk struct {
+	Id                string        `json:"id"`
+	Object            string        `json:"object"`
+	Created           int64         `json:"created"`
+	Model             string        `json:"model"`
+	SystemFingerprint string        `json:"system_fingerprint"`
+	Choices           []ChunkChoice `json:"choices"`
+}
+
+func NewError(code int, message string) ErrorResponse {
+	var etype string
+	switch code {
+	case http.StatusBadRequest:
+		etype = "invalid_request_error"
+	case http.StatusNotFound:
+		etype = "not_found_error"
+	default:
+		etype = "api_error"
+	}
+
+	return ErrorResponse{Error{Type: etype, Message: message}}
+}
+
+func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
+	return ChatCompletion{
+		Id:                id,
+		Object:            "chat.completion",
+		Created:           r.CreatedAt.Unix(),
+		Model:             r.Model,
+		SystemFingerprint: "fp_ollama",
+		Choices: []Choice{{
+			Index:   0,
+			Message: Message{Role: r.Message.Role, Content: r.Message.Content},
+			FinishReason: func(done bool) *string {
+				if done {
+					reason := "stop"
+					return &reason
+				}
+				return nil
+			}(r.Done),
+		}},
+		Usage: Usage{
+			// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
+			PromptTokens:     r.PromptEvalCount,
+			CompletionTokens: r.EvalCount,
+			TotalTokens:      r.PromptEvalCount + r.EvalCount,
+		},
+	}
+}
+
+func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
+	return ChatCompletionChunk{
+		Id:                id,
+		Object:            "chat.completion.chunk",
+		Created:           time.Now().Unix(),
+		Model:             r.Model,
+		SystemFingerprint: "fp_ollama",
+		Choices: []ChunkChoice{
+			{
+				Index: 0,
+				Delta: Message{Role: "assistant", Content: r.Message.Content},
+				FinishReason: func(done bool) *string {
+					if done {
+						reason := "stop"
+						return &reason
+					}
+					return nil
+				}(r.Done),
+			},
+		},
+	}
+}
+
+func fromRequest(r ChatCompletionRequest) api.ChatRequest {
+	var messages []api.Message
+	for _, msg := range r.Messages {
+		messages = append(messages, api.Message{Role: msg.Role, Content: msg.Content})
+	}
+
+	options := make(map[string]interface{})
+
+	switch stop := r.Stop.(type) {
+	case string:
+		options["stop"] = []string{stop}
+	case []interface{}:
+		var stops []string
+		for _, s := range stop {
+			if str, ok := s.(string); ok {
+				stops = append(stops, str)
+			}
+		}
+		options["stop"] = stops
+	}
+
+	if r.MaxTokens != nil {
+		options["num_predict"] = *r.MaxTokens
+	}
+
+	if r.Temperature != nil {
+		options["temperature"] = *r.Temperature * 2.0
+	} else {
+		options["temperature"] = 1.0
+	}
+
+	if r.Seed != nil {
+		options["seed"] = *r.Seed
+
+		// temperature=0 is required for reproducible outputs
+		options["temperature"] = 0.0
+	}
+
+	if r.FrequencyPenalty != nil {
+		options["frequency_penalty"] = *r.FrequencyPenalty * 2.0
+	}
+
+	if r.PresencePenalty != nil {
+		options["presence_penalty"] = *r.PresencePenalty * 2.0
+	}
+
+	if r.TopP != nil {
+		options["top_p"] = *r.TopP
+	} else {
+		options["top_p"] = 1.0
+	}
+
+	var format string
+	if r.ResponseFormat != nil && r.ResponseFormat.Type == "json_object" {
+		format = "json"
+	}
+
+	return api.ChatRequest{
+		Model:    r.Model,
+		Messages: messages,
+		Format:   format,
+		Options:  options,
+		Stream:   &r.Stream,
+	}
+}
+
+type writer struct {
+	stream bool
+	id     string
+	gin.ResponseWriter
+}
+
+func (w *writer) writeError(code int, data []byte) (int, error) {
+	var serr api.StatusError
+	err := json.Unmarshal(data, &serr)
+	if err != nil {
+		return 0, err
+	}
+
+	w.ResponseWriter.Header().Set("Content-Type", "application/json")
+	err = json.NewEncoder(w.ResponseWriter).Encode(NewError(http.StatusInternalServerError, serr.Error()))
+	if err != nil {
+		return 0, err
+	}
+
+	return len(data), nil
+}
+
+func (w *writer) writeResponse(data []byte) (int, error) {
+	var chatResponse api.ChatResponse
+	err := json.Unmarshal(data, &chatResponse)
+	if err != nil {
+		return 0, err
+	}
+
+	// chat chunk
+	if w.stream {
+		d, err := json.Marshal(toChunk(w.id, chatResponse))
+		if err != nil {
+			return 0, err
+
+		}
+
+		w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
+		_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
+		if err != nil {
+			return 0, err
+		}
+
+		if chatResponse.Done {
+			_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
+			if err != nil {
+				return 0, err
+			}
+		}
+
+		return len(data), nil
+	}
+
+	// chat completion
+	w.ResponseWriter.Header().Set("Content-Type", "application/json")
+	err = json.NewEncoder(w.ResponseWriter).Encode(toChatCompletion(w.id, chatResponse))
+	if err != nil {
+		return 0, err
+	}
+
+	return len(data), nil
+}
+
+func (w *writer) Write(data []byte) (int, error) {
+	code := w.ResponseWriter.Status()
+	if code != http.StatusOK {
+		return w.writeError(code, data)
+	}
+
+	return w.writeResponse(data)
+}
+
+func Middleware() gin.HandlerFunc {
+	return func(c *gin.Context) {
+		var req ChatCompletionRequest
+		err := c.ShouldBindJSON(&req)
+		if err != nil {
+			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
+			return
+		}
+
+		if len(req.Messages) == 0 {
+			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "[] is too short - 'messages'"))
+			return
+		}
+
+		var b bytes.Buffer
+		if err := json.NewEncoder(&b).Encode(fromRequest(req)); err != nil {
+			c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
+			return
+		}
+
+		c.Request.Body = io.NopCloser(&b)
+
+		w := &writer{
+			ResponseWriter: c.Writer,
+			stream:         req.Stream,
+			id:             fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
+		}
+
+		c.Writer = w
+
+		c.Next()
+	}
+}
--- a/progress/progress.go
+++ b/progress/progress.go
@@ -52,10 +52,6 @@ func (p *Progress) Stop() bool {
 	return stopped
 }

-func (p *Progress) StopWithoutClear() bool {
-	return p.stop()
-}
-
 func (p *Progress) StopAndClear() bool {
 	fmt.Fprint(p.w, "\033[?25l")
 	defer fmt.Fprint(p.w, "\033[?25h")
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -32,6 +32,8 @@ func (p *Prompt) placeholder() string {

 type Terminal struct {
 	outchan chan rune
+	rawmode bool
+	termios any
 }

 type Instance struct {
@@ -60,6 +62,16 @@ func New(prompt Prompt) (*Instance, error) {
 }

 func (i *Instance) Readline() (string, error) {
+	if !i.Terminal.rawmode {
+		fd := int(syscall.Stdin)
+		termios, err := SetRawMode(fd)
+		if err != nil {
+			return "", err
+		}
+		i.Terminal.rawmode = true
+		i.Terminal.termios = termios
+	}
+
 	prompt := i.Prompt.prompt()
 	if i.Pasting {
 		// force alt prompt when pasting
@@ -67,13 +79,12 @@ func (i *Instance) Readline() (string, error) {
 	}
 	fmt.Print(prompt)

-	fd := int(syscall.Stdin)
-	termios, err := SetRawMode(fd)
-	if err != nil {
-		return "", err
-	}
-	// nolint: errcheck
-	defer UnsetRawMode(fd, termios)
+	defer func() {
+		fd := int(syscall.Stdin)
+		// nolint: errcheck
+		UnsetRawMode(fd, i.Terminal.termios)
+		i.Terminal.rawmode = false
+	}()

 	buf, _ := NewBuffer(i.Prompt)

@@ -205,7 +216,8 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlW:
 			buf.DeleteWord()
 		case CharCtrlZ:
-			return handleCharCtrlZ(fd, termios)
+			fd := int(syscall.Stdin)
+			return handleCharCtrlZ(fd, i.Terminal.termios)
 		case CharEnter:
 			output := buf.String()
 			if output != "" {
@@ -236,8 +248,16 @@ func (i *Instance) HistoryDisable() {
 }

 func NewTerminal() (*Terminal, error) {
+	fd := int(syscall.Stdin)
+	termios, err := SetRawMode(fd)
+	if err != nil {
+		return nil, err
+	}
+
 	t := &Terminal{
 		outchan: make(chan rune),
+		rawmode: true,
+		termios: termios,
 	}

 	go t.ioloop()
--- a/readline/readline_unix.go
+++ b/readline/readline_unix.go
@@ -6,8 +6,9 @@ import (
 	"syscall"
 )

-func handleCharCtrlZ(fd int, termios *Termios) (string, error) {
-	if err := UnsetRawMode(fd, termios); err != nil {
+func handleCharCtrlZ(fd int, termios any) (string, error) {
+	t := termios.(*Termios)
+	if err := UnsetRawMode(fd, t); err != nil {
 		return "", err
 	}

--- a/readline/readline_windows.go
+++ b/readline/readline_windows.go
@@ -1,6 +1,6 @@
 package readline

-func handleCharCtrlZ(fd int, state *State) (string, error) {
+func handleCharCtrlZ(fd int, state any) (string, error) {
 	// not supported
 	return "", nil
 }
--- a/readline/term.go
+++ b/readline/term.go
@@ -25,8 +25,9 @@ func SetRawMode(fd int) (*Termios, error) {
 	return termios, setTermios(fd, &newTermios)
 }

-func UnsetRawMode(fd int, termios *Termios) error {
-	return setTermios(fd, termios)
+func UnsetRawMode(fd int, termios any) error {
+	t := termios.(*Termios)
+	return setTermios(fd, t)
 }

 // IsTerminal returns true if the given file descriptor is a terminal.
--- a/readline/term_windows.go
+++ b/readline/term_windows.go
@@ -56,7 +56,8 @@ func SetRawMode(fd int) (*State, error) {
 	return &State{st}, nil
 }

-func UnsetRawMode(fd int, state *State) error {
-	_, _, err := syscall.SyscallN(procSetConsoleMode.Addr(), uintptr(fd), uintptr(state.mode), 0)
+func UnsetRawMode(fd int, state any) error {
+	s := state.(*State)
+	_, _, err := syscall.SyscallN(procSetConsoleMode.Addr(), uintptr(fd), uintptr(s.mode), 0)
 	return err
 }
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -61,7 +61,7 @@ if [ -n "$NEEDS" ]; then
 fi

 status "Downloading ollama..."
-curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.ai/download/ollama-linux-$ARCH"
+curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-$ARCH"

 for BINDIR in /usr/local/bin /usr/bin /bin; do
    echo $PATH | grep -q $BINDIR && break || continue
--- a/server/auth.go
+++ b/server/auth.go
@@ -111,8 +111,14 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 	defer resp.Body.Close()

 	if resp.StatusCode >= http.StatusBadRequest {
-		body, _ := io.ReadAll(resp.Body)
-		return "", fmt.Errorf("on pull registry responded with code %d: %s", resp.StatusCode, body)
+		responseBody, err := io.ReadAll(resp.Body)
+		if err != nil {
+			return "", fmt.Errorf("%d: %v", resp.StatusCode, err)
+		} else if len(responseBody) > 0 {
+			return "", fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
+		}
+
+		return "", fmt.Errorf("%s", resp.Status)
 	}

 	respBody, err := io.ReadAll(resp.Body)
@@ -147,12 +153,7 @@ func (s SignatureData) Bytes() []byte {

 // SignData takes a SignatureData object and signs it with a raw private key
 func (s SignatureData) Sign(rawKey []byte) (string, error) {
-	privateKey, err := ssh.ParseRawPrivateKey(rawKey)
-	if err != nil {
-		return "", err
-	}
-
-	signer, err := ssh.NewSignerFromKey(privateKey)
+	signer, err := ssh.ParsePrivateKey(rawKey)
 	if err != nil {
 		return "", err
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -19,7 +19,6 @@ import (
 	"strconv"
 	"strings"
 	"text/template"
-	"text/template/parse"

 	"golang.org/x/exp/slices"

@@ -58,156 +57,6 @@ type Message struct {
 	Content string `json:"content"`
 }

-type PromptVars struct {
-	System   string
-	Prompt   string
-	Response string
-	First    bool
-}
-
-// extractParts extracts the parts of the template before and after the {{.Response}} node.
-func extractParts(tmplStr string) (pre string, post string, err error) {
-	tmpl, err := template.New("").Parse(tmplStr)
-	if err != nil {
-		return "", "", err
-	}
-
-	var foundResponse bool
-
-	for _, node := range tmpl.Tree.Root.Nodes {
-		if node.Type() == parse.NodeAction && node.String() == "{{.Response}}" {
-			foundResponse = true
-		}
-		if !foundResponse {
-			pre += node.String()
-		} else {
-			post += node.String()
-		}
-	}
-
-	return pre, post, nil
-}
-
-func Prompt(promptTemplate string, p PromptVars) (string, error) {
-	var prompt strings.Builder
-	// Use the "missingkey=zero" option to handle missing variables without panicking
-	tmpl, err := template.New("").Option("missingkey=zero").Parse(promptTemplate)
-	if err != nil {
-		return "", err
-	}
-
-	vars := map[string]any{
-		"System":   p.System,
-		"Prompt":   p.Prompt,
-		"Response": p.Response,
-		"First":    p.First,
-	}
-
-	var sb strings.Builder
-	if err := tmpl.Execute(&sb, vars); err != nil {
-		return "", err
-	}
-	prompt.WriteString(sb.String())
-
-	if !strings.Contains(prompt.String(), p.Response) {
-		// if the response is not in the prompt template, append it to the end
-		prompt.WriteString(p.Response)
-	}
-
-	return prompt.String(), nil
-}
-
-// PreResponsePrompt returns the prompt before the response tag
-func (m *Model) PreResponsePrompt(p PromptVars) (string, error) {
-	if p.System == "" {
-		// use the default system prompt for this model if one is not specified
-		p.System = m.System
-	}
-	pre, _, err := extractParts(m.Template)
-	if err != nil {
-		return "", err
-	}
-
-	return Prompt(pre, p)
-}
-
-// PostResponseTemplate returns the template after the response tag
-func (m *Model) PostResponseTemplate(p PromptVars) (string, error) {
-	if p.System == "" {
-		// use the default system prompt for this model if one is not specified
-		p.System = m.System
-	}
-	_, post, err := extractParts(m.Template)
-	if err != nil {
-		return "", err
-	}
-
-	if post == "" {
-		// if there is no post-response template, return the provided response
-		return p.Response, nil
-	}
-
-	return Prompt(post, p)
-}
-
-func (m *Model) ChatPrompt(msgs []api.Message) (string, []api.ImageData, error) {
-	// build the prompt from the list of messages
-	var prompt strings.Builder
-	var currentImages []api.ImageData
-	currentVars := PromptVars{
-		First:  true,
-		System: m.System,
-	}
-
-	writePrompt := func() error {
-		p, err := Prompt(m.Template, currentVars)
-		if err != nil {
-			return err
-		}
-		prompt.WriteString(p)
-		currentVars = PromptVars{}
-		return nil
-	}
-
-	for _, msg := range msgs {
-		switch strings.ToLower(msg.Role) {
-		case "system":
-			if currentVars.System != "" {
-				if err := writePrompt(); err != nil {
-					return "", nil, err
-				}
-			}
-			currentVars.System = msg.Content
-		case "user":
-			if currentVars.Prompt != "" {
-				if err := writePrompt(); err != nil {
-					return "", nil, err
-				}
-			}
-			currentVars.Prompt = msg.Content
-			currentImages = msg.Images
-		case "assistant":
-			currentVars.Response = msg.Content
-			if err := writePrompt(); err != nil {
-				return "", nil, err
-			}
-		default:
-			return "", nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
-		}
-	}
-
-	// Append the last set of vars if they are non-empty
-	if currentVars.Prompt != "" || currentVars.System != "" {
-		p, err := m.PreResponsePrompt(currentVars)
-		if err != nil {
-			return "", nil, fmt.Errorf("pre-response template: %w", err)
-		}
-		prompt.WriteString(p)
-	}
-
-	return prompt.String(), currentImages, nil
-}
-
 type ManifestV2 struct {
 	SchemaVersion int      `json:"schemaVersion"`
 	MediaType     string   `json:"mediaType"`
@@ -471,7 +320,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 				switch {
 				case errors.Is(err, os.ErrNotExist):
 					fn(api.ProgressResponse{Status: "pulling model"})
-					if err := PullModel(ctx, c.Args, "", &RegistryOptions{}, fn); err != nil {
+					if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil {
 						return err
 					}

@@ -1041,7 +890,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 	return nil
 }

-func PullModel(ctx context.Context, name, currentDigest string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
+func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

 	var manifest *ManifestV2
@@ -1069,23 +918,13 @@ func PullModel(ctx context.Context, name, currentDigest string, regOpts *Registr
 		return fmt.Errorf("insecure protocol http")
 	}

-	if currentDigest == "" {
-		fn(api.ProgressResponse{Status: "pulling manifest"})
-	}
+	fn(api.ProgressResponse{Status: "pulling manifest"})

-	manifest, err = pullModelManifest(ctx, mp, currentDigest, regOpts)
+	manifest, err = pullModelManifest(ctx, mp, regOpts)
 	if err != nil {
 		return fmt.Errorf("pull model manifest: %s", err)
 	}

-	if currentDigest != "" {
-		if manifest == nil {
-			// we already have the model
-			return nil
-		}
-		fn(api.ProgressResponse{Status: "upgrading " + mp.GetShortTagname()})
-	}
-
 	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
 	layers = append(layers, manifest.Config)
@@ -1157,27 +996,17 @@ func PullModel(ctx context.Context, name, currentDigest string, regOpts *Registr
 	return nil
 }

-func pullModelManifest(ctx context.Context, mp ModelPath, currentDigest string, regOpts *RegistryOptions) (*ManifestV2, error) {
+func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *RegistryOptions) (*ManifestV2, error) {
 	requestURL := mp.BaseURL().JoinPath("v2", mp.GetNamespaceRepository(), "manifests", mp.Tag)

 	headers := make(http.Header)
 	headers.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")
-
-	if currentDigest != "" {
-		headers.Set("If-None-Match", currentDigest)
-	}
-
 	resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, regOpts)
 	if err != nil {
 		return nil, err
 	}
 	defer resp.Body.Close()

-	// todo we can potentially read the manifest locally and return it here
-	if resp.StatusCode == http.StatusNotModified {
-		return nil, nil
-	}
-
 	var m *ManifestV2
 	if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
 		return nil, err
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,347 +0,0 @@
-package server
-
-import (
-	"strings"
-	"testing"
-
-	"github.com/jmorganca/ollama/api"
-)
-
-func TestPrompt(t *testing.T) {
-	tests := []struct {
-		name     string
-		template string
-		vars     PromptVars
-		want     string
-		wantErr  bool
-	}{
-		{
-			name:     "System Prompt",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			vars: PromptVars{
-				System: "You are a Wizard.",
-				Prompt: "What are the potion ingredients?",
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
-		},
-		{
-			name:     "System Prompt with Response",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST] {{ .Response }}",
-			vars: PromptVars{
-				System:   "You are a Wizard.",
-				Prompt:   "What are the potion ingredients?",
-				Response: "I don't know.",
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST] I don't know.",
-		},
-		{
-			name:     "Conditional Logic Nodes",
-			template: "[INST] {{if .First}}Hello!{{end}} {{ .System }} {{ .Prompt }} [/INST] {{ .Response }}",
-			vars: PromptVars{
-				First:    true,
-				System:   "You are a Wizard.",
-				Prompt:   "What are the potion ingredients?",
-				Response: "I don't know.",
-			},
-			want: "[INST] Hello! You are a Wizard. What are the potion ingredients? [/INST] I don't know.",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got, err := Prompt(tt.template, tt.vars)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("Prompt() error = %v, wantErr %v", err, tt.wantErr)
-				return
-			}
-			if got != tt.want {
-				t.Errorf("Prompt() got = %v, want %v", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestModel_PreResponsePrompt(t *testing.T) {
-	tests := []struct {
-		name     string
-		template string
-		vars     PromptVars
-		want     string
-		wantErr  bool
-	}{
-		{
-			name:     "No Response in Template",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			vars: PromptVars{
-				System: "You are a Wizard.",
-				Prompt: "What are the potion ingredients?",
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
-		},
-		{
-			name:     "Response in Template",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST] {{ .Response }}",
-			vars: PromptVars{
-				System: "You are a Wizard.",
-				Prompt: "What are the potion ingredients?",
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST] ",
-		},
-		{
-			name:     "Response in Template with Trailing Formatting",
-			template: "<|im_start|>user\n{{ .Prompt }}<|im_end|><|im_start|>assistant\n{{ .Response }}<|im_end|>",
-			vars: PromptVars{
-				Prompt: "What are the potion ingredients?",
-			},
-			want: "<|im_start|>user\nWhat are the potion ingredients?<|im_end|><|im_start|>assistant\n",
-		},
-		{
-			name:     "Response in Template with Alternative Formatting",
-			template: "<|im_start|>user\n{{.Prompt}}<|im_end|><|im_start|>assistant\n{{.Response}}<|im_end|>",
-			vars: PromptVars{
-				Prompt: "What are the potion ingredients?",
-			},
-			want: "<|im_start|>user\nWhat are the potion ingredients?<|im_end|><|im_start|>assistant\n",
-		},
-	}
-
-	for _, tt := range tests {
-		m := Model{Template: tt.template}
-		t.Run(tt.name, func(t *testing.T) {
-			got, err := m.PreResponsePrompt(tt.vars)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("PreResponsePrompt() error = %v, wantErr %v", err, tt.wantErr)
-				return
-			}
-			if got != tt.want {
-				t.Errorf("PreResponsePrompt() got = %v, want %v", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestModel_PostResponsePrompt(t *testing.T) {
-	tests := []struct {
-		name     string
-		template string
-		vars     PromptVars
-		want     string
-		wantErr  bool
-	}{
-		{
-			name:     "No Response in Template",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			vars: PromptVars{
-				Response: "I don't know.",
-			},
-			want: "I don't know.",
-		},
-		{
-			name:     "Response in Template",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST] {{ .Response }}",
-			vars: PromptVars{
-				Response: "I don't know.",
-			},
-			want: "I don't know.",
-		},
-		{
-			name:     "Response in Template with Trailing Formatting",
-			template: "<|im_start|>user\n{{ .Prompt }}<|im_end|><|im_start|>assistant\n{{ .Response }}<|im_end|>",
-			vars: PromptVars{
-				Response: "I don't know.",
-			},
-			want: "I don't know.<|im_end|>",
-		},
-		{
-			name:     "Response in Template with Alternative Formatting",
-			template: "<|im_start|>user\n{{.Prompt}}<|im_end|><|im_start|>assistant\n{{.Response}}<|im_end|>",
-			vars: PromptVars{
-				Response: "I don't know.",
-			},
-			want: "I don't know.<|im_end|>",
-		},
-	}
-
-	for _, tt := range tests {
-		m := Model{Template: tt.template}
-		t.Run(tt.name, func(t *testing.T) {
-			got, err := m.PostResponseTemplate(tt.vars)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("PostResponseTemplate() error = %v, wantErr %v", err, tt.wantErr)
-				return
-			}
-			if got != tt.want {
-				t.Errorf("PostResponseTemplate() got = %v, want %v", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestModel_PreResponsePrompt_PostResponsePrompt(t *testing.T) {
-	tests := []struct {
-		name     string
-		template string
-		preVars  PromptVars
-		postVars PromptVars
-		want     string
-		wantErr  bool
-	}{
-		{
-			name:     "Response in Template",
-			template: "<|im_start|>user\n{{.Prompt}}<|im_end|><|im_start|>assistant\n{{.Response}}<|im_end|>",
-			preVars: PromptVars{
-				Prompt: "What are the potion ingredients?",
-			},
-			postVars: PromptVars{
-				Prompt:   "What are the potion ingredients?",
-				Response: "Sugar.",
-			},
-			want: "<|im_start|>user\nWhat are the potion ingredients?<|im_end|><|im_start|>assistant\nSugar.<|im_end|>",
-		},
-		{
-			name:     "No Response in Template",
-			template: "<|im_start|>user\n{{.Prompt}}<|im_end|><|im_start|>assistant\n",
-			preVars: PromptVars{
-				Prompt: "What are the potion ingredients?",
-			},
-			postVars: PromptVars{
-				Prompt:   "What are the potion ingredients?",
-				Response: "Spice.",
-			},
-			want: "<|im_start|>user\nWhat are the potion ingredients?<|im_end|><|im_start|>assistant\nSpice.",
-		},
-	}
-
-	for _, tt := range tests {
-		m := Model{Template: tt.template}
-		t.Run(tt.name, func(t *testing.T) {
-			pre, err := m.PreResponsePrompt(tt.preVars)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("PreResponsePrompt() error = %v, wantErr %v", err, tt.wantErr)
-				return
-			}
-			post, err := m.PostResponseTemplate(tt.postVars)
-			if err != nil {
-				t.Errorf("PostResponseTemplate() error = %v, wantErr %v", err, tt.wantErr)
-				return
-			}
-			result := pre + post
-			if result != tt.want {
-				t.Errorf("Prompt() got = %v, want %v", result, tt.want)
-			}
-		})
-	}
-}
-
-func TestChat(t *testing.T) {
-	tests := []struct {
-		name     string
-		template string
-		msgs     []api.Message
-		want     string
-		wantErr  string
-	}{
-		{
-			name:     "Single Message",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			msgs: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are a Wizard.",
-				},
-				{
-					Role:    "user",
-					Content: "What are the potion ingredients?",
-				},
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
-		},
-		{
-			name:     "First Message",
-			template: "[INST] {{if .First}}Hello!{{end}} {{ .System }} {{ .Prompt }} [/INST]",
-			msgs: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are a Wizard.",
-				},
-				{
-					Role:    "user",
-					Content: "What are the potion ingredients?",
-				},
-				{
-					Role:    "assistant",
-					Content: "eye of newt",
-				},
-				{
-					Role:    "user",
-					Content: "Anything else?",
-				},
-			},
-			want: "[INST] Hello! You are a Wizard. What are the potion ingredients? [/INST]eye of newt[INST]   Anything else? [/INST]",
-		},
-		{
-			name:     "Message History",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			msgs: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are a Wizard.",
-				},
-				{
-					Role:    "user",
-					Content: "What are the potion ingredients?",
-				},
-				{
-					Role:    "assistant",
-					Content: "sugar",
-				},
-				{
-					Role:    "user",
-					Content: "Anything else?",
-				},
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]sugar[INST]  Anything else? [/INST]",
-		},
-		{
-			name:     "Assistant Only",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			msgs: []api.Message{
-				{
-					Role:    "assistant",
-					Content: "everything nice",
-				},
-			},
-			want: "[INST]   [/INST]everything nice",
-		},
-		{
-			name: "Invalid Role",
-			msgs: []api.Message{
-				{
-					Role:    "not-a-role",
-					Content: "howdy",
-				},
-			},
-			wantErr: "invalid role: not-a-role",
-		},
-	}
-
-	for _, tt := range tests {
-		m := Model{
-			Template: tt.template,
-		}
-		t.Run(tt.name, func(t *testing.T) {
-			got, _, err := m.ChatPrompt(tt.msgs)
-			if tt.wantErr != "" {
-				if err == nil {
-					t.Errorf("ChatPrompt() expected error, got nil")
-				}
-				if !strings.Contains(err.Error(), tt.wantErr) {
-					t.Errorf("ChatPrompt() error = %v, wantErr %v", err, tt.wantErr)
-				}
-			}
-			if got != tt.want {
-				t.Errorf("ChatPrompt() got = %v, want %v", got, tt.want)
-			}
-		})
-	}
-}
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -0,0 +1,224 @@
+package server
+
+import (
+	"fmt"
+	"log/slog"
+	"strings"
+	"text/template"
+	"text/template/parse"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+// isResponseNode checks if the node contains .Response
+func isResponseNode(node *parse.ActionNode) bool {
+	for _, cmd := range node.Pipe.Cmds {
+		for _, arg := range cmd.Args {
+			if fieldNode, ok := arg.(*parse.FieldNode); ok && len(fieldNode.Ident) > 0 {
+				if fieldNode.Ident[0] == "Response" {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
+// formatTemplateForResponse formats the template AST to:
+// 1. remove all nodes after the first .Response (if generate=true)
+// 2. add a .Response node to the end if it doesn't exist
+// TODO(jmorganca): this should recursively cut the template before the first .Response
+func formatTemplateForResponse(tmpl *template.Template, generate bool) {
+	var found bool
+	for i, node := range tmpl.Tree.Root.Nodes {
+		if actionNode, ok := node.(*parse.ActionNode); ok {
+			if isResponseNode(actionNode) {
+				found = true
+				if generate {
+					tmpl.Tree.Root.Nodes = tmpl.Tree.Root.Nodes[:i+1]
+					break
+				}
+			}
+		}
+	}
+
+	if !found {
+		// add the response node if it doesn't exist
+		responseFieldNode := &parse.FieldNode{NodeType: parse.NodeField, Ident: []string{"Response"}}
+		responsePipeNode := &parse.PipeNode{NodeType: parse.NodePipe, Cmds: []*parse.CommandNode{{NodeType: parse.NodeCommand, Args: []parse.Node{responseFieldNode}}}}
+		responseActionNode := &parse.ActionNode{NodeType: parse.NodeAction, Pipe: responsePipeNode}
+		tmpl.Tree.Root.Nodes = append(tmpl.Tree.Root.Nodes, responseActionNode)
+	}
+}
+
+// Prompt renders a prompt from a template. If generate is set to true,
+// the response and parts of the template following it are not rendered
+func Prompt(tmpl, system, prompt, response string, generate bool) (string, error) {
+	parsed, err := template.New("").Option("missingkey=zero").Parse(tmpl)
+	if err != nil {
+		return "", err
+	}
+
+	formatTemplateForResponse(parsed, generate)
+
+	vars := map[string]any{
+		"System":   system,
+		"Prompt":   prompt,
+		"Response": response,
+	}
+
+	var sb strings.Builder
+	if err := parsed.Execute(&sb, vars); err != nil {
+		return "", err
+	}
+
+	return sb.String(), nil
+}
+
+func countTokens(tmpl string, system string, prompt string, response string, encode func(string) ([]int, error)) (int, error) {
+	rendered, err := Prompt(tmpl, system, prompt, response, false)
+	if err != nil {
+		return 0, err
+	}
+
+	tokens, err := encode(rendered)
+	if err != nil {
+		slog.Error("failed to encode prompt", "err", err)
+		return 0, err
+	}
+
+	return len(tokens), err
+}
+
+// ChatPrompt builds up a prompt from a series of messages, truncating based on context window size
+func ChatPrompt(tmpl string, system string, messages []api.Message, window int, encode func(string) ([]int, error)) (string, error) {
+	type prompt struct {
+		System   string
+		Prompt   string
+		Response string
+
+		images []int
+		tokens int
+	}
+
+	var p prompt
+
+	// Set the first system prompt to the model's system prompt
+	if system != "" {
+		p.System = system
+	}
+
+	// iterate through messages to build up {system,user,response} prompts
+	var imgId int
+	var prompts []prompt
+	for _, msg := range messages {
+		switch strings.ToLower(msg.Role) {
+		case "system":
+			if p.System != "" || p.Prompt != "" || p.Response != "" {
+				prompts = append(prompts, p)
+				p = prompt{}
+			}
+
+			p.System = msg.Content
+		case "user":
+			if p.Prompt != "" || p.Response != "" {
+				prompts = append(prompts, p)
+				p = prompt{}
+			}
+
+			p.Prompt = msg.Content
+
+			for range msg.Images {
+				p.Prompt += fmt.Sprintf(" [img-%d]", imgId)
+				p.images = append(p.images, imgId)
+				imgId += 1
+			}
+		case "assistant":
+			if p.Response != "" {
+				prompts = append(prompts, p)
+				p = prompt{}
+			}
+
+			p.Response = msg.Content
+		default:
+			return "", fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
+		}
+	}
+
+	// add final prompt
+	if p.System != "" || p.Prompt != "" || p.Response != "" {
+		prompts = append(prompts, p)
+	}
+
+	// calculate token lengths for each prompt, estimating 768 tokens per images
+	for i, p := range prompts {
+		tokens, err := countTokens(tmpl, p.System, p.Prompt, p.Response, encode)
+		if err != nil {
+			return "", err
+		}
+
+		prompts[i].tokens = tokens + len(prompts[i].images)*768
+	}
+
+	// truncate images and prompts starting from the beginning of the list
+	// until either one prompt remains or the total tokens fits the context window
+	// TODO (jmorganca): this doesn't account for the context window room required for the response
+	for {
+		var required int
+		for _, p := range prompts {
+			required += p.tokens
+		}
+
+		required += 1 // for bos token
+
+		if required <= window {
+			slog.Debug("prompt now fits in context window", "required", required, "window", window)
+			break
+		}
+
+		prompt := &prompts[0]
+
+		if len(prompt.images) > 1 {
+			img := prompt.images[0]
+			slog.Debug("prompt longer than context window, removing image", "id", img, "required", required, "window", window)
+			prompt.images = prompt.images[1:]
+			prompt.Prompt = strings.Replace(prompt.Prompt, fmt.Sprintf(" [img-%d]", img), "", 1)
+			prompt.tokens -= 768
+			continue
+		}
+
+		if len(prompts) > 1 {
+			slog.Debug("required tokens longer than context window, removing first prompt", "prompt", prompts[0].tokens, "required", required, "window", window)
+			system := prompt.System
+			prompts = prompts[1:]
+
+			if system != "" && prompts[0].System == "" {
+				prompts[0].System = system
+
+				tokens, err := countTokens(tmpl, prompts[0].System, prompts[0].Prompt, prompts[0].Response, encode)
+				if err != nil {
+					return "", err
+				}
+
+				prompts[0].tokens = tokens + len(prompts[0].images)*768
+			}
+
+			continue
+		}
+
+		// stop truncating if there's only one prompt left
+		break
+	}
+
+	var sb strings.Builder
+	for i, p := range prompts {
+		// last prompt should leave the response unrendered (for completion)
+		rendered, err := Prompt(tmpl, p.System, p.Prompt, p.Response, i == len(prompts)-1)
+		if err != nil {
+			return "", err
+		}
+		sb.WriteString(rendered)
+	}
+
+	return sb.String(), nil
+}
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -0,0 +1,234 @@
+package server
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+func TestPrompt(t *testing.T) {
+	tests := []struct {
+		name     string
+		template string
+		system   string
+		prompt   string
+		response string
+		generate bool
+		want     string
+	}{
+		{
+			name:     "simple prompt",
+			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
+			system:   "You are a Wizard.",
+			prompt:   "What are the potion ingredients?",
+			want:     "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
+		},
+		{
+			name:     "implicit response",
+			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
+			system:   "You are a Wizard.",
+			prompt:   "What are the potion ingredients?",
+			response: "I don't know.",
+			want:     "[INST] You are a Wizard. What are the potion ingredients? [/INST]I don't know.",
+		},
+		{
+			name:     "response",
+			template: "[INST] {{ .System }} {{ .Prompt }} [/INST] {{ .Response }}",
+			system:   "You are a Wizard.",
+			prompt:   "What are the potion ingredients?",
+			response: "I don't know.",
+			want:     "[INST] You are a Wizard. What are the potion ingredients? [/INST] I don't know.",
+		},
+		{
+			name:     "cut",
+			template: "<system>{{ .System }}</system><user>{{ .Prompt }}</user><assistant>{{ .Response }}</assistant>",
+			system:   "You are a Wizard.",
+			prompt:   "What are the potion ingredients?",
+			response: "I don't know.",
+			generate: true,
+			want:     "<system>You are a Wizard.</system><user>What are the potion ingredients?</user><assistant>I don't know.",
+		},
+		{
+			name:     "nocut",
+			template: "<system>{{ .System }}</system><user>{{ .Prompt }}</user><assistant>{{ .Response }}</assistant>",
+			system:   "You are a Wizard.",
+			prompt:   "What are the potion ingredients?",
+			response: "I don't know.",
+			want:     "<system>You are a Wizard.</system><user>What are the potion ingredients?</user><assistant>I don't know.</assistant>",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := Prompt(tc.template, tc.system, tc.prompt, tc.response, tc.generate)
+			if err != nil {
+				t.Errorf("error = %v", err)
+			}
+
+			if got != tc.want {
+				t.Errorf("got = %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestChatPrompt(t *testing.T) {
+	tests := []struct {
+		name     string
+		template string
+		system   string
+		messages []api.Message
+		window   int
+		want     string
+	}{
+		{
+			name:     "simple prompt",
+			template: "[INST] {{ .Prompt }} [/INST]",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello"},
+			},
+			window: 1024,
+			want:   "[INST] Hello [/INST]",
+		},
+		{
+			name:     "with default system message",
+			system:   "You are a Wizard.",
+			template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST]",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello"},
+			},
+			window: 1024,
+			want:   "[INST] <<SYS>>You are a Wizard.<</SYS>> Hello [/INST]",
+		},
+		{
+			name:     "with system message",
+			template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST]",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a Wizard."},
+				{Role: "user", Content: "Hello"},
+			},
+			window: 1024,
+			want:   "[INST] <<SYS>>You are a Wizard.<</SYS>> Hello [/INST]",
+		},
+		{
+			name:     "with response",
+			template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST] {{ .Response }}",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a Wizard."},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "I am?"},
+			},
+			window: 1024,
+			want:   "[INST] <<SYS>>You are a Wizard.<</SYS>> Hello [/INST] I am?",
+		},
+		{
+			name:     "with implicit response",
+			template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST]",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a Wizard."},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "I am?"},
+			},
+			window: 1024,
+			want:   "[INST] <<SYS>>You are a Wizard.<</SYS>> Hello [/INST]I am?",
+		},
+		{
+			name:     "with conversation",
+			template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST] {{ .Response }} ",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a Wizard."},
+				{Role: "user", Content: "What are the potion ingredients?"},
+				{Role: "assistant", Content: "sugar"},
+				{Role: "user", Content: "Anything else?"},
+			},
+			window: 1024,
+			want:   "[INST] <<SYS>>You are a Wizard.<</SYS>> What are the potion ingredients? [/INST] sugar [INST] Anything else? [/INST] ",
+		},
+		{
+			name:     "with truncation",
+			template: "{{ .System }} {{ .Prompt }} {{ .Response }} ",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a Wizard."},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "I am?"},
+				{Role: "user", Content: "Why is the sky blue?"},
+				{Role: "assistant", Content: "The sky is blue from rayleigh scattering"},
+			},
+			window: 10,
+			want:   "You are a Wizard. Why is the sky blue? The sky is blue from rayleigh scattering",
+		},
+		{
+			name:     "images",
+			template: "{{ .System }} {{ .Prompt }}",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a Wizard."},
+				{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("base64")}},
+			},
+			window: 1024,
+			want:   "You are a Wizard. Hello [img-0]",
+		},
+		{
+			name:     "images truncated",
+			template: "{{ .System }} {{ .Prompt }}",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a Wizard."},
+				{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("img1"), []byte("img2")}},
+			},
+			window: 1024,
+			want:   "You are a Wizard. Hello [img-1]",
+		},
+		{
+			name:     "empty list",
+			template: "{{ .System }} {{ .Prompt }}",
+			messages: []api.Message{},
+			window:   1024,
+			want:     "",
+		},
+		{
+			name:     "empty list default system",
+			system:   "You are a Wizard.",
+			template: "{{ .System }} {{ .Prompt }}",
+			messages: []api.Message{},
+			window:   1024,
+			want:     "You are a Wizard. ",
+		},
+		{
+			name:     "empty user message",
+			system:   "You are a Wizard.",
+			template: "{{ .System }} {{ .Prompt }}",
+			messages: []api.Message{
+				{Role: "user", Content: ""},
+			},
+			window: 1024,
+			want:   "You are a Wizard. ",
+		},
+		{
+			name:     "empty prompt",
+			template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST] {{ .Response }} ",
+			messages: []api.Message{
+				{Role: "user", Content: ""},
+			},
+			window: 1024,
+			want:   "",
+		},
+	}
+
+	encode := func(s string) ([]int, error) {
+		words := strings.Fields(s)
+		return make([]int, len(words)), nil
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := ChatPrompt(tc.template, tc.system, tc.messages, tc.window, encode)
+			if err != nil {
+				t.Errorf("error = %v", err)
+			}
+
+			if got != tc.want {
+				t.Errorf("got = %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -22,10 +22,12 @@ import (

 	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
+	"golang.org/x/exp/slices"

 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
 	"github.com/jmorganca/ollama/llm"
+	"github.com/jmorganca/ollama/openai"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/version"
 )
@@ -135,6 +137,12 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options
 	return opts, nil
 }

+func isSupportedImageType(image []byte) bool {
+	contentType := http.DetectContentType(image)
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
+	return slices.Contains(allowedTypes, contentType)
+}
+
 func GenerateHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
@@ -165,6 +173,13 @@ func GenerateHandler(c *gin.Context) {
 		return
 	}

+	for _, img := range req.Images {
+		if !isSupportedImageType(img) {
+			c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "unsupported image format"})
+			return
+		}
+	}
+
 	model, err := GetModel(req.Model)
 	if err != nil {
 		var pErr *fs.PathError
@@ -199,6 +214,8 @@ func GenerateHandler(c *gin.Context) {
 	}

 	// an empty request loads the model
+	// note: for a short while template was used in lieu
+	// of `raw` mode so we need to check for it too
 	if req.Prompt == "" && req.Template == "" && req.System == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
 			CreatedAt: time.Now().UTC(),
@@ -211,43 +228,52 @@ func GenerateHandler(c *gin.Context) {
 	checkpointLoaded := time.Now()

 	var prompt string
-	var promptVars PromptVars
 	switch {
 	case req.Raw:
 		prompt = req.Prompt
 	case req.Prompt != "":
-		if req.Template != "" {
-			// override the default model template
-			model.Template = req.Template
+		if req.Template == "" {
+			req.Template = model.Template
 		}

-		var rebuild strings.Builder
+		if req.System == "" {
+			req.System = model.System
+		}
+
+		slog.Debug("generate handler", "prompt", req.Prompt)
+		slog.Debug("generate handler", "template", req.Template)
+		slog.Debug("generate handler", "system", req.System)
+
+		var sb strings.Builder
 		if req.Context != nil {
-			// TODO: context is deprecated, at some point the context logic within this conditional should be removed
-			prevCtx, err := loaded.runner.Decode(c.Request.Context(), req.Context)
+			prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}

-			// Remove leading spaces from prevCtx if present
-			prevCtx = strings.TrimPrefix(prevCtx, " ")
-			rebuild.WriteString(prevCtx)
+			sb.WriteString(prev)
 		}
-		promptVars = PromptVars{
-			System: req.System,
-			Prompt: req.Prompt,
-			First:  len(req.Context) == 0,
+
+		// write image tags
+		// TODO: limit the number of images to fit in the context similar to the chat endpoint
+		for i := range req.Images {
+			req.Prompt += fmt.Sprintf(" [img-%d]", i)
 		}
-		p, err := model.PreResponsePrompt(promptVars)
+
+		p, err := Prompt(req.Template, req.System, req.Prompt, "", true)
 		if err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}
-		rebuild.WriteString(p)
-		prompt = rebuild.String()
+
+		sb.WriteString(p)
+
+		prompt = sb.String()
 	}

+	slog.Debug("generate handler", "prompt", prompt)
+
 	ch := make(chan any)
 	var generated strings.Builder
 	go func() {
@@ -282,30 +308,39 @@ func GenerateHandler(c *gin.Context) {
 				resp.LoadDuration = checkpointLoaded.Sub(checkpointStart)

 				if !req.Raw {
-					// append the generated text to the history and template it if needed
-					promptVars.Response = generated.String()
-					result, err := model.PostResponseTemplate(promptVars)
+					p, err := Prompt(req.Template, req.System, req.Prompt, generated.String(), false)
+					if err != nil {
+						c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+						return
+					}
+
+					// TODO (jmorganca): encode() should not strip special tokens
+					tokens, err := loaded.runner.Encode(c.Request.Context(), p)
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
 						return
 					}
-					embd, err := loaded.runner.Encode(c.Request.Context(), prompt+result)
-					if err != nil {
-						ch <- gin.H{"error": err.Error()}
-						return
-					}
-					resp.Context = embd
+
+					resp.Context = append(req.Context, tokens...)
 				}
 			}

 			ch <- resp
 		}

+		var images []llm.ImageData
+		for i := range req.Images {
+			images = append(images, llm.ImageData{
+				ID:   i,
+				Data: req.Images[i],
+			})
+		}
+
 		// Start prediction
 		predictReq := llm.PredictOpts{
 			Prompt:  prompt,
 			Format:  req.Format,
-			Images:  req.Images,
+			Images:  images,
 			Options: opts,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
@@ -451,7 +486,7 @@ func PullModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()

-		if err := PullModel(ctx, model, req.CurrentDigest, regOpts, fn); err != nil {
+		if err := PullModel(ctx, model, regOpts, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -673,7 +708,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {

 	modelDetails := api.ModelDetails{
 		ParentModel:       model.ParentModel,
-		Digest:            "sha256:" + model.Digest,
 		Format:            model.Config.ModelFormat,
 		Family:            model.Config.ModelFamily,
 		Families:          model.Config.ModelFamilies,
@@ -917,6 +951,9 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r.POST("/api/blobs/:digest", CreateBlobHandler)
 	r.HEAD("/api/blobs/:digest", HeadBlobHandler)

+	// Compatibility endpoints
+	r.POST("/v1/chat/completions", openai.Middleware(), ChatHandler)
+
 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
 			c.String(http.StatusOK, "Ollama is running")
@@ -932,13 +969,26 @@ func (s *Server) GenerateRoutes() http.Handler {
 }

 func Serve(ln net.Listener) error {
+	level := slog.LevelInfo
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
-		var programLevel = new(slog.LevelVar)
-		h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: programLevel, AddSource: true})
-		slog.SetDefault(slog.New(h))
-		programLevel.Set(slog.LevelDebug)
-		slog.Debug("Debug logging enabled")
+		level = slog.LevelDebug
 	}
+
+	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
+		Level:     level,
+		AddSource: true,
+		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
+			if attr.Key == slog.SourceKey {
+				source := attr.Value.Any().(*slog.Source)
+				source.File = filepath.Base(source.File)
+			}
+
+			return attr
+		},
+	})
+
+	slog.SetDefault(slog.New(handler))
+
 	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
@@ -1041,6 +1091,20 @@ func streamResponse(c *gin.Context, ch chan any) {
 	})
 }

+// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
+func chatPrompt(ctx context.Context, messages []api.Message) (string, error) {
+	encode := func(s string) ([]int, error) {
+		return loaded.runner.Encode(ctx, s)
+	}
+
+	prompt, err := ChatPrompt(loaded.Model.Template, loaded.Model.System, messages, loaded.Options.NumCtx, encode)
+	if err != nil {
+		return "", err
+	}
+
+	return prompt, nil
+}
+
 func ChatHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
@@ -1101,8 +1165,16 @@ func ChatHandler(c *gin.Context) {
 		return
 	}

+	checkpointLoaded := time.Now()
+
+	prompt, err := chatPrompt(c.Request.Context(), req.Messages)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
 	// an empty request loads the model
-	if len(req.Messages) == 0 {
+	if len(req.Messages) == 0 || prompt == "" {
 		resp := api.ChatResponse{
 			CreatedAt: time.Now().UTC(),
 			Model:     req.Model,
@@ -1113,14 +1185,25 @@ func ChatHandler(c *gin.Context) {
 		return
 	}

-	checkpointLoaded := time.Now()
+	// only send images that are in the prompt
+	var i int
+	var images []llm.ImageData
+	for _, m := range req.Messages {
+		for _, img := range m.Images {
+			if !isSupportedImageType(img) {
+				c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "unsupported image format"})
+				return
+			}

-	prompt, images, err := model.ChatPrompt(req.Messages)
-	if err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
+			if strings.Contains(prompt, fmt.Sprintf("[img-%d]", i)) {
+				images = append(images, llm.ImageData{Data: img, ID: i})
+			}
+			i += 1
+		}
 	}

+	slog.Debug("chat handler", "prompt", prompt, "images", len(images))
+
 	ch := make(chan any)

 	go func() {
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -16,6 +16,7 @@ import (
 	"github.com/stretchr/testify/assert"

 	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/llm"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/version"
 )
@@ -239,3 +240,27 @@ func Test_Routes(t *testing.T) {

 	}
 }
+
+type MockLLM struct {
+	encoding []int
+}
+
+func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
+	return nil
+}
+
+func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
+	return llm.encoding, nil
+}
+
+func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
+	return "", nil
+}
+
+func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
+	return []float64{}, nil
+}
+
+func (llm *MockLLM) Close() {
+	// do nothing
+}
Author	SHA1	Message	Date
Patrick Devine	42e77e2a69	handle race condition while setting raw mode in windows (#2509 )	2024-02-14 21:28:35 -08:00
Jeffrey Morgan	9241a29336	Revert "Revert "bump submodule to `6c00a06` (#2479 )"" (#2485 ) This reverts commit `6920964b87`.	2024-02-13 18:18:41 -08:00
Jeffrey Morgan	f7231ad9ad	set `shutting_down` to `false` once shutdown is complete (#2484 )	2024-02-13 17:48:41 -08:00
Jeffrey Morgan	6920964b87	Revert "bump submodule to `6c00a06` (#2479 )" This reverts commit `2f9ed52bbd`.	2024-02-13 17:23:05 -08:00
Jeffrey Morgan	2f9ed52bbd	bump submodule to `6c00a06` (#2479 )	2024-02-13 17:12:42 -08:00
bnorick	caf2b13c10	Fix infinite keep_alive (#2480 )	2024-02-13 15:40:32 -08:00
lebrunel	1d263449ff	Update README.md to include link to Ollama-ex Elixir library (#2477 )	2024-02-13 11:40:44 -08:00
Jeffrey Morgan	48a273f80b	Fix issues with templating prompt in chat mode (#2460 )	2024-02-12 15:06:57 -08:00
Daniel Hiltgen	939c60473f	Merge pull request #2422 from dhiltgen/better_kill More robust shutdown	2024-02-12 14:05:06 -08:00
Jeffrey Morgan	f76ca04f9e	update submodule to `099afc6` (#2468 )	2024-02-12 14:01:16 -08:00
Daniel Hiltgen	76b8728f0c	Merge pull request #2465 from dhiltgen/block_rocm_pre_9 Detect AMD GPU info via sysfs and block old cards	2024-02-12 12:41:43 -08:00
Jeffrey Morgan	1f9078d6ae	Check image filetype in api handlers (#2467 )	2024-02-12 11:16:20 -08:00
Daniel Hiltgen	6d84f07505	Detect AMD GPU info via sysfs and block old cards This wires up some new logic to start using sysfs to discover AMD GPU information and detects old cards we can't yet support so we can fallback to CPU mode.	2024-02-12 08:19:41 -08:00
Jeffrey Morgan	26b13fc33c	patch: always add token to cache_tokens (#2459 )	2024-02-12 08:10:16 -08:00
Jeffrey Morgan	1c8435ffa9	Update domain name references in docs and install script (#2435 )	2024-02-09 15:19:30 -08:00
Daniel Hiltgen	6680761596	Shutdown faster Make sure that when a shutdown signal comes, we shutdown quickly instead of waiting for a potentially long exchange to wrap up.	2024-02-08 22:22:50 -08:00
Jeffrey Morgan	42b797ed9c	Update openai.md	2024-02-08 15:03:23 -05:00
Jeffrey Morgan	336aa43f3c	Update openai.md	2024-02-08 12:48:28 -05:00
Daniel Hiltgen	69f392c9b7	Merge pull request #2403 from dhiltgen/handle_tmp_cleanup Ensure the libraries are present	2024-02-07 17:55:31 -08:00
Daniel Hiltgen	a1dfab43b9	Ensure the libraries are present When we store our libraries in a temp dir, a reaper might clean them when we are idle, so make sure to check for them before we reload.	2024-02-07 17:27:49 -08:00
Jeffrey Morgan	a0a199b108	Fix hanging issue when sending empty content (#2399 )	2024-02-07 19:30:33 -05:00
Jeffrey Morgan	ab0d37fde4	Update openai.md	2024-02-07 17:25:33 -05:00
Jeffrey Morgan	14e71350c8	Update openai.md	2024-02-07 17:25:24 -05:00
Jeffrey Morgan	453f572f83	Initial OpenAI `/v1/chat/completions` API compatibility (#2376 )	2024-02-07 17:24:29 -05:00
Daniel Hiltgen	c9dfa6e571	Merge pull request #2377 from dhiltgen/bump_llamacpp Bump llama.cpp to b2081	2024-02-07 12:04:38 -08:00
Michael Yang	3dcbcd367d	Merge pull request #2394 from ollama/mxyng/fix-error-response	2024-02-07 11:47:31 -08:00
Michael Yang	e805ac1d59	fix response on token error	2024-02-07 11:05:49 -08:00
Michael Yang	b9229ffca5	Merge pull request #2378 from ollama/mxyng/runners runners	2024-02-06 13:49:58 -08:00
Michael Yang	46c847c4ad	enable rocm builds	2024-02-06 13:36:13 -08:00
Michael Yang	92b1a21f79	use linux runners	2024-02-06 13:36:04 -08:00
Daniel Hiltgen	de76b95dd4	Bump llama.cpp to b2081	2024-02-06 12:06:43 -08:00
Michael Yang	59ec837ef6	Merge pull request #2374 from ollama/mxyng/rocm-builds disable rocm builds	2024-02-06 09:41:02 -08:00
Michael Yang	f06b99a461	disable rocm builds	2024-02-06 09:29:42 -08:00
Bruce MacDonald	128fce5495	docs: keep_alive (#2258 )	2024-02-06 11:00:05 -05:00
Daniel Hiltgen	27aa2d4a19	Merge pull request #1849 from mraiser/main Accomodate split cuda lib dir	2024-02-05 16:01:16 -08:00
Jeffrey Morgan	b9f91a0b36	Update import instructions to use convert and quantize tooling from llama.cpp submodule (#2247 )	2024-02-05 00:50:44 -05:00
Erik S	b538dc3858	Add llm-ollama plugin for Datasette's LLM CLI to README (#2340 ) Co-authored-by: Erik Sp <git@aschwa.com>	2024-02-03 15:40:50 -08:00
Jeffrey Morgan	f0e9496c85	Update api.md	2024-02-02 12:17:24 -08:00
Jeffrey Morgan	09a6f76f4c	fix error on `ollama run` with a non-existent model	2024-02-01 23:11:52 -08:00
Jeffrey Morgan	e135167484	Add multimodel support to `ollama run` in noninteractive mopde (#2317 )	2024-02-01 21:33:06 -08:00
Jeffrey Morgan	38296ab352	clear previous images when submitting an image to `ollama run` (#2316 )	2024-02-01 21:30:26 -08:00
Daniel Hiltgen	f43dea68d1	Merge pull request #2318 from dhiltgen/more_clean Harden generate patching model	2024-02-01 20:41:29 -08:00
Daniel Hiltgen	e1f50377f4	Harden generate patching model Only apply patches if we have any, and make sure to cleanup every file we patched at the end to leave the tree clean	2024-02-01 19:34:36 -08:00
Jeffrey Morgan	7913104527	Improvements to `ollama run` for multimodal models (#2300 )	2024-02-01 17:09:51 -08:00
Michael Yang	bfbf2f7cf7	Merge pull request #2296 from ollama/mxyng/img-tags append image tags to user content	2024-02-01 13:16:59 -08:00
Michael Yang	fe3cbd014f	Merge pull request #2298 from ollama/mxyng/debug-prompt structured debug prompt	2024-02-01 13:16:49 -08:00
Michael Yang	3d6f48507a	structured debug prompt	2024-02-01 11:56:28 -08:00
Michael Yang	f3761405c8	use image id	2024-02-01 11:52:42 -08:00
Michael Yang	e49dc9f3d8	fix tests	2024-02-01 11:48:11 -08:00
Michael Yang	d125510b4b	remove image tags	2024-02-01 11:32:51 -08:00
Russell Canfield	1ca386aa9e	Feature - Add Wingman Extension (#2313 )	2024-02-01 11:16:24 -08:00
Michael Yang	fb56988014	account for image projection in token count	2024-02-01 09:50:48 -08:00
Michael Yang	d046bee790	use llm.ImageData for chat	2024-01-31 19:18:25 -08:00
Jeffrey Morgan	f11bf0740b	use `llm.ImageData`	2024-01-31 19:13:48 -08:00
Michael Yang	8450bf66e6	trim images	2024-01-31 19:13:47 -08:00
Michael Yang	b4e11be8ef	append image tags to user content	2024-01-31 19:13:10 -08:00
Bruce MacDonald	a896079705	preserve last system message from modelfile (#2289 )	2024-01-31 21:45:01 -05:00
Michael Yang	583950c828	Merge pull request #2294 from ollama/mxyng/slog-source update slog handler options	2024-01-31 15:29:11 -08:00
Michael Yang	8ac08a0eec	update slog handler options - consistent format by using text handler for debug and non-debug - truncate source file to just the file name	2024-01-31 15:15:00 -08:00
Michael Yang	60f47be64c	Merge pull request #2284 from ollama/mxyng/parse-raw remove unnecessary parse raw	2024-01-31 09:40:48 -08:00
Daniel Hiltgen	6e56077ada	Merge pull request #2263 from dhiltgen/bump_llamacpp Bump llama.cpp to b1999	2024-01-31 08:39:41 -08:00
Hoang Nguyen	98ae9467bb	Added MindMac to Community Integrations -> Web & Desktop section (#1957 )	2024-01-31 07:48:37 -08:00
Richard Macarthy	b7a24af083	Add twinny vscode extension to Extensions and Plugins (#1950 )	2024-01-31 06:25:06 -08:00
Michael Yang	c8b1f2369e	remove unnecessary parse raw	2024-01-30 17:00:53 -08:00
Daniel Hiltgen	72b12c3be7	Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch.	2024-01-30 16:52:12 -08:00
Bruce MacDonald	0632dff3f8	trim chat prompt based on llm context size (#1963 )	2024-01-30 15:59:29 -05:00
Maximilian Weber	509e2dec8a	Update README.md (#2252 ) Added - [Ollama for R - rollama](https://github.com/JBGruber/rollama) in Libraries in README.md	2024-01-30 11:56:51 -08:00
Daniel Hiltgen	78a48de804	Merge pull request #2256 from dhiltgen/container_logs Add container hints for troubleshooting	2024-01-30 08:12:48 -08:00
Daniel Hiltgen	e7dbb00331	Add container hints for troubleshooting Some users are new to containers and unsure where the server logs go	2024-01-29 08:53:41 -08:00
Marc Raiser	c3f9538636	remove default.nix	2024-01-29 00:05:07 -05:00
Jeffrey Morgan	2e06ed01d5	remove unknown `CPPFLAGS` option	2024-01-28 17:51:23 -08:00
Daniel Hiltgen	4072b5879b	Merge pull request #2246 from dhiltgen/reject_cuda_without_avx Don't disable GPUs on arm without AVX	2024-01-28 16:26:55 -08:00
Daniel Hiltgen	15562e887d	Don't disable GPUs on arm without AVX AVX is an x86 feature, so ARM should be excluded from the check.	2024-01-28 15:22:38 -08:00
Jeffrey Morgan	f2245c7c77	print prompt with `OLLAMA_DEBUG=1` (#2245 )	2024-01-28 15:22:35 -08:00
Jeffrey Morgan	e4b9b72f2a	Do not repeat system prompt for chat templating (#2241 )	2024-01-28 14:15:56 -08:00
Daniel Hiltgen	311f8e0c3f	Merge pull request #2243 from dhiltgen/harden_zero_gpus Harden for zero detected GPUs	2024-01-28 13:30:44 -08:00
Daniel Hiltgen	f07f8b7a9e	Harden for zero detected GPUs At least with the ROCm libraries, its possible to have the library present with zero GPUs. This fix avoids a divide by zero bug in llm.go when we try to calculate GPU memory with zero GPUs.	2024-01-28 13:13:10 -08:00
mraiser	4c4c730a0a	Merge branch 'ollama:main' into main	2024-01-27 21:56:11 -05:00
Daniel Hiltgen	e02ecfb6c8	Merge pull request #2116 from dhiltgen/cc_50_80 Add support for CUDA 5.0 cards	2024-01-27 10:28:38 -08:00
Daniel Hiltgen	c8059b4dcf	Merge pull request #2224 from jaglinux/fix_rocm_get_version_message ROCm: Correct the response string in rocm_get_version function	2024-01-27 07:29:32 -08:00
Jagadish Krishnamoorthy	59d87127f5	Update gpu_info_rocm.c	2024-01-26 22:08:27 -08:00
Marc Raiser	6eb3cddcb6	To build on NixOS: nix-shell --run 'go generate ./... && go build .'	2024-01-25 10:17:22 -05:00
mraiser	a4564232a4	Update gen_linux.sh to find libcudart in separate directory	2024-01-25 09:49:35 -05:00
Daniel Hiltgen	a447a083f2	Add compute capability 5.0, 7.5, and 8.0	2024-01-20 14:24:05 -08:00
Daniel Hiltgen	681a914990	Add support for CUDA 5.2 cards	2024-01-20 10:48:43 -08:00