refactor to use client.List instead of walking the filesystem

fix linter
add --upgrade-all flag to refresh any stale models
2024-01-26 18:34:42 -08:00 · 2024-01-26 18:34:42 -08:00 · 2024-01-26 18:34:40 -08:00
25 changed files with 456 additions and 937 deletions
--- a/README.md
+++ b/README.md
@@ -200,21 +200,18 @@ brew install cmake go
 ```

 Then generate dependencies:
-
 ```
 go generate ./...
 ```
-
 Then build the binary:
-
 ```
 go build .
 ```

 More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)

-### Running local builds

+### Running local builds
 Next, start the server:

 ```
@@ -256,7 +253,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ## Community Integrations

 ### Web & Desktop
-
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
@@ -269,7 +265,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
 - [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
- [MindMac](https://mindmac.app)
+

 ### Terminal

@@ -282,7 +278,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
 - [cmdh](https://github.com/pgibler/cmdh)
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).

 ### Database

@@ -309,7 +304,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainDart](https://github.com/davidmigloz/langchain_dart)
 - [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
 - [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
+

 ### Mobile

@@ -331,5 +326,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
--- a/api/types.go
+++ b/api/types.go
@@ -183,11 +183,12 @@ type CopyRequest struct {
 }

 type PullRequest struct {
-	Model    string `json:"model"`
-	Insecure bool   `json:"insecure,omitempty"`
-	Username string `json:"username"`
-	Password string `json:"password"`
-	Stream   *bool  `json:"stream,omitempty"`
+	Model         string `json:"model"`
+	Insecure      bool   `json:"insecure,omitempty"`
+	Username      string `json:"username"`
+	Password      string `json:"password"`
+	Stream        *bool  `json:"stream,omitempty"`
+	CurrentDigest string `json:"current_digest,omitempty"`

 	// Name is deprecated, see Model
 	Name string `json:"name"`
@@ -241,6 +242,7 @@ type GenerateResponse struct {

 type ModelDetails struct {
 	ParentModel       string   `json:"parent_model"`
+	Digest            string   `json:"digest"`
 	Format            string   `json:"format"`
 	Family            string   `json:"family"`
 	Families          []string `json:"families"`
@@ -279,20 +281,85 @@ func (m *Metrics) Summary() {
 var ErrInvalidOpts = fmt.Errorf("invalid options")

 func (opts *Options) FromMap(m map[string]interface{}) error {
-	data, err := json.Marshal(m)
-	if err != nil {
-		return err
-	}
+	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
+	typeOpts := reflect.TypeOf(opts).Elem()   // types of the fields in the options struct

-	err = json.Unmarshal(data, opts)
-	if err != nil {
-		// Custom error handling
-		if jsonErr, ok := err.(*json.UnmarshalTypeError); ok {
-			return fmt.Errorf("invalid type for option '%v': expected %v, got %v", jsonErr.Field, jsonErr.Type, jsonErr.Value)
+	// build map of json struct tags to their types
+	jsonOpts := make(map[string]reflect.StructField)
+	for _, field := range reflect.VisibleFields(typeOpts) {
+		jsonTag := strings.Split(field.Tag.Get("json"), ",")[0]
+		if jsonTag != "" {
+			jsonOpts[jsonTag] = field
 		}
-		return err
 	}

+	invalidOpts := []string{}
+	for key, val := range m {
+		if opt, ok := jsonOpts[key]; ok {
+			field := valueOpts.FieldByName(opt.Name)
+			if field.IsValid() && field.CanSet() {
+				if val == nil {
+					continue
+				}
+
+				switch field.Kind() {
+				case reflect.Int:
+					switch t := val.(type) {
+					case int64:
+						field.SetInt(t)
+					case float64:
+						// when JSON unmarshals numbers, it uses float64, not int
+						field.SetInt(int64(t))
+					default:
+						return fmt.Errorf("option %q must be of type integer", key)
+					}
+				case reflect.Bool:
+					val, ok := val.(bool)
+					if !ok {
+						return fmt.Errorf("option %q must be of type boolean", key)
+					}
+					field.SetBool(val)
+				case reflect.Float32:
+					// JSON unmarshals to float64
+					val, ok := val.(float64)
+					if !ok {
+						return fmt.Errorf("option %q must be of type float32", key)
+					}
+					field.SetFloat(val)
+				case reflect.String:
+					val, ok := val.(string)
+					if !ok {
+						return fmt.Errorf("option %q must be of type string", key)
+					}
+					field.SetString(val)
+				case reflect.Slice:
+					// JSON unmarshals to []interface{}, not []string
+					val, ok := val.([]interface{})
+					if !ok {
+						return fmt.Errorf("option %q must be of type array", key)
+					}
+					// convert []interface{} to []string
+					slice := make([]string, len(val))
+					for i, item := range val {
+						str, ok := item.(string)
+						if !ok {
+							return fmt.Errorf("option %q must be of an array of strings", key)
+						}
+						slice[i] = str
+					}
+					field.Set(reflect.ValueOf(slice))
+				default:
+					return fmt.Errorf("unknown type loading config params: %v", field.Kind())
+				}
+			}
+		} else {
+			invalidOpts = append(invalidOpts, key)
+		}
+	}
+
+	if len(invalidOpts) > 0 {
+		return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
+	}
 	return nil
 }

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -25,7 +25,6 @@ import (
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
-	"golang.org/x/exp/slices"
 	"golang.org/x/term"

 	"github.com/jmorganca/ollama/api"
@@ -147,68 +146,19 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}

 	name := args[0]
-
 	// check if the model exists on the server
-	show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
+	_, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 	var statusError api.StatusError
 	switch {
 	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
 		if err := PullHandler(cmd, []string{name}); err != nil {
 			return err
 		}
-
-		show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
-		if err != nil {
-			return err
-		}
 	case err != nil:
 		return err
 	}

-	interactive := true
-
-	opts := runOptions{
-		Model:       args[0],
-		WordWrap:    os.Getenv("TERM") == "xterm-256color",
-		Options:     map[string]interface{}{},
-		MultiModal:  slices.Contains(show.Details.Families, "clip"),
-		ParentModel: show.Details.ParentModel,
-	}
-
-	format, err := cmd.Flags().GetString("format")
-	if err != nil {
-		return err
-	}
-	opts.Format = format
-
-	prompts := args[1:]
-	// prepend stdin to the prompt if provided
-	if !term.IsTerminal(int(os.Stdin.Fd())) {
-		in, err := io.ReadAll(os.Stdin)
-		if err != nil {
-			return err
-		}
-
-		prompts = append([]string{string(in)}, prompts...)
-		opts.WordWrap = false
-		interactive = false
-	}
-	opts.Prompt = strings.Join(prompts, " ")
-	if len(prompts) > 0 {
-		interactive = false
-	}
-
-	nowrap, err := cmd.Flags().GetBool("nowordwrap")
-	if err != nil {
-		return err
-	}
-	opts.WordWrap = !nowrap
-
-	if !interactive {
-		return generate(cmd, opts)
-	}
-
-	return generateInteractive(cmd, opts)
+	return RunGenerate(cmd, args)
 }

 func PushHandler(cmd *cobra.Command, args []string) error {
@@ -407,6 +357,42 @@ func CopyHandler(cmd *cobra.Command, args []string) error {
 }

 func PullHandler(cmd *cobra.Command, args []string) error {
+	upgradeAll, err := cmd.Flags().GetBool("upgrade-all")
+	if err != nil {
+		return err
+	}
+
+	if !upgradeAll {
+		if len(args) == 0 {
+			return fmt.Errorf("no model specified to pull")
+		}
+		return pull(cmd, args[0], "")
+	}
+
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	models, err := client.List(cmd.Context())
+	if err != nil {
+		return err
+	}
+
+	for _, m := range (*models).Models {
+		err = pull(cmd, m.Name, "sha256:"+m.Digest)
+		if err != nil {
+			if strings.Contains(err.Error(), "file does not exist") {
+				fmt.Printf("model '%s' is no longer available\n", m.Name)
+				continue
+			}
+			return err
+		}
+	}
+	return nil
+}
+
+func pull(cmd *cobra.Command, name string, currentDigest string) error {
 	insecure, err := cmd.Flags().GetBool("insecure")
 	if err != nil {
 		return err
@@ -418,7 +404,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 	}

 	p := progress.NewProgress(os.Stderr)
-	defer p.Stop()
+	defer p.StopWithoutClear()

 	bars := make(map[string]*progress.Bar)

@@ -452,7 +438,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	request := api.PullRequest{Name: args[0], Insecure: insecure}
+	request := api.PullRequest{Name: name, Insecure: insecure, CurrentDigest: currentDigest}
 	if err := client.Pull(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -460,6 +446,51 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

+func RunGenerate(cmd *cobra.Command, args []string) error {
+	interactive := true
+
+	opts := runOptions{
+		Model:    args[0],
+		WordWrap: os.Getenv("TERM") == "xterm-256color",
+		Options:  map[string]interface{}{},
+	}
+
+	format, err := cmd.Flags().GetString("format")
+	if err != nil {
+		return err
+	}
+	opts.Format = format
+
+	prompts := args[1:]
+	// prepend stdin to the prompt if provided
+	if !term.IsTerminal(int(os.Stdin.Fd())) {
+		in, err := io.ReadAll(os.Stdin)
+		if err != nil {
+			return err
+		}
+
+		prompts = append([]string{string(in)}, prompts...)
+		opts.WordWrap = false
+		interactive = false
+	}
+	opts.Prompt = strings.Join(prompts, " ")
+	if len(prompts) > 0 {
+		interactive = false
+	}
+
+	nowrap, err := cmd.Flags().GetBool("nowordwrap")
+	if err != nil {
+		return err
+	}
+	opts.WordWrap = !nowrap
+
+	if !interactive {
+		return generate(cmd, opts)
+	}
+
+	return generateInteractive(cmd, opts)
+}
+
 type generateContextKey string

 type runOptions struct {
@@ -635,18 +666,10 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		return nil
 	}

-	if opts.MultiModal {
-		opts.Prompt, opts.Images, err = extractFileData(opts.Prompt)
-		if err != nil {
-			return err
-		}
-	}
-
 	request := api.GenerateRequest{
 		Model:    opts.Model,
 		Prompt:   opts.Prompt,
 		Context:  generateContext,
-		Images:   opts.Images,
 		Format:   opts.Format,
 		System:   opts.System,
 		Template: opts.Template,
@@ -897,12 +920,13 @@ func NewCLI() *cobra.Command {
 	pullCmd := &cobra.Command{
 		Use:     "pull MODEL",
 		Short:   "Pull a model from a registry",
-		Args:    cobra.ExactArgs(1),
+		Args:    cobra.RangeArgs(0, 1),
 		PreRunE: checkServerHeartbeat,
 		RunE:    PullHandler,
 	}

 	pullCmd.Flags().Bool("insecure", false, "Use an insecure registry")
+	pullCmd.Flags().Bool("upgrade-all", false, "Upgrade all models if they're out of date")

 	pushCmd := &cobra.Command{
 		Use:     "push MODEL",
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -6,7 +6,6 @@ import (
 	"io"
 	"net/http"
 	"os"
-	"path/filepath"
 	"regexp"
 	"sort"
 	"strings"
@@ -99,11 +98,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
-
-		if opts.MultiModal {
-			fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
-		}
-
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -213,7 +207,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			switch multiline {
 			case MultilineSystem:
 				opts.System = sb.String()
-				opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
 				fmt.Println("Set system message.")
 				sb.Reset()
 			case MultilineTemplate:
@@ -233,6 +226,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				fmt.Fprintln(&sb)
 				multiline = MultilinePrompt
 				scanner.Prompt.UseAlt = true
+				break
 			}
 		case scanner.Pasting:
 			fmt.Fprintln(&sb, line)
@@ -355,13 +349,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 					if args[1] == "system" {
 						opts.System = sb.String()
-						opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
 						fmt.Println("Set system message.")
-						sb.Reset()
 					} else if args[1] == "template" {
 						opts.Template = sb.String()
 						fmt.Println("Set prompt template.")
-						sb.Reset()
 					}

 					sb.Reset()
@@ -496,18 +487,29 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				if err != nil {
 					return err
 				}
+				newMessage.Content = msg

-				// clear all previous images for better responses
+				// reset the context if we find another image
 				if len(images) > 0 {
-					for i := range opts.Messages {
-						opts.Messages[i].Images = nil
+					newMessage.Images = append(newMessage.Images, images...)
+					// reset the context for the new image
+					opts.Messages = []api.Message{}
+				} else {
+					if len(opts.Messages) > 1 {
+						newMessage.Images = append(newMessage.Images, opts.Messages[len(opts.Messages)-2].Images...)
 					}
 				}
-
-				newMessage.Content = msg
-				newMessage.Images = images
+				if len(newMessage.Images) == 0 {
+					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
+					fmt.Println()
+					sb.Reset()
+					continue
+				}
 			}

+			if opts.System != "" {
+				opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
+			}
 			opts.Messages = append(opts.Messages, newMessage)

 			assistant, err := chat(cmd, opts)
@@ -601,10 +603,10 @@ func extractFileData(input string) (string, []api.ImageData, error) {
 			if os.IsNotExist(err) {
 				continue
 			}
-			fmt.Fprintf(os.Stderr, "Couldn't process image: %q\n", err)
+			fmt.Printf("Couldn't process image: %q\n", err)
 			return "", imgs, err
 		}
-		fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
+		fmt.Printf("Added image '%s'\n", nfp)
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
--- a/docs/api.md
+++ b/docs/api.md
@@ -542,7 +542,7 @@ curl http://localhost:11434/api/chat -d '{
      "role": "user",
      "content": "what is in this image?",
      "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
-    }
+    },
  ]
 }'
 ```
--- a/docs/development.md
+++ b/docs/development.md
@@ -50,8 +50,7 @@ development and runtime packages.
 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler.  You can customize
-set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
+libraries, and `CUDACXX` to the location of the nvcc compiler.

 Then generate dependencies:

--- a/docs/import.md
+++ b/docs/import.md
@@ -15,7 +15,7 @@ FROM ./mistral-7b-v0.1.Q4_0.gguf
 (Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:

 ```
-FROM ./mistral-7b-v0.1.Q4_0.gguf
+FROM ./q4_0.bin
 TEMPLATE "[INST] {{ .Prompt }} [/INST]"
 ```

@@ -37,69 +37,55 @@ ollama run example "What is your favourite condiment?"

 ## Importing (PyTorch & Safetensors)

-> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress.
+### Supported models

-### Setup
+Ollama supports a set of model architectures, with support for more coming soon:

-First, clone the `ollama/ollama` repo:
+- Llama & Mistral
+- Falcon & RW
+- BigCode

-```
-git clone git@github.com:ollama/ollama.git ollama
-cd ollama
-```
+To view a model's architecture, check the `config.json` file in its HuggingFace repo. You should see an entry under `architectures` (e.g. `LlamaForCausalLM`).

-and then fetch its `llama.cpp` submodule:
-
-```shell
-git submodule init
-git submodule update llm/llama.cpp
-```
-
-Next, install the Python dependencies:
-
-```
-python3 -m venv llm/llama.cpp/.venv
-source llm/llama.cpp/.venv/bin/activate
-pip install -r llm/llama.cpp/requirements.txt
-```
-
-Then build the `quantize` tool:
-
-```
-make -C llm/llama.cpp quantize
-```
-
-### Clone the HuggingFace repository (optional)
+### Step 1: Clone the HuggingFace repository (optional)

 If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model.

-Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository:
-
 ```
 git lfs install
-git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model
+git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
+cd Mistral-7B-Instruct-v0.1
 ```

-### Convert the model
+### Step 2: Convert and quantize to a `.bin` file (optional, for PyTorch and Safetensors)

-> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py`
+If the model is in PyTorch or Safetensors format, a [Docker image](https://hub.docker.com/r/ollama/quantize) with the tooling required to convert and quantize models is available.
+
+First, Install [Docker](https://www.docker.com/get-started/).
+
+Next, to convert and quantize your model, run:

 ```
-python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin
+docker run --rm -v .:/model ollama/quantize -q q4_0 /model
 ```

-### Quantize the model
+This will output two files into the directory:

-```
-llm/llama.cpp/quantize converted.bin quantized.bin q4_0
-```
+- `f16.bin`: the model converted to GGUF
+- `q4_0.bin` the model quantized to a 4-bit quantization (Ollama will use this file to create the Ollama model)

 ### Step 3: Write a `Modelfile`

 Next, create a `Modelfile` for your model:

 ```
-FROM quantized.bin
+FROM ./q4_0.bin
+```
+
+(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
+
+```
+FROM ./q4_0.bin
 TEMPLATE "[INST] {{ .Prompt }} [/INST]"
 ```

@@ -163,3 +149,47 @@ The quantization options are as follow (from highest highest to lowest levels of
 - `q6_K`
 - `q8_0`
 - `f16`
+
+## Manually converting & quantizing models
+
+### Prerequisites
+
+Start by cloning the `llama.cpp` repo to your machine in another directory:
+
+```
+git clone https://github.com/ggerganov/llama.cpp.git
+cd llama.cpp
+```
+
+Next, install the Python dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+Finally, build the `quantize` tool:
+
+```
+make quantize
+```
+
+### Convert the model
+
+Run the correct conversion script for your model architecture:
+
+```shell
+# LlamaForCausalLM or MistralForCausalLM
+python convert.py <path to model directory>
+
+# FalconForCausalLM
+python convert-falcon-hf-to-gguf.py <path to model directory>
+
+# GPTBigCodeForCausalLM
+python convert-starcoder-hf-to-gguf.py <path to model directory>
+```
+
+### Quantize the model
+
+```
+quantize <path to model dir>/ggml-model-f32.bin <path to model dir>/q4_0.bin q4_0
+```
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -12,13 +12,6 @@ On Linux systems with systemd, the logs can be found with this command:
 journalctl -u ollama
 ```

-When you run Ollama in a container, the logs go to stdout/stderr in the container:
-
-```shell
-docker logs <container-name>
-```
-(Use `docker ps` to find the container name)
-
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.

 Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -30,8 +30,8 @@ type handles struct {
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil

-// With our current CUDA compile flags, older than 5.0 will not work properly
-var CudaComputeMin = [2]C.int{5, 0}
+// With our current CUDA compile flags, 5.2 and older will not work properly
+const CudaComputeMajorMin = 6

 // Possible locations for the nvidia-ml library
 var CudaLinuxGlobs = []string{
@@ -122,34 +122,34 @@ func GetGPUInfo() GpuInfo {
 		initGPUHandles()
 	}

-	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
+	// All our GPU builds have AVX enabled, so fallback to CPU if we don't detect at least AVX
 	cpuVariant := GetCPUVariant()
-	if cpuVariant == "" && runtime.GOARCH == "amd64" {
+	if cpuVariant == "" {
 		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
 	}

 	var memInfo C.mem_info_t
 	resp := GpuInfo{}
-	if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+	if gpuHandles.cuda != nil && cpuVariant != "" {
 		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		if memInfo.err != nil {
 			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
 			C.free(unsafe.Pointer(memInfo.err))
-		} else if memInfo.count > 0 {
+		} else {
 			// Verify minimum compute capability
 			var cc C.cuda_compute_capability_t
 			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
 			if cc.err != nil {
 				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
-			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
+			} else if cc.major >= CudaComputeMajorMin {
 				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 			} else {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+	} else if gpuHandles.rocm != nil && cpuVariant != "" {
 		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
 		if memInfo.err != nil {
 			slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
@@ -157,7 +157,7 @@ func GetGPUInfo() GpuInfo {
 		} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
 			// Only one GPU detected and it appears to be an integrated GPU - skip it
 			slog.Info("ROCm unsupported integrated GPU detected")
-		} else if memInfo.count > 0 {
+		} else {
 			if memInfo.igpu_index >= 0 {
 				// We have multiple GPUs reported, and one of them is an integrated GPU
 				// so we have to set the env var to bypass it
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -178,7 +178,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
-    resp->str = strdup("rocm handle not initialized");
+    resp->str = strdup("nvml handle not initialized");
    resp->status = 1;
    return;
  }
@@ -195,4 +195,4 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  resp->str = strdup(buf);
 }

-#endif  // __APPLE__
+#endif  // __APPLE__
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -4,7 +4,7 @@ package llm
 #cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
-#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
+#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
 #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
 #cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
@@ -161,10 +161,13 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-
+	var imageData []ImageData
 	if len(predict.Images) > 0 {
-		slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
+		for cnt, i := range predict.Images {
+			imageData = append(imageData, ImageData{Data: i, ID: cnt})
+		}
 	}
+	slog.Info(fmt.Sprintf("loaded %d images", len(imageData)))

 	request := map[string]any{
 		"prompt":            predict.Prompt,
@@ -186,7 +189,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		"penalize_nl":       predict.Options.PenalizeNewline,
 		"seed":              predict.Options.Seed,
 		"stop":              predict.Options.Stop,
-		"image_data":        predict.Images,
+		"image_data":        imageData,
 		"cache_prompt":      true,
 	}

--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -26,13 +26,13 @@

 // Expose the llama server as a callable extern "C" API
 llama_server_context *llama = NULL;
+std::atomic<bool> ext_server_running(false);
 std::thread ext_server_thread;

 void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
  assert(err != NULL && sparams != NULL);
  log_set_target(stderr);
  if (!sparams->verbose_logging) {
-    server_verbose = true;
    log_disable();
  }

@@ -122,23 +122,18 @@ void llama_server_start() {
  assert(llama != NULL);
  // TODO mutex to protect thread creation
  ext_server_thread = std::thread([&]() {
+    ext_server_running = true;
    try {
      LOG_TEE("llama server main loop starting\n");
      ggml_time_init();
-      llama->queue_tasks.on_new_task(std::bind(
-        &llama_server_context::process_single_task, llama, std::placeholders::_1));
-      llama->queue_tasks.on_finish_multitask(std::bind(
-          &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
-      llama->queue_tasks.on_all_tasks_finished(std::bind(
-          &llama_server_context::run_on_all_tasks_finished, llama));
-      llama->queue_results.on_multitask_update(std::bind(
-          &llama_server_queue::update_multitask,
-          &llama->queue_tasks,
-          std::placeholders::_1,
-          std::placeholders::_2,
-          std::placeholders::_3
-        ));
-      llama->queue_tasks.start_loop();
+      while (ext_server_running.load()) {
+        if (!llama->update_slots()) {
+          LOG_TEE(
+              "unexpected error in llama server update_slots - exiting main "
+              "loop\n");
+          break;
+        }
+      }
    } catch (std::exception &e) {
      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
    } catch (...) {
@@ -151,10 +146,13 @@ void llama_server_start() {

 void llama_server_stop() {
  assert(llama != NULL);
-  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
-  // This may take a while for any pending tasks to drain
-  // TODO - consider a timeout to cancel tasks if it's taking too long
-  llama->queue_tasks.terminate();
+  // TODO - too verbose, remove once things are solid
+  LOG_TEE("requesting llama server shutdown\n");
+  ext_server_running = false;
+
+  // unblocks the update_slots() loop so it can clean up and exit
+  llama->request_cancel(0);
+
  ext_server_thread.join();
  delete llama;
  llama = NULL;
@@ -167,9 +165,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
  resp->msg[0] = '\0';
  try {
    json data = json::parse(json_req);
-    resp->id = llama->queue_tasks.get_new_id();
-    llama->queue_results.add_waiting_task_id(resp->id);
-    llama->request_completion(resp->id, data, false, false, -1);
+    resp->id = llama->request_completion(data, false, false, -1);
  } catch (std::exception &e) {
    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
  } catch (...) {
@@ -187,22 +183,16 @@ void llama_server_completion_next_result(const int task_id,
  resp->json_resp = NULL;
  std::string result_json;
  try {
-    task_result result = llama->queue_results.recv(task_id);
+    task_result result = llama->next_result(task_id);
    result_json =
        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
    resp->id = result.id;
    resp->stop = result.stop;
    resp->error = result.error;
    if (result.error) {
-      LOG_TEE("next result cancel on error\n");
      llama->request_cancel(task_id);
-      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
    } else if (result.stop) {
-      LOG_TEE("next result cancel on stop\n");
      llama->request_cancel(task_id);
-      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
    }
  } catch (std::exception &e) {
    resp->error = true;
@@ -233,7 +223,6 @@ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
  err->msg[0] = '\0';
  try {
    llama->request_cancel(task_id);
-    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -318,15 +307,13 @@ void llama_server_embedding(const char *json_req, char **json_resp,
    } else {
      prompt = "";
    }
-    const int task_id = llama->queue_tasks.get_new_id();
-    llama->queue_results.add_waiting_task_id(task_id);
-    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
-    task_result result = llama->queue_results.recv(task_id);
+    const int task_id = llama->request_completion(
+        {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
+    task_result result = llama->next_result(task_id);
    std::string result_json = result.result_json.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
-    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -39,9 +39,6 @@ init_vars() {
    *)
        ;;
    esac
-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then 
-        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    fi
 }

 git_module_setup() {
@@ -65,17 +62,15 @@ apply_patches() {
        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
    fi

-    if [ -n "$(ls -A ../patches/*.diff)" ]; then
-        # apply temporary patches until fix is upstream
-        for patch in ../patches/*.diff; do
-            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-                (cd ${LLAMACPP_DIR}; git checkout ${file})
-            done
+    # apply temporary patches until fix is upstream
+    for patch in ../patches/*.diff; do
+        for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
+            (cd ${LLAMACPP_DIR}; git checkout ${file})
        done
-        for patch in ../patches/*.diff; do
-            (cd ${LLAMACPP_DIR} && git apply ${patch})
-        done
-    fi
+    done
+    for patch in ../patches/*.diff; do
+        (cd ${LLAMACPP_DIR} && git apply ${patch})
+    done

    # Avoid duplicate main symbols when we link into the cgo binary
    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
@@ -114,12 +109,4 @@ compress_libs() {
 # Keep the local tree clean after we're done with the build
 cleanup() {
    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
-
-    if [ -n "$(ls -A ../patches/*.diff)" ]; then
-        for patch in ../patches/*.diff; do
-            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-                (cd ${LLAMACPP_DIR}; git checkout ${file})
-            done
-        done
-    fi
 }
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -128,11 +128,6 @@ if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
 fi

-# Allow override in case libcudart is in the wrong place
-if [ -z "${CUDART_LIB_DIR}" ]; then
-    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
-fi
-
 if [ -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
@@ -140,7 +135,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
-    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
@@ -156,8 +151,6 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
-        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
        else
            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
        fi
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -25,11 +25,6 @@ function init_vars {
    }
    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
-    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
-        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    } else {
-        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
-    }
 }

 function git_module_setup {
@@ -156,7 +151,7 @@ if ($null -ne $script:CUDA_LIB_DIR) {
    }
    init_vars
    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
    build
    install
    cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -62,7 +62,7 @@ const maxRetries = 3
 type PredictOpts struct {
 	Prompt  string
 	Format  string
-	Images  []ImageData
+	Images  []api.ImageData
 	Options api.Options
 }

--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,8 +1,8 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index a48582ad..9fffffd8 100644
+index 0462fbd2..4fa7b57f 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1564,12 +1564,6 @@ struct llama_server_context
+@@ -1857,12 +1857,6 @@ struct llama_server_context
                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                     }
 
@@ -15,8 +15,8 @@ index a48582ad..9fffffd8 100644
                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                     {
                         // we have to evaluate at least 1 token to generate logits.
-@@ -1581,6 +1575,12 @@ struct llama_server_context
-                         }
+@@ -1870,6 +1864,12 @@ struct llama_server_context
+                         slot.n_past--;
                     }
 
 +                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
--- a/llm/patches/02-shutdown.diff
+++ b/llm/patches/02-shutdown.diff
@@ -1,90 +0,0 @@
-diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 11dd82c3..311495a8 100644
--- a/examples/server/server.cpp
-+++ b/examples/server/server.cpp
-@@ -28,6 +28,7 @@
- #include <chrono>
- #include <condition_variable>
- #include <atomic>
-+#include <signal.h>
- 
- using json = nlohmann::json;
- 
-@@ -2394,6 +2395,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
-     }
- }
- 
-+std::function<void(int)> shutdown_handler;
-+inline void signal_handler(int signal) { shutdown_handler(signal); }
-+
- int main(int argc, char **argv)
- {
- #if SERVER_VERBOSE != 1
-@@ -3014,8 +3018,14 @@ int main(int argc, char **argv)
-         std::placeholders::_2,
-         std::placeholders::_3
-     ));
-    llama.queue_tasks.start_loop();
- 
-+    shutdown_handler = [&](int) {
-+        llama.queue_tasks.terminate();
-+    };
-+    signal(SIGTERM, signal_handler);
-+    signal(SIGINT, signal_handler);
-+    llama.queue_tasks.start_loop();
-+    svr.stop();
-     t.join();
- 
-     llama_backend_free();
-diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
-index 70cce072..2acb1eab 100644
--- a/examples/server/utils.hpp
-+++ b/examples/server/utils.hpp
-@@ -6,6 +6,7 @@
- #include <mutex>
- #include <condition_variable>
- #include <unordered_map>
-+#include <atomic>
- 
- #include "json.hpp"
- 
-@@ -190,6 +191,7 @@ inline std::string format_chatml(std::vector<json> messages)
- struct llama_server_queue {
-     int id = 0;
-     std::mutex mutex_tasks;
-+    std::atomic<bool> running;
-     // queues
-     std::vector<task_server> queue_tasks;
-     std::vector<task_server> queue_tasks_deferred;
-@@ -248,9 +250,15 @@ struct llama_server_queue {
-         queue_tasks_deferred.clear();
-     }
- 
-    // Start the main loop. This call is blocking
-    [[noreturn]]
-+    // end the start_loop routine
-+    void terminate() {
-+        running = false;
-+        condition_tasks.notify_all();
-+    }
-+
-+    // Start the main loop.
-     void start_loop() {
-+        running = true;
-         while (true) {
-             // new task arrived
-             LOG_VERBOSE("have new task", {});
-@@ -294,8 +302,12 @@ struct llama_server_queue {
-             {
-                 std::unique_lock<std::mutex> lock(mutex_tasks);
-                 if (queue_tasks.empty()) {
-+                    if (!running.load()) {
-+                        LOG_VERBOSE("ending start_loop", {});
-+                        return;
-+                    }
-                     condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
-+                        return (!queue_tasks.empty() || !running.load());
-                     });
-                 }
-             }
--- a/progress/progress.go
+++ b/progress/progress.go
@@ -52,6 +52,10 @@ func (p *Progress) Stop() bool {
 	return stopped
 }

+func (p *Progress) StopWithoutClear() bool {
+	return p.stop()
+}
+
 func (p *Progress) StopAndClear() bool {
 	fmt.Fprint(p.w, "\033[?25l")
 	defer fmt.Fprint(p.w, "\033[?25h")
--- a/server/auth.go
+++ b/server/auth.go
@@ -147,7 +147,12 @@ func (s SignatureData) Bytes() []byte {

 // SignData takes a SignatureData object and signs it with a raw private key
 func (s SignatureData) Sign(rawKey []byte) (string, error) {
-	signer, err := ssh.ParsePrivateKey(rawKey)
+	privateKey, err := ssh.ParseRawPrivateKey(rawKey)
+	if err != nil {
+		return "", err
+	}
+
+	signer, err := ssh.NewSignerFromKey(privateKey)
 	if err != nil {
 		return "", err
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -63,7 +63,6 @@ type PromptVars struct {
 	Prompt   string
 	Response string
 	First    bool
-	Images   []llm.ImageData
 }

 // extractParts extracts the parts of the template before and after the {{.Response}} node.
@@ -120,6 +119,10 @@ func Prompt(promptTemplate string, p PromptVars) (string, error) {

 // PreResponsePrompt returns the prompt before the response tag
 func (m *Model) PreResponsePrompt(p PromptVars) (string, error) {
+	if p.System == "" {
+		// use the default system prompt for this model if one is not specified
+		p.System = m.System
+	}
 	pre, _, err := extractParts(m.Template)
 	if err != nil {
 		return "", err
@@ -147,68 +150,62 @@ func (m *Model) PostResponseTemplate(p PromptVars) (string, error) {
 	return Prompt(post, p)
 }

-type ChatHistory struct {
-	Prompts    []PromptVars
-	LastSystem string
-}
-
-// ChatPrompts returns a list of formatted chat prompts from a list of messages
-func (m *Model) ChatPrompts(msgs []api.Message) (*ChatHistory, error) {
+func (m *Model) ChatPrompt(msgs []api.Message) (string, []api.ImageData, error) {
 	// build the prompt from the list of messages
-	lastSystem := m.System
+	var prompt strings.Builder
+	var currentImages []api.ImageData
 	currentVars := PromptVars{
 		First:  true,
 		System: m.System,
 	}

-	prompts := []PromptVars{}
-	var images []llm.ImageData
+	writePrompt := func() error {
+		p, err := Prompt(m.Template, currentVars)
+		if err != nil {
+			return err
+		}
+		prompt.WriteString(p)
+		currentVars = PromptVars{}
+		return nil
+	}

 	for _, msg := range msgs {
 		switch strings.ToLower(msg.Role) {
 		case "system":
-			// if this is the first message it overrides the system prompt in the modelfile
-			if !currentVars.First && currentVars.System != "" {
-				prompts = append(prompts, currentVars)
-				currentVars = PromptVars{}
+			if currentVars.System != "" {
+				if err := writePrompt(); err != nil {
+					return "", nil, err
+				}
 			}
 			currentVars.System = msg.Content
-			lastSystem = msg.Content
 		case "user":
 			if currentVars.Prompt != "" {
-				prompts = append(prompts, currentVars)
-				currentVars = PromptVars{}
+				if err := writePrompt(); err != nil {
+					return "", nil, err
+				}
 			}
-
 			currentVars.Prompt = msg.Content
-			for i := range msg.Images {
-				id := len(images) + i
-				currentVars.Prompt += fmt.Sprintf(" [img-%d]", id)
-				currentVars.Images = append(currentVars.Images, llm.ImageData{
-					ID:   id,
-					Data: msg.Images[i],
-				})
-			}
-
-			images = append(images, currentVars.Images...)
+			currentImages = msg.Images
 		case "assistant":
 			currentVars.Response = msg.Content
-			prompts = append(prompts, currentVars)
-			currentVars = PromptVars{}
+			if err := writePrompt(); err != nil {
+				return "", nil, err
+			}
 		default:
-			return nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
+			return "", nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
 		}
 	}

 	// Append the last set of vars if they are non-empty
 	if currentVars.Prompt != "" || currentVars.System != "" {
-		prompts = append(prompts, currentVars)
+		p, err := m.PreResponsePrompt(currentVars)
+		if err != nil {
+			return "", nil, fmt.Errorf("pre-response template: %w", err)
+		}
+		prompt.WriteString(p)
 	}

-	return &ChatHistory{
-		Prompts:    prompts,
-		LastSystem: lastSystem,
-	}, nil
+	return prompt.String(), currentImages, nil
 }

 type ManifestV2 struct {
@@ -474,7 +471,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 				switch {
 				case errors.Is(err, os.ErrNotExist):
 					fn(api.ProgressResponse{Status: "pulling model"})
-					if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil {
+					if err := PullModel(ctx, c.Args, "", &RegistryOptions{}, fn); err != nil {
 						return err
 					}

@@ -1044,7 +1041,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 	return nil
 }

-func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
+func PullModel(ctx context.Context, name, currentDigest string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

 	var manifest *ManifestV2
@@ -1072,13 +1069,23 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 		return fmt.Errorf("insecure protocol http")
 	}

-	fn(api.ProgressResponse{Status: "pulling manifest"})
+	if currentDigest == "" {
+		fn(api.ProgressResponse{Status: "pulling manifest"})
+	}

-	manifest, err = pullModelManifest(ctx, mp, regOpts)
+	manifest, err = pullModelManifest(ctx, mp, currentDigest, regOpts)
 	if err != nil {
 		return fmt.Errorf("pull model manifest: %s", err)
 	}

+	if currentDigest != "" {
+		if manifest == nil {
+			// we already have the model
+			return nil
+		}
+		fn(api.ProgressResponse{Status: "upgrading " + mp.GetShortTagname()})
+	}
+
 	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
 	layers = append(layers, manifest.Config)
@@ -1150,17 +1157,27 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 	return nil
 }

-func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *RegistryOptions) (*ManifestV2, error) {
+func pullModelManifest(ctx context.Context, mp ModelPath, currentDigest string, regOpts *RegistryOptions) (*ManifestV2, error) {
 	requestURL := mp.BaseURL().JoinPath("v2", mp.GetNamespaceRepository(), "manifests", mp.Tag)

 	headers := make(http.Header)
 	headers.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")
+
+	if currentDigest != "" {
+		headers.Set("If-None-Match", currentDigest)
+	}
+
 	resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, regOpts)
 	if err != nil {
 		return nil, err
 	}
 	defer resp.Body.Close()

+	// todo we can potentially read the manifest locally and return it here
+	if resp.StatusCode == http.StatusNotModified {
+		return nil, nil
+	}
+
 	var m *ManifestV2
 	if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
 		return nil, err
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,7 +1,6 @@
 package server

 import (
-	"bytes"
 	"strings"
 	"testing"

@@ -234,58 +233,17 @@ func TestModel_PreResponsePrompt_PostResponsePrompt(t *testing.T) {
 	}
 }

-func chatHistoryEqual(a, b ChatHistory) bool {
-	if len(a.Prompts) != len(b.Prompts) {
-		return false
-	}
-	for i, v := range a.Prompts {
-
-		if v.First != b.Prompts[i].First {
-			return false
-		}
-
-		if v.Response != b.Prompts[i].Response {
-			return false
-		}
-
-		if v.Prompt != b.Prompts[i].Prompt {
-			return false
-		}
-
-		if v.System != b.Prompts[i].System {
-			return false
-		}
-
-		if len(v.Images) != len(b.Prompts[i].Images) {
-			return false
-		}
-
-		for j, img := range v.Images {
-			if img.ID != b.Prompts[i].Images[j].ID {
-				return false
-			}
-
-			if !bytes.Equal(img.Data, b.Prompts[i].Images[j].Data) {
-				return false
-			}
-		}
-	}
-	return a.LastSystem == b.LastSystem
-}
-
 func TestChat(t *testing.T) {
 	tests := []struct {
-		name    string
-		model   Model
-		msgs    []api.Message
-		want    ChatHistory
-		wantErr string
+		name     string
+		template string
+		msgs     []api.Message
+		want     string
+		wantErr  string
 	}{
 		{
-			name: "Single Message",
-			model: Model{
-				Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			},
+			name:     "Single Message",
+			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
 			msgs: []api.Message{
 				{
 					Role:    "system",
@@ -296,22 +254,34 @@ func TestChat(t *testing.T) {
 					Content: "What are the potion ingredients?",
 				},
 			},
-			want: ChatHistory{
-				Prompts: []PromptVars{
-					{
-						System: "You are a Wizard.",
-						Prompt: "What are the potion ingredients?",
-						First:  true,
-					},
-				},
-				LastSystem: "You are a Wizard.",
-			},
+			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
 		},
 		{
-			name: "Message History",
-			model: Model{
-				Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
+			name:     "First Message",
+			template: "[INST] {{if .First}}Hello!{{end}} {{ .System }} {{ .Prompt }} [/INST]",
+			msgs: []api.Message{
+				{
+					Role:    "system",
+					Content: "You are a Wizard.",
+				},
+				{
+					Role:    "user",
+					Content: "What are the potion ingredients?",
+				},
+				{
+					Role:    "assistant",
+					Content: "eye of newt",
+				},
+				{
+					Role:    "user",
+					Content: "Anything else?",
+				},
 			},
+			want: "[INST] Hello! You are a Wizard. What are the potion ingredients? [/INST]eye of newt[INST]   Anything else? [/INST]",
+		},
+		{
+			name:     "Message History",
+			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
 			msgs: []api.Message{
 				{
 					Role:    "system",
@@ -330,85 +300,18 @@ func TestChat(t *testing.T) {
 					Content: "Anything else?",
 				},
 			},
-			want: ChatHistory{
-				Prompts: []PromptVars{
-					{
-						System:   "You are a Wizard.",
-						Prompt:   "What are the potion ingredients?",
-						Response: "sugar",
-						First:    true,
-					},
-					{
-						Prompt: "Anything else?",
-					},
-				},
-				LastSystem: "You are a Wizard.",
-			},
+			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]sugar[INST]  Anything else? [/INST]",
 		},
 		{
-			name: "Assistant Only",
-			model: Model{
-				Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			},
+			name:     "Assistant Only",
+			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
 			msgs: []api.Message{
 				{
 					Role:    "assistant",
 					Content: "everything nice",
 				},
 			},
-			want: ChatHistory{
-				Prompts: []PromptVars{
-					{
-						Response: "everything nice",
-						First:    true,
-					},
-				},
-			},
-		},
-		{
-			name: "Last system message is preserved from modelfile",
-			model: Model{
-				Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-				System:   "You are Mojo Jojo.",
-			},
-			msgs: []api.Message{
-				{
-					Role:    "user",
-					Content: "hi",
-				},
-			},
-			want: ChatHistory{
-				Prompts: []PromptVars{
-					{
-						System: "You are Mojo Jojo.",
-						Prompt: "hi",
-						First:  true,
-					},
-				},
-				LastSystem: "You are Mojo Jojo.",
-			},
-		},
-		{
-			name: "Last system message is preserved from messages",
-			model: Model{
-				Template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-				System:   "You are Mojo Jojo.",
-			},
-			msgs: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are Professor Utonium.",
-				},
-			},
-			want: ChatHistory{
-				Prompts: []PromptVars{
-					{
-						System: "You are Professor Utonium.",
-						First:  true,
-					},
-				},
-				LastSystem: "You are Professor Utonium.",
-			},
+			want: "[INST]   [/INST]everything nice",
 		},
 		{
 			name: "Invalid Role",
@@ -423,8 +326,11 @@ func TestChat(t *testing.T) {
 	}

 	for _, tt := range tests {
+		m := Model{
+			Template: tt.template,
+		}
 		t.Run(tt.name, func(t *testing.T) {
-			got, err := tt.model.ChatPrompts(tt.msgs)
+			got, _, err := m.ChatPrompt(tt.msgs)
 			if tt.wantErr != "" {
 				if err == nil {
 					t.Errorf("ChatPrompt() expected error, got nil")
@@ -432,10 +338,9 @@ func TestChat(t *testing.T) {
 				if !strings.Contains(err.Error(), tt.wantErr) {
 					t.Errorf("ChatPrompt() error = %v, wantErr %v", err, tt.wantErr)
 				}
-				return
 			}
-			if !chatHistoryEqual(*got, tt.want) {
-				t.Errorf("ChatPrompt() got = %#v, want %#v", got, tt.want)
+			if got != tt.want {
+				t.Errorf("ChatPrompt() got = %v, want %v", got, tt.want)
 			}
 		})
 	}
--- a/server/routes.go
+++ b/server/routes.go
@@ -178,7 +178,11 @@ func GenerateHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		if errors.Is(err, api.ErrInvalidOpts) {
+			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+			return
+		}
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

@@ -235,15 +239,6 @@ func GenerateHandler(c *gin.Context) {
 			Prompt: req.Prompt,
 			First:  len(req.Context) == 0,
 		}
-
-		if promptVars.System == "" {
-			promptVars.System = model.System
-		}
-
-		for i := range req.Images {
-			promptVars.Prompt += fmt.Sprintf(" [img-%d]", i)
-		}
-
 		p, err := model.PreResponsePrompt(promptVars)
 		if err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -253,8 +248,6 @@ func GenerateHandler(c *gin.Context) {
 		prompt = rebuild.String()
 	}

-	slog.Debug("generate handler", "prompt", prompt)
-
 	ch := make(chan any)
 	var generated strings.Builder
 	go func() {
@@ -308,19 +301,11 @@ func GenerateHandler(c *gin.Context) {
 			ch <- resp
 		}

-		var images []llm.ImageData
-		for i := range req.Images {
-			images = append(images, llm.ImageData{
-				ID:   i,
-				Data: req.Images[i],
-			})
-		}
-
 		// Start prediction
 		predictReq := llm.PredictOpts{
 			Prompt:  prompt,
 			Format:  req.Format,
-			Images:  images,
+			Images:  req.Images,
 			Options: opts,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
@@ -392,7 +377,11 @@ func EmbeddingHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		if errors.Is(err, api.ErrInvalidOpts) {
+			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+			return
+		}
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

@@ -462,7 +451,7 @@ func PullModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()

-		if err := PullModel(ctx, model, regOpts, fn); err != nil {
+		if err := PullModel(ctx, model, req.CurrentDigest, regOpts, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -684,6 +673,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {

 	modelDetails := api.ModelDetails{
 		ParentModel:       model.ParentModel,
+		Digest:            "sha256:" + model.Digest,
 		Format:            model.Config.ModelFormat,
 		Family:            model.Config.ModelFamily,
 		Families:          model.Config.ModelFamilies,
@@ -942,26 +932,13 @@ func (s *Server) GenerateRoutes() http.Handler {
 }

 func Serve(ln net.Listener) error {
-	level := slog.LevelInfo
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
-		level = slog.LevelDebug
+		var programLevel = new(slog.LevelVar)
+		h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: programLevel, AddSource: true})
+		slog.SetDefault(slog.New(h))
+		programLevel.Set(slog.LevelDebug)
+		slog.Debug("Debug logging enabled")
 	}
-
-	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
-		Level:     level,
-		AddSource: true,
-		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
-			if attr.Key == slog.SourceKey {
-				source := attr.Value.Any().(*slog.Source)
-				source.File = filepath.Base(source.File)
-			}
-
-			return attr
-		},
-	})
-
-	slog.SetDefault(slog.New(handler))
-
 	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
@@ -1104,7 +1081,11 @@ func ChatHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		if errors.Is(err, api.ErrInvalidOpts) {
+			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+			return
+		}
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

@@ -1134,20 +1115,12 @@ func ChatHandler(c *gin.Context) {

 	checkpointLoaded := time.Now()

-	chat, err := model.ChatPrompts(req.Messages)
+	prompt, images, err := model.ChatPrompt(req.Messages)
 	if err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

-	prompt, images, err := trimmedPrompt(c.Request.Context(), chat, model)
-	if err != nil {
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		return
-	}
-
-	slog.Debug("chat handler", "prompt", prompt)
-
 	ch := make(chan any)

 	go func() {
@@ -1221,115 +1194,3 @@ func ChatHandler(c *gin.Context) {

 	streamResponse(c, ch)
 }
-
-// promptInfo stores the variables used to template a prompt, and the token length of the resulting template for some model
-type promptInfo struct {
-	vars     PromptVars
-	tokenLen int
-}
-
-// trimmedPrompt builds a prompt to send to a running model. It ensures the prompt fits within the max context length,
-// while preserving the most recent system message.
-func trimmedPrompt(ctx context.Context, chat *ChatHistory, model *Model) (string, []llm.ImageData, error) {
-	if len(chat.Prompts) == 0 {
-		return "", nil, nil
-	}
-
-	var promptsToAdd []promptInfo
-	var totalTokenLength int
-	var systemPromptIncluded bool
-
-	var images []llm.ImageData
-	// reverse iterate through the prompts to build the prompt string in a way that fits the max context length
-	for i := len(chat.Prompts) - 1; i >= 0; i-- {
-		prompt := chat.Prompts[i]
-		promptText, err := promptString(model, prompt, i == len(chat.Prompts)-1)
-		if err != nil {
-			return "", nil, err
-		}
-
-		encodedTokens, err := loaded.runner.Encode(ctx, promptText)
-		if err != nil {
-			return "", nil, err
-		}
-
-		if totalTokenLength+len(encodedTokens) > loaded.NumCtx && i != len(chat.Prompts)-1 {
-			break // reached max context length, stop adding more prompts
-		}
-
-		for j := range prompt.Images {
-			if totalTokenLength+768 > loaded.NumCtx {
-				// this decreases the token length but overestimating is fine
-				prompt.Prompt = strings.ReplaceAll(prompt.Prompt, fmt.Sprintf(" [img-%d]", prompt.Images[j].ID), "")
-				continue
-			}
-
-			totalTokenLength += 768
-			images = append(images, prompt.Images[j])
-		}
-
-		totalTokenLength += len(encodedTokens)
-		systemPromptIncluded = systemPromptIncluded || prompt.System != ""
-		promptsToAdd = append(promptsToAdd, promptInfo{vars: prompt, tokenLen: len(encodedTokens)})
-	}
-
-	// ensure the system prompt is included, if not already
-	if chat.LastSystem != "" && !systemPromptIncluded {
-		var err error
-		promptsToAdd, err = includeSystemPrompt(ctx, chat.LastSystem, totalTokenLength, promptsToAdd)
-		if err != nil {
-			return "", nil, err
-		}
-	}
-
-	promptsToAdd[len(promptsToAdd)-1].vars.First = true
-
-	// construct the final prompt string from the prompts which fit within the context window
-	var result string
-	for i, prompt := range promptsToAdd {
-		promptText, err := promptString(model, prompt.vars, i == 0)
-		if err != nil {
-			return "", nil, err
-		}
-		result = promptText + result
-	}
-
-	return result, images, nil
-}
-
-// promptString applies the model template to the prompt
-func promptString(model *Model, vars PromptVars, isMostRecent bool) (string, error) {
-	if isMostRecent {
-		p, err := model.PreResponsePrompt(vars)
-		if err != nil {
-			return "", fmt.Errorf("pre-response template: %w", err)
-		}
-		return p, nil
-	}
-	p, err := Prompt(model.Template, vars)
-	if err != nil {
-		return "", err
-	}
-	return p, nil
-}
-
-// includeSystemPrompt adjusts the prompts to include the system prompt.
-func includeSystemPrompt(ctx context.Context, systemPrompt string, totalTokenLength int, promptsToAdd []promptInfo) ([]promptInfo, error) {
-	systemTokens, err := loaded.runner.Encode(ctx, systemPrompt)
-	if err != nil {
-		return nil, err
-	}
-
-	for i := len(promptsToAdd) - 1; i >= 0; i-- {
-		if totalTokenLength+len(systemTokens) <= loaded.NumCtx {
-			promptsToAdd[i].vars.System = systemPrompt
-			return promptsToAdd[:i+1], nil
-		}
-		totalTokenLength -= promptsToAdd[i].tokenLen
-	}
-
-	// if got here, system did not fit anywhere, so return the most recent prompt with the system message set
-	recent := promptsToAdd[len(promptsToAdd)-1]
-	recent.vars.System = systemPrompt
-	return []promptInfo{recent}, nil
-}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -16,7 +16,6 @@ import (
 	"github.com/stretchr/testify/assert"

 	"github.com/jmorganca/ollama/api"
-	"github.com/jmorganca/ollama/llm"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/version"
 )
@@ -240,258 +239,3 @@ func Test_Routes(t *testing.T) {

 	}
 }
-
-func Test_ChatPrompt(t *testing.T) {
-	tests := []struct {
-		name     string
-		template string
-		chat     *ChatHistory
-		numCtx   int
-		runner   MockLLM
-		want     string
-		wantErr  string
-	}{
-		{
-			name:     "Single Message",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						System: "You are a Wizard.",
-						Prompt: "What are the potion ingredients?",
-						First:  true,
-					},
-				},
-				LastSystem: "You are a Wizard.",
-			},
-			numCtx: 1,
-			runner: MockLLM{
-				encoding: []int{1}, // fit the ctxLen
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
-		},
-		{
-			name:     "First Message",
-			template: "[INST] {{if .First}}Hello!{{end}} {{ .System }} {{ .Prompt }} [/INST]",
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						System:   "You are a Wizard.",
-						Prompt:   "What are the potion ingredients?",
-						Response: "eye of newt",
-						First:    true,
-					},
-					{
-						Prompt: "Anything else?",
-					},
-				},
-				LastSystem: "You are a Wizard.",
-			},
-			numCtx: 2,
-			runner: MockLLM{
-				encoding: []int{1}, // fit the ctxLen
-			},
-			want: "[INST] Hello! You are a Wizard. What are the potion ingredients? [/INST]eye of newt[INST]   Anything else? [/INST]",
-		},
-		{
-			name:     "Message History",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						System:   "You are a Wizard.",
-						Prompt:   "What are the potion ingredients?",
-						Response: "sugar",
-						First:    true,
-					},
-					{
-						Prompt: "Anything else?",
-					},
-				},
-				LastSystem: "You are a Wizard.",
-			},
-			numCtx: 4,
-			runner: MockLLM{
-				encoding: []int{1}, // fit the ctxLen, 1 for each message
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]sugar[INST]  Anything else? [/INST]",
-		},
-		{
-			name:     "Assistant Only",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						Response: "everything nice",
-						First:    true,
-					},
-				},
-			},
-			numCtx: 1,
-			runner: MockLLM{
-				encoding: []int{1},
-			},
-			want: "[INST]   [/INST]everything nice",
-		},
-		{
-			name:     "Message History Truncated, No System",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						Prompt:   "What are the potion ingredients?",
-						Response: "sugar",
-						First:    true,
-					},
-					{
-						Prompt:   "Anything else?",
-						Response: "spice",
-					},
-					{
-						Prompt: "... and?",
-					},
-				},
-			},
-			numCtx: 2, // only 1 message from history and most recent message
-			runner: MockLLM{
-				encoding: []int{1},
-			},
-			want: "[INST]  Anything else? [/INST]spice[INST]  ... and? [/INST]",
-		},
-		{
-			name:     "System is Preserved when Truncated",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						Prompt:   "What are the magic words?",
-						Response: "abracadabra",
-					},
-					{
-						Prompt: "What is the spell for invisibility?",
-					},
-				},
-				LastSystem: "You are a wizard.",
-			},
-			numCtx: 2,
-			runner: MockLLM{
-				encoding: []int{1},
-			},
-			want: "[INST] You are a wizard. What is the spell for invisibility? [/INST]",
-		},
-		{
-			name:     "System is Preserved when Length Exceeded",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						Prompt:   "What are the magic words?",
-						Response: "abracadabra",
-					},
-					{
-						Prompt: "What is the spell for invisibility?",
-					},
-				},
-				LastSystem: "You are a wizard.",
-			},
-			numCtx: 1,
-			runner: MockLLM{
-				encoding: []int{1},
-			},
-			want: "[INST] You are a wizard. What is the spell for invisibility? [/INST]",
-		},
-		{
-			name:     "First is Preserved when Truncated",
-			template: "[INST] {{ if .First }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]",
-
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					// first message omitted for test
-					{
-						Prompt:   "Do you have a magic hat?",
-						Response: "Of course.",
-					},
-					{
-						Prompt: "What is the spell for invisibility?",
-					},
-				},
-				LastSystem: "You are a wizard.",
-			},
-			numCtx: 3, // two most recent messages and room for system message
-			runner: MockLLM{
-				encoding: []int{1},
-			},
-			want: "[INST] You are a wizard. Do you have a magic hat? [/INST]Of course.[INST] What is the spell for invisibility? [/INST]",
-		},
-		{
-			name:     "Most recent message is returned when longer than ctxLen",
-			template: "[INST] {{ .Prompt }} [/INST]",
-
-			chat: &ChatHistory{
-				Prompts: []PromptVars{
-					{
-						Prompt: "What is the spell for invisibility?",
-						First:  true,
-					},
-				},
-			},
-			numCtx: 1, // two most recent messages
-			runner: MockLLM{
-				encoding: []int{1, 2},
-			},
-			want: "[INST] What is the spell for invisibility? [/INST]",
-		},
-	}
-
-	for _, testCase := range tests {
-		tt := testCase
-		m := &Model{
-			Template: tt.template,
-		}
-		t.Run(tt.name, func(t *testing.T) {
-			loaded.runner = &tt.runner
-			loaded.Options = &api.Options{
-				Runner: api.Runner{
-					NumCtx: tt.numCtx,
-				},
-			}
-			// TODO: add tests for trimming images
-			got, _, err := trimmedPrompt(context.Background(), tt.chat, m)
-			if tt.wantErr != "" {
-				if err == nil {
-					t.Errorf("ChatPrompt() expected error, got nil")
-				}
-				if !strings.Contains(err.Error(), tt.wantErr) {
-					t.Errorf("ChatPrompt() error = %v, wantErr %v", err, tt.wantErr)
-				}
-			}
-			if got != tt.want {
-				t.Errorf("ChatPrompt() got = %v, want %v", got, tt.want)
-			}
-		})
-	}
-}
-
-type MockLLM struct {
-	encoding []int
-}
-
-func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
-	return nil
-}
-
-func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
-	return llm.encoding, nil
-}
-
-func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
-	return "", nil
-}
-
-func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
-	return []float64{}, nil
-}
-
-func (llm *MockLLM) Close() {
-	// do nothing
-}
Author	SHA1	Message	Date
Patrick Devine	9a483dc7b7	refactor to use client.List instead of walking the filesystem	2024-01-26 18:34:42 -08:00
Patrick Devine	366b38460f	fix linter	2024-01-26 18:34:42 -08:00
Patrick Devine	021b1bdc4a	add `--upgrade-all` flag to refresh any stale models	2024-01-26 18:34:40 -08:00