restore model load duration on generate response (#1524 )

* restore model load duration on generate response - set model load duration on generate and chat done response - calculate createAt time when response created * remove checkpoints predict opts * Update routes.go
Update runner to support mixtral and mixture of experts (MoE) (#1475 )
2023-12-14 12:15:50 -05:00 · 2023-12-13 17:15:10 -05:00 · 2023-12-13 14:42:30 -05:00 · 2023-12-13 14:38:47 -05:00 · 2023-12-13 13:59:33 -05:00 · 2023-12-13 11:21:23 -05:00
18 changed files with 582 additions and 298 deletions
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Here are some example open-source models that can be downloaded:
 | Llama 2 70B        | 70B        | 39GB  | `ollama run llama2:70b`        |
 | Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
 | Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
+| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |

 > Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.

@@ -104,7 +105,7 @@ FROM llama2
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1

-# set the system prompt
+# set the system message
 SYSTEM """
 You are Mario from Super Mario Bros. Answer as Mario, the assistant, only.
 """
@@ -158,6 +159,13 @@ For multiline input, you can wrap text with `"""`:
 I'm a basic program that prints the famous "Hello, world!" message to the console.
 ```

+### Multimodal models
+
+```
+>>> What's in this image? /Users/jmorgan/Desktop/smile.png
+The image features a yellow smiley face, which is likely the central focus of the picture.
+```
+
 ### Pass in prompt as arguments

 ```
@@ -205,7 +213,8 @@ Finally, in a separate shell, run a model:
 ## REST API

 Ollama has a REST API for running and managing models.
-For example, to generate text from a model:
+
+### Generate a response

 ```
 curl http://localhost:11434/api/generate -d '{
@@ -214,7 +223,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-Or send a chat message (coming in 0.1.14):
+### Chat with a model

 ```
 curl http://localhost:11434/api/chat -d '{
@@ -253,6 +262,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)

+### Database
+
+- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)
+
 ### Package managers

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
--- a/api/types.go
+++ b/api/types.go
@@ -31,15 +31,18 @@ func (e StatusError) Error() string {
 	}
 }

+type ImageData []byte
+
 type GenerateRequest struct {
-	Model    string `json:"model"`
-	Prompt   string `json:"prompt"`
-	System   string `json:"system"`
-	Template string `json:"template"`
-	Context  []int  `json:"context,omitempty"`
-	Stream   *bool  `json:"stream,omitempty"`
-	Raw      bool   `json:"raw,omitempty"`
-	Format   string `json:"format"`
+	Model    string      `json:"model"`
+	Prompt   string      `json:"prompt"`
+	System   string      `json:"system"`
+	Template string      `json:"template"`
+	Context  []int       `json:"context,omitempty"`
+	Stream   *bool       `json:"stream,omitempty"`
+	Raw      bool        `json:"raw,omitempty"`
+	Format   string      `json:"format"`
+	Images   []ImageData `json:"images,omitempty"`

 	Options map[string]interface{} `json:"options"`
 }
@@ -54,8 +57,9 @@ type ChatRequest struct {
 }

 type Message struct {
-	Role    string `json:"role"` // one of ["system", "user", "assistant"]
-	Content string `json:"content"`
+	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
+	Content string      `json:"content"`
+	Images  []ImageData `json:"images, omitempty"`
 }

 type ChatResponse struct {
@@ -148,11 +152,12 @@ type ShowRequest struct {
 }

 type ShowResponse struct {
-	License    string `json:"license,omitempty"`
-	Modelfile  string `json:"modelfile,omitempty"`
-	Parameters string `json:"parameters,omitempty"`
-	Template   string `json:"template,omitempty"`
-	System     string `json:"system,omitempty"`
+	License    string       `json:"license,omitempty"`
+	Modelfile  string       `json:"modelfile,omitempty"`
+	Parameters string       `json:"parameters,omitempty"`
+	Template   string       `json:"template,omitempty"`
+	System     string       `json:"system,omitempty"`
+	Details    ModelDetails `json:"details,omitempty"`
 }

 type CopyRequest struct {
@@ -188,10 +193,11 @@ type ListResponse struct {
 }

 type ModelResponse struct {
-	Name       string    `json:"name"`
-	ModifiedAt time.Time `json:"modified_at"`
-	Size       int64     `json:"size"`
-	Digest     string    `json:"digest"`
+	Name       string       `json:"name"`
+	ModifiedAt time.Time    `json:"modified_at"`
+	Size       int64        `json:"size"`
+	Digest     string       `json:"digest"`
+	Details    ModelDetails `json:"details,omitempty"`
 }

 type TokenResponse struct {
@@ -203,20 +209,18 @@ type GenerateResponse struct {
 	CreatedAt time.Time `json:"created_at"`
 	Response  string    `json:"response"`

-	ModelConfiguration ModelConfiguration `json:"model_configuration"`
-
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

 	Metrics
 }

-type ModelConfiguration struct {
-	ModelFormat   string   `json:"model_format"`
-	ModelFamily   string   `json:"model_family"`
-	ModelFamilies []string `json:"model_families"`
-	ModelType     string   `json:"model_type"`
-	FileType      string   `json:"file_type"`
+type ModelDetails struct {
+	Format            string   `json:"format"`
+	Family            string   `json:"family"`
+	Families          []string `json:"families"`
+	ParameterSize     string   `json:"parameter_size"`
+	QuantizationLevel string   `json:"quantization_level"`
 }

 func (m *Metrics) Summary() {
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -17,7 +17,9 @@ import (
 	"os/exec"
 	"os/signal"
 	"path/filepath"
+	"regexp"
 	"runtime"
+	"slices"
 	"strings"
 	"syscall"
 	"time"
@@ -36,6 +38,8 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

+type ImageData []byte
+
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -418,6 +422,7 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 		Model:    args[0],
 		WordWrap: os.Getenv("TERM") == "xterm-256color",
 		Options:  map[string]interface{}{},
+		Images:   []ImageData{},
 	}

 	format, err := cmd.Flags().GetString("format")
@@ -427,7 +432,6 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 	opts.Format = format

 	prompts := args[1:]
-
 	// prepend stdin to the prompt if provided
 	if !term.IsTerminal(int(os.Stdin.Fd())) {
 		in, err := io.ReadAll(os.Stdin)
@@ -466,6 +470,7 @@ type generateOptions struct {
 	Format   string
 	System   string
 	Template string
+	Images   []ImageData
 	Options  map[string]interface{}
 }

@@ -551,6 +556,10 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		return nil
 	}

+	images := make([]api.ImageData, 0)
+	for _, i := range opts.Images {
+		images = append(images, api.ImageData(i))
+	}
 	request := api.GenerateRequest{
 		Model:    opts.Model,
 		Prompt:   opts.Prompt,
@@ -559,6 +568,7 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		System:   opts.System,
 		Template: opts.Template,
 		Options:  opts.Options,
+		Images:   images,
 	}

 	if err := client.Generate(ctx, &request, fn); err != nil {
@@ -585,7 +595,9 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		latest.Summary()
 	}

-	cmd.SetContext(context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context))
+	ctx = context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context)
+	cmd.SetContext(ctx)
+
 	return nil
 }

@@ -598,11 +610,31 @@ const (
 	MultilineTemplate
 )

+func modelIsMultiModal(cmd *cobra.Command, name string) bool {
+	// get model details
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		fmt.Println("error: couldn't connect to ollama server")
+		return false
+	}
+
+	req := api.ShowRequest{Name: name}
+	resp, err := client.Show(cmd.Context(), &req)
+	if err != nil {
+		return false
+	}
+
+	return slices.Contains(resp.Details.Families, "clip")
+}
+
 func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
+	multiModal := modelIsMultiModal(cmd, opts.Model)
+
 	// load the model
 	loadOpts := generateOptions{
 		Model:  opts.Model,
 		Prompt: "",
+		Images: []ImageData{},
 	}
 	if err := generate(cmd, loadOpts); err != nil {
 		return err
@@ -622,7 +654,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	usageSet := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set parameter ...     Set a parameter")
-		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system prompt")
+		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system message")
 		fmt.Fprintln(os.Stderr, "  /set template <string> Set prompt template")
 		fmt.Fprintln(os.Stderr, "  /set history           Enable history")
 		fmt.Fprintln(os.Stderr, "  /set nohistory         Disable history")
@@ -640,7 +672,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
 		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
 		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
-		fmt.Fprintln(os.Stderr, "  /show system       Show system prompt")
+		fmt.Fprintln(os.Stderr, "  /show system       Show system message")
 		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
 		fmt.Fprintln(os.Stderr, "")
 	}
@@ -701,9 +733,10 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			// if the prompt so far starts with """ then we're in multiline mode
 			// and we need to keep reading until we find a line that ends with """
 			cut, found := strings.CutSuffix(line, `"""`)
-			prompt += cut + "\n"
+			prompt += cut

 			if !found {
+				prompt += "\n"
 				continue
 			}

@@ -714,11 +747,11 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			case MultilineSystem:
 				opts.System = prompt
 				prompt = ""
-				fmt.Println("Set system template.")
+				fmt.Println("Set system message.")
 			case MultilineTemplate:
 				opts.Template = prompt
 				prompt = ""
-				fmt.Println("Set model template.")
+				fmt.Println("Set prompt template.")
 			}
 			multiline = MultilineNone
 		case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
@@ -789,17 +822,18 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					line = strings.TrimPrefix(line, `"""`)
 					if strings.HasPrefix(args[2], `"""`) {
 						cut, found := strings.CutSuffix(line, `"""`)
-						prompt += cut + "\n"
+						prompt += cut
 						if found {
-							opts.System = prompt
 							if args[1] == "system" {
-								fmt.Println("Set system template.")
+								opts.System = prompt
+								fmt.Println("Set system message.")
 							} else {
+								opts.Template = prompt
 								fmt.Println("Set prompt template.")
 							}
 							prompt = ""
 						} else {
-							prompt = `"""` + prompt
+							prompt = `"""` + prompt + "\n"
 							if args[1] == "system" {
 								multiline = MultilineSystem
 							} else {
@@ -809,7 +843,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 						}
 					} else {
 						opts.System = line
-						fmt.Println("Set system template.")
+						fmt.Println("Set system message.")
 					}
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
@@ -861,7 +895,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					case resp.System != "":
 						fmt.Println(resp.System + "\n")
 					default:
-						fmt.Print("No system prompt was specified for this model.\n\n")
+						fmt.Print("No system message was specified for this model.\n\n")
 					}
 				case "template":
 					switch {
@@ -902,6 +936,27 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {

 		if len(prompt) > 0 && multiline == MultilineNone {
 			opts.Prompt = prompt
+			if multiModal {
+				newPrompt, images, err := extractFileNames(prompt)
+				if err != nil {
+					return err
+				}
+				opts.Prompt = newPrompt
+
+				// reset the context if we find another image
+				if len(images) > 0 {
+					opts.Images = images
+					ctx := cmd.Context()
+					ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
+					cmd.SetContext(ctx)
+				}
+				if len(opts.Images) == 0 {
+					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
+					fmt.Println()
+					prompt = ""
+					continue
+				}
+			}
 			if err := generate(cmd, opts); err != nil {
 				return err
 			}
@@ -911,6 +966,57 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	}
 }

+func normalizeFilePath(fp string) string {
+	// Define a map of escaped characters and their replacements
+	replacements := map[string]string{
+		"\\ ":  " ",  // Escaped space
+		"\\(":  "(",  // Escaped left parenthesis
+		"\\)":  ")",  // Escaped right parenthesis
+		"\\[":  "[",  // Escaped left square bracket
+		"\\]":  "]",  // Escaped right square bracket
+		"\\{":  "{",  // Escaped left curly brace
+		"\\}":  "}",  // Escaped right curly brace
+		"\\$":  "$",  // Escaped dollar sign
+		"\\&":  "&",  // Escaped ampersand
+		"\\;":  ";",  // Escaped semicolon
+		"\\'":  "'",  // Escaped single quote
+		"\\\\": "\\", // Escaped backslash
+		"\\*":  "*",  // Escaped asterisk
+		"\\?":  "?",  // Escaped question mark
+	}
+
+	for escaped, actual := range replacements {
+		fp = strings.ReplaceAll(fp, escaped, actual)
+	}
+	return fp
+}
+
+func extractFileNames(input string) (string, []ImageData, error) {
+	// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
+	// and followed by more characters and a file extension
+	regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
+	re := regexp.MustCompile(regexPattern)
+
+	filePaths := re.FindAllString(input, -1)
+	var imgs []ImageData
+
+	for _, fp := range filePaths {
+		nfp := normalizeFilePath(fp)
+		data, err := getImageData(nfp)
+		if err != nil {
+			if os.IsNotExist(err) {
+				continue
+			}
+			fmt.Printf("Couldn't process image: %q\n", err)
+			return "", imgs, err
+		}
+		fmt.Printf("Added image '%s'\n", nfp)
+		input = strings.ReplaceAll(input, fp, "")
+		imgs = append(imgs, data)
+	}
+	return input, imgs, nil
+}
+
 func RunServer(cmd *cobra.Command, _ []string) error {
 	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	if err != nil {
@@ -937,6 +1043,50 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 	return server.Serve(ln, origins)
 }

+func getImageData(filePath string) ([]byte, error) {
+	file, err := os.Open(filePath)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	buf := make([]byte, 512)
+	_, err = file.Read(buf)
+	if err != nil {
+		return nil, err
+	}
+
+	contentType := http.DetectContentType(buf)
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
+	if !slices.Contains(allowedTypes, contentType) {
+		return nil, fmt.Errorf("invalid image type: %s", contentType)
+	}
+
+	info, err := file.Stat()
+	if err != nil {
+		return nil, err
+	}
+
+	// Check if the file size exceeds 100MB
+	var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
+	if info.Size() > maxSize {
+		return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
+	}
+
+	buf = make([]byte, info.Size())
+	_, err = file.Seek(0, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = io.ReadFull(file, buf)
+	if err != nil {
+		return nil, err
+	}
+
+	return buf, nil
+}
+
 func initializeKeypair() error {
 	home, err := os.UserHomeDir()
 	if err != nil {
@@ -1103,7 +1253,7 @@ func NewCLI() *cobra.Command {
 	showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
 	showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
 	showCmd.Flags().Bool("template", false, "Show template of a model")
-	showCmd.Flags().Bool("system", false, "Show system prompt of a model")
+	showCmd.Flags().Bool("system", false, "Show system message of a model")

 	runCmd := &cobra.Command{
 		Use:     "run MODEL [PROMPT]",
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,6 +3,7 @@
 ## Endpoints

 - [Generate a completion](#generate-a-completion)
+- [Generate a chat completion](#generate-a-chat-completion)
 - [Create a Model](#create-a-model)
 - [List Local Models](#list-local-models)
 - [Show Model Information](#show-model-information)
@@ -38,12 +39,13 @@ Generate a response for a given prompt with a provided model. This is a streamin

 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
+- `images`: a list of base64-encoded images (for multimodal models such as `llava`)

 Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
+- `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
@@ -146,6 +148,37 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 ```

+#### Request (with images)
+
+To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`:
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llava",
+  "prompt":"What is in this picture?",
+  "stream": false,
+  "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+}'
+```
+
+#### Response
+
+```
+{
+  "model": "llava",
+  "created_at": "2023-11-03T15:36:02.583064Z",
+  "response": "A happy cartoon character, which is cute and cheerful.",
+  "context": [1, 2, 3],
+  "done": true,
+  "total_duration": 14648695333,
+  "load_duration": 3302671417,
+  "prompt_eval_count": 14,
+  "prompt_eval_duration": 286243000,
+  "eval_count": 129,
+  "eval_duration": 10931424000
+}
+```
+
 #### Request (Raw Mode)

 In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting.
@@ -252,7 +285,7 @@ curl http://localhost:11434/api/generate -d '{
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
-    "num_ctx": 4,
+    "num_ctx": 1024,
    "num_batch": 2,
    "num_gqa": 1,
    "num_gpu": 1,
@@ -267,7 +300,7 @@ curl http://localhost:11434/api/generate -d '{
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
-    }
+  }
 }'
 ```

@@ -290,7 +323,7 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```

-## Send Chat Messages (coming in 0.1.14)
+## Generate a chat completion

 ```shell
 POST /api/chat
@@ -303,6 +336,12 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory

+The `message` object has the following fields:
+
+- `role`: the role of the message, either `system`, `user` or `assistant`
+- `content`: the content of the message
+- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
+
 Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
@@ -420,6 +459,23 @@ Final response:
 }
 ```

+#### Request (with images)
+
+Send a chat message with a conversation history.
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is in this image?",
+      "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+    },
+  ]
+}'
+```
+
 ## Create a Model

 ```shell
@@ -547,7 +603,7 @@ A single JSON object will be returned.
 POST /api/show
 ```

-Show details about a model including modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, and system prompt.

 ### Parameters

@@ -567,10 +623,16 @@ curl http://localhost:11434/api/show -d '{

 ```json
 {
-  "license": "<contents of license block>",
-  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
-  "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
-  "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM mike/llava:latest\nTEMPLATE \"\"\"\nUSER:{{ .Prompt }}\nASSISTANT:\n\"\"\"\nPARAMETER num_ctx 4096",
+  "parameters": "num_ctx                        4096",
+  "template": "\nUSER:{{ .Prompt }}\nASSISTANT:\n",
+  "license:": "<license>",
+  "details": {
+    "format": "gguf",
+    "families": ["llama", "clip"],
+    "parameter_size": "7B",
+    "quantization_level": "Q4_0"
+  }
 }
 ```

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -95,10 +95,6 @@ The manifest lists all the layers used in this model. You will see a `media type

 To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.

-### I downloaded most of a model yesterday, but it's gone today. What happened?
-
-When the Ollama server starts, it looks for fragments of models that still exist on the system and cleans them out. If you have an Internet connection that can't complete a model download all at once, this can be frustrating. Adding the OLLAMA_NOPRUNE environment variable will prevent the server from pruning incomplete files.
-
 ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

 No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -30,14 +30,14 @@ The format of the `Modelfile`:
 INSTRUCTION arguments
 ```

-| Instruction                         | Description                                                   |
-| ----------------------------------- | ------------------------------------------------------------- |
-| [`FROM`](#from-required) (required) | Defines the base model to use.                                |
-| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.        |
-| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.             |
-| [`SYSTEM`](#system)                 | Specifies the system prompt that will be set in the template. |
-| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.           |
-| [`LICENSE`](#license)               | Specifies the legal license.                                  |
+| Instruction                         | Description                                                    |
+| ----------------------------------- | -------------------------------------------------------------- |
+| [`FROM`](#from-required) (required) | Defines the base model to use.                                 |
+| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.         |
+| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.              |
+| [`SYSTEM`](#system)                 | Specifies the system message that will be set in the template. |
+| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
+| [`LICENSE`](#license)               | Specifies the legal license.                                   |

 ## Examples

@@ -52,7 +52,7 @@ PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
 PARAMETER num_ctx 4096

-# sets a custom system prompt to specify the behavior of the chat assistant
+# sets a custom system message to specify the behavior of the chat assistant
 SYSTEM You are Mario from super mario bros, acting as an assistant.
 ```

@@ -70,9 +70,9 @@ More examples are available in the [examples directory](../examples).
 There are two ways to view `Modelfile`s underlying the models in [ollama.ai/library][1]:

 - Option 1: view a details page from a model's tags page:
-   1. Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
-   2. Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
-   3. Scroll down to "Layers"
+  1.  Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
+  2.  Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
+  3.  Scroll down to "Layers"
      - Note: if the [`FROM` instruction](#from-required) is not present,
        it means the model was created from a local file
 - Option 2: use `ollama show` to print the `Modelfile` like so:
@@ -152,15 +152,15 @@ PARAMETER <parameter> <parametervalue>

 ### TEMPLATE

-`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system prompt and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.
+`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.

 #### Template Variables

-| Variable        | Description                                                                                                  |
-| --------------- | ------------------------------------------------------------------------------------------------------------ |
-| `{{ .System }}` | The system prompt used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
-| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                 |
-| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.          |
+| Variable        | Description                                                                                                   |
+| --------------- | ------------------------------------------------------------------------------------------------------------- |
+| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
+| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
+| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.           |

 ```modelfile
 TEMPLATE """
@@ -180,7 +180,7 @@ SYSTEM """<system message>"""

 ### SYSTEM

-The `SYSTEM` instruction specifies the system prompt to be used in the template, if applicable.
+The `SYSTEM` instruction specifies the system message to be used in the template, if applicable.

 ```modelfile
 SYSTEM """<system message>"""
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -1,20 +0,0 @@
-package llm
-
-const (
-	falconModelType7B   = 32
-	falconModelType40B  = 60
-	falconModelType180B = 80
-)
-
-func falconModelType(numLayer uint32) string {
-	switch numLayer {
-	case 32:
-		return "7B"
-	case 60:
-		return "40B"
-	case 80:
-		return "180B"
-	default:
-		return "unknown"
-	}
-}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -93,6 +93,8 @@ func (c *containerGGML) Name() string {
 }

 func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
+	// file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
 	return nil, nil
 }

@@ -115,6 +117,10 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
+
+	// remaining file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
+
 	return nil, nil
 }

@@ -141,6 +147,10 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
 	// different model types may have different layouts for hyperparameters
 	var llama llamaModel
 	binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
+
+	// remaining file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
+
 	return &llama, nil
 }

@@ -163,6 +173,10 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
+
+	// remaining file contents aren't decoded
+	ro.Seek(0, io.SeekEnd)
+
 	return nil, nil
 }

--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -120,27 +120,6 @@ func (llm *ggufModel) ModelType() string {
 		return format.HumanNumber(llm.parameters)
 	}

-	switch llm.ModelFamily() {
-	case "llama":
-		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
-			heads, headsOK := llm.kv["llama.head_count"].(uint32)
-			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
-			if headsOK && headsKVsOK && heads/headKVs == 8 {
-				return "70B"
-			}
-
-			return llamaModelType(blocks)
-		}
-	case "falcon":
-		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
-			return falconModelType(blocks)
-		}
-	case "starcoder":
-		if blocks, ok := llm.kv["starcoder.block_count"].(uint32); ok {
-			return starCoderModelType(blocks)
-		}
-	}
-
 	return "unknown"
 }

--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -59,6 +59,7 @@ ws ::= ([ \t\n] ws)?
 var llamaCppEmbed embed.FS

 type ModelRunner struct {
+	Type        string // "gguf" or "ggml"
 	Path        string // path to the model runner executable
 	Accelerated bool
 }
@@ -72,25 +73,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	switch runtime.GOOS {
 	case "darwin":
 		if runtime.GOARCH == "arm64" {
-			runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
 		} else {
-			runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
 		}
 	case "linux":
 		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
+			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
-			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
+			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
+			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	}

@@ -148,6 +149,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
+			Type:        r.Type,
 			Path:        filepath.Clean(path.Join(workDir, r.Path)),
 			Accelerated: r.Accelerated,
 		})
@@ -221,8 +223,14 @@ type Running struct {
 	*StatusWriter            // captures error messages from the llama runner process
 }

+type ImageData struct {
+	Data []byte `json:"data"`
+	ID   int    `json:"id"`
+}
+
 type llama struct {
 	api.Options
+	ImageData []ImageData
 	Running
 }

@@ -402,11 +410,13 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		}

 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+		params := append(params, "--port", strconv.Itoa(port))
+
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
 			ctx,
 			runner.Path,
-			append(params, "--port", strconv.Itoa(port))...,
+			params...,
 		)

 		var libraryPaths []string
@@ -535,20 +545,15 @@ type prediction struct {
 }

 const maxBufferSize = 512 * format.KiloByte
+const maxRetries = 6

 type PredictOpts struct {
-	Model            string
-	Prompt           string
-	Format           string
-	CheckpointStart  time.Time
-	CheckpointLoaded time.Time
+	Prompt string
+	Format string
+	Images []api.ImageData
 }

 type PredictResult struct {
-	Model              string
-	CreatedAt          time.Time
-	TotalDuration      time.Duration
-	LoadDuration       time.Duration
 	Content            string
 	Done               bool
 	PromptEvalCount    int
@@ -557,7 +562,20 @@ type PredictResult struct {
 	EvalDuration       time.Duration
 }

+// IsRetryable checks if the line matches a condition that can be retried
+func isRetryable(line []byte) bool {
+	return bytes.Contains(line, []byte("slot unavailable"))
+}
+
 func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+	imageData := llm.ImageData
+	if len(predict.Images) > 0 {
+		for cnt, i := range predict.Images {
+			imageData = append(imageData, ImageData{Data: i, ID: cnt})
+		}
+	}
+	log.Printf("loaded %d images", len(imageData))
+
 	request := map[string]any{
 		"prompt":            predict.Prompt,
 		"stream":            true,
@@ -579,59 +597,78 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 		"penalize_nl":       llm.PenalizeNewline,
 		"seed":              llm.Seed,
 		"stop":              llm.Stop,
+		"image_data":        imageData,
 	}

 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
 	}

-	// Handling JSON marshaling with special characters unescaped.
-	buffer := &bytes.Buffer{}
-	enc := json.NewEncoder(buffer)
-	enc.SetEscapeHTML(false)
-
-	if err := enc.Encode(request); err != nil {
-		return fmt.Errorf("failed to marshal data: %v", err)
-	}
-
-	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
-	if err != nil {
-		return fmt.Errorf("error creating POST request: %v", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return fmt.Errorf("POST predict: %v", err)
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode >= 400 {
-		bodyBytes, err := io.ReadAll(resp.Body)
-		if err != nil {
-			return fmt.Errorf("failed reading llm error response: %w", err)
+	retryDelay := 100 * time.Microsecond
+	for retries := 0; retries < maxRetries; retries++ {
+		if retries > 0 {
+			time.Sleep(retryDelay) // wait before retrying
+			retryDelay *= 2        // exponential backoff
 		}
-		log.Printf("llm predict error: %s", bodyBytes)
-		return fmt.Errorf("%s", bodyBytes)
-	}

-	scanner := bufio.NewScanner(resp.Body)
-	// increase the buffer size to avoid running out of space
-	buf := make([]byte, 0, maxBufferSize)
-	scanner.Buffer(buf, maxBufferSize)
-	for scanner.Scan() {
-		select {
-		case <-ctx.Done():
-			// This handles the request cancellation
-			return ctx.Err()
-		default:
-			line := scanner.Bytes()
-			if len(line) == 0 {
-				continue
+		// Handling JSON marshaling with special characters unescaped.
+		buffer := &bytes.Buffer{}
+		enc := json.NewEncoder(buffer)
+		enc.SetEscapeHTML(false)
+
+		if err := enc.Encode(request); err != nil {
+			return fmt.Errorf("failed to marshal data: %v", err)
+		}
+
+		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
+		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+		if err != nil {
+			return fmt.Errorf("error creating POST request: %v", err)
+		}
+		req.Header.Set("Content-Type", "application/json")
+
+		resp, err := http.DefaultClient.Do(req)
+		if err != nil {
+			return fmt.Errorf("POST predict: %v", err)
+		}
+		defer resp.Body.Close()
+
+		if resp.StatusCode >= 400 {
+			bodyBytes, err := io.ReadAll(resp.Body)
+			if err != nil {
+				return fmt.Errorf("failed reading llm error response: %w", err)
 			}
+			log.Printf("llm predict error: %s", bodyBytes)
+			return fmt.Errorf("%s", bodyBytes)
+		}
+
+		scanner := bufio.NewScanner(resp.Body)
+		// increase the buffer size to avoid running out of space
+		buf := make([]byte, 0, maxBufferSize)
+		scanner.Buffer(buf, maxBufferSize)
+
+		retryNeeded := false
+		for scanner.Scan() {
+			select {
+			case <-ctx.Done():
+				// This handles the request cancellation
+				return ctx.Err()
+			default:
+				line := scanner.Bytes()
+				if len(line) == 0 {
+					continue
+				}
+
+				if isRetryable(line) {
+					retryNeeded = true
+					break
+				}
+
+				evt, ok := bytes.CutPrefix(line, []byte("data: "))
+				if !ok {
+					return fmt.Errorf("error parsing llm response stream: %s", line)
+				}

-			if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
 				var p prediction
 				if err := json.Unmarshal(evt, &p); err != nil {
 					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
@@ -639,18 +676,12 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred

 				if p.Content != "" {
 					fn(PredictResult{
-						Model:     predict.Model,
-						CreatedAt: time.Now().UTC(),
-						Content:   p.Content,
+						Content: p.Content,
 					})
 				}

 				if p.Stop {
 					fn(PredictResult{
-						Model:         predict.Model,
-						CreatedAt:     time.Now().UTC(),
-						TotalDuration: time.Since(predict.CheckpointStart),
-
 						Done:               true,
 						PromptEvalCount:    p.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
@@ -661,21 +692,26 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 				}
 			}
 		}
-	}

-	if err := scanner.Err(); err != nil {
-		if strings.Contains(err.Error(), "unexpected EOF") {
-			// this means the llama runner subprocess crashed
-			llm.Close()
-			if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
-				return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
+		if err := scanner.Err(); err != nil {
+			if strings.Contains(err.Error(), "unexpected EOF") {
+				// this means the llama runner subprocess crashed
+				llm.Close()
+				if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
+					return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
+				}
+				return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
 			}
-			return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
+			return fmt.Errorf("error reading llm response: %v", err)
+		}
+
+		if !retryNeeded {
+			return nil // success
 		}
-		return fmt.Errorf("error reading llm response: %v", err)
 	}

-	return nil
+	// should never reach here ideally
+	return fmt.Errorf("max retries exceeded")
 }

 type TokenizeRequest struct {
--- a/llm/starcoder.go
+++ b/llm/starcoder.go
@@ -1,23 +0,0 @@
-package llm
-
-const (
-	starCoderModelType1B  = 24
-	starCoderModelType3B  = 36
-	starCoderModelType7B  = 42
-	starCoderModelType15B = 40
-)
-
-func starCoderModelType(numLayer uint32) string {
-	switch numLayer {
-	case 24:
-		return "1B"
-	case 36:
-		return "3B"
-	case 42:
-		return "7B"
-	case 40:
-		return "15B"
-	default:
-		return "unknown"
-	}
-}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -192,14 +192,7 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlW:
 			buf.DeleteWord()
 		case CharCtrlZ:
-			if err := UnsetRawMode(fd, termios); err != nil {
-				return "", err
-			}
-
-			syscall.Kill(0, syscall.SIGSTOP)
-
-			// on resume...
-			return "", nil
+			return handleCharCtrlZ(fd, termios)
 		case CharEnter:
 			output := buf.String()
 			if output != "" {
--- a/readline/readline_unix.go
+++ b/readline/readline_unix.go
@@ -0,0 +1,18 @@
+//go:build !windows
+
+package readline
+
+import (
+	"syscall"
+)
+
+func handleCharCtrlZ(fd int, termios *Termios) (string, error) {
+	if err := UnsetRawMode(fd, termios); err != nil {
+		return "", err
+	}
+
+	syscall.Kill(0, syscall.SIGSTOP)
+
+	// on resume...
+	return "", nil
+}
--- a/readline/readline_windows.go
+++ b/readline/readline_windows.go
@@ -0,0 +1,6 @@
+package readline
+
+func handleCharCtrlZ(fd int, state *State) (string, error) {
+	// not supported
+	return "", nil
+}
--- a/server/images.go
+++ b/server/images.go
@@ -46,6 +46,7 @@ type Model struct {
 	System         string
 	License        []string
 	Digest         string
+	Size           int64
 	Options        map[string]interface{}
 }

@@ -65,7 +66,7 @@ func (m *Model) Prompt(p PromptVars) (string, error) {
 	}

 	if p.System == "" {
-		// use the default system prompt for this model if one is not specified
+		// use the default system message for this model if one is not specified
 		p.System = m.System
 	}

@@ -85,9 +86,10 @@ func (m *Model) Prompt(p PromptVars) (string, error) {
 	return prompt.String(), nil
 }

-func (m *Model) ChatPrompt(msgs []api.Message) (string, error) {
+func (m *Model) ChatPrompt(msgs []api.Message) (string, []api.ImageData, error) {
 	// build the prompt from the list of messages
 	var prompt strings.Builder
+	var currentImages []api.ImageData
 	currentVars := PromptVars{
 		First: true,
 	}
@@ -107,35 +109,36 @@ func (m *Model) ChatPrompt(msgs []api.Message) (string, error) {
 		case "system":
 			if currentVars.System != "" {
 				if err := writePrompt(); err != nil {
-					return "", err
+					return "", nil, err
 				}
 			}
 			currentVars.System = msg.Content
 		case "user":
 			if currentVars.Prompt != "" {
 				if err := writePrompt(); err != nil {
-					return "", err
+					return "", nil, err
 				}
 			}
 			currentVars.Prompt = msg.Content
+			currentImages = msg.Images
 		case "assistant":
 			currentVars.Response = msg.Content
 			if err := writePrompt(); err != nil {
-				return "", err
+				return "", nil, err
 			}
 		default:
-			return "", fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
+			return "", nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
 		}
 	}

 	// Append the last set of vars if they are non-empty
 	if currentVars.Prompt != "" || currentVars.System != "" {
 		if err := writePrompt(); err != nil {
-			return "", err
+			return "", nil, err
 		}
 	}

-	return prompt.String(), nil
+	return prompt.String(), currentImages, nil
 }

 type ManifestV2 struct {
@@ -146,12 +149,16 @@ type ManifestV2 struct {
 }

 type ConfigV2 struct {
+	ModelFormat   string   `json:"model_format"`
+	ModelFamily   string   `json:"model_family"`
+	ModelFamilies []string `json:"model_families"`
+	ModelType     string   `json:"model_type"`
+	FileType      string   `json:"file_type"`
+
 	// required by spec
 	Architecture string `json:"architecture"`
 	OS           string `json:"os"`
 	RootFS       RootFS `json:"rootfs"`
-
-	api.ModelConfiguration
 }

 func (c *ConfigV2) SetModelFormat(format string) {
@@ -238,6 +245,7 @@ func GetModel(name string) (*Model, error) {
 		Digest:    digest,
 		Template:  "{{ .Prompt }}",
 		License:   []string{},
+		Size:      manifest.GetTotalSize(),
 	}

 	filename, err := GetBlobsPath(manifest.Config.Digest)
@@ -541,6 +549,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 			}
 		}

+		// xxx - can this be removed?
 		if config.ModelType == "65B" {
 			if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
 				config.ModelType = "70B"
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -81,7 +81,7 @@ func TestChat(t *testing.T) {
 			Template: tt.template,
 		}
 		t.Run(tt.name, func(t *testing.T) {
-			got, err := m.ChatPrompt(tt.msgs)
+			got, _, err := m.ChatPrompt(tt.msgs)
 			if tt.wantErr != "" {
 				if err == nil {
 					t.Errorf("ChatPrompt() expected error, got nil")
--- a/server/routes.go
+++ b/server/routes.go
@@ -156,9 +156,9 @@ func GenerateHandler(c *gin.Context) {
 	defer loaded.mu.Unlock()

 	checkpointStart := time.Now()
-
 	var req api.GenerateRequest
 	err := c.ShouldBindJSON(&req)
+
 	switch {
 	case errors.Is(err, io.EOF):
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -199,10 +199,9 @@ func GenerateHandler(c *gin.Context) {
 	// an empty request loads the model
 	if req.Prompt == "" && req.Template == "" && req.System == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
-			CreatedAt:          time.Now().UTC(),
-			Model:              req.Model,
-			ModelConfiguration: model.Config.ModelConfiguration,
-			Done:               true})
+			CreatedAt: time.Now().UTC(),
+			Model:     req.Model,
+			Done:      true})
 		return
 	}

@@ -261,14 +260,11 @@ func GenerateHandler(c *gin.Context) {
 			}

 			resp := api.GenerateResponse{
-				Model:              r.Model,
-				ModelConfiguration: model.Config.ModelConfiguration,
-				CreatedAt:          r.CreatedAt,
-				Done:               r.Done,
-				Response:           r.Content,
+				Model:     req.Model,
+				CreatedAt: time.Now().UTC(),
+				Done:      r.Done,
+				Response:  r.Content,
 				Metrics: api.Metrics{
-					TotalDuration:      r.TotalDuration,
-					LoadDuration:       r.LoadDuration,
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
 					EvalCount:          r.EvalCount,
@@ -276,13 +272,18 @@ func GenerateHandler(c *gin.Context) {
 				},
 			}

-			if r.Done && !req.Raw {
-				embd, err := loaded.runner.Encode(c.Request.Context(), prompt+generated.String())
-				if err != nil {
-					ch <- gin.H{"error": err.Error()}
-					return
+			if r.Done {
+				resp.TotalDuration = time.Since(checkpointStart)
+				resp.LoadDuration = checkpointLoaded.Sub(checkpointStart)
+
+				if !req.Raw {
+					embd, err := loaded.runner.Encode(c.Request.Context(), prompt+generated.String())
+					if err != nil {
+						ch <- gin.H{"error": err.Error()}
+						return
+					}
+					resp.Context = embd
 				}
-				resp.Context = embd
 			}

 			ch <- resp
@@ -290,11 +291,9 @@ func GenerateHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
-			Model:            model.Name,
-			Prompt:           prompt,
-			Format:           req.Format,
-			CheckpointStart:  checkpointStart,
-			CheckpointLoaded: checkpointLoaded,
+			Prompt: prompt,
+			Format: req.Format,
+			Images: req.Images,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
@@ -302,19 +301,30 @@ func GenerateHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Wait for the channel to close
-		var r api.GenerateResponse
+		// Accumulate responses into the final response
+		var final api.GenerateResponse
 		var sb strings.Builder
 		for resp := range ch {
-			var ok bool
-			if r, ok = resp.(api.GenerateResponse); !ok {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			switch r := resp.(type) {
+			case api.GenerateResponse:
+				sb.WriteString(r.Response)
+				final = r
+			case gin.H:
+				if errorMsg, ok := r["error"].(string); ok {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
+					return
+				} else {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
+					return
+				}
+			default:
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
 				return
 			}
-			sb.WriteString(r.Response)
 		}
-		r.Response = sb.String()
-		c.JSON(http.StatusOK, r)
+
+		final.Response = sb.String()
+		c.JSON(http.StatusOK, final)
 		return
 	}

@@ -606,10 +616,19 @@ func GetModelInfo(name string) (*api.ShowResponse, error) {
 		return nil, err
 	}

+	modelDetails := api.ModelDetails{
+		Format:            model.Config.ModelFormat,
+		Family:            model.Config.ModelFamily,
+		Families:          model.Config.ModelFamilies,
+		ParameterSize:     model.Config.ModelType,
+		QuantizationLevel: model.Config.FileType,
+	}
+
 	resp := &api.ShowResponse{
 		License:  strings.Join(model.License, "\n"),
 		System:   model.System,
 		Template: model.Template,
+		Details:  modelDetails,
 	}

 	mf, err := ShowModelfile(model)
@@ -659,25 +678,42 @@ func ListModelsHandler(c *gin.Context) {
 		return
 	}

+	modelResponse := func(modelName string) (api.ModelResponse, error) {
+		model, err := GetModel(modelName)
+		if err != nil {
+			return api.ModelResponse{}, err
+		}
+
+		modelDetails := api.ModelDetails{
+			Format:            model.Config.ModelFormat,
+			Family:            model.Config.ModelFamily,
+			Families:          model.Config.ModelFamilies,
+			ParameterSize:     model.Config.ModelType,
+			QuantizationLevel: model.Config.FileType,
+		}
+
+		return api.ModelResponse{
+			Name:    model.ShortName,
+			Size:    model.Size,
+			Digest:  model.Digest,
+			Details: modelDetails,
+		}, nil
+	}
+
 	walkFunc := func(path string, info os.FileInfo, _ error) error {
 		if !info.IsDir() {
 			dir, file := filepath.Split(path)
 			dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
 			tag := strings.Join([]string{dir, file}, ":")

-			mp := ParseModelPath(tag)
-			manifest, digest, err := GetManifest(mp)
+			resp, err := modelResponse(tag)
 			if err != nil {
 				log.Printf("skipping file: %s", fp)
 				return nil
 			}

-			models = append(models, api.ModelResponse{
-				Name:       mp.GetShortTagname(),
-				Size:       manifest.GetTotalSize(),
-				Digest:     digest,
-				ModifiedAt: info.ModTime(),
-			})
+			resp.ModifiedAt = info.ModTime()
+			models = append(models, resp)
 		}

 		return nil
@@ -852,7 +888,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	if runtime.GOOS == "linux" {
 		// check compatibility to log warnings
 		if _, err := llm.CheckVRAM(); err != nil {
-			log.Printf(err.Error())
+			log.Print(err.Error())
 		}
 	}

@@ -959,7 +995,7 @@ func ChatHandler(c *gin.Context) {

 	checkpointLoaded := time.Now()

-	prompt, err := model.ChatPrompt(req.Messages)
+	prompt, images, err := model.ChatPrompt(req.Messages)
 	if err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
@@ -976,12 +1012,10 @@ func ChatHandler(c *gin.Context) {
 			loaded.expireTimer.Reset(sessionDuration)

 			resp := api.ChatResponse{
-				Model:     r.Model,
-				CreatedAt: r.CreatedAt,
+				Model:     req.Model,
+				CreatedAt: time.Now().UTC(),
 				Done:      r.Done,
 				Metrics: api.Metrics{
-					TotalDuration:      r.TotalDuration,
-					LoadDuration:       r.LoadDuration,
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
 					EvalCount:          r.EvalCount,
@@ -989,7 +1023,10 @@ func ChatHandler(c *gin.Context) {
 				},
 			}

-			if !r.Done {
+			if r.Done {
+				resp.TotalDuration = time.Since(checkpointStart)
+				resp.LoadDuration = checkpointLoaded.Sub(checkpointStart)
+			} else {
 				resp.Message = &api.Message{Role: "assistant", Content: r.Content}
 			}

@@ -998,11 +1035,9 @@ func ChatHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
-			Model:            model.Name,
-			Prompt:           prompt,
-			Format:           req.Format,
-			CheckpointStart:  checkpointStart,
-			CheckpointLoaded: checkpointLoaded,
+			Prompt: prompt,
+			Format: req.Format,
+			Images: images,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
@@ -1010,21 +1045,33 @@ func ChatHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Wait for the channel to close
-		var r api.ChatResponse
+		// Accumulate responses into the final response
+		var final api.ChatResponse
 		var sb strings.Builder
 		for resp := range ch {
-			var ok bool
-			if r, ok = resp.(api.ChatResponse); !ok {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			switch r := resp.(type) {
+			case api.ChatResponse:
+				if r.Message != nil {
+					sb.WriteString(r.Message.Content)
+				}
+
+				final = r
+			case gin.H:
+				if errorMsg, ok := r["error"].(string); ok {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
+					return
+				} else {
+					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
+					return
+				}
+			default:
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
 				return
 			}
-			if r.Message != nil {
-				sb.WriteString(r.Message.Content)
-			}
 		}
-		r.Message = &api.Message{Role: "assistant", Content: sb.String()}
-		c.JSON(http.StatusOK, r)
+
+		final.Message = &api.Message{Role: "assistant", Content: sb.String()}
+		c.JSON(http.StatusOK, final)
 		return
 	}
Author	SHA1	Message	Date
Bruce MacDonald	6ee8c80199	restore model load duration on generate response (#1524 ) * restore model load duration on generate response - set model load duration on generate and chat done response - calculate createAt time when response created * remove checkpoints predict opts * Update routes.go	2023-12-14 12:15:50 -05:00
Jeffrey Morgan	31f0551dab	Update runner to support mixtral and mixture of experts (MoE) (#1475 )	2023-12-13 17:15:10 -05:00
Jeffrey Morgan	4a1abfe4fa	fix tests	2023-12-13 14:42:30 -05:00
Jeffrey Morgan	bbd41494bf	add multimodal to `README.md`	2023-12-13 14:38:47 -05:00
Jeffrey Morgan	fedba24a63	Docs for multimodal support (#1485 ) * add multimodal docs * add chat api docs * consistency between `/api/generate` and `/api/chat` * simplify docs	2023-12-13 13:59:33 -05:00
pepperoni21	e3b090dbc5	Added message format for chat api (#1488 )	2023-12-13 11:21:23 -05:00
Patrick Devine	d9e60f634b	add image support to the chat api (#1490 )	2023-12-12 13:28:58 -08:00
Michael Yang	4251b342de	Merge pull request #1469 from jmorganca/mxyng/model-types remove per-model types	2023-12-12 12:27:03 -08:00
Jeffrey Morgan	0a9d348023	Fix issues with `/set template` and `/set system` (#1486 )	2023-12-12 14:43:19 -05:00
Bruce MacDonald	3144e2a439	exponential back-off (#1484 )	2023-12-12 12:33:02 -05:00
Bruce MacDonald	c0960e29b5	retry on concurrent request failure (#1483 ) - remove parallel	2023-12-12 12:14:35 -05:00
ruecat	5314fc9b63	Fix Readme "Database -> MindsDB" link (#1479 )	2023-12-12 10:26:13 -05:00
Jorge Torres	a36b5fef3b	Update README.md (#1412 )	2023-12-11 18:05:10 -05:00
Patrick Devine	910e9401d0	Multimodal support (#1216 ) --------- Co-authored-by: Matt Apperson <mattapperson@Matts-MacBook-Pro.local>	2023-12-11 13:56:22 -08:00
Michael Yang	56ffc3023a	remove per-model types mostly replaced by decoding tensors except ggml models which only support llama	2023-12-11 09:40:21 -08:00
Bruce MacDonald	7a1b37ac64	os specific ctrl-z (#1420 )	2023-12-11 10:48:14 -05:00
Jeffrey Morgan	5d4d2e2c60	update docs with chat completion api	2023-12-10 13:53:36 -05:00
Jeffrey Morgan	7db5bcf73b	fix `go-staticcheck` warning	2023-12-10 11:44:27 -05:00
Jeffrey Morgan	fa2f095bd9	fix model name returned by `/api/generate` being different than the model name provided	2023-12-10 11:42:15 -05:00
Jeffrey Morgan	045b855db9	fix error on accumulating final chat response	2023-12-10 11:24:39 -05:00
Jeffrey Morgan	32064a0646	fix empty response when receiving runner error	2023-12-10 10:53:38 -05:00
Jeffrey Morgan	d9a250e9b5	seek to end of file when decoding older model formats	2023-12-09 21:14:35 -05:00
Jeffrey Morgan	944519ed16	seek to eof for older model binaries	2023-12-09 20:48:57 -05:00
Jeffrey Morgan	2dd040d04c	do not use `--parallel 2` for old runners	2023-12-09 20:17:33 -05:00
Bruce MacDonald	bbe41ce41a	fix: parallel queueing race condition caused silent failure (#1445 ) * fix: queued request failures - increase parallel requests to 2 to complete queued request, queueing is managed in ollama * log steam errors	2023-12-09 14:14:02 -05:00
Jeffrey Morgan	9e1406e4ed	Don't expose model information in `/api/generate`	2023-12-09 02:05:43 -08:00