Added mention of the NOPRUNE env var

Signed-off-by: Matt Williams <m@technovangelist.com>
2023-12-08 17:37:12 -08:00
16 changed files with 259 additions and 482 deletions
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ FROM llama2
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1

-# set the system message
+# set the system prompt
 SYSTEM """
 You are Mario from Super Mario Bros. Answer as Mario, the assistant, only.
 """
@@ -205,8 +205,7 @@ Finally, in a separate shell, run a model:
 ## REST API

 Ollama has a REST API for running and managing models.
-
-### Generate a response
+For example, to generate text from a model:

 ```
 curl http://localhost:11434/api/generate -d '{
@@ -215,7 +214,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-### Chat with a model
+Or send a chat message (coming in 0.1.14):

 ```
 curl http://localhost:11434/api/chat -d '{
@@ -254,10 +253,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)

-### Database
-
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)
-
 ### Package managers

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
--- a/api/types.go
+++ b/api/types.go
@@ -31,18 +31,15 @@ func (e StatusError) Error() string {
 	}
 }

-type ImageData []byte
-
 type GenerateRequest struct {
-	Model    string      `json:"model"`
-	Prompt   string      `json:"prompt"`
-	System   string      `json:"system"`
-	Template string      `json:"template"`
-	Context  []int       `json:"context,omitempty"`
-	Stream   *bool       `json:"stream,omitempty"`
-	Raw      bool        `json:"raw,omitempty"`
-	Format   string      `json:"format"`
-	Images   []ImageData `json:"images,omitempty"`
+	Model    string `json:"model"`
+	Prompt   string `json:"prompt"`
+	System   string `json:"system"`
+	Template string `json:"template"`
+	Context  []int  `json:"context,omitempty"`
+	Stream   *bool  `json:"stream,omitempty"`
+	Raw      bool   `json:"raw,omitempty"`
+	Format   string `json:"format"`

 	Options map[string]interface{} `json:"options"`
 }
@@ -57,9 +54,8 @@ type ChatRequest struct {
 }

 type Message struct {
-	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
-	Content string      `json:"content"`
-	Images  []ImageData `json:"images, omitempty"`
+	Role    string `json:"role"` // one of ["system", "user", "assistant"]
+	Content string `json:"content"`
 }

 type ChatResponse struct {
@@ -152,12 +148,11 @@ type ShowRequest struct {
 }

 type ShowResponse struct {
-	License    string       `json:"license,omitempty"`
-	Modelfile  string       `json:"modelfile,omitempty"`
-	Parameters string       `json:"parameters,omitempty"`
-	Template   string       `json:"template,omitempty"`
-	System     string       `json:"system,omitempty"`
-	Details    ModelDetails `json:"details,omitempty"`
+	License    string `json:"license,omitempty"`
+	Modelfile  string `json:"modelfile,omitempty"`
+	Parameters string `json:"parameters,omitempty"`
+	Template   string `json:"template,omitempty"`
+	System     string `json:"system,omitempty"`
 }

 type CopyRequest struct {
@@ -193,11 +188,10 @@ type ListResponse struct {
 }

 type ModelResponse struct {
-	Name       string       `json:"name"`
-	ModifiedAt time.Time    `json:"modified_at"`
-	Size       int64        `json:"size"`
-	Digest     string       `json:"digest"`
-	Details    ModelDetails `json:"details,omitempty"`
+	Name       string    `json:"name"`
+	ModifiedAt time.Time `json:"modified_at"`
+	Size       int64     `json:"size"`
+	Digest     string    `json:"digest"`
 }

 type TokenResponse struct {
@@ -209,18 +203,20 @@ type GenerateResponse struct {
 	CreatedAt time.Time `json:"created_at"`
 	Response  string    `json:"response"`

+	ModelConfiguration ModelConfiguration `json:"model_configuration"`
+
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

 	Metrics
 }

-type ModelDetails struct {
-	Format            string   `json:"format"`
-	Family            string   `json:"family"`
-	Families          []string `json:"families"`
-	ParameterSize     string   `json:"parameter_size"`
-	QuantizationLevel string   `json:"quantization_level"`
+type ModelConfiguration struct {
+	ModelFormat   string   `json:"model_format"`
+	ModelFamily   string   `json:"model_family"`
+	ModelFamilies []string `json:"model_families"`
+	ModelType     string   `json:"model_type"`
+	FileType      string   `json:"file_type"`
 }

 func (m *Metrics) Summary() {
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -17,9 +17,7 @@ import (
 	"os/exec"
 	"os/signal"
 	"path/filepath"
-	"regexp"
 	"runtime"
-	"slices"
 	"strings"
 	"syscall"
 	"time"
@@ -38,8 +36,6 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

-type ImageData []byte
-
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -422,7 +418,6 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 		Model:    args[0],
 		WordWrap: os.Getenv("TERM") == "xterm-256color",
 		Options:  map[string]interface{}{},
-		Images:   []ImageData{},
 	}

 	format, err := cmd.Flags().GetString("format")
@@ -432,6 +427,7 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 	opts.Format = format

 	prompts := args[1:]
+
 	// prepend stdin to the prompt if provided
 	if !term.IsTerminal(int(os.Stdin.Fd())) {
 		in, err := io.ReadAll(os.Stdin)
@@ -470,7 +466,6 @@ type generateOptions struct {
 	Format   string
 	System   string
 	Template string
-	Images   []ImageData
 	Options  map[string]interface{}
 }

@@ -556,10 +551,6 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		return nil
 	}

-	images := make([]api.ImageData, 0)
-	for _, i := range opts.Images {
-		images = append(images, api.ImageData(i))
-	}
 	request := api.GenerateRequest{
 		Model:    opts.Model,
 		Prompt:   opts.Prompt,
@@ -568,7 +559,6 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		System:   opts.System,
 		Template: opts.Template,
 		Options:  opts.Options,
-		Images:   images,
 	}

 	if err := client.Generate(ctx, &request, fn); err != nil {
@@ -595,9 +585,7 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		latest.Summary()
 	}

-	ctx = context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context)
-	cmd.SetContext(ctx)
-
+	cmd.SetContext(context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context))
 	return nil
 }

@@ -610,31 +598,11 @@ const (
 	MultilineTemplate
 )

-func modelIsMultiModal(cmd *cobra.Command, name string) bool {
-	// get model details
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		fmt.Println("error: couldn't connect to ollama server")
-		return false
-	}
-
-	req := api.ShowRequest{Name: name}
-	resp, err := client.Show(cmd.Context(), &req)
-	if err != nil {
-		return false
-	}
-
-	return slices.Contains(resp.Details.Families, "clip")
-}
-
 func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
-	multiModal := modelIsMultiModal(cmd, opts.Model)
-
 	// load the model
 	loadOpts := generateOptions{
 		Model:  opts.Model,
 		Prompt: "",
-		Images: []ImageData{},
 	}
 	if err := generate(cmd, loadOpts); err != nil {
 		return err
@@ -654,7 +622,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	usageSet := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set parameter ...     Set a parameter")
-		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system message")
+		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system prompt")
 		fmt.Fprintln(os.Stderr, "  /set template <string> Set prompt template")
 		fmt.Fprintln(os.Stderr, "  /set history           Enable history")
 		fmt.Fprintln(os.Stderr, "  /set nohistory         Disable history")
@@ -672,7 +640,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
 		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
 		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
-		fmt.Fprintln(os.Stderr, "  /show system       Show system message")
+		fmt.Fprintln(os.Stderr, "  /show system       Show system prompt")
 		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
 		fmt.Fprintln(os.Stderr, "")
 	}
@@ -733,10 +701,9 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			// if the prompt so far starts with """ then we're in multiline mode
 			// and we need to keep reading until we find a line that ends with """
 			cut, found := strings.CutSuffix(line, `"""`)
-			prompt += cut
+			prompt += cut + "\n"

 			if !found {
-				prompt += "\n"
 				continue
 			}

@@ -747,11 +714,11 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			case MultilineSystem:
 				opts.System = prompt
 				prompt = ""
-				fmt.Println("Set system message.")
+				fmt.Println("Set system template.")
 			case MultilineTemplate:
 				opts.Template = prompt
 				prompt = ""
-				fmt.Println("Set prompt template.")
+				fmt.Println("Set model template.")
 			}
 			multiline = MultilineNone
 		case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
@@ -822,18 +789,17 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					line = strings.TrimPrefix(line, `"""`)
 					if strings.HasPrefix(args[2], `"""`) {
 						cut, found := strings.CutSuffix(line, `"""`)
-						prompt += cut
+						prompt += cut + "\n"
 						if found {
+							opts.System = prompt
 							if args[1] == "system" {
-								opts.System = prompt
-								fmt.Println("Set system message.")
+								fmt.Println("Set system template.")
 							} else {
-								opts.Template = prompt
 								fmt.Println("Set prompt template.")
 							}
 							prompt = ""
 						} else {
-							prompt = `"""` + prompt + "\n"
+							prompt = `"""` + prompt
 							if args[1] == "system" {
 								multiline = MultilineSystem
 							} else {
@@ -843,7 +809,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 						}
 					} else {
 						opts.System = line
-						fmt.Println("Set system message.")
+						fmt.Println("Set system template.")
 					}
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
@@ -895,7 +861,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					case resp.System != "":
 						fmt.Println(resp.System + "\n")
 					default:
-						fmt.Print("No system message was specified for this model.\n\n")
+						fmt.Print("No system prompt was specified for this model.\n\n")
 					}
 				case "template":
 					switch {
@@ -936,26 +902,6 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {

 		if len(prompt) > 0 && multiline == MultilineNone {
 			opts.Prompt = prompt
-			if multiModal {
-				newPrompt, images, err := extractFileNames(prompt)
-				if err != nil {
-					return err
-				}
-				opts.Prompt = newPrompt
-
-				// reset the context if we find another image
-				if len(images) > 0 {
-					opts.Images = images
-					ctx := cmd.Context()
-					ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
-					cmd.SetContext(ctx)
-				}
-				if len(opts.Images) == 0 {
-					fmt.Println("This model requires you to add a jpeg, png, or svg image.\n")
-					prompt = ""
-					continue
-				}
-			}
 			if err := generate(cmd, opts); err != nil {
 				return err
 			}
@@ -965,57 +911,6 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	}
 }

-func normalizeFilePath(fp string) string {
-	// Define a map of escaped characters and their replacements
-	replacements := map[string]string{
-		"\\ ":  " ",  // Escaped space
-		"\\(":  "(",  // Escaped left parenthesis
-		"\\)":  ")",  // Escaped right parenthesis
-		"\\[":  "[",  // Escaped left square bracket
-		"\\]":  "]",  // Escaped right square bracket
-		"\\{":  "{",  // Escaped left curly brace
-		"\\}":  "}",  // Escaped right curly brace
-		"\\$":  "$",  // Escaped dollar sign
-		"\\&":  "&",  // Escaped ampersand
-		"\\;":  ";",  // Escaped semicolon
-		"\\'":  "'",  // Escaped single quote
-		"\\\\": "\\", // Escaped backslash
-		"\\*":  "*",  // Escaped asterisk
-		"\\?":  "?",  // Escaped question mark
-	}
-
-	for escaped, actual := range replacements {
-		fp = strings.ReplaceAll(fp, escaped, actual)
-	}
-	return fp
-}
-
-func extractFileNames(input string) (string, []ImageData, error) {
-	// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
-	// and followed by more characters and a file extension
-	regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
-	re := regexp.MustCompile(regexPattern)
-
-	filePaths := re.FindAllString(input, -1)
-	var imgs []ImageData
-
-	for _, fp := range filePaths {
-		nfp := normalizeFilePath(fp)
-		data, err := getImageData(nfp)
-		if err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-			fmt.Printf("Couldn't process image: %q\n", err)
-			return "", imgs, err
-		}
-		fmt.Printf("Added image '%s'\n", nfp)
-		input = strings.ReplaceAll(input, fp, "")
-		imgs = append(imgs, data)
-	}
-	return input, imgs, nil
-}
-
 func RunServer(cmd *cobra.Command, _ []string) error {
 	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	if err != nil {
@@ -1042,50 +937,6 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 	return server.Serve(ln, origins)
 }

-func getImageData(filePath string) ([]byte, error) {
-	file, err := os.Open(filePath)
-	if err != nil {
-		return nil, err
-	}
-	defer file.Close()
-
-	buf := make([]byte, 512)
-	_, err = file.Read(buf)
-	if err != nil {
-		return nil, err
-	}
-
-	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
-	if !slices.Contains(allowedTypes, contentType) {
-		return nil, fmt.Errorf("invalid image type: %s", contentType)
-	}
-
-	info, err := file.Stat()
-	if err != nil {
-		return nil, err
-	}
-
-	// Check if the file size exceeds 100MB
-	var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
-	if info.Size() > maxSize {
-		return nil, fmt.Errorf("file size exceeds maximum limit (100MB).")
-	}
-
-	buf = make([]byte, info.Size())
-	_, err = file.Seek(0, 0)
-	if err != nil {
-		return nil, err
-	}
-
-	_, err = io.ReadFull(file, buf)
-	if err != nil {
-		return nil, err
-	}
-
-	return buf, nil
-}
-
 func initializeKeypair() error {
 	home, err := os.UserHomeDir()
 	if err != nil {
@@ -1252,7 +1103,7 @@ func NewCLI() *cobra.Command {
 	showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
 	showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
 	showCmd.Flags().Bool("template", false, "Show template of a model")
-	showCmd.Flags().Bool("system", false, "Show system message of a model")
+	showCmd.Flags().Bool("system", false, "Show system prompt of a model")

 	runCmd := &cobra.Command{
 		Use:     "run MODEL [PROMPT]",
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,7 +3,6 @@
 ## Endpoints

 - [Generate a completion](#generate-a-completion)
- [Generate a chat completion](#generate-a-chat-completion)
 - [Create a Model](#create-a-model)
 - [List Local Models](#list-local-models)
 - [Show Model Information](#show-model-information)
@@ -44,7 +43,7 @@ Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system message to (overrides what is defined in the `Modelfile`)
+- `system`: system prompt to (overrides what is defined in the `Modelfile`)
 - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
@@ -253,7 +252,7 @@ curl http://localhost:11434/api/generate -d '{
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
-    "num_ctx": 1024,
+    "num_ctx": 4,
    "num_batch": 2,
    "num_gqa": 1,
    "num_gpu": 1,
@@ -268,7 +267,7 @@ curl http://localhost:11434/api/generate -d '{
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
-  }
+    }
 }'
 ```

@@ -291,7 +290,7 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```

-## Generate a chat completion
+## Send Chat Messages (coming in 0.1.14)

 ```shell
 POST /api/chat
@@ -548,7 +547,7 @@ A single JSON object will be returned.
 POST /api/show
 ```

-Show details about a model including modelfile, template, parameters, license, and system message.
+Show details about a model including modelfile, template, parameters, license, and system prompt.

 ### Parameters

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -95,6 +95,10 @@ The manifest lists all the layers used in this model. You will see a `media type

 To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.

+### I downloaded most of a model yesterday, but it's gone today. What happened?
+
+When the Ollama server starts, it looks for fragments of models that still exist on the system and cleans them out. If you have an Internet connection that can't complete a model download all at once, this can be frustrating. Adding the OLLAMA_NOPRUNE environment variable will prevent the server from pruning incomplete files.
+
 ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

 No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -30,14 +30,14 @@ The format of the `Modelfile`:
 INSTRUCTION arguments
 ```

-| Instruction                         | Description                                                    |
-| ----------------------------------- | -------------------------------------------------------------- |
-| [`FROM`](#from-required) (required) | Defines the base model to use.                                 |
-| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.         |
-| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.              |
-| [`SYSTEM`](#system)                 | Specifies the system message that will be set in the template. |
-| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
-| [`LICENSE`](#license)               | Specifies the legal license.                                   |
+| Instruction                         | Description                                                   |
+| ----------------------------------- | ------------------------------------------------------------- |
+| [`FROM`](#from-required) (required) | Defines the base model to use.                                |
+| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.        |
+| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.             |
+| [`SYSTEM`](#system)                 | Specifies the system prompt that will be set in the template. |
+| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.           |
+| [`LICENSE`](#license)               | Specifies the legal license.                                  |

 ## Examples

@@ -52,7 +52,7 @@ PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
 PARAMETER num_ctx 4096

-# sets a custom system message to specify the behavior of the chat assistant
+# sets a custom system prompt to specify the behavior of the chat assistant
 SYSTEM You are Mario from super mario bros, acting as an assistant.
 ```

@@ -70,9 +70,9 @@ More examples are available in the [examples directory](../examples).
 There are two ways to view `Modelfile`s underlying the models in [ollama.ai/library][1]:

 - Option 1: view a details page from a model's tags page:
-  1.  Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
-  2.  Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
-  3.  Scroll down to "Layers"
+   1. Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
+   2. Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
+   3. Scroll down to "Layers"
      - Note: if the [`FROM` instruction](#from-required) is not present,
        it means the model was created from a local file
 - Option 2: use `ollama show` to print the `Modelfile` like so:
@@ -152,15 +152,15 @@ PARAMETER <parameter> <parametervalue>

 ### TEMPLATE

-`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.
+`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system prompt and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.

 #### Template Variables

-| Variable        | Description                                                                                                   |
-| --------------- | ------------------------------------------------------------------------------------------------------------- |
-| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
-| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
-| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.           |
+| Variable        | Description                                                                                                  |
+| --------------- | ------------------------------------------------------------------------------------------------------------ |
+| `{{ .System }}` | The system prompt used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
+| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                 |
+| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.          |

 ```modelfile
 TEMPLATE """
@@ -180,7 +180,7 @@ SYSTEM """<system message>"""

 ### SYSTEM

-The `SYSTEM` instruction specifies the system message to be used in the template, if applicable.
+The `SYSTEM` instruction specifies the system prompt to be used in the template, if applicable.

 ```modelfile
 SYSTEM """<system message>"""
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -0,0 +1,20 @@
+package llm
+
+const (
+	falconModelType7B   = 32
+	falconModelType40B  = 60
+	falconModelType180B = 80
+)
+
+func falconModelType(numLayer uint32) string {
+	switch numLayer {
+	case 32:
+		return "7B"
+	case 60:
+		return "40B"
+	case 80:
+		return "180B"
+	default:
+		return "unknown"
+	}
+}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -93,8 +93,6 @@ func (c *containerGGML) Name() string {
 }

 func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
-	// file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
 	return nil, nil
 }

@@ -117,10 +115,6 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return nil, nil
 }

@@ -147,10 +141,6 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
 	// different model types may have different layouts for hyperparameters
 	var llama llamaModel
 	binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return &llama, nil
 }

@@ -173,10 +163,6 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return nil, nil
 }

--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -120,6 +120,27 @@ func (llm *ggufModel) ModelType() string {
 		return format.HumanNumber(llm.parameters)
 	}

+	switch llm.ModelFamily() {
+	case "llama":
+		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
+			heads, headsOK := llm.kv["llama.head_count"].(uint32)
+			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
+			if headsOK && headsKVsOK && heads/headKVs == 8 {
+				return "70B"
+			}
+
+			return llamaModelType(blocks)
+		}
+	case "falcon":
+		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
+			return falconModelType(blocks)
+		}
+	case "starcoder":
+		if blocks, ok := llm.kv["starcoder.block_count"].(uint32); ok {
+			return starCoderModelType(blocks)
+		}
+	}
+
 	return "unknown"
 }

--- a/llm/llama.go
+++ b/llm/llama.go
@@ -59,7 +59,6 @@ ws ::= ([ \t\n] ws)?
 var llamaCppEmbed embed.FS

 type ModelRunner struct {
-	Type        string // "gguf" or "ggml"
 	Path        string // path to the model runner executable
 	Accelerated bool
 }
@@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	switch runtime.GOOS {
 	case "darwin":
 		if runtime.GOARCH == "arm64" {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
 		} else {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
 		}
 	case "linux":
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
+			{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	}

@@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
-			Type:        r.Type,
 			Path:        filepath.Clean(path.Join(workDir, r.Path)),
 			Accelerated: r.Accelerated,
 		})
@@ -223,14 +221,8 @@ type Running struct {
 	*StatusWriter            // captures error messages from the llama runner process
 }

-type ImageData struct {
-	Data []byte `json:"data"`
-	ID   int    `json:"id"`
-}
-
 type llama struct {
 	api.Options
-	ImageData []ImageData
 	Running
 }

@@ -410,13 +402,11 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		}

 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-		params := append(params, "--port", strconv.Itoa(port))
-
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
 			ctx,
 			runner.Path,
-			params...,
+			append(params, "--port", strconv.Itoa(port))...,
 		)

 		var libraryPaths []string
@@ -545,17 +535,17 @@ type prediction struct {
 }

 const maxBufferSize = 512 * format.KiloByte
-const maxRetries = 6

 type PredictOpts struct {
+	Model            string
 	Prompt           string
 	Format           string
-	Images           []api.ImageData
 	CheckpointStart  time.Time
 	CheckpointLoaded time.Time
 }

 type PredictResult struct {
+	Model              string
 	CreatedAt          time.Time
 	TotalDuration      time.Duration
 	LoadDuration       time.Duration
@@ -567,20 +557,7 @@ type PredictResult struct {
 	EvalDuration       time.Duration
 }

-// IsRetryable checks if the line matches a condition that can be retried
-func isRetryable(line []byte) bool {
-	return bytes.Contains(line, []byte("slot unavailable"))
-}
-
 func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
-	imageData := llm.ImageData
-	if len(predict.Images) > 0 {
-		for cnt, i := range predict.Images {
-			imageData = append(imageData, ImageData{Data: i, ID: cnt})
-		}
-	}
-	log.Printf("loaded %d images", len(imageData))
-
 	request := map[string]any{
 		"prompt":            predict.Prompt,
 		"stream":            true,
@@ -602,78 +579,59 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 		"penalize_nl":       llm.PenalizeNewline,
 		"seed":              llm.Seed,
 		"stop":              llm.Stop,
-		"image_data":        imageData,
 	}

 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
 	}

-	retryDelay := 100 * time.Microsecond
-	for retries := 0; retries < maxRetries; retries++ {
-		if retries > 0 {
-			time.Sleep(retryDelay) // wait before retrying
-			retryDelay *= 2        // exponential backoff
-		}
+	// Handling JSON marshaling with special characters unescaped.
+	buffer := &bytes.Buffer{}
+	enc := json.NewEncoder(buffer)
+	enc.SetEscapeHTML(false)

-		// Handling JSON marshaling with special characters unescaped.
-		buffer := &bytes.Buffer{}
-		enc := json.NewEncoder(buffer)
-		enc.SetEscapeHTML(false)
+	if err := enc.Encode(request); err != nil {
+		return fmt.Errorf("failed to marshal data: %v", err)
+	}

-		if err := enc.Encode(request); err != nil {
-			return fmt.Errorf("failed to marshal data: %v", err)
-		}
+	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+	if err != nil {
+		return fmt.Errorf("error creating POST request: %v", err)
+	}
+	req.Header.Set("Content-Type", "application/json")

-		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
-		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("POST predict: %v", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode >= 400 {
+		bodyBytes, err := io.ReadAll(resp.Body)
 		if err != nil {
-			return fmt.Errorf("error creating POST request: %v", err)
+			return fmt.Errorf("failed reading llm error response: %w", err)
 		}
-		req.Header.Set("Content-Type", "application/json")
+		log.Printf("llm predict error: %s", bodyBytes)
+		return fmt.Errorf("%s", bodyBytes)
+	}

-		resp, err := http.DefaultClient.Do(req)
-		if err != nil {
-			return fmt.Errorf("POST predict: %v", err)
-		}
-		defer resp.Body.Close()
-
-		if resp.StatusCode >= 400 {
-			bodyBytes, err := io.ReadAll(resp.Body)
-			if err != nil {
-				return fmt.Errorf("failed reading llm error response: %w", err)
+	scanner := bufio.NewScanner(resp.Body)
+	// increase the buffer size to avoid running out of space
+	buf := make([]byte, 0, maxBufferSize)
+	scanner.Buffer(buf, maxBufferSize)
+	for scanner.Scan() {
+		select {
+		case <-ctx.Done():
+			// This handles the request cancellation
+			return ctx.Err()
+		default:
+			line := scanner.Bytes()
+			if len(line) == 0 {
+				continue
 			}
-			log.Printf("llm predict error: %s", bodyBytes)
-			return fmt.Errorf("%s", bodyBytes)
-		}
-
-		scanner := bufio.NewScanner(resp.Body)
-		// increase the buffer size to avoid running out of space
-		buf := make([]byte, 0, maxBufferSize)
-		scanner.Buffer(buf, maxBufferSize)
-
-		retryNeeded := false
-		for scanner.Scan() {
-			select {
-			case <-ctx.Done():
-				// This handles the request cancellation
-				return ctx.Err()
-			default:
-				line := scanner.Bytes()
-				if len(line) == 0 {
-					continue
-				}
-
-				if isRetryable(line) {
-					retryNeeded = true
-					break
-				}
-
-				evt, ok := bytes.CutPrefix(line, []byte("data: "))
-				if !ok {
-					return fmt.Errorf("error parsing llm response stream: %s", line)
-				}

+			if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
 				var p prediction
 				if err := json.Unmarshal(evt, &p); err != nil {
 					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
@@ -681,6 +639,7 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred

 				if p.Content != "" {
 					fn(PredictResult{
+						Model:     predict.Model,
 						CreatedAt: time.Now().UTC(),
 						Content:   p.Content,
 					})
@@ -688,6 +647,7 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred

 				if p.Stop {
 					fn(PredictResult{
+						Model:         predict.Model,
 						CreatedAt:     time.Now().UTC(),
 						TotalDuration: time.Since(predict.CheckpointStart),

@@ -701,26 +661,21 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 				}
 			}
 		}
-
-		if err := scanner.Err(); err != nil {
-			if strings.Contains(err.Error(), "unexpected EOF") {
-				// this means the llama runner subprocess crashed
-				llm.Close()
-				if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
-					return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
-				}
-				return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
-			}
-			return fmt.Errorf("error reading llm response: %v", err)
-		}
-
-		if !retryNeeded {
-			return nil // success
-		}
 	}

-	// should never reach here ideally
-	return fmt.Errorf("max retries exceeded")
+	if err := scanner.Err(); err != nil {
+		if strings.Contains(err.Error(), "unexpected EOF") {
+			// this means the llama runner subprocess crashed
+			llm.Close()
+			if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
+				return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
+			}
+			return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
+		}
+		return fmt.Errorf("error reading llm response: %v", err)
+	}
+
+	return nil
 }

 type TokenizeRequest struct {
--- a/llm/starcoder.go
+++ b/llm/starcoder.go
@@ -0,0 +1,23 @@
+package llm
+
+const (
+	starCoderModelType1B  = 24
+	starCoderModelType3B  = 36
+	starCoderModelType7B  = 42
+	starCoderModelType15B = 40
+)
+
+func starCoderModelType(numLayer uint32) string {
+	switch numLayer {
+	case 24:
+		return "1B"
+	case 36:
+		return "3B"
+	case 42:
+		return "7B"
+	case 40:
+		return "15B"
+	default:
+		return "unknown"
+	}
+}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -192,7 +192,14 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlW:
 			buf.DeleteWord()
 		case CharCtrlZ:
-			return handleCharCtrlZ(fd, termios)
+			if err := UnsetRawMode(fd, termios); err != nil {
+				return "", err
+			}
+
+			syscall.Kill(0, syscall.SIGSTOP)
+
+			// on resume...
+			return "", nil
 		case CharEnter:
 			output := buf.String()
 			if output != "" {
--- a/readline/readline_unix.go
+++ b/readline/readline_unix.go
@@ -1,18 +0,0 @@
-//go:build !windows
-
-package readline
-
-import (
-	"syscall"
-)
-
-func handleCharCtrlZ(fd int, termios *Termios) (string, error) {
-	if err := UnsetRawMode(fd, termios); err != nil {
-		return "", err
-	}
-
-	syscall.Kill(0, syscall.SIGSTOP)
-
-	// on resume...
-	return "", nil
-}
--- a/readline/readline_windows.go
+++ b/readline/readline_windows.go
@@ -1,6 +0,0 @@
-package readline
-
-func handleCharCtrlZ(fd int, state *State) (string, error) {
-	// not supported
-	return "", nil
-}
--- a/server/images.go
+++ b/server/images.go
@@ -46,7 +46,6 @@ type Model struct {
 	System         string
 	License        []string
 	Digest         string
-	Size           int64
 	Options        map[string]interface{}
 }

@@ -66,7 +65,7 @@ func (m *Model) Prompt(p PromptVars) (string, error) {
 	}

 	if p.System == "" {
-		// use the default system message for this model if one is not specified
+		// use the default system prompt for this model if one is not specified
 		p.System = m.System
 	}

@@ -86,10 +85,9 @@ func (m *Model) Prompt(p PromptVars) (string, error) {
 	return prompt.String(), nil
 }

-func (m *Model) ChatPrompt(msgs []api.Message) (string, []api.ImageData, error) {
+func (m *Model) ChatPrompt(msgs []api.Message) (string, error) {
 	// build the prompt from the list of messages
 	var prompt strings.Builder
-	var currentImages []api.ImageData
 	currentVars := PromptVars{
 		First: true,
 	}
@@ -109,36 +107,35 @@ func (m *Model) ChatPrompt(msgs []api.Message) (string, []api.ImageData, error)
 		case "system":
 			if currentVars.System != "" {
 				if err := writePrompt(); err != nil {
-					return "", nil, err
+					return "", err
 				}
 			}
 			currentVars.System = msg.Content
 		case "user":
 			if currentVars.Prompt != "" {
 				if err := writePrompt(); err != nil {
-					return "", nil, err
+					return "", err
 				}
 			}
 			currentVars.Prompt = msg.Content
-			currentImages = msg.Images
 		case "assistant":
 			currentVars.Response = msg.Content
 			if err := writePrompt(); err != nil {
-				return "", nil, err
+				return "", err
 			}
 		default:
-			return "", nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
+			return "", fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
 		}
 	}

 	// Append the last set of vars if they are non-empty
 	if currentVars.Prompt != "" || currentVars.System != "" {
 		if err := writePrompt(); err != nil {
-			return "", nil, err
+			return "", err
 		}
 	}

-	return prompt.String(), currentImages, nil
+	return prompt.String(), nil
 }

 type ManifestV2 struct {
@@ -149,16 +146,12 @@ type ManifestV2 struct {
 }

 type ConfigV2 struct {
-	ModelFormat   string   `json:"model_format"`
-	ModelFamily   string   `json:"model_family"`
-	ModelFamilies []string `json:"model_families"`
-	ModelType     string   `json:"model_type"`
-	FileType      string   `json:"file_type"`
-
 	// required by spec
 	Architecture string `json:"architecture"`
 	OS           string `json:"os"`
 	RootFS       RootFS `json:"rootfs"`
+
+	api.ModelConfiguration
 }

 func (c *ConfigV2) SetModelFormat(format string) {
@@ -245,7 +238,6 @@ func GetModel(name string) (*Model, error) {
 		Digest:    digest,
 		Template:  "{{ .Prompt }}",
 		License:   []string{},
-		Size:      manifest.GetTotalSize(),
 	}

 	filename, err := GetBlobsPath(manifest.Config.Digest)
@@ -549,7 +541,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 			}
 		}

-		// xxx - can this be removed?
 		if config.ModelType == "65B" {
 			if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
 				config.ModelType = "70B"
--- a/server/routes.go
+++ b/server/routes.go
@@ -156,9 +156,9 @@ func GenerateHandler(c *gin.Context) {
 	defer loaded.mu.Unlock()

 	checkpointStart := time.Now()
+
 	var req api.GenerateRequest
 	err := c.ShouldBindJSON(&req)
-
 	switch {
 	case errors.Is(err, io.EOF):
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -199,9 +199,10 @@ func GenerateHandler(c *gin.Context) {
 	// an empty request loads the model
 	if req.Prompt == "" && req.Template == "" && req.System == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true})
+			CreatedAt:          time.Now().UTC(),
+			Model:              req.Model,
+			ModelConfiguration: model.Config.ModelConfiguration,
+			Done:               true})
 		return
 	}

@@ -260,10 +261,11 @@ func GenerateHandler(c *gin.Context) {
 			}

 			resp := api.GenerateResponse{
-				Model:     req.Model,
-				CreatedAt: r.CreatedAt,
-				Done:      r.Done,
-				Response:  r.Content,
+				Model:              r.Model,
+				ModelConfiguration: model.Config.ModelConfiguration,
+				CreatedAt:          r.CreatedAt,
+				Done:               r.Done,
+				Response:           r.Content,
 				Metrics: api.Metrics{
 					TotalDuration:      r.TotalDuration,
 					LoadDuration:       r.LoadDuration,
@@ -288,11 +290,11 @@ func GenerateHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
+			Model:            model.Name,
 			Prompt:           prompt,
 			Format:           req.Format,
 			CheckpointStart:  checkpointStart,
 			CheckpointLoaded: checkpointLoaded,
-			Images:           req.Images,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
@@ -300,30 +302,19 @@ func GenerateHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Accumulate responses into the final response
-		var final api.GenerateResponse
+		// Wait for the channel to close
+		var r api.GenerateResponse
 		var sb strings.Builder
 		for resp := range ch {
-			switch r := resp.(type) {
-			case api.GenerateResponse:
-				sb.WriteString(r.Response)
-				final = r
-			case gin.H:
-				if errorMsg, ok := r["error"].(string); ok {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
-					return
-				} else {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
-					return
-				}
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
+			var ok bool
+			if r, ok = resp.(api.GenerateResponse); !ok {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
+			sb.WriteString(r.Response)
 		}
-
-		final.Response = sb.String()
-		c.JSON(http.StatusOK, final)
+		r.Response = sb.String()
+		c.JSON(http.StatusOK, r)
 		return
 	}

@@ -615,19 +606,10 @@ func GetModelInfo(name string) (*api.ShowResponse, error) {
 		return nil, err
 	}

-	modelDetails := api.ModelDetails{
-		Format:            model.Config.ModelFormat,
-		Family:            model.Config.ModelFamily,
-		Families:          model.Config.ModelFamilies,
-		ParameterSize:     model.Config.ModelType,
-		QuantizationLevel: model.Config.FileType,
-	}
-
 	resp := &api.ShowResponse{
 		License:  strings.Join(model.License, "\n"),
 		System:   model.System,
 		Template: model.Template,
-		Details:  modelDetails,
 	}

 	mf, err := ShowModelfile(model)
@@ -677,42 +659,25 @@ func ListModelsHandler(c *gin.Context) {
 		return
 	}

-	modelResponse := func(modelName string) (api.ModelResponse, error) {
-		model, err := GetModel(modelName)
-		if err != nil {
-			return api.ModelResponse{}, err
-		}
-
-		modelDetails := api.ModelDetails{
-			Format:            model.Config.ModelFormat,
-			Family:            model.Config.ModelFamily,
-			Families:          model.Config.ModelFamilies,
-			ParameterSize:     model.Config.ModelType,
-			QuantizationLevel: model.Config.FileType,
-		}
-
-		return api.ModelResponse{
-			Name:    model.ShortName,
-			Size:    model.Size,
-			Digest:  model.Digest,
-			Details: modelDetails,
-		}, nil
-	}
-
 	walkFunc := func(path string, info os.FileInfo, _ error) error {
 		if !info.IsDir() {
 			dir, file := filepath.Split(path)
 			dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
 			tag := strings.Join([]string{dir, file}, ":")

-			resp, err := modelResponse(tag)
+			mp := ParseModelPath(tag)
+			manifest, digest, err := GetManifest(mp)
 			if err != nil {
 				log.Printf("skipping file: %s", fp)
 				return nil
 			}

-			resp.ModifiedAt = info.ModTime()
-			models = append(models, resp)
+			models = append(models, api.ModelResponse{
+				Name:       mp.GetShortTagname(),
+				Size:       manifest.GetTotalSize(),
+				Digest:     digest,
+				ModifiedAt: info.ModTime(),
+			})
 		}

 		return nil
@@ -887,7 +852,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	if runtime.GOOS == "linux" {
 		// check compatibility to log warnings
 		if _, err := llm.CheckVRAM(); err != nil {
-			log.Print(err.Error())
+			log.Printf(err.Error())
 		}
 	}

@@ -994,7 +959,7 @@ func ChatHandler(c *gin.Context) {

 	checkpointLoaded := time.Now()

-	prompt, images, err := model.ChatPrompt(req.Messages)
+	prompt, err := model.ChatPrompt(req.Messages)
 	if err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
@@ -1011,7 +976,7 @@ func ChatHandler(c *gin.Context) {
 			loaded.expireTimer.Reset(sessionDuration)

 			resp := api.ChatResponse{
-				Model:     req.Model,
+				Model:     r.Model,
 				CreatedAt: r.CreatedAt,
 				Done:      r.Done,
 				Metrics: api.Metrics{
@@ -1033,11 +998,11 @@ func ChatHandler(c *gin.Context) {

 		// Start prediction
 		predictReq := llm.PredictOpts{
+			Model:            model.Name,
 			Prompt:           prompt,
 			Format:           req.Format,
 			CheckpointStart:  checkpointStart,
 			CheckpointLoaded: checkpointLoaded,
-			Images:           images,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
@@ -1045,33 +1010,21 @@ func ChatHandler(c *gin.Context) {
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Accumulate responses into the final response
-		var final api.ChatResponse
+		// Wait for the channel to close
+		var r api.ChatResponse
 		var sb strings.Builder
 		for resp := range ch {
-			switch r := resp.(type) {
-			case api.ChatResponse:
-				if r.Message != nil {
-					sb.WriteString(r.Message.Content)
-				}
-
-				final = r
-			case gin.H:
-				if errorMsg, ok := r["error"].(string); ok {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
-					return
-				} else {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
-					return
-				}
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
+			var ok bool
+			if r, ok = resp.(api.ChatResponse); !ok {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
+			if r.Message != nil {
+				sb.WriteString(r.Message.Content)
+			}
 		}
-
-		final.Message = &api.Message{Role: "assistant", Content: sb.String()}
-		c.JSON(http.StatusOK, final)
+		r.Message = &api.Message{Role: "assistant", Content: sb.String()}
+		c.JSON(http.StatusOK, r)
 		return
 	}