replace reflect usage in option parsing

2024-02-06 12:51:07 -05:00
12 changed files with 63 additions and 625 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -34,7 +34,7 @@ jobs:
      matrix:
        cuda-version:
          - '11.8.0'
-    runs-on: linux
+    runs-on: ubuntu-latest
    container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
    steps:
      - run: |
@@ -64,7 +64,7 @@ jobs:
        rocm-version:
          - '5.7.1'
          - '6.0'
-    runs-on: linux
+    runs-on: ubuntu-latest
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
    steps:
      - run: |
--- a/api/types.go
+++ b/api/types.go
@@ -279,85 +279,20 @@ func (m *Metrics) Summary() {
 var ErrInvalidOpts = fmt.Errorf("invalid options")

 func (opts *Options) FromMap(m map[string]interface{}) error {
-	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
-	typeOpts := reflect.TypeOf(opts).Elem()   // types of the fields in the options struct
+	data, err := json.Marshal(m)
+	if err != nil {
+		return err
+	}

-	// build map of json struct tags to their types
-	jsonOpts := make(map[string]reflect.StructField)
-	for _, field := range reflect.VisibleFields(typeOpts) {
-		jsonTag := strings.Split(field.Tag.Get("json"), ",")[0]
-		if jsonTag != "" {
-			jsonOpts[jsonTag] = field
+	err = json.Unmarshal(data, opts)
+	if err != nil {
+		// Custom error handling
+		if jsonErr, ok := err.(*json.UnmarshalTypeError); ok {
+			return fmt.Errorf("invalid type for option '%v': expected %v, got %v", jsonErr.Field, jsonErr.Type, jsonErr.Value)
 		}
+		return err
 	}

-	invalidOpts := []string{}
-	for key, val := range m {
-		if opt, ok := jsonOpts[key]; ok {
-			field := valueOpts.FieldByName(opt.Name)
-			if field.IsValid() && field.CanSet() {
-				if val == nil {
-					continue
-				}
-
-				switch field.Kind() {
-				case reflect.Int:
-					switch t := val.(type) {
-					case int64:
-						field.SetInt(t)
-					case float64:
-						// when JSON unmarshals numbers, it uses float64, not int
-						field.SetInt(int64(t))
-					default:
-						return fmt.Errorf("option %q must be of type integer", key)
-					}
-				case reflect.Bool:
-					val, ok := val.(bool)
-					if !ok {
-						return fmt.Errorf("option %q must be of type boolean", key)
-					}
-					field.SetBool(val)
-				case reflect.Float32:
-					// JSON unmarshals to float64
-					val, ok := val.(float64)
-					if !ok {
-						return fmt.Errorf("option %q must be of type float32", key)
-					}
-					field.SetFloat(val)
-				case reflect.String:
-					val, ok := val.(string)
-					if !ok {
-						return fmt.Errorf("option %q must be of type string", key)
-					}
-					field.SetString(val)
-				case reflect.Slice:
-					// JSON unmarshals to []interface{}, not []string
-					val, ok := val.([]interface{})
-					if !ok {
-						return fmt.Errorf("option %q must be of type array", key)
-					}
-					// convert []interface{} to []string
-					slice := make([]string, len(val))
-					for i, item := range val {
-						str, ok := item.(string)
-						if !ok {
-							return fmt.Errorf("option %q must be of an array of strings", key)
-						}
-						slice[i] = str
-					}
-					field.Set(reflect.ValueOf(slice))
-				default:
-					return fmt.Errorf("unknown type loading config params: %v", field.Kind())
-				}
-			}
-		} else {
-			invalidOpts = append(invalidOpts, key)
-		}
-	}
-
-	if len(invalidOpts) > 0 {
-		return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
-	}
 	return nil
 }

--- a/docs/api.md
+++ b/docs/api.md
@@ -49,8 +49,7 @@ Advanced parameters (optional):
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.

 #### JSON mode

@@ -380,7 +379,6 @@ Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

 ### Examples

@@ -960,7 +958,6 @@ Generate embeddings from a model
 Advanced parameters:

 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

 ### Examples

--- a/docs/openai.md
+++ b/docs/openai.md
@@ -1,142 +0,0 @@
-# OpenAI compatibility
-
-> **Note:** OpenAI compatibility is now part of the `main` branch and will be available in an upcoming release of Ollama.
-
-> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/jmorganca/ollama/blob/main/docs/api.md).
-
-Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
-
-## Usage
-
-### OpenAI Python library
-
-```python
-from openai import OpenAI
-
-client = OpenAI(
-    base_url='http://localhost:11434/v1/',
-
-    # required but ignored
-    api_key='ollama',
-)
-
-chat_completion = client.chat.completions.create(
-    messages=[
-        {
-            'role': 'user',
-            'content': 'Say this is a test',
-        }
-    ],
-    model='llama2',
-)
-```
-
-### OpenAI JavaScript library
-
-```javascript
-import OpenAI from 'openai'
-
-const openai = new OpenAI({
-  baseURL: 'http://localhost:11434/v1/',
-
-  // required but ignored
-  apiKey: 'ollama',
-})
-
-const chatCompletion = await openai.chat.completions.create({
-  messages: [{ role: 'user', content: 'Say this is a test' }],
-  model: 'llama2',
-})
-```
-
-### `curl`
-
-```
-curl http://localhost:11434/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "llama2",
-        "messages": [
-            {
-                "role": "system",
-                "content": "You are a helpful assistant."
-            },
-            {
-                "role": "user",
-                "content": "Hello!"
-            }
-        ]
-    }'
-```
-
-## Endpoints
-
-### `/v1/chat/completions`
-
-#### Supported features
-
- [x] Chat completions
- [x] Streaming
- [x] JSON mode
- [x] Reproducible outputs
- [ ] Vision
- [ ] Function calling
- [ ] Logprobs
-
-#### Supported request fields
-
- [x] `model`
- [x] `messages`
-  - [x] Text `content`
-  - [ ] Array of `content` parts
- [x] `frequency_penalty`
- [x] `presence_penalty`
- [x] `response_format`
- [x] `seed`
- [x] `stop`
- [x] `stream`
- [x] `temperature`
- [x] `top_p`
- [x] `max_tokens`
- [ ] `logit_bias`
- [ ] `tools`
- [ ] `tool_choice`
- [ ] `user`
-
-#### Notes
-
- Setting `seed` will always set `temperature` to `0`
- `finish_reason` will always be `stop`
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
-
-## Models
-
-Before using a model, pull it locally `ollama pull`:
-
-```shell
-ollama pull llama2
-```
-
-### Default model names
-
-For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
-
-```
-ollama cp llama2 gpt-3.5-turbo
-```
-
-Afterwards, this new model name can be specified the `model` field:
-
-```shell
-curl http://localhost:11434/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "gpt-3.5-turbo",
-        "messages": [
-            {
-                "role": "user",
-                "content": "Hello!"
-            }
-        ]
-    }'
-```
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -120,7 +120,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)

 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, workDir, model, adapters, projectors, opts)
+	return newLlmServer(info, model, adapters, projectors, opts)
 }

 // Give any native cgo implementations an opportunity to initialize
@@ -128,7 +128,7 @@ func Init(workdir string) error {
 	return nativeInit(workdir)
 }

-func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)

 	// Check to see if the user has requested a specific library instead of auto-detecting
@@ -143,16 +143,6 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto
 		}
 	}

-	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
-	_, err := os.Stat(dynLibs[0])
-	if err != nil {
-		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
-		err = nativeInit(workDir)
-		if err != nil {
-			return nil, err
-		}
-	}
-
 	err2 := fmt.Errorf("unable to locate suitable llm library")
 	for _, dynLib := range dynLibs {
 		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,8 +1,8 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index d86d7e04..7d71c766 100644
+index a48582ad..9fffffd8 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1598,12 +1598,6 @@ struct llama_server_context
+@@ -1564,12 +1564,6 @@ struct llama_server_context
                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                     }
 
@@ -15,7 +15,7 @@ index d86d7e04..7d71c766 100644
                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                     {
                         // we have to evaluate at least 1 token to generate logits.
-@@ -1615,6 +1609,12 @@ struct llama_server_context
+@@ -1581,6 +1575,12 @@ struct llama_server_context
                         }
                     }
 
@@ -26,5 +26,5 @@ index d86d7e04..7d71c766 100644
 +                    slot.cache_tokens = prompt_tokens;
 +
                     LOG_VERBOSE("prompt ingested", {
-                                                     {"n_past",  slot.n_past},
-                                                     {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
+                                                     {"n_past", slot.n_past},
+                                                     {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
--- a/llm/patches/02-shutdown.diff
+++ b/llm/patches/02-shutdown.diff
@@ -37,18 +37,26 @@ index 11dd82c3..311495a8 100644
 
     llama_backend_free();
 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
-index 70cce072..9124869a 100644
+index 70cce072..2acb1eab 100644
 --- a/examples/server/utils.hpp
 +++ b/examples/server/utils.hpp
-@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector<json> messages)
+@@ -6,6 +6,7 @@
+ #include <mutex>
+ #include <condition_variable>
+ #include <unordered_map>
+#include <atomic>
+ 
+ #include "json.hpp"
+ 
+@@ -190,6 +191,7 @@ inline std::string format_chatml(std::vector<json> messages)
 struct llama_server_queue {
     int id = 0;
     std::mutex mutex_tasks;
-+    bool running;
+    std::atomic<bool> running;
     // queues
     std::vector<task_server> queue_tasks;
     std::vector<task_server> queue_tasks_deferred;
-@@ -248,9 +249,18 @@ struct llama_server_queue {
+@@ -248,9 +250,15 @@ struct llama_server_queue {
         queue_tasks_deferred.clear();
     }
 
@@ -56,10 +64,7 @@ index 70cce072..9124869a 100644
 -    [[noreturn]]
 +    // end the start_loop routine
 +    void terminate() {
-+        {
-+            std::unique_lock<std::mutex> lock(mutex_tasks);
-+            running = false;
-+        }
+        running = false;
 +        condition_tasks.notify_all();
 +    }
 +
@@ -69,17 +74,17 @@ index 70cce072..9124869a 100644
         while (true) {
             // new task arrived
             LOG_VERBOSE("have new task", {});
-@@ -294,8 +304,12 @@ struct llama_server_queue {
+@@ -294,8 +302,12 @@ struct llama_server_queue {
             {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
                 if (queue_tasks.empty()) {
-+                    if (!running) {
+                    if (!running.load()) {
 +                        LOG_VERBOSE("ending start_loop", {});
 +                        return;
 +                    }
                     condition_tasks.wait(lock, [&]{
 -                        return !queue_tasks.empty();
-+                        return (!queue_tasks.empty() || !running);
+                        return (!queue_tasks.empty() || !running.load());
                     });
                 }
             }
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -1,322 +0,0 @@
-// openai package provides middleware for partial compatibility with the OpenAI REST API
-package openai
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"io"
-	"math/rand"
-	"net/http"
-	"time"
-
-	"github.com/gin-gonic/gin"
-	"github.com/jmorganca/ollama/api"
-)
-
-type Error struct {
-	Message string      `json:"message"`
-	Type    string      `json:"type"`
-	Param   interface{} `json:"param"`
-	Code    *string     `json:"code"`
-}
-
-type ErrorResponse struct {
-	Error Error `json:"error"`
-}
-
-type Message struct {
-	Role    string `json:"role"`
-	Content string `json:"content"`
-}
-
-type Choice struct {
-	Index        int     `json:"index"`
-	Message      Message `json:"message"`
-	FinishReason *string `json:"finish_reason"`
-}
-
-type ChunkChoice struct {
-	Index        int     `json:"index"`
-	Delta        Message `json:"delta"`
-	FinishReason *string `json:"finish_reason"`
-}
-
-type Usage struct {
-	PromptTokens     int `json:"prompt_tokens"`
-	CompletionTokens int `json:"completion_tokens"`
-	TotalTokens      int `json:"total_tokens"`
-}
-
-type ResponseFormat struct {
-	Type string `json:"type"`
-}
-
-type ChatCompletionRequest struct {
-	Model            string          `json:"model"`
-	Messages         []Message       `json:"messages"`
-	Stream           bool            `json:"stream"`
-	MaxTokens        *int            `json:"max_tokens"`
-	Seed             *int            `json:"seed"`
-	Stop             any             `json:"stop"`
-	Temperature      *float64        `json:"temperature"`
-	FrequencyPenalty *float64        `json:"frequency_penalty"`
-	PresencePenalty  *float64        `json:"presence_penalty_penalty"`
-	TopP             *float64        `json:"top_p"`
-	ResponseFormat   *ResponseFormat `json:"response_format"`
-}
-
-type ChatCompletion struct {
-	Id                string   `json:"id"`
-	Object            string   `json:"object"`
-	Created           int64    `json:"created"`
-	Model             string   `json:"model"`
-	SystemFingerprint string   `json:"system_fingerprint"`
-	Choices           []Choice `json:"choices"`
-	Usage             Usage    `json:"usage,omitempty"`
-}
-
-type ChatCompletionChunk struct {
-	Id                string        `json:"id"`
-	Object            string        `json:"object"`
-	Created           int64         `json:"created"`
-	Model             string        `json:"model"`
-	SystemFingerprint string        `json:"system_fingerprint"`
-	Choices           []ChunkChoice `json:"choices"`
-}
-
-func NewError(code int, message string) ErrorResponse {
-	var etype string
-	switch code {
-	case http.StatusBadRequest:
-		etype = "invalid_request_error"
-	case http.StatusNotFound:
-		etype = "not_found_error"
-	default:
-		etype = "api_error"
-	}
-
-	return ErrorResponse{Error{Type: etype, Message: message}}
-}
-
-func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
-	return ChatCompletion{
-		Id:                id,
-		Object:            "chat.completion",
-		Created:           r.CreatedAt.Unix(),
-		Model:             r.Model,
-		SystemFingerprint: "fp_ollama",
-		Choices: []Choice{{
-			Index:   0,
-			Message: Message{Role: r.Message.Role, Content: r.Message.Content},
-			FinishReason: func(done bool) *string {
-				if done {
-					reason := "stop"
-					return &reason
-				}
-				return nil
-			}(r.Done),
-		}},
-		Usage: Usage{
-			// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
-			PromptTokens:     r.PromptEvalCount,
-			CompletionTokens: r.EvalCount,
-			TotalTokens:      r.PromptEvalCount + r.EvalCount,
-		},
-	}
-}
-
-func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
-	return ChatCompletionChunk{
-		Id:                id,
-		Object:            "chat.completion.chunk",
-		Created:           time.Now().Unix(),
-		Model:             r.Model,
-		SystemFingerprint: "fp_ollama",
-		Choices: []ChunkChoice{
-			{
-				Index: 0,
-				Delta: Message{Role: "assistant", Content: r.Message.Content},
-				FinishReason: func(done bool) *string {
-					if done {
-						reason := "stop"
-						return &reason
-					}
-					return nil
-				}(r.Done),
-			},
-		},
-	}
-}
-
-func fromRequest(r ChatCompletionRequest) api.ChatRequest {
-	var messages []api.Message
-	for _, msg := range r.Messages {
-		messages = append(messages, api.Message{Role: msg.Role, Content: msg.Content})
-	}
-
-	options := make(map[string]interface{})
-
-	switch stop := r.Stop.(type) {
-	case string:
-		options["stop"] = []string{stop}
-	case []interface{}:
-		var stops []string
-		for _, s := range stop {
-			if str, ok := s.(string); ok {
-				stops = append(stops, str)
-			}
-		}
-		options["stop"] = stops
-	}
-
-	if r.MaxTokens != nil {
-		options["num_predict"] = *r.MaxTokens
-	}
-
-	if r.Temperature != nil {
-		options["temperature"] = *r.Temperature * 2.0
-	} else {
-		options["temperature"] = 1.0
-	}
-
-	if r.Seed != nil {
-		options["seed"] = *r.Seed
-
-		// temperature=0 is required for reproducible outputs
-		options["temperature"] = 0.0
-	}
-
-	if r.FrequencyPenalty != nil {
-		options["frequency_penalty"] = *r.FrequencyPenalty * 2.0
-	}
-
-	if r.PresencePenalty != nil {
-		options["presence_penalty"] = *r.PresencePenalty * 2.0
-	}
-
-	if r.TopP != nil {
-		options["top_p"] = *r.TopP
-	} else {
-		options["top_p"] = 1.0
-	}
-
-	var format string
-	if r.ResponseFormat != nil && r.ResponseFormat.Type == "json_object" {
-		format = "json"
-	}
-
-	return api.ChatRequest{
-		Model:    r.Model,
-		Messages: messages,
-		Format:   format,
-		Options:  options,
-		Stream:   &r.Stream,
-	}
-}
-
-type writer struct {
-	stream bool
-	id     string
-	gin.ResponseWriter
-}
-
-func (w *writer) writeError(code int, data []byte) (int, error) {
-	var serr api.StatusError
-	err := json.Unmarshal(data, &serr)
-	if err != nil {
-		return 0, err
-	}
-
-	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	err = json.NewEncoder(w.ResponseWriter).Encode(NewError(http.StatusInternalServerError, serr.Error()))
-	if err != nil {
-		return 0, err
-	}
-
-	return len(data), nil
-}
-
-func (w *writer) writeResponse(data []byte) (int, error) {
-	var chatResponse api.ChatResponse
-	err := json.Unmarshal(data, &chatResponse)
-	if err != nil {
-		return 0, err
-	}
-
-	// chat chunk
-	if w.stream {
-		d, err := json.Marshal(toChunk(w.id, chatResponse))
-		if err != nil {
-			return 0, err
-
-		}
-
-		w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
-		_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
-		if err != nil {
-			return 0, err
-		}
-
-		if chatResponse.Done {
-			_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
-			if err != nil {
-				return 0, err
-			}
-		}
-
-		return len(data), nil
-	}
-
-	// chat completion
-	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	err = json.NewEncoder(w.ResponseWriter).Encode(toChatCompletion(w.id, chatResponse))
-	if err != nil {
-		return 0, err
-	}
-
-	return len(data), nil
-}
-
-func (w *writer) Write(data []byte) (int, error) {
-	code := w.ResponseWriter.Status()
-	if code != http.StatusOK {
-		return w.writeError(code, data)
-	}
-
-	return w.writeResponse(data)
-}
-
-func Middleware() gin.HandlerFunc {
-	return func(c *gin.Context) {
-		var req ChatCompletionRequest
-		err := c.ShouldBindJSON(&req)
-		if err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		if len(req.Messages) == 0 {
-			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "[] is too short - 'messages'"))
-			return
-		}
-
-		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(fromRequest(req)); err != nil {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
-			return
-		}
-
-		c.Request.Body = io.NopCloser(&b)
-
-		w := &writer{
-			ResponseWriter: c.Writer,
-			stream:         req.Stream,
-			id:             fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
-		}
-
-		c.Writer = w
-
-		c.Next()
-	}
-}
--- a/server/auth.go
+++ b/server/auth.go
@@ -111,14 +111,8 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 	defer resp.Body.Close()

 	if resp.StatusCode >= http.StatusBadRequest {
-		responseBody, err := io.ReadAll(resp.Body)
-		if err != nil {
-			return "", fmt.Errorf("%d: %v", resp.StatusCode, err)
-		} else if len(responseBody) > 0 {
-			return "", fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
-		}
-
-		return "", fmt.Errorf("%s", resp.Status)
+		body, _ := io.ReadAll(resp.Body)
+		return "", fmt.Errorf("on pull registry responded with code %d: %s", resp.StatusCode, body)
 	}

 	respBody, err := io.ReadAll(resp.Body)
--- a/server/images.go
+++ b/server/images.go
@@ -181,19 +181,16 @@ func (m *Model) ChatPrompts(msgs []api.Message) (*ChatHistory, error) {
 			}

 			currentVars.Prompt = msg.Content
-
-			if len(m.ProjectorPaths) > 0 {
-				for i := range msg.Images {
-					id := len(images) + i
-					currentVars.Prompt += fmt.Sprintf(" [img-%d]", id)
-					currentVars.Images = append(currentVars.Images, llm.ImageData{
-						ID:   id,
-						Data: msg.Images[i],
-					})
-				}
-
-				images = append(images, currentVars.Images...)
+			for i := range msg.Images {
+				id := len(images) + i
+				currentVars.Prompt += fmt.Sprintf(" [img-%d]", id)
+				currentVars.Images = append(currentVars.Images, llm.ImageData{
+					ID:   id,
+					Data: msg.Images[i],
+				})
 			}
+
+			images = append(images, currentVars.Images...)
 		case "assistant":
 			currentVars.Response = msg.Content
 			prompts = append(prompts, currentVars)
--- a/server/routes.go
+++ b/server/routes.go
@@ -26,7 +26,6 @@ import (
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
 	"github.com/jmorganca/ollama/llm"
-	"github.com/jmorganca/ollama/openai"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/version"
 )
@@ -179,11 +178,7 @@ func GenerateHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		if errors.Is(err, api.ErrInvalidOpts) {
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			return
-		}
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

@@ -397,11 +392,7 @@ func EmbeddingHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		if errors.Is(err, api.ErrInvalidOpts) {
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			return
-		}
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

@@ -936,9 +927,6 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r.POST("/api/blobs/:digest", CreateBlobHandler)
 	r.HEAD("/api/blobs/:digest", HeadBlobHandler)

-	// Compatibility endpoints
-	r.POST("/v1/chat/completions", openai.Middleware(), ChatHandler)
-
 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
 			c.String(http.StatusOK, "Ollama is running")
@@ -1116,11 +1104,7 @@ func ChatHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		if errors.Is(err, api.ErrInvalidOpts) {
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			return
-		}
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

@@ -1136,6 +1120,18 @@ func ChatHandler(c *gin.Context) {
 		return
 	}

+	// an empty request loads the model
+	if len(req.Messages) == 0 {
+		resp := api.ChatResponse{
+			CreatedAt: time.Now().UTC(),
+			Model:     req.Model,
+			Done:      true,
+			Message:   api.Message{Role: "assistant"},
+		}
+		c.JSON(http.StatusOK, resp)
+		return
+	}
+
 	checkpointLoaded := time.Now()

 	chat, err := model.ChatPrompts(req.Messages)
@@ -1150,18 +1146,6 @@ func ChatHandler(c *gin.Context) {
 		return
 	}

-	// an empty request loads the model
-	if len(prompt) == 0 {
-		resp := api.ChatResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true,
-			Message:   api.Message{Role: "assistant"},
-		}
-		c.JSON(http.StatusOK, resp)
-		return
-	}
-
 	slog.Debug("chat handler", "prompt", prompt)

 	ch := make(chan any)