Compare commits
1 Commits
v0.1.24
...
jmorganca/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a859f037da |
4
.github/workflows/test.yaml
vendored
4
.github/workflows/test.yaml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
matrix:
|
||||
cuda-version:
|
||||
- '11.8.0'
|
||||
runs-on: linux
|
||||
runs-on: ubuntu-latest
|
||||
container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
|
||||
steps:
|
||||
- run: |
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
rocm-version:
|
||||
- '5.7.1'
|
||||
- '6.0'
|
||||
runs-on: linux
|
||||
runs-on: ubuntu-latest
|
||||
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
|
||||
steps:
|
||||
- run: |
|
||||
|
||||
85
api/types.go
85
api/types.go
@@ -279,85 +279,20 @@ func (m *Metrics) Summary() {
|
||||
var ErrInvalidOpts = fmt.Errorf("invalid options")
|
||||
|
||||
func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
|
||||
typeOpts := reflect.TypeOf(opts).Elem() // types of the fields in the options struct
|
||||
data, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// build map of json struct tags to their types
|
||||
jsonOpts := make(map[string]reflect.StructField)
|
||||
for _, field := range reflect.VisibleFields(typeOpts) {
|
||||
jsonTag := strings.Split(field.Tag.Get("json"), ",")[0]
|
||||
if jsonTag != "" {
|
||||
jsonOpts[jsonTag] = field
|
||||
err = json.Unmarshal(data, opts)
|
||||
if err != nil {
|
||||
// Custom error handling
|
||||
if jsonErr, ok := err.(*json.UnmarshalTypeError); ok {
|
||||
return fmt.Errorf("invalid type for option '%v': expected %v, got %v", jsonErr.Field, jsonErr.Type, jsonErr.Value)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
invalidOpts := []string{}
|
||||
for key, val := range m {
|
||||
if opt, ok := jsonOpts[key]; ok {
|
||||
field := valueOpts.FieldByName(opt.Name)
|
||||
if field.IsValid() && field.CanSet() {
|
||||
if val == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
switch field.Kind() {
|
||||
case reflect.Int:
|
||||
switch t := val.(type) {
|
||||
case int64:
|
||||
field.SetInt(t)
|
||||
case float64:
|
||||
// when JSON unmarshals numbers, it uses float64, not int
|
||||
field.SetInt(int64(t))
|
||||
default:
|
||||
return fmt.Errorf("option %q must be of type integer", key)
|
||||
}
|
||||
case reflect.Bool:
|
||||
val, ok := val.(bool)
|
||||
if !ok {
|
||||
return fmt.Errorf("option %q must be of type boolean", key)
|
||||
}
|
||||
field.SetBool(val)
|
||||
case reflect.Float32:
|
||||
// JSON unmarshals to float64
|
||||
val, ok := val.(float64)
|
||||
if !ok {
|
||||
return fmt.Errorf("option %q must be of type float32", key)
|
||||
}
|
||||
field.SetFloat(val)
|
||||
case reflect.String:
|
||||
val, ok := val.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("option %q must be of type string", key)
|
||||
}
|
||||
field.SetString(val)
|
||||
case reflect.Slice:
|
||||
// JSON unmarshals to []interface{}, not []string
|
||||
val, ok := val.([]interface{})
|
||||
if !ok {
|
||||
return fmt.Errorf("option %q must be of type array", key)
|
||||
}
|
||||
// convert []interface{} to []string
|
||||
slice := make([]string, len(val))
|
||||
for i, item := range val {
|
||||
str, ok := item.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("option %q must be of an array of strings", key)
|
||||
}
|
||||
slice[i] = str
|
||||
}
|
||||
field.Set(reflect.ValueOf(slice))
|
||||
default:
|
||||
return fmt.Errorf("unknown type loading config params: %v", field.Kind())
|
||||
}
|
||||
}
|
||||
} else {
|
||||
invalidOpts = append(invalidOpts, key)
|
||||
}
|
||||
}
|
||||
|
||||
if len(invalidOpts) > 0 {
|
||||
return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -49,8 +49,7 @@ Advanced parameters (optional):
|
||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.
|
||||
|
||||
#### JSON mode
|
||||
|
||||
@@ -380,7 +379,6 @@ Advanced parameters (optional):
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
### Examples
|
||||
|
||||
@@ -960,7 +958,6 @@ Generate embeddings from a model
|
||||
Advanced parameters:
|
||||
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
### Examples
|
||||
|
||||
|
||||
142
docs/openai.md
142
docs/openai.md
@@ -1,142 +0,0 @@
|
||||
# OpenAI compatibility
|
||||
|
||||
> **Note:** OpenAI compatibility is now part of the `main` branch and will be available in an upcoming release of Ollama.
|
||||
|
||||
> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/jmorganca/ollama/blob/main/docs/api.md).
|
||||
|
||||
Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
|
||||
|
||||
## Usage
|
||||
|
||||
### OpenAI Python library
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url='http://localhost:11434/v1/',
|
||||
|
||||
# required but ignored
|
||||
api_key='ollama',
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'Say this is a test',
|
||||
}
|
||||
],
|
||||
model='llama2',
|
||||
)
|
||||
```
|
||||
|
||||
### OpenAI JavaScript library
|
||||
|
||||
```javascript
|
||||
import OpenAI from 'openai'
|
||||
|
||||
const openai = new OpenAI({
|
||||
baseURL: 'http://localhost:11434/v1/',
|
||||
|
||||
// required but ignored
|
||||
apiKey: 'ollama',
|
||||
})
|
||||
|
||||
const chatCompletion = await openai.chat.completions.create({
|
||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||
model: 'llama2',
|
||||
})
|
||||
```
|
||||
|
||||
### `curl`
|
||||
|
||||
```
|
||||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "llama2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello!"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
## Endpoints
|
||||
|
||||
### `/v1/chat/completions`
|
||||
|
||||
#### Supported features
|
||||
|
||||
- [x] Chat completions
|
||||
- [x] Streaming
|
||||
- [x] JSON mode
|
||||
- [x] Reproducible outputs
|
||||
- [ ] Vision
|
||||
- [ ] Function calling
|
||||
- [ ] Logprobs
|
||||
|
||||
#### Supported request fields
|
||||
|
||||
- [x] `model`
|
||||
- [x] `messages`
|
||||
- [x] Text `content`
|
||||
- [ ] Array of `content` parts
|
||||
- [x] `frequency_penalty`
|
||||
- [x] `presence_penalty`
|
||||
- [x] `response_format`
|
||||
- [x] `seed`
|
||||
- [x] `stop`
|
||||
- [x] `stream`
|
||||
- [x] `temperature`
|
||||
- [x] `top_p`
|
||||
- [x] `max_tokens`
|
||||
- [ ] `logit_bias`
|
||||
- [ ] `tools`
|
||||
- [ ] `tool_choice`
|
||||
- [ ] `user`
|
||||
|
||||
#### Notes
|
||||
|
||||
- Setting `seed` will always set `temperature` to `0`
|
||||
- `finish_reason` will always be `stop`
|
||||
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
||||
|
||||
## Models
|
||||
|
||||
Before using a model, pull it locally `ollama pull`:
|
||||
|
||||
```shell
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
### Default model names
|
||||
|
||||
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
||||
|
||||
```
|
||||
ollama cp llama2 gpt-3.5-turbo
|
||||
```
|
||||
|
||||
Afterwards, this new model name can be specified the `model` field:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello!"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
Submodule llm/llama.cpp updated: f57fadc009...d2f650cb5b
14
llm/llm.go
14
llm/llm.go
@@ -120,7 +120,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
return newLlmServer(info, workDir, model, adapters, projectors, opts)
|
||||
return newLlmServer(info, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
// Give any native cgo implementations an opportunity to initialize
|
||||
@@ -128,7 +128,7 @@ func Init(workdir string) error {
|
||||
return nativeInit(workdir)
|
||||
}
|
||||
|
||||
func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
dynLibs := getDynLibs(gpuInfo)
|
||||
|
||||
// Check to see if the user has requested a specific library instead of auto-detecting
|
||||
@@ -143,16 +143,6 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto
|
||||
}
|
||||
}
|
||||
|
||||
// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
|
||||
_, err := os.Stat(dynLibs[0])
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
|
||||
err = nativeInit(workDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err2 := fmt.Errorf("unable to locate suitable llm library")
|
||||
for _, dynLib := range dynLibs {
|
||||
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index d86d7e04..7d71c766 100644
|
||||
index a48582ad..9fffffd8 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -1598,12 +1598,6 @@ struct llama_server_context
|
||||
@@ -1564,12 +1564,6 @@ struct llama_server_context
|
||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ index d86d7e04..7d71c766 100644
|
||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||
{
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
@@ -1615,6 +1609,12 @@ struct llama_server_context
|
||||
@@ -1581,6 +1575,12 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,5 +26,5 @@ index d86d7e04..7d71c766 100644
|
||||
+ slot.cache_tokens = prompt_tokens;
|
||||
+
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||
{"n_past", slot.n_past},
|
||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||
|
||||
@@ -37,18 +37,26 @@ index 11dd82c3..311495a8 100644
|
||||
|
||||
llama_backend_free();
|
||||
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
|
||||
index 70cce072..9124869a 100644
|
||||
index 70cce072..2acb1eab 100644
|
||||
--- a/examples/server/utils.hpp
|
||||
+++ b/examples/server/utils.hpp
|
||||
@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector<json> messages)
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <unordered_map>
|
||||
+#include <atomic>
|
||||
|
||||
#include "json.hpp"
|
||||
|
||||
@@ -190,6 +191,7 @@ inline std::string format_chatml(std::vector<json> messages)
|
||||
struct llama_server_queue {
|
||||
int id = 0;
|
||||
std::mutex mutex_tasks;
|
||||
+ bool running;
|
||||
+ std::atomic<bool> running;
|
||||
// queues
|
||||
std::vector<task_server> queue_tasks;
|
||||
std::vector<task_server> queue_tasks_deferred;
|
||||
@@ -248,9 +249,18 @@ struct llama_server_queue {
|
||||
@@ -248,9 +250,15 @@ struct llama_server_queue {
|
||||
queue_tasks_deferred.clear();
|
||||
}
|
||||
|
||||
@@ -56,10 +64,7 @@ index 70cce072..9124869a 100644
|
||||
- [[noreturn]]
|
||||
+ // end the start_loop routine
|
||||
+ void terminate() {
|
||||
+ {
|
||||
+ std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
+ running = false;
|
||||
+ }
|
||||
+ running = false;
|
||||
+ condition_tasks.notify_all();
|
||||
+ }
|
||||
+
|
||||
@@ -69,17 +74,17 @@ index 70cce072..9124869a 100644
|
||||
while (true) {
|
||||
// new task arrived
|
||||
LOG_VERBOSE("have new task", {});
|
||||
@@ -294,8 +304,12 @@ struct llama_server_queue {
|
||||
@@ -294,8 +302,12 @@ struct llama_server_queue {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (queue_tasks.empty()) {
|
||||
+ if (!running) {
|
||||
+ if (!running.load()) {
|
||||
+ LOG_VERBOSE("ending start_loop", {});
|
||||
+ return;
|
||||
+ }
|
||||
condition_tasks.wait(lock, [&]{
|
||||
- return !queue_tasks.empty();
|
||||
+ return (!queue_tasks.empty() || !running);
|
||||
+ return (!queue_tasks.empty() || !running.load());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
322
openai/openai.go
322
openai/openai.go
@@ -1,322 +0,0 @@
|
||||
// openai package provides middleware for partial compatibility with the OpenAI REST API
|
||||
package openai
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type Error struct {
|
||||
Message string `json:"message"`
|
||||
Type string `json:"type"`
|
||||
Param interface{} `json:"param"`
|
||||
Code *string `json:"code"`
|
||||
}
|
||||
|
||||
type ErrorResponse struct {
|
||||
Error Error `json:"error"`
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type Choice struct {
|
||||
Index int `json:"index"`
|
||||
Message Message `json:"message"`
|
||||
FinishReason *string `json:"finish_reason"`
|
||||
}
|
||||
|
||||
type ChunkChoice struct {
|
||||
Index int `json:"index"`
|
||||
Delta Message `json:"delta"`
|
||||
FinishReason *string `json:"finish_reason"`
|
||||
}
|
||||
|
||||
type Usage struct {
|
||||
PromptTokens int `json:"prompt_tokens"`
|
||||
CompletionTokens int `json:"completion_tokens"`
|
||||
TotalTokens int `json:"total_tokens"`
|
||||
}
|
||||
|
||||
type ResponseFormat struct {
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
type ChatCompletionRequest struct {
|
||||
Model string `json:"model"`
|
||||
Messages []Message `json:"messages"`
|
||||
Stream bool `json:"stream"`
|
||||
MaxTokens *int `json:"max_tokens"`
|
||||
Seed *int `json:"seed"`
|
||||
Stop any `json:"stop"`
|
||||
Temperature *float64 `json:"temperature"`
|
||||
FrequencyPenalty *float64 `json:"frequency_penalty"`
|
||||
PresencePenalty *float64 `json:"presence_penalty_penalty"`
|
||||
TopP *float64 `json:"top_p"`
|
||||
ResponseFormat *ResponseFormat `json:"response_format"`
|
||||
}
|
||||
|
||||
type ChatCompletion struct {
|
||||
Id string `json:"id"`
|
||||
Object string `json:"object"`
|
||||
Created int64 `json:"created"`
|
||||
Model string `json:"model"`
|
||||
SystemFingerprint string `json:"system_fingerprint"`
|
||||
Choices []Choice `json:"choices"`
|
||||
Usage Usage `json:"usage,omitempty"`
|
||||
}
|
||||
|
||||
type ChatCompletionChunk struct {
|
||||
Id string `json:"id"`
|
||||
Object string `json:"object"`
|
||||
Created int64 `json:"created"`
|
||||
Model string `json:"model"`
|
||||
SystemFingerprint string `json:"system_fingerprint"`
|
||||
Choices []ChunkChoice `json:"choices"`
|
||||
}
|
||||
|
||||
func NewError(code int, message string) ErrorResponse {
|
||||
var etype string
|
||||
switch code {
|
||||
case http.StatusBadRequest:
|
||||
etype = "invalid_request_error"
|
||||
case http.StatusNotFound:
|
||||
etype = "not_found_error"
|
||||
default:
|
||||
etype = "api_error"
|
||||
}
|
||||
|
||||
return ErrorResponse{Error{Type: etype, Message: message}}
|
||||
}
|
||||
|
||||
func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||
return ChatCompletion{
|
||||
Id: id,
|
||||
Object: "chat.completion",
|
||||
Created: r.CreatedAt.Unix(),
|
||||
Model: r.Model,
|
||||
SystemFingerprint: "fp_ollama",
|
||||
Choices: []Choice{{
|
||||
Index: 0,
|
||||
Message: Message{Role: r.Message.Role, Content: r.Message.Content},
|
||||
FinishReason: func(done bool) *string {
|
||||
if done {
|
||||
reason := "stop"
|
||||
return &reason
|
||||
}
|
||||
return nil
|
||||
}(r.Done),
|
||||
}},
|
||||
Usage: Usage{
|
||||
// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
|
||||
PromptTokens: r.PromptEvalCount,
|
||||
CompletionTokens: r.EvalCount,
|
||||
TotalTokens: r.PromptEvalCount + r.EvalCount,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
|
||||
return ChatCompletionChunk{
|
||||
Id: id,
|
||||
Object: "chat.completion.chunk",
|
||||
Created: time.Now().Unix(),
|
||||
Model: r.Model,
|
||||
SystemFingerprint: "fp_ollama",
|
||||
Choices: []ChunkChoice{
|
||||
{
|
||||
Index: 0,
|
||||
Delta: Message{Role: "assistant", Content: r.Message.Content},
|
||||
FinishReason: func(done bool) *string {
|
||||
if done {
|
||||
reason := "stop"
|
||||
return &reason
|
||||
}
|
||||
return nil
|
||||
}(r.Done),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func fromRequest(r ChatCompletionRequest) api.ChatRequest {
|
||||
var messages []api.Message
|
||||
for _, msg := range r.Messages {
|
||||
messages = append(messages, api.Message{Role: msg.Role, Content: msg.Content})
|
||||
}
|
||||
|
||||
options := make(map[string]interface{})
|
||||
|
||||
switch stop := r.Stop.(type) {
|
||||
case string:
|
||||
options["stop"] = []string{stop}
|
||||
case []interface{}:
|
||||
var stops []string
|
||||
for _, s := range stop {
|
||||
if str, ok := s.(string); ok {
|
||||
stops = append(stops, str)
|
||||
}
|
||||
}
|
||||
options["stop"] = stops
|
||||
}
|
||||
|
||||
if r.MaxTokens != nil {
|
||||
options["num_predict"] = *r.MaxTokens
|
||||
}
|
||||
|
||||
if r.Temperature != nil {
|
||||
options["temperature"] = *r.Temperature * 2.0
|
||||
} else {
|
||||
options["temperature"] = 1.0
|
||||
}
|
||||
|
||||
if r.Seed != nil {
|
||||
options["seed"] = *r.Seed
|
||||
|
||||
// temperature=0 is required for reproducible outputs
|
||||
options["temperature"] = 0.0
|
||||
}
|
||||
|
||||
if r.FrequencyPenalty != nil {
|
||||
options["frequency_penalty"] = *r.FrequencyPenalty * 2.0
|
||||
}
|
||||
|
||||
if r.PresencePenalty != nil {
|
||||
options["presence_penalty"] = *r.PresencePenalty * 2.0
|
||||
}
|
||||
|
||||
if r.TopP != nil {
|
||||
options["top_p"] = *r.TopP
|
||||
} else {
|
||||
options["top_p"] = 1.0
|
||||
}
|
||||
|
||||
var format string
|
||||
if r.ResponseFormat != nil && r.ResponseFormat.Type == "json_object" {
|
||||
format = "json"
|
||||
}
|
||||
|
||||
return api.ChatRequest{
|
||||
Model: r.Model,
|
||||
Messages: messages,
|
||||
Format: format,
|
||||
Options: options,
|
||||
Stream: &r.Stream,
|
||||
}
|
||||
}
|
||||
|
||||
type writer struct {
|
||||
stream bool
|
||||
id string
|
||||
gin.ResponseWriter
|
||||
}
|
||||
|
||||
func (w *writer) writeError(code int, data []byte) (int, error) {
|
||||
var serr api.StatusError
|
||||
err := json.Unmarshal(data, &serr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
w.ResponseWriter.Header().Set("Content-Type", "application/json")
|
||||
err = json.NewEncoder(w.ResponseWriter).Encode(NewError(http.StatusInternalServerError, serr.Error()))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return len(data), nil
|
||||
}
|
||||
|
||||
func (w *writer) writeResponse(data []byte) (int, error) {
|
||||
var chatResponse api.ChatResponse
|
||||
err := json.Unmarshal(data, &chatResponse)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// chat chunk
|
||||
if w.stream {
|
||||
d, err := json.Marshal(toChunk(w.id, chatResponse))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
|
||||
}
|
||||
|
||||
w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
|
||||
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if chatResponse.Done {
|
||||
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return len(data), nil
|
||||
}
|
||||
|
||||
// chat completion
|
||||
w.ResponseWriter.Header().Set("Content-Type", "application/json")
|
||||
err = json.NewEncoder(w.ResponseWriter).Encode(toChatCompletion(w.id, chatResponse))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return len(data), nil
|
||||
}
|
||||
|
||||
func (w *writer) Write(data []byte) (int, error) {
|
||||
code := w.ResponseWriter.Status()
|
||||
if code != http.StatusOK {
|
||||
return w.writeError(code, data)
|
||||
}
|
||||
|
||||
return w.writeResponse(data)
|
||||
}
|
||||
|
||||
func Middleware() gin.HandlerFunc {
|
||||
return func(c *gin.Context) {
|
||||
var req ChatCompletionRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
if len(req.Messages) == 0 {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "[] is too short - 'messages'"))
|
||||
return
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if err := json.NewEncoder(&b).Encode(fromRequest(req)); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
c.Request.Body = io.NopCloser(&b)
|
||||
|
||||
w := &writer{
|
||||
ResponseWriter: c.Writer,
|
||||
stream: req.Stream,
|
||||
id: fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
|
||||
}
|
||||
|
||||
c.Writer = w
|
||||
|
||||
c.Next()
|
||||
}
|
||||
}
|
||||
@@ -111,14 +111,8 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= http.StatusBadRequest {
|
||||
responseBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("%d: %v", resp.StatusCode, err)
|
||||
} else if len(responseBody) > 0 {
|
||||
return "", fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("%s", resp.Status)
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("on pull registry responded with code %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
|
||||
@@ -181,19 +181,16 @@ func (m *Model) ChatPrompts(msgs []api.Message) (*ChatHistory, error) {
|
||||
}
|
||||
|
||||
currentVars.Prompt = msg.Content
|
||||
|
||||
if len(m.ProjectorPaths) > 0 {
|
||||
for i := range msg.Images {
|
||||
id := len(images) + i
|
||||
currentVars.Prompt += fmt.Sprintf(" [img-%d]", id)
|
||||
currentVars.Images = append(currentVars.Images, llm.ImageData{
|
||||
ID: id,
|
||||
Data: msg.Images[i],
|
||||
})
|
||||
}
|
||||
|
||||
images = append(images, currentVars.Images...)
|
||||
for i := range msg.Images {
|
||||
id := len(images) + i
|
||||
currentVars.Prompt += fmt.Sprintf(" [img-%d]", id)
|
||||
currentVars.Images = append(currentVars.Images, llm.ImageData{
|
||||
ID: id,
|
||||
Data: msg.Images[i],
|
||||
})
|
||||
}
|
||||
|
||||
images = append(images, currentVars.Images...)
|
||||
case "assistant":
|
||||
currentVars.Response = msg.Content
|
||||
prompts = append(prompts, currentVars)
|
||||
|
||||
@@ -26,7 +26,6 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
"github.com/jmorganca/ollama/llm"
|
||||
"github.com/jmorganca/ollama/openai"
|
||||
"github.com/jmorganca/ollama/parser"
|
||||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
@@ -179,11 +178,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
|
||||
opts, err := modelOptions(model, req.Options)
|
||||
if err != nil {
|
||||
if errors.Is(err, api.ErrInvalidOpts) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -397,11 +392,7 @@ func EmbeddingHandler(c *gin.Context) {
|
||||
|
||||
opts, err := modelOptions(model, req.Options)
|
||||
if err != nil {
|
||||
if errors.Is(err, api.ErrInvalidOpts) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -936,9 +927,6 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
r.POST("/api/blobs/:digest", CreateBlobHandler)
|
||||
r.HEAD("/api/blobs/:digest", HeadBlobHandler)
|
||||
|
||||
// Compatibility endpoints
|
||||
r.POST("/v1/chat/completions", openai.Middleware(), ChatHandler)
|
||||
|
||||
for _, method := range []string{http.MethodGet, http.MethodHead} {
|
||||
r.Handle(method, "/", func(c *gin.Context) {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
@@ -1116,11 +1104,7 @@ func ChatHandler(c *gin.Context) {
|
||||
|
||||
opts, err := modelOptions(model, req.Options)
|
||||
if err != nil {
|
||||
if errors.Is(err, api.ErrInvalidOpts) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1136,6 +1120,18 @@ func ChatHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// an empty request loads the model
|
||||
if len(req.Messages) == 0 {
|
||||
resp := api.ChatResponse{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true,
|
||||
Message: api.Message{Role: "assistant"},
|
||||
}
|
||||
c.JSON(http.StatusOK, resp)
|
||||
return
|
||||
}
|
||||
|
||||
checkpointLoaded := time.Now()
|
||||
|
||||
chat, err := model.ChatPrompts(req.Messages)
|
||||
@@ -1150,18 +1146,6 @@ func ChatHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// an empty request loads the model
|
||||
if len(prompt) == 0 {
|
||||
resp := api.ChatResponse{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true,
|
||||
Message: api.Message{Role: "assistant"},
|
||||
}
|
||||
c.JSON(http.StatusOK, resp)
|
||||
return
|
||||
}
|
||||
|
||||
slog.Debug("chat handler", "prompt", prompt)
|
||||
|
||||
ch := make(chan any)
|
||||
|
||||
Reference in New Issue
Block a user