Compare commits

..

12 Commits

Author SHA1 Message Date
ParthSareen
6556540655 User interface prototype 2024-12-19 16:43:36 -08:00
ParthSareen
3f60fd57e3 Remove /template API 2024-12-19 14:47:51 -08:00
ParthSareen
38cd80d52c Add dry run option for chat request 2024-12-19 14:17:29 -08:00
ParthSareen
c9a46140e6 Warn user on truncation - ollama logs 2024-12-19 13:48:25 -08:00
ParthSareen
1d529d8b7b Add /template endpoint 2024-12-18 15:23:27 -08:00
Jeffrey Morgan
a72f2dce45 scripts: sign renamed macOS binary (#8131) 2024-12-17 18:03:49 -08:00
Jesse Gross
08a832b482 llama: Ensure KV cache is fully defragmented.
Sometimes the KV cache requires defragmentation even without
triggering the threshold heuristic. In this case, decoding
will not being able to find a KV cache slot. This is particularly
difficult for the caller to handle if it happens in between
ubatches. To avoid this, we should immediately trigger a defrag.

In addition, a heavily fragmented cache can require more than
max_moves to defragment. Currently, we stop when we hit the limit
but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.

Fixes #7949
2024-12-17 14:01:19 -08:00
Blake Mizerany
2ddc32d5c5 llm: do not error on "null" format (#8139)
This fixes another regression in the previous commit that fixed other
known bugs.
2024-12-17 09:49:37 -08:00
Jascha Beste
2cde4b8817 readme: change getting started guide link for pgai (#8119) 2024-12-16 22:13:23 -08:00
Blake Mizerany
87f0a49fe6 llm: do not silently fail for supplied, but invalid formats (#8130)
Changes in #8002 introduced fixes for bugs with mangling JSON Schemas.
It also fixed a bug where the server would silently fail when clients
requested invalid formats. It also, unfortunately, introduced a bug
where the server would reject requests with an empty format, which
should be allowed.

The change in #8127 updated the code to allow the empty format, but also
reintroduced the regression where the server would silently fail when
the format was set, but invalid.

This commit fixes both regressions. The server does not reject the empty
format, but it does reject invalid formats. It also adds tests to help
us catch regressions in the future.

Also, the updated code provides a more detailed error message when a
client sends a non-empty, but invalid format, echoing the invalid format
in the response.

This commits also takes the opportunity to remove superfluous linter
checks.
2024-12-16 21:57:49 -08:00
Jeffrey Morgan
0f06a6daa7 llm: loosen format check to default to no format (#8127) 2024-12-16 18:45:46 -08:00
Daniel Hiltgen
8f805dd74b darwin: restore multiple runners for x86 (#8125)
In 0.5.2 we simplified packaging to have avx only for macos x86.  It looks like
there may still be some non-AVX systems out there, so this puts back the prior
logic of building no-AVX for the primary binary, and now 2 runners for avx and avx2.
These will be packaged in the App bundle only, so the stand-alone binary will now be
without AVX support on macos.  On arm, we'll also see these runners reported
as available in the log, but they're dormant and will never be used at runtime.
2024-12-16 18:45:02 -08:00
14 changed files with 474 additions and 116 deletions

View File

@@ -8,8 +8,6 @@ linters:
- containedctx
- contextcheck
- errcheck
- exportloopref
- gci
- gocheckcompilerdirectives
- gofmt
- gofumpt
@@ -30,8 +28,6 @@ linters:
- wastedassign
- whitespace
linters-settings:
gci:
sections: [standard, default, localmodule]
staticcheck:
checks:
- all

View File

@@ -8,11 +8,9 @@ include make/cuda-v12-defs.make
include make/rocm-defs.make
ifeq ($(CUSTOM_CPU_FLAGS),)
ifneq ($(OS),darwin)
ifeq ($(ARCH),amd64)
RUNNER_TARGETS=cpu
endif
endif
# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
ifneq ($(CUDA_11_COMPILER),)

View File

@@ -407,8 +407,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Database
- [PostgreSQL extension pgai](https://github.com/timescale/pgai) (Create and search embeddings from Ollama models using pgvector)
- [Get started guide](https://github.com/timescale/pgai/blob/main/docs/ollama.md)
- [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
- [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
- [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
- [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)

View File

@@ -103,10 +103,18 @@ type ChatRequest struct {
// Tools is an optional list of tools the model has access to.
Tools `json:"tools,omitempty"`
Debug *Debug `json:"debug,omitempty"`
Dry bool `json:"dry,omitempty"`
// Options lists model-specific options.
Options map[string]interface{} `json:"options"`
}
type Debug struct {
Include []string `json:"include,omitempty"`
}
type Tools []Tool
func (t Tools) String() string {
@@ -190,6 +198,8 @@ type ChatResponse struct {
Message Message `json:"message"`
DoneReason string `json:"done_reason,omitempty"`
Debug map[string]any `json:"debug,omitempty"`
Done bool `json:"done"`
Metrics

99
llama/llama.cpp vendored
View File

@@ -3051,6 +3051,13 @@ struct llama_kv_cache {
}
};
// block of KV slots to move when defragging
struct llama_kv_defrag_move {
uint32_t src;
uint32_t dst;
uint32_t len;
};
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
@@ -10828,35 +10835,23 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
for (uint32_t i = 0; i < ids.size(); ++i) {
const uint32_t id = ids[i];
if (i == id || id == ids.size()) {
continue;
}
uint32_t nm = 1;
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
nm++;
}
for (const auto & move : moves) {
for (int il = 0; il < n_layer; ++il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm,
n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm,
n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -10864,31 +10859,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm,
n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm,
n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa,
move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, i));
ggml_row_size(kv_self.v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa,
move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, id));
ggml_row_size(kv_self.v_l[il]->type, move.dst));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
}
i += nm - 1;
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -17351,7 +17344,7 @@ struct llm_build_context {
}
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -17361,7 +17354,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
struct ggml_cgraph * result = llm.build_defrag(ids);
struct ggml_cgraph * result = llm.build_defrag(moves);
llm.free();
@@ -18377,7 +18370,12 @@ static int llama_decode_internal(
kv_self.head = 0;
}
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
if (!slot) {
llama_kv_cache_defrag(kv_self);
llama_kv_cache_update(&lctx);
slot = llama_kv_cache_find_slot(kv_self, ubatch);
}
if (!slot) {
return 1;
}
@@ -18782,8 +18780,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
// number of cells moved
uint32_t n_moves = 0;
// groups of cells moved
std::vector<struct llama_kv_defrag_move> moves;
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -18847,19 +18845,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
// should we stop searching for the next move?
bool stop = false;
// go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) {
auto & cell1 = kv_self.cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) {
if (n_moves == max_moves) {
stop = true;
break;
}
cont = false;
continue;
}
@@ -18875,8 +18865,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
n_moves++;
moves.push_back({i1, i0 + nf, 1});
cont = true;
} else {
moves.back().len++;
}
nf++;
@@ -18886,22 +18878,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
}
}
if (stop || n_moves == max_moves) {
break;
}
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1;
}
if (n_moves == 0) {
if (moves.size() == 0) {
return;
}
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
#if 0
// CPU defrag
@@ -18976,11 +18962,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
#else
// ggml_graph defrag
ggml_backend_sched_reset(lctx.sched.get());
for (std::size_t i = 0; i < moves.size(); i += max_moves) {
std::vector<struct llama_kv_defrag_move> chunk;
auto end = std::min(i + max_moves, moves.size());
chunk.assign(moves.begin() + i, moves.begin() + end);
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
ggml_backend_sched_reset(lctx.sched.get());
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
}
#endif
//const int64_t t_end = ggml_time_us();

View File

@@ -0,0 +1,242 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 13 Dec 2024 16:11:59 -0800
Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
Sometimes the KV cache requires defragmentation even without
triggering the threshold heuristic. In this case, decoding
will not being able to find a KV cache slot. This is particularly
difficult for the caller to handle if it happens in between
ubatches. To avoid this, we should immediately trigger a defrag.
In addition, a heavily fragmented cache can require more than
max_moves to defragment. Currently, we stop when we hit the limit
but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.
---
src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
1 file changed, 46 insertions(+), 53 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 4778a9ed..654e32bc 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3025,6 +3025,13 @@ struct llama_kv_cache {
}
};
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+ uint32_t src;
+ uint32_t dst;
+ uint32_t len;
+};
+
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
@@ -10802,35 +10809,23 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
- for (uint32_t i = 0; i < ids.size(); ++i) {
- const uint32_t id = ids[i];
-
- if (i == id || id == ids.size()) {
- continue;
- }
-
- uint32_t nm = 1;
-
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
- nm++;
- }
-
+ for (const auto & move : moves) {
for (int il = 0; il < n_layer; ++il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -10838,31 +10833,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
- ggml_row_size(kv_self.v_l[il]->type, i));
+ ggml_row_size(kv_self.v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
- ggml_row_size(kv_self.v_l[il]->type, id));
+ ggml_row_size(kv_self.v_l[il]->type, move.dst));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
}
-
- i += nm - 1;
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -17325,7 +17318,7 @@ struct llm_build_context {
}
};
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
- struct ggml_cgraph * result = llm.build_defrag(ids);
+ struct ggml_cgraph * result = llm.build_defrag(moves);
llm.free();
@@ -18351,7 +18344,12 @@ static int llama_decode_internal(
kv_self.head = 0;
}
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ if (!slot) {
+ llama_kv_cache_defrag(kv_self);
+ llama_kv_cache_update(&lctx);
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ }
if (!slot) {
return 1;
}
@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
- // number of cells moved
- uint32_t n_moves = 0;
+ // groups of cells moved
+ std::vector<struct llama_kv_defrag_move> moves;
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
- // should we stop searching for the next move?
- bool stop = false;
-
// go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) {
auto & cell1 = kv_self.cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) {
- if (n_moves == max_moves) {
- stop = true;
- break;
- }
-
cont = false;
continue;
}
@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
- n_moves++;
+ moves.push_back({i1, i0 + nf, 1});
cont = true;
+ } else {
+ moves.back().len++;
}
nf++;
@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
}
}
- if (stop || n_moves == max_moves) {
- break;
- }
-
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1;
}
- if (n_moves == 0) {
+ if (moves.size() == 0) {
return;
}
- //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
- //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
#if 0
// CPU defrag
@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
#else
// ggml_graph defrag
- ggml_backend_sched_reset(lctx.sched.get());
+ for (std::size_t i = 0; i < moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, moves.size());
+ chunk.assign(moves.begin() + i, moves.begin() + end);
- ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+ ggml_backend_sched_reset(lctx.sched.get());
+
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+ }
#endif
//const int64_t t_end = ggml_time_us();

View File

@@ -433,14 +433,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
err := s.lc.Decode(batch)
if err != nil {
if errors.Is(err, llama.ErrKvCacheFull) {
slog.Debug("defragmenting kv cache")
s.cache.lc.KvCacheDefrag()
err = s.lc.Decode(batch)
}
if err != nil {
return fmt.Errorf("failed to decode batch: %w", err)
}
return fmt.Errorf("failed to decode batch: %w", err)
}
if crossAttention {

View File

@@ -674,21 +674,6 @@ type CompletionResponse struct {
}
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
if err := s.sem.Acquire(ctx, 1); err != nil {
if errors.Is(err, context.Canceled) {
slog.Info("aborting completion request due to client closing the connection")
} else {
slog.Error("Failed to acquire semaphore", "error", err)
}
return err
}
defer s.sem.Release(1)
// put an upper limit on num_predict to avoid the model running on forever
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
req.Options.NumPredict = 10 * s.options.NumCtx
}
request := map[string]any{
"prompt": req.Prompt,
"stream": true,
@@ -714,30 +699,51 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
"cache_prompt": true,
}
// Make sure the server is ready
status, err := s.getServerStatusRetry(ctx)
if err != nil {
return err
} else if status != ServerStatusReady {
return fmt.Errorf("unexpected server status: %s", status.ToString())
}
if len(req.Format) > 0 {
switch {
case bytes.Equal(req.Format, []byte(`"json"`)):
switch string(req.Format) {
case `null`, `""`:
// Field was set, but "missing" a value. We accept
// these as "not set".
break
case `"json"`:
request["grammar"] = grammarJSON
case bytes.HasPrefix(req.Format, []byte("{")):
default:
if req.Format[0] != '{' {
return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
}
// User provided a JSON schema
g := llama.SchemaToGrammar(req.Format)
if g == nil {
return fmt.Errorf("invalid JSON schema in format")
}
request["grammar"] = string(g)
default:
slog.Warn("invalid format: expected \"json\" or a JSON schema")
}
}
if err := s.sem.Acquire(ctx, 1); err != nil {
if errors.Is(err, context.Canceled) {
slog.Info("aborting completion request due to client closing the connection")
} else {
slog.Error("Failed to acquire semaphore", "error", err)
}
return err
}
defer s.sem.Release(1)
// put an upper limit on num_predict to avoid the model running on forever
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
req.Options.NumPredict = 10 * s.options.NumCtx
}
// Make sure the server is ready
status, err := s.getServerStatusRetry(ctx)
if err != nil {
return err
} else if status != ServerStatusReady {
return fmt.Errorf("unexpected server status: %s", status.ToString())
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)

72
llm/server_test.go Normal file
View File

@@ -0,0 +1,72 @@
package llm
import (
"context"
"errors"
"fmt"
"strings"
"testing"
"github.com/ollama/ollama/api"
"golang.org/x/sync/semaphore"
)
func TestLLMServerCompletionFormat(t *testing.T) {
// This test was written to fix an already deployed issue. It is a bit
// of a mess, and but it's good enough, until we can refactoring the
// Completion method to be more testable.
ctx, cancel := context.WithCancel(context.Background())
s := &llmServer{
sem: semaphore.NewWeighted(1), // required to prevent nil panic
}
checkInvalid := func(format string) {
t.Helper()
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: []byte(format),
}, nil)
want := fmt.Sprintf("invalid format: %q; expected \"json\" or a valid JSON Schema", format)
if err == nil || !strings.Contains(err.Error(), want) {
t.Fatalf("err = %v; want %q", err, want)
}
}
checkInvalid("X") // invalid format
checkInvalid(`"X"`) // invalid JSON Schema
cancel() // prevent further processing if request makes it past the format check
checkValid := func(err error) {
t.Helper()
if !errors.Is(err, context.Canceled) {
t.Fatalf("Completion: err = %v; expected context.Canceled", err)
}
}
valids := []string{
// "missing"
``,
`""`,
`null`,
// JSON
`"json"`,
`{"type":"object"}`,
}
for _, valid := range valids {
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: []byte(valid),
}, nil)
checkValid(err)
}
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: nil, // missing format
}, nil)
checkValid(err)
}

View File

@@ -19,6 +19,7 @@ const config: ForgeConfig = {
icon: './assets/icon.icns',
extraResource: [
'../dist/ollama',
'../dist/darwin-amd64/lib',
path.join(__dirname, './assets/iconTemplate.png'),
path.join(__dirname, './assets/iconTemplate@2x.png'),
path.join(__dirname, './assets/iconUpdateTemplate.png'),
@@ -42,7 +43,7 @@ const config: ForgeConfig = {
}
: {}),
osxUniversal: {
x64ArchFiles: '**/ollama',
x64ArchFiles: '**/ollama*',
},
},
rebuildConfig: {},

View File

@@ -72,6 +72,7 @@ func locateRunnersOnce() {
paths := []string{
filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
filepath.Join(filepath.Dir(exe), "lib", "ollama", "runners"),
}
for _, path := range paths {
if _, err := os.Stat(path); err == nil {

View File

@@ -15,19 +15,35 @@ export CGO_CXXFLAGS=-mmacosx-version-min=11.3
export CGO_LDFLAGS=-mmacosx-version-min=11.3
rm -rf llama/build dist/darwin-*
# Generate the universal ollama binary for stand-alone usage: metal + avx
echo "Building binary"
echo "Building darwin arm64"
GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
echo "Building darwin amd64 with AVX enabled"
GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist
GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
# sign the binary and rename it
if [ -n "$APPLE_IDENTITY" ]; then
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama-darwin
else
echo "Skipping code signing - set APPLE_IDENTITY"
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
fi
chmod +x dist/ollama
ditto -c -k --keepParent dist/ollama-darwin dist/temp.zip
if [ -n "$APPLE_IDENTITY" ]; then
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
fi
rm -f dist/temp.zip
# Build the app bundle
echo "Building app"
echo "Building darwin amd64 with runners"
rm dist/darwin-amd64/bin/ollama
GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
# Generate the universal ollama binary for the app bundle: metal + no-avx
lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
# build and optionally sign the mac app
npm install --prefix macapp
@@ -38,15 +54,3 @@ else
fi
cp macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
# sign the binary and rename it
if [ -n "$APPLE_IDENTITY" ]; then
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
else
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
fi
ditto -c -k --keepParent dist/ollama dist/temp.zip
if [ -n "$APPLE_IDENTITY" ]; then
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
fi
mv dist/ollama dist/ollama-darwin
rm -f dist/temp.zip

View File

@@ -82,6 +82,10 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
}
currMsgIdx := n
// Warn user if messages are truncated from the input
if numTruncatedMessages := len(msgs[0:currMsgIdx]); numTruncatedMessages > 0 {
slog.Warn("truncated first messages from input", "num_truncated", numTruncatedMessages)
}
for cnt, msg := range msgs[currMsgIdx:] {
prefix := ""

View File

@@ -1539,6 +1539,34 @@ func (s *Server) ChatHandler(c *gin.Context) {
return
}
if req.Dry {
var debug map[string]any
if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
debug = map[string]any{"prompt": prompt}
}
tokens, err := r.Tokenize(c.Request.Context(), prompt)
if err != nil {
slog.Error("tokenize error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusOK, api.ChatResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant", Content: ""},
Done: true,
DoneReason: "dry_run",
Debug: debug,
Metrics: api.Metrics{
PromptEvalCount: len(tokens),
PromptEvalDuration: 0,
EvalCount: 0,
EvalDuration: 0,
},
})
return
}
slog.Debug("chat request", "images", len(images), "prompt", prompt)
ch := make(chan any)
@@ -1571,6 +1599,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
}
if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
res.Debug = map[string]any{"prompt": prompt}
if req.Stream != nil && !*req.Stream {
tempMsg := res.Message
res.Message = api.Message{Role: "assistant", Content: ""}
ch <- res
res.Message = tempMsg
}
}
// TODO: tool call checking and filtering should be moved outside of this callback once streaming
// however this was a simple change for now without reworking streaming logic of this (and other)
// handlers