User interface prototype

Remove /template API
Add dry run option for chat request
2024-12-19 16:43:36 -08:00 · 2024-12-19 14:47:51 -08:00 · 2024-12-19 14:17:29 -08:00 · 2024-12-19 13:48:25 -08:00 · 2024-12-18 15:23:27 -08:00 · 2024-12-17 18:03:49 -08:00
14 changed files with 474 additions and 116 deletions
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -8,8 +8,6 @@ linters:
    - containedctx
    - contextcheck
    - errcheck
-    - exportloopref
-    - gci
    - gocheckcompilerdirectives
    - gofmt
    - gofumpt
@@ -30,8 +28,6 @@ linters:
    - wastedassign
    - whitespace
 linters-settings:
-  gci:
-    sections: [standard, default, localmodule]
  staticcheck:
    checks:
      - all
--- a/2
+++ b/2
@@ -8,11 +8,9 @@ include make/cuda-v12-defs.make
 include make/rocm-defs.make

 ifeq ($(CUSTOM_CPU_FLAGS),)
-ifneq ($(OS),darwin)
 ifeq ($(ARCH),amd64)
 	RUNNER_TARGETS=cpu
 endif
-endif
 # Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
 ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
 ifneq ($(CUDA_11_COMPILER),)
--- a/README.md
+++ b/README.md
@@ -407,8 +407,8 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Database

- [PostgreSQL extension pgai](https://github.com/timescale/pgai) (Create and search embeddings from Ollama models using pgvector)
-   - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/ollama.md)
+- [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
+   - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
 - [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)
--- a/api/types.go
+++ b/api/types.go
@@ -103,10 +103,18 @@ type ChatRequest struct {
 	// Tools is an optional list of tools the model has access to.
 	Tools `json:"tools,omitempty"`

+	Debug *Debug `json:"debug,omitempty"`
+
+	Dry bool `json:"dry,omitempty"`
+
 	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 }

+type Debug struct {
+	Include []string `json:"include,omitempty"`
+}
+
 type Tools []Tool

 func (t Tools) String() string {
@@ -190,6 +198,8 @@ type ChatResponse struct {
 	Message    Message   `json:"message"`
 	DoneReason string    `json:"done_reason,omitempty"`

+	Debug map[string]any `json:"debug,omitempty"`
+
 	Done bool `json:"done"`

 	Metrics
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@@ -3051,6 +3051,13 @@ struct llama_kv_cache {
    }
 };

+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+    uint32_t src;
+    uint32_t dst;
+    uint32_t len;
+};
+
 struct llama_control_vector {
    std::vector<struct ggml_tensor *> tensors; // per layer
    std::vector<ggml_context_ptr> ctxs;
@@ -10828,35 +10835,23 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+    struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);

-        for (uint32_t i = 0; i < ids.size(); ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == ids.size()) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
+        for (const auto & move : moves) {
            for (int il = 0; il < n_layer; ++il) {
                const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
                const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);

                ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
+                        n_embd_k_gqa, move.len,
                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));

                ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
+                        n_embd_k_gqa, move.len,
                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));

                ggml_tensor * view_v_src;
                ggml_tensor * view_v_dst;
@@ -10864,31 +10859,29 @@ struct llm_build_context {
                if (flash_attn) {
                    // NOTE: the V cache is not transposed when using flash attention
                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
+                            n_embd_v_gqa, move.len,
                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));

                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
+                            n_embd_v_gqa, move.len,
                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
                } else {
                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
+                            move.len, n_embd_v_gqa,
                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, i));
+                            ggml_row_size(kv_self.v_l[il]->type, move.src));

                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
+                            move.len, n_embd_v_gqa,
                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, id));
+                            ggml_row_size(kv_self.v_l[il]->type, move.dst));
                }

                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
            }
-
-            i += nm - 1;
        }

        //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -17351,7 +17344,7 @@ struct llm_build_context {
    }
 };

-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
    llama_ubatch dummy = {};
    dummy.equal_seqs = true;

@@ -17361,7 +17354,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const

    llm.init();

-    struct ggml_cgraph * result = llm.build_defrag(ids);
+    struct ggml_cgraph * result = llm.build_defrag(moves);

    llm.free();

@@ -18377,7 +18370,12 @@ static int llama_decode_internal(
                kv_self.head = 0;
            }

-            const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            if (!slot) {
+                llama_kv_cache_defrag(kv_self);
+                llama_kv_cache_update(&lctx);
+                slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            }
            if (!slot) {
                return 1;
            }
@@ -18782,8 +18780,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {

    //const int64_t t_start = ggml_time_us();

-    // number of cells moved
-    uint32_t n_moves = 0;
+    // groups of cells moved
+    std::vector<struct llama_kv_defrag_move> moves;

    // each move requires 6*n_layer tensors (see build_defrag)
    //   - source view, destination view, copy operation
@@ -18847,19 +18845,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
        // are we moving a continuous block of memory?
        bool cont = false;

-        // should we stop searching for the next move?
-        bool stop = false;
-
        // go back and move the nf cells to the hole
        for (; i1 < n_kv; ++i1) {
            auto & cell1 = kv_self.cells[i1];

            if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
                cont = false;
                continue;
            }
@@ -18875,8 +18865,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
            kv_self.head = n_used;

            if (!cont) {
-                n_moves++;
+                moves.push_back({i1, i0 + nf, 1});
                cont = true;
+            } else {
+                moves.back().len++;
            }

            nf++;
@@ -18886,22 +18878,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
            }
        }

-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);

        i0 += nh - 1;
    }

-    if (n_moves == 0) {
+    if (moves.size() == 0) {
        return;
    }

-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n",  moves.size());

 #if 0
    // CPU defrag
@@ -18976,11 +18962,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 #else
    // ggml_graph defrag

-    ggml_backend_sched_reset(lctx.sched.get());
+    for (std::size_t i = 0; i < moves.size(); i += max_moves) {
+        std::vector<struct llama_kv_defrag_move> chunk;
+        auto end = std::min(i + max_moves, moves.size());
+        chunk.assign(moves.begin() + i, moves.begin() + end);

-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+        ggml_backend_sched_reset(lctx.sched.get());

-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+        //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
+        ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
+
+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+    }
 #endif

    //const int64_t t_end = ggml_time_us();
--- a/llama/patches/0014-llama-Ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0014-llama-Ensure-KV-cache-is-fully-defragmented.patch
@@ -0,0 +1,242 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Fri, 13 Dec 2024 16:11:59 -0800
+Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
+
+Sometimes the KV cache requires defragmentation even without
+triggering the threshold heuristic. In this case, decoding
+will not being able to find a KV cache slot. This is particularly
+difficult for the caller to handle if it happens in between
+ubatches. To avoid this, we should immediately trigger a defrag.
+
+In addition, a heavily fragmented cache can require more than
+max_moves to defragment. Currently, we stop when we hit the limit
+but this can leave a cache that still does not have adequate space
+even after defragmentation is triggered. Instead, we should do
+multiple batches of processing until everything is complete.
+---
+ src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
+ 1 file changed, 46 insertions(+), 53 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 4778a9ed..654e32bc 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -3025,6 +3025,13 @@ struct llama_kv_cache {
+     }
+ };
+ 
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+    uint32_t src;
+    uint32_t dst;
+    uint32_t len;
+};
+
+ struct llama_control_vector {
+     std::vector<struct ggml_tensor *> tensors; // per layer
+     std::vector<ggml_context_ptr> ctxs;
+@@ -10802,35 +10809,23 @@ struct llm_build_context {
+         return gf;
+     }
+ 
+-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+    struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
+         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ 
+-        for (uint32_t i = 0; i < ids.size(); ++i) {
+-            const uint32_t id = ids[i];
+-
+-            if (i == id || id == ids.size()) {
+-                continue;
+-            }
+-
+-            uint32_t nm = 1;
+-
+-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+-                nm++;
+-            }
+-
+        for (const auto & move : moves) {
+             for (int il = 0; il < n_layer; ++il) {
+                 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+                 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ 
+                 ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
+-                        n_embd_k_gqa, nm,
+                        n_embd_k_gqa, move.len,
+                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
+ 
+                 ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
+-                        n_embd_k_gqa, nm,
+                        n_embd_k_gqa, move.len,
+                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
+ 
+                 ggml_tensor * view_v_src;
+                 ggml_tensor * view_v_dst;
+@@ -10838,31 +10833,29 @@ struct llm_build_context {
+                 if (flash_attn) {
+                     // NOTE: the V cache is not transposed when using flash attention
+                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+-                            n_embd_v_gqa, nm,
+                            n_embd_v_gqa, move.len,
+                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
+ 
+                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+-                            n_embd_v_gqa, nm,
+                            n_embd_v_gqa, move.len,
+                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
+                 } else {
+                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+-                            nm, n_embd_v_gqa,
+                            move.len, n_embd_v_gqa,
+                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+-                            ggml_row_size(kv_self.v_l[il]->type, i));
+                            ggml_row_size(kv_self.v_l[il]->type, move.src));
+ 
+                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+-                            nm, n_embd_v_gqa,
+                            move.len, n_embd_v_gqa,
+                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+-                            ggml_row_size(kv_self.v_l[il]->type, id));
+                            ggml_row_size(kv_self.v_l[il]->type, move.dst));
+                 }
+ 
+                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+             }
+-
+-            i += nm - 1;
+         }
+ 
+         //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+@@ -17325,7 +17318,7 @@ struct llm_build_context {
+     }
+ };
+ 
+-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
+     llama_ubatch dummy = {};
+     dummy.equal_seqs = true;
+ 
+@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
+ 
+     llm.init();
+ 
+-    struct ggml_cgraph * result = llm.build_defrag(ids);
+    struct ggml_cgraph * result = llm.build_defrag(moves);
+ 
+     llm.free();
+ 
+@@ -18351,7 +18344,12 @@ static int llama_decode_internal(
+                 kv_self.head = 0;
+             }
+ 
+-            const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            if (!slot) {
+                llama_kv_cache_defrag(kv_self);
+                llama_kv_cache_update(&lctx);
+                slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            }
+             if (!slot) {
+                 return 1;
+             }
+@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+ 
+     //const int64_t t_start = ggml_time_us();
+ 
+-    // number of cells moved
+-    uint32_t n_moves = 0;
+    // groups of cells moved
+    std::vector<struct llama_kv_defrag_move> moves;
+ 
+     // each move requires 6*n_layer tensors (see build_defrag)
+     //   - source view, destination view, copy operation
+@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+         // are we moving a continuous block of memory?
+         bool cont = false;
+ 
+-        // should we stop searching for the next move?
+-        bool stop = false;
+-
+         // go back and move the nf cells to the hole
+         for (; i1 < n_kv; ++i1) {
+             auto & cell1 = kv_self.cells[i1];
+ 
+             if (cell1.is_empty() || ids[i1] != n_kv) {
+-                if (n_moves == max_moves) {
+-                    stop = true;
+-                    break;
+-                }
+-
+                 cont = false;
+                 continue;
+             }
+@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+             kv_self.head = n_used;
+ 
+             if (!cont) {
+-                n_moves++;
+                moves.push_back({i1, i0 + nf, 1});
+                 cont = true;
+            } else {
+                moves.back().len++;
+             }
+ 
+             nf++;
+@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+             }
+         }
+ 
+-        if (stop || n_moves == max_moves) {
+-            break;
+-        }
+-
+         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+ 
+         i0 += nh - 1;
+     }
+ 
+-    if (n_moves == 0) {
+    if (moves.size() == 0) {
+         return;
+     }
+ 
+-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
+-
+-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n",  moves.size());
+ 
+ #if 0
+     // CPU defrag
+@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+ #else
+     // ggml_graph defrag
+ 
+-    ggml_backend_sched_reset(lctx.sched.get());
+    for (std::size_t i = 0; i < moves.size(); i += max_moves) {
+        std::vector<struct llama_kv_defrag_move> chunk;
+        auto end = std::min(i + max_moves, moves.size());
+        chunk.assign(moves.begin() + i, moves.begin() + end);
+ 
+-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+        ggml_backend_sched_reset(lctx.sched.get());
+
+        //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
+        ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
+ 
+-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+    }
+ #endif
+ 
+     //const int64_t t_end = ggml_time_us();
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -433,14 +433,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 	err := s.lc.Decode(batch)
 	if err != nil {
-		if errors.Is(err, llama.ErrKvCacheFull) {
-			slog.Debug("defragmenting kv cache")
-			s.cache.lc.KvCacheDefrag()
-			err = s.lc.Decode(batch)
-		}
-		if err != nil {
-			return fmt.Errorf("failed to decode batch: %w", err)
-		}
+		return fmt.Errorf("failed to decode batch: %w", err)
 	}

 	if crossAttention {
--- a/llm/server.go
+++ b/llm/server.go
@@ -674,21 +674,6 @@ type CompletionResponse struct {
 }

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
-	if err := s.sem.Acquire(ctx, 1); err != nil {
-		if errors.Is(err, context.Canceled) {
-			slog.Info("aborting completion request due to client closing the connection")
-		} else {
-			slog.Error("Failed to acquire semaphore", "error", err)
-		}
-		return err
-	}
-	defer s.sem.Release(1)
-
-	// put an upper limit on num_predict to avoid the model running on forever
-	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
-		req.Options.NumPredict = 10 * s.options.NumCtx
-	}
-
 	request := map[string]any{
 		"prompt":            req.Prompt,
 		"stream":            true,
@@ -714,30 +699,51 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"cache_prompt":      true,
 	}

-	// Make sure the server is ready
-	status, err := s.getServerStatusRetry(ctx)
-	if err != nil {
-		return err
-	} else if status != ServerStatusReady {
-		return fmt.Errorf("unexpected server status: %s", status.ToString())
-	}
-
 	if len(req.Format) > 0 {
-		switch {
-		case bytes.Equal(req.Format, []byte(`"json"`)):
+		switch string(req.Format) {
+		case `null`, `""`:
+			// Field was set, but "missing" a value. We accept
+			// these as "not set".
+			break
+		case `"json"`:
 			request["grammar"] = grammarJSON
-		case bytes.HasPrefix(req.Format, []byte("{")):
+		default:
+			if req.Format[0] != '{' {
+				return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
+			}
+
 			// User provided a JSON schema
 			g := llama.SchemaToGrammar(req.Format)
 			if g == nil {
 				return fmt.Errorf("invalid JSON schema in format")
 			}
 			request["grammar"] = string(g)
-		default:
-			slog.Warn("invalid format: expected \"json\" or a JSON schema")
 		}
 	}

+	if err := s.sem.Acquire(ctx, 1); err != nil {
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting completion request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
+		return err
+	}
+	defer s.sem.Release(1)
+
+	// put an upper limit on num_predict to avoid the model running on forever
+	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
+		req.Options.NumPredict = 10 * s.options.NumCtx
+	}
+
+	// Make sure the server is ready
+	status, err := s.getServerStatusRetry(ctx)
+	if err != nil {
+		return err
+	} else if status != ServerStatusReady {
+		return fmt.Errorf("unexpected server status: %s", status.ToString())
+	}
+
 	// Handling JSON marshaling with special characters unescaped.
 	buffer := &bytes.Buffer{}
 	enc := json.NewEncoder(buffer)
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -0,0 +1,72 @@
+package llm
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+	"golang.org/x/sync/semaphore"
+)
+
+func TestLLMServerCompletionFormat(t *testing.T) {
+	// This test was written to fix an already deployed issue. It is a bit
+	// of a mess, and but it's good enough, until we can refactoring the
+	// Completion method to be more testable.
+
+	ctx, cancel := context.WithCancel(context.Background())
+	s := &llmServer{
+		sem: semaphore.NewWeighted(1), // required to prevent nil panic
+	}
+
+	checkInvalid := func(format string) {
+		t.Helper()
+		err := s.Completion(ctx, CompletionRequest{
+			Options: new(api.Options),
+			Format:  []byte(format),
+		}, nil)
+
+		want := fmt.Sprintf("invalid format: %q; expected \"json\" or a valid JSON Schema", format)
+		if err == nil || !strings.Contains(err.Error(), want) {
+			t.Fatalf("err = %v; want %q", err, want)
+		}
+	}
+
+	checkInvalid("X")   // invalid format
+	checkInvalid(`"X"`) // invalid JSON Schema
+
+	cancel() // prevent further processing if request makes it past the format check
+
+	checkValid := func(err error) {
+		t.Helper()
+		if !errors.Is(err, context.Canceled) {
+			t.Fatalf("Completion: err = %v; expected context.Canceled", err)
+		}
+	}
+
+	valids := []string{
+		// "missing"
+		``,
+		`""`,
+		`null`,
+
+		// JSON
+		`"json"`,
+		`{"type":"object"}`,
+	}
+	for _, valid := range valids {
+		err := s.Completion(ctx, CompletionRequest{
+			Options: new(api.Options),
+			Format:  []byte(valid),
+		}, nil)
+		checkValid(err)
+	}
+
+	err := s.Completion(ctx, CompletionRequest{
+		Options: new(api.Options),
+		Format:  nil, // missing format
+	}, nil)
+	checkValid(err)
+}
--- a/macapp/forge.config.ts
+++ b/macapp/forge.config.ts
@@ -19,6 +19,7 @@ const config: ForgeConfig = {
    icon: './assets/icon.icns',
    extraResource: [
      '../dist/ollama',
+      '../dist/darwin-amd64/lib',
      path.join(__dirname, './assets/iconTemplate.png'),
      path.join(__dirname, './assets/iconTemplate@2x.png'),
      path.join(__dirname, './assets/iconUpdateTemplate.png'),
@@ -42,7 +43,7 @@ const config: ForgeConfig = {
        }
      : {}),
    osxUniversal: {
-      x64ArchFiles: '**/ollama',
+      x64ArchFiles: '**/ollama*',
    },
  },
  rebuildConfig: {},
--- a/runners/common.go
+++ b/runners/common.go
@@ -72,6 +72,7 @@ func locateRunnersOnce() {
 	paths := []string{
 		filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
 		filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
+		filepath.Join(filepath.Dir(exe), "lib", "ollama", "runners"),
 	}
 	for _, path := range paths {
 		if _, err := os.Stat(path); err == nil {
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -15,19 +15,35 @@ export CGO_CXXFLAGS=-mmacosx-version-min=11.3
 export CGO_LDFLAGS=-mmacosx-version-min=11.3

 rm -rf llama/build dist/darwin-*
+
+# Generate the universal ollama binary for stand-alone usage: metal + avx
+echo "Building binary"
 echo "Building darwin arm64"
 GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
 echo "Building darwin amd64 with AVX enabled"
-GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist
+GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
+lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama

-
-lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
+# sign the binary and rename it
 if [ -n "$APPLE_IDENTITY" ]; then
-    codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
+    codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama-darwin
 else
-    echo "Skipping code signing - set APPLE_IDENTITY"
+    echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
 fi
-chmod +x dist/ollama
+ditto -c -k --keepParent dist/ollama-darwin dist/temp.zip
+if [ -n "$APPLE_IDENTITY" ]; then
+    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+fi
+rm -f dist/temp.zip
+
+# Build the app bundle
+echo "Building app"
+echo "Building darwin amd64 with runners"
+rm dist/darwin-amd64/bin/ollama
+GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
+
+# Generate the universal ollama binary for the app bundle: metal + no-avx
+lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama

 # build and optionally sign the mac app
 npm install --prefix macapp
@@ -38,15 +54,3 @@ else
 fi
 cp macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip

-# sign the binary and rename it
-if [ -n "$APPLE_IDENTITY" ]; then
-    codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
-else
-    echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
-fi
-ditto -c -k --keepParent dist/ollama dist/temp.zip
-if [ -n "$APPLE_IDENTITY" ]; then
-    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
-fi
-mv dist/ollama dist/ollama-darwin
-rm -f dist/temp.zip
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -82,6 +82,10 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	}

 	currMsgIdx := n
+	// Warn user if messages are truncated from the input
+	if numTruncatedMessages := len(msgs[0:currMsgIdx]); numTruncatedMessages > 0 {
+		slog.Warn("truncated first messages from input", "num_truncated", numTruncatedMessages)
+	}

 	for cnt, msg := range msgs[currMsgIdx:] {
 		prefix := ""
--- a/server/routes.go
+++ b/server/routes.go
@@ -1539,6 +1539,34 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}

+	if req.Dry {
+		var debug map[string]any
+		if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
+			debug = map[string]any{"prompt": prompt}
+		}
+		tokens, err := r.Tokenize(c.Request.Context(), prompt)
+		if err != nil {
+			slog.Error("tokenize error", "error", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
+		c.JSON(http.StatusOK, api.ChatResponse{
+			Model:      req.Model,
+			CreatedAt:  time.Now().UTC(),
+			Message:    api.Message{Role: "assistant", Content: ""},
+			Done:       true,
+			DoneReason: "dry_run",
+			Debug:      debug,
+			Metrics: api.Metrics{
+				PromptEvalCount:    len(tokens),
+				PromptEvalDuration: 0,
+				EvalCount:          0,
+				EvalDuration:       0,
+			},
+		})
+		return
+	}
+
 	slog.Debug("chat request", "images", len(images), "prompt", prompt)

 	ch := make(chan any)
@@ -1571,6 +1599,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 			}

+			if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
+				res.Debug = map[string]any{"prompt": prompt}
+				if req.Stream != nil && !*req.Stream {
+					tempMsg := res.Message
+					res.Message = api.Message{Role: "assistant", Content: ""}
+					ch <- res
+					res.Message = tempMsg
+				}
+			}
+
 			// TODO: tool call checking and filtering should be moved outside of this callback once streaming
 			// however this was a simple change for now without reworking streaming logic of this (and other)
 			// handlers
Author	SHA1	Message	Date
ParthSareen	6556540655	User interface prototype	2024-12-19 16:43:36 -08:00
ParthSareen	3f60fd57e3	Remove /template API	2024-12-19 14:47:51 -08:00
ParthSareen	38cd80d52c	Add dry run option for chat request	2024-12-19 14:17:29 -08:00
ParthSareen	c9a46140e6	Warn user on truncation - ollama logs	2024-12-19 13:48:25 -08:00
ParthSareen	1d529d8b7b	Add /template endpoint	2024-12-18 15:23:27 -08:00
Jeffrey Morgan	a72f2dce45	scripts: sign renamed macOS binary (#8131 )	2024-12-17 18:03:49 -08:00
Jesse Gross	08a832b482	llama: Ensure KV cache is fully defragmented. Sometimes the KV cache requires defragmentation even without triggering the threshold heuristic. In this case, decoding will not being able to find a KV cache slot. This is particularly difficult for the caller to handle if it happens in between ubatches. To avoid this, we should immediately trigger a defrag. In addition, a heavily fragmented cache can require more than max_moves to defragment. Currently, we stop when we hit the limit but this can leave a cache that still does not have adequate space even after defragmentation is triggered. Instead, we should do multiple batches of processing until everything is complete. Fixes #7949	2024-12-17 14:01:19 -08:00
Blake Mizerany	2ddc32d5c5	llm: do not error on "null" format (#8139 ) This fixes another regression in the previous commit that fixed other known bugs.	2024-12-17 09:49:37 -08:00
Jascha Beste	2cde4b8817	readme: change getting started guide link for pgai (#8119 )	2024-12-16 22:13:23 -08:00
Blake Mizerany	87f0a49fe6	llm: do not silently fail for supplied, but invalid formats (#8130 ) Changes in #8002 introduced fixes for bugs with mangling JSON Schemas. It also fixed a bug where the server would silently fail when clients requested invalid formats. It also, unfortunately, introduced a bug where the server would reject requests with an empty format, which should be allowed. The change in #8127 updated the code to allow the empty format, but also reintroduced the regression where the server would silently fail when the format was set, but invalid. This commit fixes both regressions. The server does not reject the empty format, but it does reject invalid formats. It also adds tests to help us catch regressions in the future. Also, the updated code provides a more detailed error message when a client sends a non-empty, but invalid format, echoing the invalid format in the response. This commits also takes the opportunity to remove superfluous linter checks.	2024-12-16 21:57:49 -08:00
Jeffrey Morgan	0f06a6daa7	llm: loosen format check to default to no format (#8127 )	2024-12-16 18:45:46 -08:00
Daniel Hiltgen	8f805dd74b	darwin: restore multiple runners for x86 (#8125 ) In 0.5.2 we simplified packaging to have avx only for macos x86. It looks like there may still be some non-AVX systems out there, so this puts back the prior logic of building no-AVX for the primary binary, and now 2 runners for avx and avx2. These will be packaged in the App bundle only, so the stand-alone binary will now be without AVX support on macos. On arm, we'll also see these runners reported as available in the log, but they're dormant and will never be used at runtime.	2024-12-16 18:45:02 -08:00