llm: allow gemma 2 to context shift (#5534 )

Update llama.cpp submodule to a8db2a9c (#5530 )
llm: print caching notices in debug only (#5533 )
2024-07-07 13:41:51 -04:00 · 2024-07-07 13:03:09 -04:00 · 2024-07-07 12:38:04 -04:00
3 changed files with 8 additions and 35 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1413,7 +1413,7 @@ struct llama_server_context
            return get_slot(-1);
        }

-        LOG_INFO("slot with common prefix found", {{
+        LOG_DEBUG("slot with common prefix found", {{
            "slot_id", slot->id,
            "characters", longest
        }});
@@ -1688,22 +1688,8 @@ struct llama_server_context
                    }
                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);

-                    char buf[256];
-                    llama_model_meta_val_str(model, "general.architecture", buf, 256);
-                    bool gemma2 = strcmp(buf, "gemma2") == 0;
-
-                    int32_t truncate_at = slot.n_ctx;
-
-                    // truncate at 2/3 of the context length for gemma2 models
-                    // as they do not support context shifts (from the sliding window implementation).
-                    // this way, prompts that almost fit the context length can still generate a full
-                    // response without a sudden stop from hitting the context limit
-                    if (gemma2) {
-                        truncate_at = 2 * slot.n_ctx / 3;
-                    }
-
                    // if input prompt is too big, truncate it, if group attention self-extend is disabled
-                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
+                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
                    {
                        const int n_left = slot.n_ctx - slot.params.n_keep;
                        const int n_shift = n_left / 2;
@@ -1731,19 +1717,6 @@ struct llama_server_context
                        GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                    }

-                    // Models with sliding window attention do not work with context shifts, so
-                    // limit their prediction to the context length
-                    if (gemma2) {
-                        int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
-                        slot.n_predict = limit;
-                        slot.params.n_predict = limit;
-                        LOG_INFO("model does not support sliding window, limiting generation", {
-                            {"n_ctx", slot.n_ctx},
-                            {"n_prompt_tokens", slot.n_prompt_tokens},
-                            {"n_predict", slot.n_predict}
-                        });
-                    }
-
                    if (!slot.params.cache_prompt)
                    {
                        llama_sampling_reset(slot.ctx_sampling);
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,11 +1,11 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 73f52435..2b81b4bd 100644
+index 2b9ace28..172640e2 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5092,16 +5092,7 @@ static void llm_load_vocab(
- 
-         // for now, only BPE models have pre-tokenizers
+@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
+             vocab.tokenizer_add_space_prefix = false;
+             vocab.tokenizer_clean_spaces = true;
 -            if (tokenizer_pre.empty()) {
 -                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
@@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -5164,7 +5155,8 @@ static void llm_load_vocab(
+@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
                 tokenizer_pre == "jais") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
             } else {
Author	SHA1	Message	Date
Jeffrey Morgan	d8def1ff94	llm: allow gemma 2 to context shift (#5534 )	2024-07-07 13:41:51 -04:00
Jeffrey Morgan	571dc61955	Update llama.cpp submodule to `a8db2a9c` (#5530 )	2024-07-07 13:03:09 -04:00
Jeffrey Morgan	0e09c380fc	llm: print caching notices in debug only (#5533 )	2024-07-07 12:38:04 -04:00