Compare commits
3 Commits
v0.1.49-rc
...
v0.1.49-rc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d8def1ff94 | ||
|
|
571dc61955 | ||
|
|
0e09c380fc |
31
llm/ext_server/server.cpp
vendored
31
llm/ext_server/server.cpp
vendored
@@ -1413,7 +1413,7 @@ struct llama_server_context
|
||||
return get_slot(-1);
|
||||
}
|
||||
|
||||
LOG_INFO("slot with common prefix found", {{
|
||||
LOG_DEBUG("slot with common prefix found", {{
|
||||
"slot_id", slot->id,
|
||||
"characters", longest
|
||||
}});
|
||||
@@ -1688,22 +1688,8 @@ struct llama_server_context
|
||||
}
|
||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||
|
||||
char buf[256];
|
||||
llama_model_meta_val_str(model, "general.architecture", buf, 256);
|
||||
bool gemma2 = strcmp(buf, "gemma2") == 0;
|
||||
|
||||
int32_t truncate_at = slot.n_ctx;
|
||||
|
||||
// truncate at 2/3 of the context length for gemma2 models
|
||||
// as they do not support context shifts (from the sliding window implementation).
|
||||
// this way, prompts that almost fit the context length can still generate a full
|
||||
// response without a sudden stop from hitting the context limit
|
||||
if (gemma2) {
|
||||
truncate_at = 2 * slot.n_ctx / 3;
|
||||
}
|
||||
|
||||
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
|
||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
|
||||
{
|
||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||
const int n_shift = n_left / 2;
|
||||
@@ -1731,19 +1717,6 @@ struct llama_server_context
|
||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||
}
|
||||
|
||||
// Models with sliding window attention do not work with context shifts, so
|
||||
// limit their prediction to the context length
|
||||
if (gemma2) {
|
||||
int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
|
||||
slot.n_predict = limit;
|
||||
slot.params.n_predict = limit;
|
||||
LOG_INFO("model does not support sliding window, limiting generation", {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_prompt_tokens", slot.n_prompt_tokens},
|
||||
{"n_predict", slot.n_predict}
|
||||
});
|
||||
}
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
llama_sampling_reset(slot.ctx_sampling);
|
||||
|
||||
Submodule llm/llama.cpp updated: d7fd29fff1...a8db2a9ce6
@@ -1,11 +1,11 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 73f52435..2b81b4bd 100644
|
||||
index 2b9ace28..172640e2 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -5092,16 +5092,7 @@ static void llm_load_vocab(
|
||||
|
||||
// for now, only BPE models have pre-tokenizers
|
||||
@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
- if (tokenizer_pre.empty()) {
|
||||
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
@@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -5164,7 +5155,8 @@ static void llm_load_vocab(
|
||||
@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "jais") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user