Update sync with latest llama.cpp layout, and run against b3485
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@@ -24,6 +24,10 @@
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "unicode.h"
|
||||
#include "unicode-data.h"
|
||||
|
||||
@@ -41,6 +45,12 @@
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
|
||||
size_t unicode_len_utf8(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||
return lookup[highbits];
|
||||
}
|
||||
|
||||
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < cps.size(); ++i) {
|
||||
@@ -49,7 +59,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
assert(offset < utf8.size());
|
||||
if (!(utf8[offset + 0] & 0x80)) {
|
||||
auto result = utf8[offset + 0];
|
||||
@@ -252,13 +262,13 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@@ -279,18 +289,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
};
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const char32_t cpt = _get_cpt(pos);
|
||||
const uint32_t cpt = _get_cpt(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
||||
if (cpt == '\'' && pos+1 < offset_end) {
|
||||
char32_t cpt_next = _get_cpt(pos+1);
|
||||
uint32_t cpt_next = _get_cpt(pos+1);
|
||||
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||
pos += _add_token(pos+2);
|
||||
continue;
|
||||
}
|
||||
if (pos+2 < offset_end) {
|
||||
char32_t cpt_next_next = _get_cpt(pos+2);
|
||||
uint32_t cpt_next_next = _get_cpt(pos+2);
|
||||
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||
@@ -320,9 +330,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
continue;
|
||||
}
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
@@ -335,7 +345,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
}
|
||||
|
||||
// regex: \s+(?!\S)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
@@ -370,13 +380,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@@ -397,18 +407,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
};
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const char32_t cpt = _get_cpt(pos);
|
||||
const uint32_t cpt = _get_cpt(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
||||
if (cpt == '\'' && pos+1 < offset_end) {
|
||||
char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
||||
uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
||||
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||
pos += _add_token(pos+2);
|
||||
continue;
|
||||
}
|
||||
if (pos+2 < offset_end) {
|
||||
char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
||||
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
||||
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||
@@ -418,8 +428,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
}
|
||||
}
|
||||
|
||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
||||
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
|
||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
|
||||
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
|
||||
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
||||
pos++;
|
||||
while (_get_flags(pos).is_letter) {
|
||||
@@ -445,12 +455,12 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
char32_t cpt2 = _get_cpt(pos);
|
||||
uint32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
@@ -461,7 +471,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
||||
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
}
|
||||
@@ -476,7 +486,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
}
|
||||
|
||||
// regex: \s+(?!\S)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
@@ -620,6 +630,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
std::vector<uint32_t> result;
|
||||
result.reserve(utf8.size());
|
||||
size_t offset = 0;
|
||||
while (offset < utf8.size()) {
|
||||
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
||||
@@ -652,7 +663,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
||||
return map.at(utf8);
|
||||
}
|
||||
|
||||
char32_t unicode_tolower(char32_t cp) {
|
||||
uint32_t unicode_tolower(uint32_t cp) {
|
||||
auto it = unicode_map_lowercase.find(cp);
|
||||
return it == unicode_map_lowercase.end() ? cp : it->second;
|
||||
}
|
||||
@@ -705,10 +716,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
continue;
|
||||
}
|
||||
|
||||
const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
|
||||
const auto flags = unicode_cpt_flags(cpts[i]);
|
||||
|
||||
if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
|
||||
text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
|
||||
if (flags.is_whitespace) {
|
||||
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
||||
//text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
|
||||
text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback
|
||||
} else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
|
||||
text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
|
||||
} else {
|
||||
text_collapsed[i] = (char) 0xD0; // fallback
|
||||
}
|
||||
@@ -792,9 +807,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
|
||||
} else {
|
||||
// no unicode category used, we can use std::wregex directly
|
||||
const std::wstring wtext = unicode_wstring_from_utf8(text);
|
||||
const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
|
||||
|
||||
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
|
||||
std::wstring wtext(cpts.begin(), cpts.end());
|
||||
for (size_t i = 0; i < wtext.size(); ++i) {
|
||||
if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
|
||||
wtext[i] = 0x0B;
|
||||
}
|
||||
}
|
||||
|
||||
//printf("text: %s\n", text.c_str());
|
||||
//printf("regex_expr: %s\n", regex_expr.c_str());
|
||||
bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
|
||||
|
||||
Reference in New Issue
Block a user