Update to b7609

This commit is contained in:
inforithmics 2026-01-02 00:55:25 +01:00
parent dfe3d70636
commit 25b43f8bb0
36 changed files with 575 additions and 242 deletions

View File

@ -1,6 +1,6 @@
UPSTREAM=https://github.com/ggml-org/llama.cpp.git UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor WORKDIR=llama/vendor
FETCH_HEAD=be47fb9285779e900915bd8246eb9664110d4ba5 FETCH_HEAD=e86f3c22211d9b5c3842e2961a022aac9cdbacad
.PHONY: help .PHONY: help
help: help:

2
llama/build-info.cpp generated vendored
View File

@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "be47fb9285779e900915bd8246eb9664110d4ba5"; char const *LLAMA_COMMIT = "e86f3c22211d9b5c3842e2961a022aac9cdbacad";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";

View File

@ -74,6 +74,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED }, { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
}; };
llm_chat_template llm_chat_template_from_str(const std::string & name) { llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_GROK_2; return LLM_CHAT_TEMPLATE_GROK_2;
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) { } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
return LLM_CHAT_TEMPLATE_PANGU_EMBED; return LLM_CHAT_TEMPLATE_PANGU_EMBED;
} else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
} }
return LLM_CHAT_TEMPLATE_UNKNOWN; return LLM_CHAT_TEMPLATE_UNKNOWN;
} }
@ -845,6 +848,14 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "[unused9]助手:"; ss << "[unused9]助手:";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
}
if (add_ass) {
ss << "<|begin|>assistant";
}
} else { } else {
// template not supported // template not supported
return -1; return -1;

View File

@ -54,6 +54,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2, LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_PANGU_EMBED, LLM_CHAT_TEMPLATE_PANGU_EMBED,
LLM_CHAT_TEMPLATE_SOLAR_OPEN,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };

View File

@ -126,6 +126,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B"; case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
case LLM_TYPE_80B_A3B: return "80B.A3B"; case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_102B_A12B: return "102B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B"; case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_230B_A10B: return "230B.A10B"; case LLM_TYPE_230B_A10B: return "230B.A10B";
case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_235B_A22B: return "235B.A22B";
@ -1682,7 +1683,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@ -1778,6 +1779,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
@ -3335,7 +3337,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@ -4791,7 +4800,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output // output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); // try to load output.weight, if not found, use token_embd (tied embeddings)
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (!output) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; auto & layer = layers[i];
@ -4854,7 +4867,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output // output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); // try to load output.weight, if not found, use token_embd (tied embeddings)
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (!output) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; auto & layer = layers[i];
@ -5221,9 +5238,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags); layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags); layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags); layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags); layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags); layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
@ -7483,7 +7500,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break; } break;
case LLM_ARCH_MODERN_BERT: case LLM_ARCH_MODERN_BERT:
{ {
llm = std::make_unique<llm_build_modern_bert<true>>(*this, params); llm = std::make_unique<llm_build_modern_bert>(*this, params);
} break; } break;
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
{ {

View File

@ -120,6 +120,7 @@ enum llm_type {
LLM_TYPE_31B_A3_5B, LLM_TYPE_31B_A3_5B,
LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B, LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_230B_A10B, // Minimax M2 LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B, LLM_TYPE_235B_A22B,

View File

@ -314,6 +314,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+", "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_YOUTU:
regex_exprs = {
"[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
regex_exprs = { regex_exprs = {
"[\r\n]", "[\r\n]",
@ -355,6 +361,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
case LLAMA_VOCAB_PRE_TYPE_QWEN2: case LLAMA_VOCAB_PRE_TYPE_QWEN2:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
regex_exprs = { regex_exprs = {
// original regex from tokenizer.json // original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@ -1849,6 +1856,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "deepseek-v3") { tokenizer_pre == "deepseek-v3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "youtu") {
pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
clean_spaces = false;
ignore_merges = true;
} else if ( } else if (
tokenizer_pre == "falcon") { tokenizer_pre == "falcon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON; pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@ -2004,6 +2016,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "minimax-m2") { tokenizer_pre == "minimax-m2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "solar-open") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
clean_spaces = false;
} else { } else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@ -2348,6 +2364,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|end|>" || t.first == "<|end|>"
|| t.first == "<|return|>" // o200k_harmony || t.first == "<|return|>" // o200k_harmony
|| t.first == "<|call|>" // o200k_harmony || t.first == "<|call|>" // o200k_harmony
|| t.first == "<|flush|>" // solar-open
|| t.first == "<|calls|>" // solar-open
|| t.first == "<end_of_turn>" || t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>" || t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>" || t.first == "<|eom_id|>"
@ -2394,13 +2412,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
} }
// TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens, // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
// we remove the "<|end|>" token from the EOG list // we remove the "<|end|>" token from the EOG list
{ {
bool has_return = false; bool has_return = false;
bool has_call = false; bool has_call = false;
bool has_end = false; bool has_end = false;
bool has_flush = false;
llama_token end_id = LLAMA_TOKEN_NULL; llama_token end_id = LLAMA_TOKEN_NULL;
@ -2410,18 +2429,20 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (id_to_token[tid].text == "<|return|>") { if (id_to_token[tid].text == "<|return|>") {
has_return = true; has_return = true;
} else if (id_to_token[tid].text == "<|call|>") { } else if (id_to_token[tid].text == "<|call|>" || id_to_token[tid].text == "<|calls|>") {
has_call = true; has_call = true;
} else if (id_to_token[tid].text == "<|flush|>") {
has_flush = true;
} else if (id_to_token[tid].text == "<|end|>") { } else if (id_to_token[tid].text == "<|end|>") {
has_end = true; has_end = true;
end_id = tid; end_id = tid;
} }
} }
if (has_return && has_call && has_end) { if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
special_eog_ids.erase(end_id); special_eog_ids.erase(end_id);
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED; id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__); LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
} }
} }
} }

View File

@ -51,6 +51,8 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41, LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
}; };
struct LLM_KV; struct LLM_KV;

View File

@ -142,11 +142,13 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
LLM_FFN_GELU, LLM_FFN_SEQ, il); LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) { } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il); type_op, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else { } else {
cur = build_ffn(cur, cur = build_ffn(cur,

View File

@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
model.layers[il].ffn_exp_probs_b, model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used, n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm, LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale, hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func, (llama_expert_gating_func_type) hparams.expert_gating_func,
il); il);
cb(moe_out, "ffn_moe_out", il); cb(moe_out, "ffn_moe_out", il);

View File

@ -332,7 +332,6 @@ struct llm_build_mistral3 : public llm_graph_context {
llm_build_mistral3(const llama_model & model, const llm_graph_params & params); llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
}; };
template <bool iswa>
struct llm_build_modern_bert : public llm_graph_context { struct llm_build_modern_bert : public llm_graph_context {
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params); llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
}; };

View File

@ -1,7 +1,6 @@
#include "models.h" #include "models.h"
template <bool iswa> llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -24,13 +23,7 @@ llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, co
auto * inp_attn = build_attn_inp_no_cache(); auto * inp_attn = build_attn_inp_no_cache();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
float freq_base_l = 0.0f; float freq_base_l = model.get_rope_freq_base(cparams, il);
if constexpr (iswa) {
freq_base_l = model.get_rope_freq_base(cparams, il);
} else {
freq_base_l = freq_base;
}
cur = inpL; cur = inpL;
@ -120,7 +113,3 @@ llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, co
res->t_embd = cur; res->t_embd = cur;
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
// Explicit template instantiations
template struct llm_build_modern_bert<false>;
template struct llm_build_modern_bert<true>;

View File

@ -985,6 +985,11 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION }, { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK }, { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
{ "\\p{S}", unicode_cpt_flags::SYMBOL }, { "\\p{S}", unicode_cpt_flags::SYMBOL },
{ "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
{ "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
{ "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
{ "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
{ "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
}; };
static const std::map<int, int> k_ucat_cpt = { static const std::map<int, int> k_ucat_cpt = {
@ -1095,22 +1100,26 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
continue; continue;
} }
if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() && // Match \p{...} Unicode properties of varying lengths
if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
regex_expr[i + 1] == 'p' && regex_expr[i + 1] == 'p' &&
regex_expr[i + 2] == '{' && regex_expr[i + 2] == '{') {
regex_expr[i + 4] == '}') { // Find the closing brace
const std::string pat = regex_expr.substr(i, 5); size_t closing_brace = regex_expr.find('}', i + 3);
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
if (!inside) { const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
regex_expr_collapsed += '['; if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
if (!inside) {
regex_expr_collapsed += '[';
}
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i = closing_brace;
continue;
} }
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i += 4;
continue;
} }
} }

View File

@ -45,13 +45,14 @@
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific // audio-specific
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
@ -188,6 +189,7 @@ enum projector_type {
PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -218,6 +220,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@ -61,6 +61,7 @@ struct clip_hparams {
std::unordered_set<int32_t> vision_feature_layer; std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0; int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0; int32_t n_wa_pattern = 0;
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
// audio // audio
int32_t n_mel_bins = 0; // whisper preprocessor int32_t n_mel_bins = 0; // whisper preprocessor

View File

@ -859,6 +859,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{ {
builder = std::make_unique<clip_graph_glm4v>(ctx, img); builder = std::make_unique<clip_graph_glm4v>(ctx, img);
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
} break;
default: default:
GGML_ABORT("missing cgraph builder"); GGML_ABORT("missing cgraph builder");
} }
@ -1176,6 +1180,20 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
} }
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
hparams.n_merge = 2;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
std::vector<int> wa_layer_indexes_vec;
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
for (auto & layer : wa_layer_indexes_vec) {
hparams.wa_layer_indexes.insert(layer);
}
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
hparams.set_limit_image_tokens(1, 62500);
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
{ {
hparams.rope_theta = 10000.0f; hparams.rope_theta = 10000.0f;
@ -1244,7 +1262,14 @@ struct clip_model_loader {
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
if (!hparams.wa_layer_indexes.empty()) {
LOG_INF("%s: wa_layer_indexes: ", __func__);
for (auto & layer : hparams.wa_layer_indexes) {
LOG_INF("%d ", layer);
}
LOG_INF("\n");
}
if (hparams.image_min_pixels > 0) { if (hparams.image_min_pixels > 0) {
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : ""); LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
} }
@ -1512,6 +1537,14 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
{ {
model.projection = get_tensor(TN_MM_PROJECTOR); model.projection = get_tensor(TN_MM_PROJECTOR);
@ -2740,6 +2773,57 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// res_imgs->data[0] = *res; // res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32)); res_imgs->entries.push_back(std::move(img_f32));
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
const int patch_size = params.patch_size; // typically 16
const int merge_size = params.n_merge; // typically 2
const int align_size = patch_size * merge_size; // 32
const int max_num_patches = params.image_max_pixels > 0 ?
params.image_max_pixels / (patch_size * patch_size) : 256;
// Linear search for optimal scale to fit within max_num_patches
float scale = 1.0f;
int target_height = original_size.height;
int target_width = original_size.width;
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
float scaled_size = size * scale;
// Round up to nearest multiple of align_size
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
// Ensure at least one patch
return std::max(align_size, aligned);
};
// Linear search with 0.02 step size
while (scale > 0.0f) {
target_height = get_scaled_image_size(scale, original_size.height);
target_width = get_scaled_image_size(scale, original_size.width);
int num_patches_h = target_height / patch_size;
int num_patches_w = target_width / patch_size;
int num_patches = num_patches_h * num_patches_w;
if (num_patches > max_num_patches) {
scale -= 0.02f;
} else {
break;
}
}
clip_image_size new_size = {target_width, target_height};
// Resize the image
clip_image_u8 resized;
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
// Normalize to float32
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
// Add to results
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
{ {
@ -2972,6 +3056,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2; return (img->nx / params.patch_size) / 2;
default: default:
break; break;
@ -2987,6 +3072,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2; return (img->ny / params.patch_size) / 2;
default: default:
break; break;
@ -3047,6 +3133,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
{ {
// dynamic size (2 conv, so double patch size) // dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2); int x_patch = img->nx / (params.patch_size * 2);
@ -3174,7 +3261,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int pos_w = image_size_width / patch_size; const int pos_w = image_size_width / patch_size;
const int pos_h = image_size_height / patch_size; const int pos_h = image_size_height / patch_size;
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) { auto get_inp_tensor = [&gf](const char * name) {
ggml_tensor * inp = ggml_graph_get_tensor(gf, name); ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@ -3323,9 +3409,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions); set_input_i32("positions", positions);
} break; } break;
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_YOUTUVL:
{ {
// pw * ph = number of tokens output by ViT after apply patch merger // pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT // ipw * ipw = number of vision token been processed inside ViT
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
const int merge_ratio = 2; const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio; const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio; const int ph = image_size_height / patch_size / merge_ratio;
@ -3336,7 +3424,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
std::vector<int> inv_idx(ph * pw); std::vector<int> inv_idx(ph * pw);
if (use_window_attn) { if (use_window_attn) {
const int attn_window_size = 112; const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
const int grid_window = attn_window_size / patch_size / merge_ratio; const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0; int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor // [num_vision_tokens, num_vision_tokens] attention mask tensor
@ -3574,6 +3662,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_YOUTUVL:
return ctx->model.mm_1_b->ne[0]; return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths // main path + deepstack paths

View File

@ -27,6 +27,11 @@ struct clip_graph_qwen3vl : clip_graph {
ggml_cgraph * build() override; ggml_cgraph * build() override;
}; };
struct clip_graph_youtuvl : clip_graph {
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_minicpmv : clip_graph { struct clip_graph_minicpmv : clip_graph {
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override; ggml_cgraph * build() override;

View File

@ -0,0 +1,179 @@
#include "models.h"
ggml_cgraph * clip_graph_youtuvl::build() {
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
const bool use_window_attn = !hparams.wa_layer_indexes.empty();
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4;
const int m = 2;
const int Wp = n_patches_x;
const int Hp = n_patches_y;
const int Hm = Hp / m;
const int Wm = Wp / m;
norm_type norm_t = NORM_TYPE_NORMAL;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp = build_inp_raw();
// change conv3d to linear
// reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
{
inp = ggml_reshape_4d(
ctx0, inp,
Wm * m * patch_size, m * patch_size, Hm, 3);
inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, Wm, m * patch_size, Hm);
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, patch_size, m, Hm * Wm);
inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
inp = ggml_cont_4d(
ctx0, inp,
patch_size, 3, patch_size, Hm * Wm * m * m);
inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
3*patch_size* patch_size, Hm * Wm * m * m, 1);
}
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
if (model.patch_bias) {
inp = ggml_add(ctx0, inp, model.patch_bias);
}
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
ggml_tensor * inpL = inp;
ggml_tensor * window_mask = nullptr;
ggml_tensor * window_idx = nullptr;
ggml_tensor * inv_window_idx = nullptr;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
}
if (use_window_attn) {
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
ggml_set_name(inv_window_idx, "inv_window_idx");
ggml_set_input(inv_window_idx);
// mask for window attention
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// if flash attn is used, we need to pad the mask and cast to f16
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
}
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
GGML_ASSERT(batch_size == 1);
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
const auto & layer = model.layers[il];
const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
// self-attention
{
ggml_tensor * Qcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
ggml_tensor * Kcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
ggml_tensor * Vcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
Qcur = ggml_rope_multi(
ctx0, Qcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Kcur = ggml_rope_multi(
ctx0, Kcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
nullptr, nullptr,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
inpL = cur;
}
ggml_tensor * embeddings = inpL;
if (use_window_attn) {
const int spatial_merge_unit = 4;
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
cb(embeddings, "window_order_restored", -1);
}
// post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
if (model.post_ln_w) {
embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
}
// Now apply merger (VLPatchMerger):
// 1. Apply RMS norm (ln_q in VLPatchMerger)
embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
cb(embeddings, "merger_normed", -1);
// 2. First reshape for spatial merge (merge 2x2 patches)
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
cb(embeddings, "merger_reshaped", -1);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
FFN_GELU,
-1);
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@ -293,7 +293,7 @@ struct mtmd_context {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]"; img_end = "[IMG_END]";
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) { } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|> // <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>"; img_beg = "<|vision_start|>";
img_end = "<|vision_end|>"; img_end = "<|vision_end|>";

View File

@ -84,10 +84,10 @@ index ef23ec78d..581f26ed3 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 55e1c20c9..da2eb6760 100644 index 84eccea3f..b388e363e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context { @@ -573,6 +573,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx; delete ctx;
@ -95,7 +95,7 @@ index 55e1c20c9..da2eb6760 100644
} }
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -838,6 +839,7 @@ struct ggml_backend_cuda_split_buffer_context { @@ -828,6 +829,7 @@ struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
@ -103,7 +103,7 @@ index 55e1c20c9..da2eb6760 100644
} }
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1119,6 +1121,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) { @@ -1109,6 +1111,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context)); CUDA_CHECK(cudaFreeHost(buffer->context));

View File

@ -10,10 +10,10 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index cd4092ca0..af2276960 100644 index bd311bea4..a0188d392 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1832,16 +1832,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) { if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false; add_space_prefix = false;
clean_spaces = true; clean_spaces = true;
@ -31,8 +31,8 @@ index cd4092ca0..af2276960 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -2016,7 +2007,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -2032,7 +2023,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
clean_spaces = false; clean_spaces = false;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));

View File

@ -10,7 +10,7 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index fb08dd258..25dd02272 100644 index 9f551e8f3..4e26cda95 100644
--- a/tools/mtmd/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -24,6 +24,19 @@ @@ -24,6 +24,19 @@
@ -33,7 +33,7 @@ index fb08dd258..25dd02272 100644
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
@@ -1691,7 +1704,29 @@ struct clip_model_loader { @@ -1724,7 +1737,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
@ -63,7 +63,7 @@ index fb08dd258..25dd02272 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -1718,7 +1753,11 @@ struct clip_model_loader { @@ -1751,7 +1786,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }

View File

@ -173,10 +173,10 @@ index 5003b4fbf..243b296b5 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5e664c8c5..1762850ed 100644 index 0450db6c9..9591c3686 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -2048,6 +2048,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -2050,6 +2050,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
@ -198,7 +198,7 @@ index 5e664c8c5..1762850ed 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5568,6 +5583,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -5585,6 +5600,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -233,7 +233,7 @@ index 5e664c8c5..1762850ed 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -7738,6 +7781,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { @@ -7755,6 +7798,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params); llm = std::make_unique<llm_build_chameleon>(*this, params);
} break; } break;
@ -244,7 +244,7 @@ index 5e664c8c5..1762850ed 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -8006,6 +8053,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -8023,6 +8070,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
@ -253,7 +253,7 @@ index 5e664c8c5..1762850ed 100644
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3: case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index f4f44a92b..3b54c83aa 100644 index 79200a0d9..740cb7094 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -79,6 +79,7 @@ enum llm_type { @@ -79,6 +79,7 @@ enum llm_type {
@ -264,7 +264,7 @@ index f4f44a92b..3b54c83aa 100644
LLM_TYPE_26B, LLM_TYPE_26B,
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
@@ -409,6 +410,8 @@ struct llama_layer { @@ -410,6 +411,8 @@ struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr; struct ggml_tensor * ffn_act_eps = nullptr;
@ -274,10 +274,10 @@ index f4f44a92b..3b54c83aa 100644
struct llama_layer_convnext convnext; struct llama_layer_convnext convnext;
diff --git a/src/models/models.h b/src/models/models.h diff --git a/src/models/models.h b/src/models/models.h
index e2cd4e484..89afb5f24 100644 index e78a788d4..30f04d956 100644
--- a/src/models/models.h --- a/src/models/models.h
+++ b/src/models/models.h +++ b/src/models/models.h
@@ -530,6 +530,11 @@ struct llm_build_smollm3 : public llm_graph_context { @@ -529,6 +529,11 @@ struct llm_build_smollm3 : public llm_graph_context {
llm_build_smollm3(const llama_model & model, const llm_graph_params & params); llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
}; };

View File

@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index af2276960..e05314272 100644 index a0188d392..67dac0f05 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@ -25,7 +25,7 @@ index af2276960..e05314272 100644
"\\s+$", "\\s+$",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp diff --git a/src/unicode.cpp b/src/unicode.cpp
index bb44edfad..13ced055f 100644 index b47dcbe61..6d1084f26 100644
--- a/src/unicode.cpp --- a/src/unicode.cpp
+++ b/src/unicode.cpp +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@ @@ -2,6 +2,11 @@

View File

@ -53,10 +53,10 @@ index b165d8bdc..f91d4faba 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index e05314272..325ef9843 100644 index 67dac0f05..eed381aab 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1788,9 +1788,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) { if (precompiled_charsmap_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx); const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);

View File

@ -234,10 +234,10 @@ index 7697c292d..00d773dd3 100644
+ *dst = *src; + *dst = *src;
+} +}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index c4ceb4fc5..0e53ecc39 100644 index ee84303ef..178e82d76 100644
--- a/ggml/src/ggml-cuda/cpy.cu --- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu
@@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda( @@ -369,6 +369,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
@ -281,7 +281,7 @@ index c4ceb4fc5..0e53ecc39 100644
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) { void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
@@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg @@ -495,6 +532,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_scalar_cuda<half, float> ggml_cpy_scalar_cuda<half, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }

View File

@ -22,7 +22,7 @@ index 393c329be..609209459 100644
size_t memory_total; size_t memory_total;
// device type // device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index da2eb6760..ff0624b78 100644 index b388e363e..3d2afac43 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) { @@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
@ -77,7 +77,7 @@ index da2eb6760..ff0624b78 100644
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info ggml_cuda_init() {
ggml_cuda_device_info info = {}; ggml_cuda_device_info info = {};
@@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -245,22 +290,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10; info.devices[id].cc += prop.minor * 0x10;
} }
} }
@ -108,7 +108,7 @@ index da2eb6760..ff0624b78 100644
std::string device_name(prop.name); std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") { if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name }); turing_devices_without_mma.push_back({ id, device_name });
@@ -4120,6 +4167,7 @@ struct ggml_backend_cuda_device_context { @@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
std::string name; std::string name;
std::string description; std::string description;
std::string pci_bus_id; std::string pci_bus_id;
@ -116,7 +116,7 @@ index da2eb6760..ff0624b78 100644
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4208,6 +4256,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k @@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
} }
#endif // defined(__linux__) #endif // defined(__linux__)
@ -128,7 +128,7 @@ index da2eb6760..ff0624b78 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
@@ -4248,6 +4301,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->name = ggml_backend_cuda_device_get_name(dev); props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev); props->description = ggml_backend_cuda_device_get_description(dev);
@ -136,7 +136,7 @@ index da2eb6760..ff0624b78 100644
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -4854,6 +4908,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4844,6 +4898,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;

View File

@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+) 2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index b0b5ab42a..1b7aa90af 100644 index fca55b76f..d28643fef 100644
--- a/tools/mtmd/mtmd.cpp --- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl { @@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {

View File

@ -178,10 +178,10 @@ index f4713a421..92ba577a5 100644
static const struct ggml_backend_i ggml_backend_cpu_i = { static const struct ggml_backend_i ggml_backend_cpu_i = {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ff0624b78..17464770d 100644 index 3d2afac43..1e09cf1f0 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2901,7 +2901,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { @@ -2891,7 +2891,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
#ifdef USE_CUDA_GRAPH #ifdef USE_CUDA_GRAPH
static bool check_node_graph_compatibility(ggml_cgraph * cgraph, static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
@ -190,7 +190,7 @@ index ff0624b78..17464770d 100644
// Loop over nodes in GGML graph to obtain info needed for CUDA graph // Loop over nodes in GGML graph to obtain info needed for CUDA graph
@@ -2934,24 +2934,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph, @@ -2924,24 +2924,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
#endif #endif
} }
@ -241,7 +241,7 @@ index ff0624b78..17464770d 100644
} }
if (!use_cuda_graph) { if (!use_cuda_graph) {
@@ -3752,7 +3762,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3742,7 +3752,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
} }
} }
@ -250,7 +250,7 @@ index ff0624b78..17464770d 100644
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(cuda_ctx->device);
@@ -3790,7 +3800,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, @@ -3780,7 +3790,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (use_cuda_graph) { if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);

View File

@ -323,10 +323,10 @@ index 62e618850..dac9cfcdf 100644
struct ggml_cuda_mm_fusion_args_host { struct ggml_cuda_mm_fusion_args_host {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 17464770d..d73cb0e47 100644 index 1e09cf1f0..c0f42012d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -365,6 +365,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -355,6 +355,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
// #define DEBUG_CUDA_MALLOC // #define DEBUG_CUDA_MALLOC
@ -335,7 +335,7 @@ index 17464770d..d73cb0e47 100644
// buffer pool for cuda (legacy) // buffer pool for cuda (legacy)
struct ggml_cuda_pool_leg : public ggml_cuda_pool { struct ggml_cuda_pool_leg : public ggml_cuda_pool {
static const int MAX_BUFFERS = 256; static const int MAX_BUFFERS = 256;
@@ -377,9 +379,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { @@ -367,9 +369,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {}; ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
size_t pool_size = 0; size_t pool_size = 0;
@ -350,7 +350,7 @@ index 17464770d..d73cb0e47 100644
} }
~ggml_cuda_pool_leg() { ~ggml_cuda_pool_leg() {
@@ -387,7 +392,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { @@ -377,7 +382,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
for (int i = 0; i < MAX_BUFFERS; ++i) { for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cuda_buffer & b = buffer_pool[i]; ggml_cuda_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) { if (b.ptr != nullptr) {
@ -361,7 +361,7 @@ index 17464770d..d73cb0e47 100644
pool_size -= b.size; pool_size -= b.size;
} }
} }
@@ -435,8 +442,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { @@ -425,8 +432,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
void * ptr; void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size); size_t look_ahead_size = (size_t) (1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256); look_ahead_size = 256 * ((look_ahead_size + 255)/256);
@ -379,7 +379,7 @@ index 17464770d..d73cb0e47 100644
*actual_size = look_ahead_size; *actual_size = look_ahead_size;
pool_size += look_ahead_size; pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC #ifdef DEBUG_CUDA_MALLOC
@@ -456,10 +470,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { @@ -446,10 +460,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
} }
} }
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
@ -402,7 +402,7 @@ index 17464770d..d73cb0e47 100644
}; };
// pool with virtual memory // pool with virtual memory
@@ -471,18 +495,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { @@ -461,18 +485,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
CUdeviceptr pool_addr = 0; CUdeviceptr pool_addr = 0;
size_t pool_used = 0; size_t pool_used = 0;
size_t pool_size = 0; size_t pool_size = 0;
@ -430,7 +430,7 @@ index 17464770d..d73cb0e47 100644
#if defined(GGML_USE_HIP) #if defined(GGML_USE_HIP)
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285 // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) { for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
@@ -509,35 +539,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { @@ -499,35 +529,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE); GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
@ -506,7 +506,7 @@ index 17464770d..d73cb0e47 100644
// add to the pool // add to the pool
pool_size += reserve_size; pool_size += reserve_size;
@@ -570,17 +614,27 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { @@ -560,17 +604,27 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
// all deallocations must be in reverse order of the allocations // all deallocations must be in reverse order of the allocations
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used)); GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
} }
@ -537,7 +537,7 @@ index 17464770d..d73cb0e47 100644
} }
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
@@ -764,11 +818,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac @@ -754,11 +808,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
} }
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@ -559,7 +559,7 @@ index 17464770d..d73cb0e47 100644
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
size_t size = ggml_nbytes(tensor); size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0]; int64_t ne0 = tensor->ne[0];
@@ -792,6 +855,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface @@ -782,6 +845,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .is_host = */ NULL, /* .is_host = */ NULL,
@ -567,7 +567,7 @@ index 17464770d..d73cb0e47 100644
}; };
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -3284,6 +3348,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, @@ -3274,6 +3338,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@ -575,7 +575,7 @@ index 17464770d..d73cb0e47 100644
// flag used to determine whether it is an integrated_gpu // flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -3420,6 +3485,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3410,6 +3475,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue; continue;
} }
@ -586,7 +586,7 @@ index 17464770d..d73cb0e47 100644
// start of fusion operations // start of fusion operations
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
@@ -3764,6 +3833,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3754,6 +3823,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@ -594,7 +594,7 @@ index 17464770d..d73cb0e47 100644
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(cuda_ctx->device);
@@ -3839,6 +3909,77 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, @@ -3829,6 +3899,77 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
@ -672,7 +672,7 @@ index 17464770d..d73cb0e47 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -4107,6 +4248,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { @@ -4097,6 +4238,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .event_record = */ ggml_backend_cuda_event_record, /* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait, /* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize, /* .graph_optimize = */ ggml_backend_cuda_graph_optimize,

View File

@ -62,7 +62,7 @@ index 4cf0ec913..4e83f6431 100644
GGML_ASSERT(device); GGML_ASSERT(device);
return device->iface.get_buffer_type(device); return device->iface.get_buffer_type(device);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d73cb0e47..547d9d366 100644 index c0f42012d..03cbdec8f 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -113,6 +113,11 @@ int ggml_cuda_get_device() { @@ -113,6 +113,11 @@ int ggml_cuda_get_device() {
@ -77,7 +77,7 @@ index d73cb0e47..547d9d366 100644
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
cudaError_t err; cudaError_t err;
@@ -4458,7 +4463,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -4448,7 +4453,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->id = ggml_backend_cuda_device_get_id(dev); props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
@ -89,7 +89,7 @@ index d73cb0e47..547d9d366 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
@@ -4918,6 +4926,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g @@ -4908,6 +4916,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
} }
@ -101,7 +101,7 @@ index d73cb0e47..547d9d366 100644
static const ggml_backend_device_i ggml_backend_cuda_device_interface = { static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_name = */ ggml_backend_cuda_device_get_name, /* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description, /* .get_description = */ ggml_backend_cuda_device_get_description,
@@ -4934,6 +4947,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { @@ -4924,6 +4937,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .event_new = */ ggml_backend_cuda_device_event_new, /* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free, /* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,

View File

@ -48,10 +48,10 @@ index 5a1403c4b..f0f734a6c 100644
set_target_properties(ggml-base PROPERTIES set_target_properties(ggml-base PROPERTIES
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 547d9d366..d7cf48691 100644 index 03cbdec8f..eb383bba7 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -267,6 +267,16 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -257,6 +257,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
@ -68,7 +68,7 @@ index 547d9d366..d7cf48691 100644
#if defined(GGML_USE_VMM) #if defined(GGML_USE_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
@@ -320,6 +330,11 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -310,6 +320,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
@ -80,7 +80,7 @@ index 547d9d366..d7cf48691 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str()); ggml_cuda_parse_uuid(prop, id).c_str());
@@ -4327,6 +4342,11 @@ struct ggml_backend_cuda_device_context { @@ -4317,6 +4332,11 @@ struct ggml_backend_cuda_device_context {
std::string description; std::string description;
std::string pci_bus_id; std::string pci_bus_id;
std::string id; std::string id;
@ -92,7 +92,7 @@ index 547d9d366..d7cf48691 100644
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4423,6 +4443,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { @@ -4413,6 +4433,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
@ -121,7 +121,7 @@ index 547d9d366..d7cf48691 100644
CUDA_CHECK(cudaMemGetInfo(free, total)); CUDA_CHECK(cudaMemGetInfo(free, total));
// ref: https://github.com/ggml-org/llama.cpp/pull/17368 // ref: https://github.com/ggml-org/llama.cpp/pull/17368
@@ -4455,6 +4497,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend @@ -4445,6 +4487,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
@ -129,7 +129,7 @@ index 547d9d366..d7cf48691 100644
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
@@ -4468,6 +4511,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -4458,6 +4501,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly. // If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0; props->memory_total = props->memory_free = 0;
@ -149,7 +149,7 @@ index 547d9d366..d7cf48691 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
bool events = false; bool events = false;
@@ -5067,6 +5123,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -5057,6 +5113,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
if (!initialized) { if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@ -157,7 +157,7 @@ index 547d9d366..d7cf48691 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) { for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -5082,6 +5139,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -5072,6 +5129,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id; dev_ctx->pci_bus_id = pci_bus_id;

View File

@ -10,10 +10,10 @@ fallback to cpu
1 file changed, 3 insertions(+) 1 file changed, 3 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d7cf48691..890be973c 100644 index eb383bba7..6a9d2746c 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4643,6 +4643,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g @@ -4633,6 +4633,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
return false; return false;
} }

View File

@ -9,10 +9,10 @@ Rever to prior logic of assuming an empty projector type is mlp
1 file changed, 4 insertions(+) 1 file changed, 4 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 25dd02272..403e17625 100644 index 4e26cda95..ab61c6ea1 100644
--- a/tools/mtmd/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -965,6 +965,10 @@ struct clip_model_loader { @@ -969,6 +969,10 @@ struct clip_model_loader {
if (proj_type.empty()) { if (proj_type.empty()) {
if (modality == CLIP_MODALITY_VISION) { if (modality == CLIP_MODALITY_VISION) {
get_string(KEY_VISION_PROJ_TYPE, proj_type, false); get_string(KEY_VISION_PROJ_TYPE, proj_type, false);

View File

@ -12,11 +12,11 @@ const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
template <cpy_kernel_t cpy_1> template <cpy_kernel_t cpy_1>
static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne, static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
const int nb12, const int nb13) { const int64_t nb12, const int64_t nb13) {
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
return; return;
@ -40,10 +40,10 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
} }
template <typename T> template <typename T>
static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne, static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
const int nb12, const int nb13) { const int64_t nb12, const int64_t nb13) {
const T* src = reinterpret_cast<const T*>(cx); const T* src = reinterpret_cast<const T*>(cx);
T* dst = reinterpret_cast<T*>(cdst); T* dst = reinterpret_cast<T*>(cdst);
@ -117,60 +117,60 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
} }
template <cpy_kernel_t cpy_blck, int qk> template <cpy_kernel_t cpy_blck, int qk>
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne, static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
const int nb12, const int nb13) { const int64_t nb12, const int64_t nb13) {
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
if (i >= ne) { if (i >= ne) {
return; return;
} }
const int i03 = i/(ne00 * ne01 * ne02); const int64_t i03 = i/(ne00 * ne01 * ne02);
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int i13 = i/(ne10 * ne11 * ne12); const int64_t i13 = i/(ne10 * ne11 * ne12);
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
cpy_blck(cx + x_offset, cdst + dst_offset); cpy_blck(cx + x_offset, cdst + dst_offset);
} }
template <cpy_kernel_t cpy_blck, int qk> template <cpy_kernel_t cpy_blck, int qk>
static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne, static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
const int nb12, const int nb13) { const int64_t nb12, const int64_t nb13) {
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
if (i >= ne) { if (i >= ne) {
return; return;
} }
const int i03 = i/(ne00 * ne01 * ne02); const int64_t i03 = i/(ne00 * ne01 * ne02);
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int i13 = i/(ne10 * ne11 * ne12); const int64_t i13 = i/(ne10 * ne11 * ne12);
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
cpy_blck(cx + x_offset, cdst + dst_offset); cpy_blck(cx + x_offset, cdst + dst_offset);
} }
template<typename src_t, typename dst_t> template<typename src_t, typename dst_t>
static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) { static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
return; return;
@ -188,19 +188,20 @@ static void ggml_cpy_scalar_contiguous_cuda(
cudaStream_t stream) { cudaStream_t stream) {
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne); (cx, cdst, ne);
} }
template<typename src_t, typename dst_t, bool transposed = false> template<typename src_t, typename dst_t, bool transposed = false>
static void ggml_cpy_scalar_cuda( static void ggml_cpy_scalar_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
if (transposed) { if (transposed) {
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
int ne00n, ne01n, ne02n; int64_t ne00n, ne01n, ne02n;
if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
ne00n = ne00; ne00n = ne00;
ne01n = ne01; ne01n = ne01;
@ -211,143 +212,159 @@ static void ggml_cpy_scalar_cuda(
ne02n = 1; ne02n = 1;
} }
dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
(ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
(ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM); int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
GGML_ASSERT(grid_x < UINT_MAX);
GGML_ASSERT(grid_y < USHRT_MAX);
GGML_ASSERT(grid_z < USHRT_MAX);
dim3 dimGrid(grid_x, grid_y, grid_z);
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>> cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} else { } else {
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
} }
static void ggml_cpy_f32_q8_0_cuda( static void ggml_cpy_f32_q8_0_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK8_0 == 0); GGML_ASSERT(ne % QK8_0 == 0);
const int num_blocks = ne / QK8_0; const int64_t num_blocks = ne / QK8_0;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q8_0_f32_cuda( static void ggml_cpy_q8_0_f32_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
const int num_blocks = ne; const int64_t num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>> cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q4_0_cuda( static void ggml_cpy_f32_q4_0_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_0 == 0); GGML_ASSERT(ne % QK4_0 == 0);
const int num_blocks = ne / QK4_0; const int64_t num_blocks = ne / QK4_0;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q4_0_f32_cuda( static void ggml_cpy_q4_0_f32_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int nb00, const int nb01, const int nb02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
const int nb10, const int nb11, const int nb12, const int nb13, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int num_blocks = ne; const int64_t num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q4_1_cuda( static void ggml_cpy_f32_q4_1_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_1 == 0); GGML_ASSERT(ne % QK4_1 == 0);
const int num_blocks = ne / QK4_1; const int64_t num_blocks = ne / QK4_1;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q4_1_f32_cuda( static void ggml_cpy_q4_1_f32_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int nb00, const int nb01, const int nb02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
const int nb10, const int nb11, const int nb12, const int nb13, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int num_blocks = ne; const int64_t num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q5_0_cuda( static void ggml_cpy_f32_q5_0_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK5_0 == 0); GGML_ASSERT(ne % QK5_0 == 0);
const int num_blocks = ne / QK5_0; const int64_t num_blocks = ne / QK5_0;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q5_0_f32_cuda( static void ggml_cpy_q5_0_f32_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int nb00, const int nb01, const int nb02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
const int nb10, const int nb11, const int nb12, const int nb13, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int num_blocks = ne; const int64_t num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q5_1_cuda( static void ggml_cpy_f32_q5_1_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK5_1 == 0); GGML_ASSERT(ne % QK5_1 == 0);
const int num_blocks = ne / QK5_1; const int64_t num_blocks = ne / QK5_1;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q5_1_f32_cuda( static void ggml_cpy_q5_1_f32_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int nb00, const int nb01, const int nb02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
const int nb10, const int nb11, const int nb12, const int nb13, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int num_blocks = ne; const int64_t num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_iq4_nl_cuda( static void ggml_cpy_f32_iq4_nl_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int64_t ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_NL == 0); GGML_ASSERT(ne % QK4_NL == 0);
const int num_blocks = ne / QK4_NL; const int64_t num_blocks = ne / QK4_NL;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
@ -393,9 +410,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2]; const int64_t ne02 = src0->ne[2];

View File

@ -251,16 +251,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES); GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
#ifdef GGML_CUDA_FORCE_MMQ
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#else
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
#endif // GGML_CUDA_FORCE_MMQ
#ifdef GGML_CUDA_FORCE_CUBLAS
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
#else
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
#endif // GGML_CUDA_FORCE_CUBLAS
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
std::vector<std::pair<int, std::string>> turing_devices_without_mma; std::vector<std::pair<int, std::string>> turing_devices_without_mma;